コード例 #1
0
def get_tieba_page(page):
    req = urllib2.Request(page)
    web = urllib2.urlopen(req)
    praser = urllister.URLLister()
    praser.feed(web.read())
    web.close()
    praser.close()
    return praser
コード例 #2
0
ファイル: main.py プロジェクト: zgq346712481/Webcrawle
 def getURL(self, url):  #getURL(url)用来将HTML中的url放入urls列表中
     try:
         try:
             usock = urllib.urlopen(url)
         except:
             print 'get url excepton'
             return []
         parser = urllister.URLLister()
         parser.feed(usock.read())
         usock.close()
         parser.close()
         urls = parser.urls
         return urls
     except Exception, e:
         print u"getURL", e
         return 0
コード例 #3
0
#!/usr/bin/python2
# -*-coding:Utf-8 -*
import urllib, urllister

usock = urllib.urlopen(
    "http://www.lefigaro.fr/sante/2011/06/15/01004-20110615ARTFIG00354-vers-une-vache-capable-de-produire-du-lait-maternel.php"
)
parser = urllister.URLLister()
parser.feed(usock.read())
usock.close()
parser.close()
for url in parser.urls:
    print url
print parser.tags

# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
コード例 #4
0
def prezzoLib(s):
   
    one=False
    ris=-1
    good=False
    max=999.99
    x=s.replace(' ','+')
    
    x=x.encode("utf-8")
    
    url="http://www.libreriauniversitaria.it/ricerca/query/"+x
    
    try:sock= urllib.urlopen(url)
    except IOError:
        
        return -1
    
    parser = urllister.URLLister()
    
    z=s.split(' ')
    
    l=len(z)
    for pz in range(l-1):
        if len(z[pz])<2:
            z.pop(pz)
            pz=pz-1
    l=len(z)
    parser.feed(sock.read())
    if(l==0):
        return -1
    if l==1:
       eee=0
       for i in parser.urls:
          
         if(z[0] in i and "query" not in i and "libri"not in i):
               eee=eee+1
               ris =cercaprezzo(i)
               Tp=cercaTitolo(i)
               
               if(float(ris)<max and ris!=-1):
                  
                  session['TitoloLIB']=Tp
                  max=float(ris)
                  
               if(eee == 4):
                  break
               ris=-1
       return max
    z.sort(key=len)
    second=False
    PX=0
    count=0
    ki=0
    kount=0
    for i in parser.urls:
        kount+=1
        
        if l==2:
            if(z[0] in i and z[1] in i and "query" not in i and "libri" not in i):
                
                 ris=cercaprezzo(i)
                 PX=PX+1
        else:
            for j in range(l-1,0,-1):
                
                if(z[j] in i and "query" not in i and "libri" not in i):
                   
                   count=count+1
                  
                   if(count==l-2):
                      
                      ris=cercaprezzo(i)
                      PX=PX+1
                      ki=PX
                      
                      count=0
                      break
            count=0
        
        jk=float(ris)
        ris=-1
        if (PX>=ki-1 and jk<max and jk!=0.0 and jk!=-1):
            newT=cercaTitolo(i)
            
            newt=newT.lower()
            print(newT)
            
            potto=0
            
            for y in range(len(z)):
                
                if(z[y].lower() in newt):
                    
                    potto+=1
                    if potto==len(z):
                        good=True
                
            potto=0
            Aut=cercaAutore(i)
            
            if(one==False):
               if good :
                   max=jk
                   session['TitoloLIB']=newT
                    
                   good=False
                   one=True
                   AF=Aut
                   if PX==10 or kount==150:
                      break
                   continue
            elif one==True and second==False and Aut==AF:
                 
                 if good :
                    max=jk
                    session['TitoloLIB']=newT
                    
                    good=False
                    second=True
                    
                    if PX==10 or kount==150:
                       break
                    continue
            else:
                
                if good and (newt==s or ("ediz. integrale" in newt and s in newt)or ("edizione" in newt and s in newt))  and Aut==AF:
                    
                    max=jk
                    session['TitoloLIB']=newT
                    good=False
                   

            
                
        
        if PX==10 or kount==150:
           break
            
        
    sock.close()
    parser.close()
    return max
コード例 #5
0
def getDataFromRefDB2PubDBs(br_filename):
    """
  look for published data in RefDB+PubDBs 
  and write a file with block:location
  """
    ## get the content of the RefDB2PubDBs map page
    PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/'
    RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.php'

    primaryUrl = PubDBCentralUrl_ + RefDBPubDBsmapPhp_
    logging.debug("MapListUrl=" + primaryUrl)

    try:
        sock = urllib.urlopen(primaryUrl)
    except IOError:
        raise RefDBError(primaryUrl)

    parser = urllister.URLLister()
    parser.feed(sock.read())
    sock.close()
    parser.close()

    ##  HREF dataset-discovery.php  (for a dataset/owner)
    ##  HREF PubDBURL for collecitons.php
    ##  HREF mantainer

    ## parsing the RefDB2PubDBs map to find the published dataset/owner and their PubDB URL

    List_blockreplica = []
    br_file = open(br_filename, 'w')
    ## regexp to get the dataset/owner
    reDsOw = re.compile(r'DSPattern=(\w+)&OwPattern=(\w+)')
    reDsOw2 = re.compile(r'DSPattern=(\w+\-\w+)&OwPattern=(\w+)')
    reDsOw3 = re.compile(r'DSPattern=(\w+\.\w+\.\w+)&OwPattern=(\w+)')
    ## mu04_mu_pt10._eta0.0
    recollid = re.compile(r'collid=(\d+)')
    count = 0
    for url in parser.linksList:
        count = count + 1
        if string.find(url, 'dataset-discovery.php') != -1:
            #`print "trying with url  "+url
            try:
                dsow = reDsOw.search(string.strip(url))
                ds = dsow.group(1)
                ow = dsow.group(2)
            except:
                try:
                    dsow = reDsOw2.search(string.strip(url))
                    ds = dsow.group(1)
                    ow = dsow.group(2)
                    pass
                except:
                    dsow = reDsOw3.search(string.strip(url))
                    ds = dsow.group(1)
                    ow = dsow.group(2)
                    pass
            ## PubDB URL is the next URL
            puburl = parser.linksList[count]
            #print "Dataset/Owner is "+ds+"/"+ow+" PubDB URL is "+puburl
            logging.debug("Dataset/Owner is " + ds + "/" + ow +
                          " PubDB URL is " + puburl)
            end = string.rfind(puburl, '/')
            collid = recollid.search(string.strip(puburl)).group(1)

            ## get location from PubDB
            if (checkPubDBversion(puburl[:end + 1])):
                ## for PubDB V4
                locationList = getPubDBInfo(puburl[:end + 1], collid)
            else:
                ## for PubDB V3
                collid = recollid.search(string.strip(puburl)).group(1)
                #print ' collid '+collid+' Do nothing for PubDB V3....'
                PubDBError(puburl, ' still PubDB V3')
                locationList = ['Null']

            ## save block-replica to a file
            block = ow + '/' + ds
            for location in locationList:
                blockreplica = block + ':' + location
                List_blockreplica.append(blockreplica)
                br_file.write(blockreplica + '\n')

    br_file.close()