def get_tieba_page(page): req = urllib2.Request(page) web = urllib2.urlopen(req) praser = urllister.URLLister() praser.feed(web.read()) web.close() praser.close() return praser
def getURL(self, url): #getURL(url)用来将HTML中的url放入urls列表中 try: try: usock = urllib.urlopen(url) except: print 'get url excepton' return [] parser = urllister.URLLister() parser.feed(usock.read()) usock.close() parser.close() urls = parser.urls return urls except Exception, e: print u"getURL", e return 0
#!/usr/bin/python2 # -*-coding:Utf-8 -* import urllib, urllister usock = urllib.urlopen( "http://www.lefigaro.fr/sante/2011/06/15/01004-20110615ARTFIG00354-vers-une-vache-capable-de-produire-du-lait-maternel.php" ) parser = urllister.URLLister() parser.feed(usock.read()) usock.close() parser.close() for url in parser.urls: print url print parser.tags # vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
def prezzoLib(s): one=False ris=-1 good=False max=999.99 x=s.replace(' ','+') x=x.encode("utf-8") url="http://www.libreriauniversitaria.it/ricerca/query/"+x try:sock= urllib.urlopen(url) except IOError: return -1 parser = urllister.URLLister() z=s.split(' ') l=len(z) for pz in range(l-1): if len(z[pz])<2: z.pop(pz) pz=pz-1 l=len(z) parser.feed(sock.read()) if(l==0): return -1 if l==1: eee=0 for i in parser.urls: if(z[0] in i and "query" not in i and "libri"not in i): eee=eee+1 ris =cercaprezzo(i) Tp=cercaTitolo(i) if(float(ris)<max and ris!=-1): session['TitoloLIB']=Tp max=float(ris) if(eee == 4): break ris=-1 return max z.sort(key=len) second=False PX=0 count=0 ki=0 kount=0 for i in parser.urls: kount+=1 if l==2: if(z[0] in i and z[1] in i and "query" not in i and "libri" not in i): ris=cercaprezzo(i) PX=PX+1 else: for j in range(l-1,0,-1): if(z[j] in i and "query" not in i and "libri" not in i): count=count+1 if(count==l-2): ris=cercaprezzo(i) PX=PX+1 ki=PX count=0 break count=0 jk=float(ris) ris=-1 if (PX>=ki-1 and jk<max and jk!=0.0 and jk!=-1): newT=cercaTitolo(i) newt=newT.lower() print(newT) potto=0 for y in range(len(z)): if(z[y].lower() in newt): potto+=1 if potto==len(z): good=True potto=0 Aut=cercaAutore(i) if(one==False): if good : max=jk session['TitoloLIB']=newT good=False one=True AF=Aut if PX==10 or kount==150: break continue elif one==True and second==False and Aut==AF: if good : max=jk session['TitoloLIB']=newT good=False second=True if PX==10 or kount==150: break continue else: if good and (newt==s or ("ediz. integrale" in newt and s in newt)or ("edizione" in newt and s in newt)) and Aut==AF: max=jk session['TitoloLIB']=newT good=False if PX==10 or kount==150: break sock.close() parser.close() return max
def getDataFromRefDB2PubDBs(br_filename): """ look for published data in RefDB+PubDBs and write a file with block:location """ ## get the content of the RefDB2PubDBs map page PubDBCentralUrl_ = 'http://cmsdoc.cern.ch/cms/production/www/PubDB/' RefDBPubDBsmapPhp_ = 'GetPublishedCollectionInfoFromRefDB.php' primaryUrl = PubDBCentralUrl_ + RefDBPubDBsmapPhp_ logging.debug("MapListUrl=" + primaryUrl) try: sock = urllib.urlopen(primaryUrl) except IOError: raise RefDBError(primaryUrl) parser = urllister.URLLister() parser.feed(sock.read()) sock.close() parser.close() ## HREF dataset-discovery.php (for a dataset/owner) ## HREF PubDBURL for collecitons.php ## HREF mantainer ## parsing the RefDB2PubDBs map to find the published dataset/owner and their PubDB URL List_blockreplica = [] br_file = open(br_filename, 'w') ## regexp to get the dataset/owner reDsOw = re.compile(r'DSPattern=(\w+)&OwPattern=(\w+)') reDsOw2 = re.compile(r'DSPattern=(\w+\-\w+)&OwPattern=(\w+)') reDsOw3 = re.compile(r'DSPattern=(\w+\.\w+\.\w+)&OwPattern=(\w+)') ## mu04_mu_pt10._eta0.0 recollid = re.compile(r'collid=(\d+)') count = 0 for url in parser.linksList: count = count + 1 if string.find(url, 'dataset-discovery.php') != -1: #`print "trying with url "+url try: dsow = reDsOw.search(string.strip(url)) ds = dsow.group(1) ow = dsow.group(2) except: try: dsow = reDsOw2.search(string.strip(url)) ds = dsow.group(1) ow = dsow.group(2) pass except: dsow = reDsOw3.search(string.strip(url)) ds = dsow.group(1) ow = dsow.group(2) pass ## PubDB URL is the next URL puburl = parser.linksList[count] #print "Dataset/Owner is "+ds+"/"+ow+" PubDB URL is "+puburl logging.debug("Dataset/Owner is " + ds + "/" + ow + " PubDB URL is " + puburl) end = string.rfind(puburl, '/') collid = recollid.search(string.strip(puburl)).group(1) ## get location from PubDB if (checkPubDBversion(puburl[:end + 1])): ## for PubDB V4 locationList = getPubDBInfo(puburl[:end + 1], collid) else: ## for PubDB V3 collid = recollid.search(string.strip(puburl)).group(1) #print ' collid '+collid+' Do nothing for PubDB V3....' PubDBError(puburl, ' still PubDB V3') locationList = ['Null'] ## save block-replica to a file block = ow + '/' + ds for location in locationList: blockreplica = block + ':' + location List_blockreplica.append(blockreplica) br_file.write(blockreplica + '\n') br_file.close()