def _read_ib_html(fname, table_ref): """ Reads a single table from an .html file fname produced by IB reports, and returns a pandas dataframe table_ref gives position of table in .html stack """ ## Open the file with open(fname, 'r') as file_handle: soup = BeautifulSoup(file_handle.read()) if len(soup) == 0: raise Exception("Empty or non existent html file %s" % fname) ## Find the right table and extract the rows tables = soup.findAll('table') table = tables[table_ref] table_rows = table.findAll('tr') ## Process the rows from html into lists (headerrow, table_data) = _parse_html_table(table_rows) soup.close() ## Convert to pandas dataframe main_table = _html_table_to_pddataframe(headerrow, table_data) return main_table
def _read_ib_html(fname, table_ref): """ Reads a single table from an .html file fname produced by IB reports, and returns a pandas dataframe table_ref gives position of table in .html stack """ ## Open the file with open(fname,'r') as file_handle: soup = BeautifulSoup(file_handle.read()) if len(soup)==0: raise Exception("Empty or non existent html file %s" % fname) ## Find the right table and extract the rows tables=soup.findAll('table') table=tables[table_ref] table_rows = table.findAll('tr') ## Process the rows from html into lists (headerrow, table_data) = _parse_html_table(table_rows) soup.close() ## Convert to pandas dataframe main_table=_html_table_to_pddataframe(headerrow, table_data) return main_table
def processCaptcha(self, kParam): vers, language, jsh, questionfile = self._collect_api_info() responseQFile = self.openPage(questionfile, buildHref=False) if responseQFile is None or responseQFile.getcode() != 200: return # ___ https://www.google.com/recaptcha/api2/anchor?k= gurl = 'https://www.google.com/recaptcha/api2/anchor?k=' + kParam + '&co=aHR0cDovL2xpbmtjYXB0Y2hhLmNvbTo4MA..&hl=' + language + '&v=' + vers + '&size=normal&cb=z6ubp1ln1ecg' responseAnchor = self.openPage(gurl, buildHref=False) if responseAnchor is None or responseAnchor.getcode() != 200: return contentAnchor = responseAnchor.read() # ___ Initialize BeautifulSoup soupAnchor = BeautifulSoup(contentAnchor) soupAnchor.close() # ___ Get the captcha token token = soupAnchor.find('input', {'id': 'recaptcha-token'}) if token is None: return recaptchaToken = token['value'] print contentAnchor return userverifyUrl = 'https://www.google.com/recaptcha/api2/userverify?k=' + kParam data = { 'v': 'r20171212152908', 'c': recaptchaToken, 'response': 'eyJyZXNwb25zZSI6IiIsInMiOiI2ZTVhIn0', 't': 771, 'ct': 771, 'bg': '!QUegR2ZHqovVLlrzLJxBBzBhg5mRh1oHAAAAY1cAAAB-nAOGhu4fgbzGrh0Of91RdQt0tdbJYOeKtFu1_y_v_N4nuCILtj4Cm8f605XqMDHhXJE-k6u9R1qmmgMUX3WJ8wmU439ZzqnyxvMFYewQW02z_gCxsxAUPkMzGaqOat8zy1a48JSgdMe1Bd5QXdJxABqcjfltK52rYsJ8ZePokzTjOZSiNRSm8fI_cU9FOJnjiBjZfLtE8y-a5rUnpAkimAPDXbcwybbu_4J5nvfStYXv4jfI2mFOBKkuNfSNrhrtoXUX83Qv5JbxDNdytuSfzRmq126a46jP-jZIsarKP-Mh9lJTIX4rgE6hxvXWzXBQKF0gwbafsM7yWRfn8_-f_Fv9Y60KV5y7JQK8PfrU546MAGpOmgKhs5nMOPkwwfCKk31IL6rc3deFBG_0TtVG4rGP7ZcDM3w6C0Dd3wIdeEJYpA4yEpgCswaotgnkMpvnoCNgVABLasxkFIOSEzxRpOvNBdo-2z6vLwkNZNOAoMTdL8VWfW5fSbr9qMUJihl5cVHyJm8km3b8HCmI-wkIQXPOVVnFu_tIdHqvKux-InVPGsLgg8eCm8mG7ZphYfsOtezccmYnne9kpduSHRLTUm0tgZqmvEIT0QpyhyYB7RpYzPE2MY43tPoDl0Ap3VsDH_gEvFPpLR0ZLoKZGwmsywLW4NOZxbIPYjWB13i8xdSI3uGrmcYp16jt_-au8TWCkz7eDSc6O77uVw6moB_KbmGmFxNcO-Ob6UhgUHZP3g8dOFcz30SXIeVTiVl8fgsxb6LPuhdf5_x9gE0oYv-91q75TFlB3lC4NIq802g2SouWK72J8S5taQwLY2eVy-Qc4Q5P2q_dV3WraHFBvUeL8kiFwAzOypOfzD3-AEZnKxF3i2MTh7kaTO2cYdmjhhpWMO74YcJYB5ZCQKX8t8JawFZ0pdjFgfTPRK8de9TRM0bI8azbIUNnBtzrKrxrPeP6U8-6NsALgifqhP1PetFc40K0CSPqb-jRDZKT_MrQ66ugTKHyknTRrI0Zqr3uv59g5U7-LBQAh5Yx0iJGdwJU9p0_j_LOilV4GOH0-GDyhTyDzOgMiK9Lml4ErJitZ99bZfZHrjeJXaahhfoTBilxZmf6xMvwCZg_MlXwoGupo4GkEhd9s33SBJ653oRdCpvbQMRIkj77rYBE_ANGIPGtgnnxrl_RIV4ZSCz2MiWIKkF5YjQIHWm_uTM' } responseVerifyUser = self.postPage(userverifyUrl, data, buildHref=False) print responseVerifyUser.read() print '------' post_captcha_href = 'http://linkcaptcha.com/verifyCaptchaNew.php' data = {'g-recaptcha-response': token['value']} responseToken = self.postPage(post_captcha_href, data, buildHref=False) print responseToken if responseToken and responseToken.getcode() == 200: print responseToken.read() responseToken.close()
def run(self): while True: chunk = self.outcoming.get() soup = BeautifulSoup(chunk) print '=====================================' print soup.findAll('title')[0].renderContents() soup.close() self.outcoming.task_done()
def get_all_form(content): try: soup = BeautifulSoup(str(content)) all_form = soup.findAll('form') soup.close() return all_form except Exception, e: return []
def run(self): if self._callback: self._callback.update_state("start") itemhtml = self._file.open(str(self._item)) soup = BeautifulSoup(itemhtml) converted = self._processor.get_converted_html(soup) self._file.writestr(self._item, converted) soup.close() itemhtml.close() if self._callback: self._callback.update_state("finish")
def getChildPageScoreAndNum(url): req = urllib2.Request(url=url, headers=headers) page = urllib2.urlopen(req).read() soup = BeautifulSoup(page) tag = soup.find('span', {'class': 'Goldnum'}) if tag == None: return None score = float(tag.string) tag = tag.nextSibling.find('span', {'class': 'Golder'}) if tag == None: return None num = int(tag.string) soup.close() return (score, num)
def getChildPageScoreAndNum(url): req = urllib2.Request( url = url, headers = headers) page = urllib2.urlopen(req).read() soup = BeautifulSoup(page) tag=soup.find('span', {'class':'Goldnum'}) if tag==None: return None score=float(tag.string) tag=tag.nextSibling.find('span',{'class':'Golder'}) if tag==None: return None num=int(tag.string) soup.close() return (score,num)
def parse_page(self, url): f = urllib.urlopen(url) if f.getcode() != 200: print "Could not fetch participants page %s!" % url raise FetchParticipantException(url) soup = BeautifulSoup(f) links = map(lambda tag: tag.a.get("href"), soup.findAll("div", "news")); participants = [] for link in links: try: parsed_p = self.parse_participant(link) participants.append(parsed_p) except FetchParticipantException: continue soup.close() return participants
def parse_participant(self, url): str_data = urllib.urlopen(url).read() data = str_data.decode('utf-8') soup = BeautifulSoup(data) partic_info = soup.find("div", "u4asn-1") if partic_info is None: raise FetchParticipantException(url) info = partic_info.find("div", "u4a-desc") results = [x.text for x in (info.h3, info.span, info.p) if x is not None] name = results[0] style = results[1] descr = results[2] if len(results) > 2 else u"" votes = int(soup.find("div", "u4asn-2").find("div", "total").text) soup.close() return Participant(name=name, link=url, style=style, description=descr, votes=votes)
def getMainPageScoreAndUrl(url): #print url req = urllib2.Request( url = url, headers = headers) page = urllib2.urlopen(req).read() soup = BeautifulSoup(page) #print soup.originalEncoding #print soup datas=[] for tag in soup.findAll('span', {'class':'play-icon'}): ptag=tag.parent href='http://www.senanb.com/'+ptag['href'] ptag=ptag.parent ptag=ptag.find('strong', {'class':'ratbar-num'}) if ptag == None: continue score=float(ptag.string) #score=ptag.contents[0] datas.append((score,href)) soup.close() return datas
def getMainPageScoreAndUrl(url): #print url req = urllib2.Request(url=url, headers=headers) page = urllib2.urlopen(req).read() soup = BeautifulSoup(page) #print soup.originalEncoding #print soup datas = [] for tag in soup.findAll('span', {'class': 'play-icon'}): ptag = tag.parent href = 'http://www.senanb.com/' + ptag['href'] ptag = ptag.parent ptag = ptag.find('strong', {'class': 'ratbar-num'}) if ptag == None: continue score = float(ptag.string) #score=ptag.contents[0] datas.append((score, href)) soup.close() return datas
def am_I_logged_in_is_form_here(self, page1, page2): soup1 = BeautifulSoup(str(page1)) soup2 = BeautifulSoup(str(page2)) flag1 = True flag2 = True for name in self.Form_tags_password_type: res_p = soup1.find("input", attrs={"type": re.compile(name)}) if res_p: flag1 = False soup1.close() for name in self.Form_tags_password_type: res_p = soup2.find("input", attrs={"type": re.compile(name)}) if res_p: flag2 = False break soup2.close() if not flag2: return False else: return True
def am_I_logged_in_is_form_here(self, page1, page2): soup1 = BeautifulSoup(str(page1)) soup2 = BeautifulSoup(str(page2)) flag1 = True flag2 = True for name in self.Form_tags_password_type: res_p = soup1.find("input", attrs={"type" : re.compile(name)}) if res_p: flag1 = False; soup1.close() for name in self.Form_tags_password_type: res_p = soup2.find("input", attrs={"type" : re.compile(name)}) if res_p: flag2 = False break; soup2.close() if not flag2: return False else: return True
def get_html_files_ref(self): htmlfiles = [] with ZipFile(self._filepath, 'r') as f: foo = f.open('META-INF/container.xml') soup = BeautifulSoup(foo) foo.close() contentfile = dict(soup.find('rootfile').attrs)['full-path'] soup.close() root = re.sub(r'[^/]*(.opf)', '', contentfile) foo = f.open(contentfile) soup = BeautifulSoup(foo) for item in soup.findAll('item'): itemdict = dict(item.attrs) if itemdict['href'].endswith('html'): htmlfiles.append(root + itemdict['href']) foo.close() soup.close() f.close() return htmlfiles
downloadScheduler = DownloadScheduler() downloadScheduler.start() """ mainUrl = 'http://mabilo.com/ringtones.htm' urlHeader = 'http://mabilo.com' timeout = 40 socket.setdefaulttimeout(timeout) # get category_list u = urllib.urlopen(mainUrl) category_list = [] soup = BeautifulSoup(u.read()) u.close() for elem in soup.findAll('ul')[-1]: elem = str(elem) start = elem.find('"',0) stop = elem.find('"',start+1) category_list.append(elem[start+1:stop]) soup.close() # process category_list for elem in category_list[15:16]: solveEachCategory(urlHeader+elem) """ while not downloadQueue.empty(): time.sleep(20) """
def solveEachCategory(origurl): global recordIndex, urlHeader cnt = 1 while True: if cnt == 1: url = origurl else : url = origurl[0:origurl.index('.htm')]+'-'+str(cnt)+'-tr.htm' cnt = cnt + 1 if cnt > 1501: break numTry = 3 while numTry > 0: try: u = urllib.urlopen(url) except: numTry = numTry - 1 print 'category '+url+' open err '+str(3-numTry)+ ' try' #time.sleep(1) else: try: data = u.read() except: u.close() numTry = numTry - 1 print 'category '+url+' get data err '+str(3-numTry)+' try' #time.sleep(1) else : break; if numTry <= 0: recordIndex += 10 # assume there are 10 rings per page continue if u.url != url: # judge if reach the last page u.close() return 0 soup = BeautifulSoup(data) u.close() for divElem in soup.findAll('div',attrs={'class':'row2'}): # get TAG with attribute # process each ring print 'processing '+str(recordIndex) # process a ring ringurl = divElem.find('a')['href']; if ringurl == None: self.errorLog('ring') continue ringurl = urlHeader+ringurl ringThread = RingThread(ringurl, recordIndex) ringThread.start() queue.put(1) recordIndex = recordIndex + 1 # for test #if recordIndex > 1080: # return; soup.close()
def run(self): numTry = 3 while numTry > 0: try: #print self.url u = urllib.urlopen(self.url) except: numTry = numTry - 1 self.logError('url can not open '+str(3-numTry)+ ' try') #time.sleep(1) else: try: data = u.read() except: numTry = numTry - 1 self.logError('url data not get '+str(3-numTry)+ ' try') u.close() #time.sleep(1) else : break; if numTry <= 0: self.finish() return -1 # ring url open success soup = BeautifulSoup(data) u.close() record = ['<Record>'] # attri Title temp = soup.find('h4') if temp !=None: temp = str(temp.next) record.append('<Title>'+temp[temp.index('Free')+5:temp.index('Ringtone')-1]+'</Title>') else: self.logError('title not found') self.finish() return -1 # attri Image temp = soup.find('div', attrs={'class':'image'}) if temp != None: imageurl = str(temp.next['src']) splitPath = imageurl.split('/') imagefile = splitPath.pop().strip() record.append('<Image>'+imagefile+'</Image>') # download image #print 'imageurl '+ imageurl imageThread = DownloadThread(imageurl, imagefile, self.index) imageThread.start() downloadQueue.put(1) """imageThread = DownloadThread(imageurl, imagefile) imageThread.start()""" """if imageThread.getResult() < 0: self.logError('image download error') return -1""" else: self.logError('image not found') self.finish() return -1 # attrs including Artist,Downloads,Size etc for infoElem in soup.findAll('div',attrs={'class':'info'}): for specInfoElem in infoElem.findAll('span', attrs={'class':'grey'}): item = specInfoElem.next itemname = str(item).strip()[:-1] if itemname=='Artist' or itemname=='Category': record.append('<'+itemname+'>'+str(item.nextSibling.next).strip()+'</'+itemname+'>') elif itemname=='Date Added': record.append('<Date>'+str(specInfoElem.nextSibling).strip()+'</Date>') else: record.append('<'+itemname+'>'+str(specInfoElem.nextSibling).strip()+'</'+itemname+'>') # start-rating infoElem = soup.find('li',attrs={'id':'rsli'}) itemname = str(infoElem['style']) record.append('<Mark>'+itemname[itemname.index(':')+1:itemname.index('%')]+'</Mark>') # attri Ring divElem = soup.find('div',attrs={'class':'det2'}) if divElem == None: self.logError('ring not found') self.finish() return -1 ringurl = str(divElem.find('a')['href']) if ringurl == None: self.logError('ring not found') self.finish() return -1 pos = ringurl.find('.mp3')+4 if pos == 3: pos = ringurl.find('.wav')+4 if pos == 3: self.logError('not mp3 or wav format') self.finish() return -1 ringfile = ringurl[ringurl.index('file=')+5:pos].strip() record.append('<Ring>'+ringfile+'</Ring>') ringurl = 'http://music.mabilo.com/dl'+ringurl[ringurl.index('.php'):pos] #download ring #print 'ringurl '+ringurl downloadThread = DownloadThread(ringurl, ringfile, self.index) downloadThread.start() downloadQueue.put(1) """ringThread = DownloadThread(ringurl, ringfile) ringThread.start()""" """if ringThread.getResult()< 0: self.logError('ring download error') return -1""" soup.close() record.append('</Record>') self.storeRecord(record) self.finish() return 0