def write(self, s): self.set_current_page(0) if "\(" in s or "\[" in s or "$$" in s: f = open("/home/fuzzy/Projects/helloworld/html/template.html") bs = bsoup(f.read(), XML_PARSER) f.close() bs("div")[0].append(bsoup(s, XML_PARSER)) f = open("/home/fuzzy/Projects/helloworld/temp/output.html",'w') f.write(str(bs)) f.close() self.current_view().open("http://localhost/helloworld/temp/output.html") else: self.current_view().get_dom_document().get_element_by_id("body").set_inner_html(s)
def login(): global m_session username = raw_input("Mixi Username: "******"Mixi Password: "******"check.pl" in res.text: print "Login Failed. Abort." sys.exit(1) print "Login Complete"
def fetch_saved_stories(uname, session, max_page=-1): all_stories = {} blank_url = 'https://news.ycombinator.com/saved?id={0:s}&p={1:d}' page = 1 while True: url = blank_url.format(uname, page) response = session.get(url) print("fetching from page {0:d}: {1:s}.".format(page, url)) assert response.status_code == 200, \ "Unexpected status code: {0:d}".format(response.status_code) print("success.".format(page)) soup = bsoup(response.content.decode('utf-8', 'ignore'), 'html.parser') stories = soup.find_all("td", class_ = "title") has_next_page = has_more(stories) if has_next_page: stories.pop() ranks = stories[0:-1:2] entries = stories[1:-1:2] entries.append(stories[-1]) for idx,entry in enumerate(entries): rank = get_rank(str(ranks[idx])) title = get_title(str(entry)) link = get_link(str(entry)) all_stories[rank] = [title, link] if max_page > -1 and page == max_page: print("finish fetching {0:d} pages.".format(page)) break if not has_next_page: print("finish fetching {0:d} pages.".format(page)) break page = page + 1 time.sleep(5) session.close() return all_stories
def cars_call(zip, page_num): ##this is the original page #page = requests.get('https://www.cars.com/for-sale/searchresults.action/?page='+str(page_num)+'&perPage=100&rd=20&searchSource=PAGINATION&sort=relevance&zc='+str(zip)) #This has all the cars sorted by there distacne from a given zip code page = requests.get( 'https://www.cars.com/for-sale/searchresults.action/?page=' + str(page_num) + '&perPage=100&rd=99999&searchSource=GN_BREADCRUMB&sort=distance-nearest&zc=' + str(zip)) #convert to a beautiful soup object: soup = bsoup(page.content, features="lxml") #show contents soup.prettify() # auto tempest isn't working.. did they somehow make everything private? D: #start scraping find and find_all body = soup.find_all('script') #body = soup.find('class') #body = soup.find('div' ) # yes or no answer if it is certified pre-owned #spec = body.find_all('span') return str(body)
def relstock(): r = requests.get( "https://in.finance.yahoo.com/quote/RELIANCE.NS?p=RELIANCE.NS") soup = bsoup(r.text, "lxml") content = soup.find( 'div', class_="My(6px) Pos(r) smartphone_Mt(6px)").find('span').text return content
def get_response(url, site_tag): aggr_urls = [] agent = { "User-Agent": 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } site_req_url = requests.get(url, headers=agent) site_req_html = bsoup(site_req_url.content, 'html.parser') if site_tag == "N7P": links = site_req_html.find_all('a', href=True, text=True, rel=True) for lk in links: for i in lk['rel']: if i == 'bookmark': aggr_urls.append(lk.string) aggr_urls.append(lk['href']) if site_tag == "W10": for i in site_req_html.select('article[class*="mspuloop"]'): lk = i.find_all('a', href=True, title=True)[0] aggr_urls.append(lk['title']) aggr_urls.append(lk['href']) aggr_urls.reverse() return (aggr_urls)
def get_links(self): """Assigns all vid links to self.links attribute and returns list of links""" r = requests.get("%s?page=last" % (self.base)).text soup = bsoup(r) lks = soup.findAll('a') self.links = lks return self.links
def getCaptcha(URL,ref): #print "\tcaptcha@{%s}" % (URL) path = re.split("^http://(?:[\w\-_0-9].*?)\.(?:[\w\-_0-9].*?)\.(?:[\w\-_0-9]+)",URL)[1] #''.join(re.split('^/sorry/image\?id=',path)).rstrip('&hl=en') image_id=1 host = URL[:len(URL)-len(path)] #print "[%s][%s]" % (host,path) #print host.lstrip('http://') conn=httplib.HTTPConnection(ref) conn.putrequest('GET',path) conn.putheader('Referer',ref) conn.putheader('Accept','text/html') conn.putheader('User-agent','Internet Explorer 6.0') conn.endheaders() resp=conn.getresponse() html=resp.read() #print html #strip image files soup=bsoup(html) #form=soup.find('form',{'method':'get'}) #print "---------------" #print form #print "---------------" imgs=soup.find('img') #print "---------------" print imgs #print "---------------" return
def get_indicators (id_=None): tIndicator_dict = dict() r = requests.get('{}/{}'.format(BASE_URL, 'indicator')) soup = bsoup(r.text, 'lxml') topics = soup.findAll('h3', id=re.compile('topic-*')) main_div = soup.find('div', {'class': 'content-wrapper clear-block'}) indicators = main_div.findAll('table', {'class': 'views-view-grid grid-2'}) ## print(len(topics), len(indicators)) for i in range(len(indicators)): indList = [] tIndicators = indicators[i].findAll('a') for ind in tIndicators: indList.append(ind.text) tIndicator_dict[topics[i].text] = indList if type(id_) == type(1): try: tid_list = list(tIndicator_dict.items()) return {tid_list[id_-1][0]: tid_list[id_-1][1]} except: return None if type(id_) == type('string'): try: return tIndicator_dict[id_] except: return None return tIndicator_dict
def parse_data(page): soup = bsoup(page, 'html.parser') relationdict = {} # This following code will give us the Synonyms, Antonyms and Related # words. wndata = soup.find('section', attrs={'data-src': 'wn'}) postype = '' for row in wndata('tr'): for i, col in enumerate(row('td')): data = col(text=True) if i == 0: temptype = data if i == 1 and data == [u'1.']: postype = temptype[0] if postype not in relationdict: relationdict[postype] = {} relationdict[postype]['Syn'] = [] relationdict[postype]['Ant'] = [] relationdict[postype]['Rel'] = [] relationdict[postype]['Syn'] += \ _extract_rel_words(col('div', attrs={'class': 'Syn'})) relationdict[postype]['Ant'] += \ _extract_rel_words(col('div', attrs={'class': 'Ant'})) relationdict[postype]['Rel'] += \ _extract_rel_words(col('div', attrs={'class': 'Rel'})) wndata = soup.find('section', attrs={'data-src': 'hm'}) _extract_pseg_lemma(wndata('div', attrs={'class': 'pseg'})) _extract_pseg_lemma(wndata('div', attrs={'class': 'runseg'})) return relationdict
def readXmlData(self, fnXML, isCleanEmptyGroups=True, isDebug=False): with open(fnXML,'r') as f: # (1) read data dataXML=bsoup(f.read(),'lxml-xml') if isDebug: print(dataXML.prettify()) dictGroup={} # (2) parse groups for ii in dataXML.findAll('Group'): tmp=dict(ii.attrs) dictGroup[tmp['Name']]=[] if isDebug: print dictGroup # (3) iterate coords for i,ii in enumerate(dataXML.findAll('Annotation')): tmp=dict(ii.attrs) tIdGroup=tmp['PartOfGroup'] lstCoords=[] for j,jj in enumerate(ii.findAll('Coordinate')): tx=np.float(jj.attrs['X']) ty=np.float(jj.attrs['Y']) tp=np.int(jj.attrs['Order']) lstCoords.append((tp,tx,ty)) if isDebug and ((j%2000)==0): print tIdGroup, '(',i,')[',j,'] : {', tp,tx,ty,'}' arrCoords=np.array(lstCoords) sidx=np.argsort(arrCoords[:,0]) arrCoords=arrCoords[sidx,1:] arrCoords=np.append(arrCoords,[arrCoords[0]],axis=0) dictGroup[tIdGroup].append(arrCoords) # (4) Clean empty Groups: if isCleanEmptyGroups: tlstKeyEmpty=[kk for kk,vv in dictGroup.items() if len(vv)<1] if isDebug: print 'Empty keys: ', tlstKeyEmpty for kk in tlstKeyEmpty: del dictGroup[kk] # (5) Create plain MplPaths: self.lstGroupId=[] self.lstPaths=[] for kk,gg in enumerate(dictGroup.values()): for ii in gg: self.lstGroupId.append(kk) self.lstPaths.append(mplPath.Path(ii)) print kk, ' : ', len(gg) # self.lstSubPaths=[] for ii in self.lstPaths: tmp=[] for j0,jj in enumerate(self.lstPaths): if ii!=jj: if ii.contains_path(jj): tmp.append(j0) self.lstSubPaths.append(tmp) # print '------------------' for i,ii in enumerate(self.lstSubPaths): print i, ' : ', len(ii), ' : ', dictGroup.keys()[self.lstGroupId[i]] # self.dataXML = dictGroup
def _handle_doc(self, html_doc): soup = bsoup(html_doc, "lxml") for ele in soup.find(id = "plist").ul.children: eleClass = ele['class'][0] #jd 商品 if eleClass == "gl-item": aDom = ele.a reobj = re.compile(r'href="(.*?)".*ata-lazy-img="(.*?)"', re.IGNORECASE) result = reobj.findall(str(aDom)) #商品详情页面url skuUrl = result[0][0] #图片 商品图片 skuPic = result[0][1] if skuUrl is not None: reid = re.compile(r'/(\d+).html', re.IGNORECASE) result = reid.findall(str(skuUrl)) # print result; skuId = result[0] else: skuId = '' #商品价格 skuPrice = self.get_price(skuId) skuName = ele.find_all("div", class_="p-name") skuName = skuName[0] reName = re.compile(r'<em>(.*?)</em>(.*)') nameResult = reName.findall(str(skuName)) skuName = nameResult[0][0] self._insert_database(skuId, skuUrl, skuName, skuPrice, skuPic)
def parse_tdt_by_topic(src_dir, doc_type, limit=0, lang=None): """ Iterate over the complete list topics from the given file and parse as a dictionary of Pythia clusters, mapped to an array of relevant docs. """ logger.info("parse_tdt_topics(%s, %s)", src_dir, doc_type) topic_file = os.path.join(src_dir, REL_TOPIC_DIR, TOPIC_REL_FILE) clusters = dict() count = 0 with open(topic_file) as fin: for line in fin: count += 1 if (limit > 0) and (count >= limit): logger.info("Limit of %s documents reached.", limit) break ontopic = bsoup(line, "lxml").ontopic logger.debug(ontopic) if ontopic is not None: tdt_level = ontopic["level"] # Not considering stories with only BRIEF topic references if "BRIEF" == tdt_level: continue post_id = ontopic["docno"] tdt_topicid = ontopic["topicid"] tdt_fileid = ontopic["fileid"] doc_date = tdt_fileid.split("_")[0] doc_src = "_".join(tdt_fileid.split("_")[-2:]) # If a language was specified, limit to sources in the given language if lang is not None: if "ENG" == lang and (doc_src not in ENGLISH_SRCS): logger.debug("Skipping non-English source document.") continue if "MAN" == lang and (doc_src not in MANDARIN_SRCS): logger.debug("Skipping non-Mandarin source document.") continue cluster_id = "{topic}_{date}".format(topic=tdt_topicid, date=doc_date) cluster = clusters.get(cluster_id, dict()) post = cluster.get(post_id, dict({"post_id": post_id})) post["tdt_level"] = tdt_level post["novelty"] = False if len(cluster) == 0: post["novelty"] = True # if 'BRIEF' == tdt_level: # post['novelty'] = False # FIXME - determine a realistic way to assign novelty # post['novelty'] = 'TBD' post["cluster_id"] = cluster_id post["tdt_topicid"] = tdt_topicid # TODO: look at alternatives for settign order, e.g. timestamp post["order"] = len(cluster) post["tdt_fileid"] = tdt_fileid post["tdt_src"] = doc_src # TODO - get text from source file and add as 'body_text' post["body_text"] = extract_doc_text(src_dir, tdt_fileid, post_id) cluster[post_id] = post clusters[cluster_id] = cluster # logger.debug("Doc:") # logger.debug(cluster[post_id]) return clusters
def parse_espn(name): url = "http://espn.go.com/nba/team/schedule/_/name/" + name r = rq.get(url) soup = bsoup(r.content, 'lxml') ## Find all the rows that have classes. Remove the first one -- it's irrelevant. trs = soup.find_all("tr", class_=True)[1:] ## Main procedure. with open("nba_2016_csv/" + name + "_2016_schedule.csv", "wb") as ofile: f = csv.writer(ofile) ## Write the headers. f.writerow(["team", "date", "a/h", "opponent", "w/l", "s", "os"]) for tr in trs: team = name tds = tr.find_all("td") date = tds[0].get_text().encode("utf-8") opp = tds[1].find_all("li", {"class": "team-name"}) for teams in opp: other_team = teams.get_text() opponent = tds[1].find_all('li', {'class': 'game-status'}) for o in opponent: h_a = o.get_text() try: win_loss = tds[2].find_all( 'li', {'class': 'game-status win'}) or tds[2].find_all( 'li', {'class': 'game-status loss'}) for a in win_loss: w_l = a.get_text() score = tds[2].find_all('a') for s in score: if w_l == 'W': gs = s.get_text().split("-")[0] gs = int(gs) ogs = s.get_text().split("-")[1].split(" ")[0] ogs = int(ogs) elif w_l == 'L': gs = s.get_text().split("-")[1].split(" ")[0] gs = int(gs) ogs = s.get_text().split("-")[0] ogs = int(ogs) except: h_a = '' w_l = None other_team = '' gs = None ogs = None ## write the result to the CSV file. finally: f.writerow([team, date, h_a, other_team, w_l, gs, ogs]) g = dict(date=date, team=team, h_a=h_a, other_team=other_team, w_l=w_l, s=gs, os=ogs) connect().save(g)
def get_euribor(dias, valores): # Fecha actual now = datetime.datetime.now() anyo = int(now.strftime("%Y")) # Defino año final 2006, que es hasta donde tengo datos de precios anyo_final = 2006 mes = int(now.strftime("%m")) controlmesactual = True # Para cada año for i_anyo in range(anyo, anyo_final-1, -1): # Para cada mes for i_mes in range(mes, 0, -1): if controlmesactual: url = "https://www.idealista.com/news/euribor/mensual/mes-actual/" controlmesactual = False else: url = "https://www.idealista.com/news/euribor/mensual/%s-%d/" % ( calendar.month_name[int(i_mes)], i_anyo) # Seleccionamos user agent aleatoriamente userAgent = random.choice(userAgents) # Cargamos cabeceras por defecto headers = requests.utils.default_headers() # Actualizamos cabeceras con el User-Agent aleatorio headers.update({'User-Agent': userAgent}) # Espaciado entre peticiones (2 ó 3 segundos) sleep_secs = random.randrange(2, 4) time.sleep(sleep_secs) # Descargamos el sitio web de interés html = requests.get(url, headers=headers) soup = bsoup(html.content) contador = 0 for dato in soup.body.tbody.find_all('td'): contador = contador + 1 if contador % 2 == 1: # fecha fecha = "%d%s%s" % (i_anyo, '{:02d}'.format(i_mes), '{:02d}'.format(int(dato.string))) # Añadimos la fecha a la lista dias.append(fecha) print(fecha) else: # euribor euribor = dato.string[:-1].replace(",", ".") # Añadimos el euribor a la lista valores.append(float(euribor)) print(float(euribor)) mes = 12 return euribor
def parse_tdt_by_topic(src_dir, doc_type, limit = 0, lang = None): """ Iterate over the complete list topics from the given file and parse as a dictionary of Pythia clusters, mapped to an array of relevant docs. """ logger.info('parse_tdt_topics(%s, %s)', src_dir, doc_type) topic_file = os.path.join(src_dir, REL_TOPIC_DIR, TOPIC_REL_FILE) clusters = dict() count = 0 with open(topic_file) as fin: for line in fin: count+=1 if (limit > 0) and (count >= limit): logger.info('Limit of %s documents reached.', limit) break ontopic = bsoup(line, 'lxml').ontopic logger.debug(ontopic) if ontopic is not None: tdt_level = ontopic['level'] # Not considering stories with only BRIEF topic references if 'BRIEF' == tdt_level: continue post_id = ontopic['docno'] tdt_topicid = ontopic['topicid'] tdt_fileid = ontopic['fileid'] doc_date = tdt_fileid.split('_')[0] doc_src = "_".join(tdt_fileid.split('_')[-2:]) # If a language was specified, limit to sources in the given language if lang is not None: if 'ENG' == lang and (doc_src not in ENGLISH_SRCS): logger.debug("Skipping non-English source document.") continue if 'MAN' == lang and (doc_src not in MANDARIN_SRCS): logger.debug("Skipping non-Mandarin source document.") continue cluster_id = '{topic}_{date}'.format(topic=tdt_topicid,date=doc_date) cluster = clusters.get(cluster_id, dict()) post = cluster.get(post_id, dict({'post_id':post_id})) post['tdt_level'] = tdt_level post['novelty'] = False if len(cluster) == 0: post['novelty'] = True # if 'BRIEF' == tdt_level: # post['novelty'] = False # FIXME - determine a realistic way to assign novelty # post['novelty'] = 'TBD' post['cluster_id'] = cluster_id post['tdt_topicid'] = tdt_topicid # TODO: look at alternatives for settign order, e.g. timestamp post['order'] = len(cluster) post['tdt_fileid'] = tdt_fileid post['tdt_src'] = doc_src # TODO - get text from source file and add as 'body_text' post['body_text'] = extract_doc_text(src_dir, tdt_fileid, post_id) cluster[post_id] = post clusters[cluster_id] = cluster # logger.debug("Doc:") # logger.debug(cluster[post_id]) return clusters
def filtersoup(html): data = [] page = bsoup(html) for link in page.FindAll('a'): data.append('href') for link in page.FindAll('form'): data.append('action') return data
def num_acct_pages(self): """This should return an integer that is the number of pages of vids in the account""" try: sp = bsoup(requests.get(self.base).text) num = sp.find('a',href='?page=last').text return int(num) except: return 2 # if there's only one page, there's no 'last' -- should only go through 1 (range(1,1))
def scrap_preceeding(base_url): homepage_html_content = web.download(base_url) homepage_soup = bsoup(homepage_html_content) ul_content = homepage_soup.find_all('ul') a_content = bsoup(str(ul_content)).find_all('a') volume_page_links = [] for raw_link in a_content: volume_page_links.append(join(base_url, raw_link.get('href'))+'/') os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/') for base_link in volume_page_links[32:]: folder_name = base_link.split('/')[-2] address = os.path.join(os.getcwd(), folder_name) if not os.path.exists(address): os.mkdir(folder_name) else: index = 1 while os.path.exists(address): folder_name = base_link.split('/')[-2] + '-' + str(index) print folder_name address = os.path.join(os.getcwd(), folder_name) index += 1 os.mkdir(folder_name) os.chdir(address) print '--------------' print 'downloading from ' + base_link volume_content_soup = bsoup(web.download(base_link)).find_all('div', {'id': 'content'}) a_content = bsoup(str(volume_content_soup)).find_all('a') # print a_content pdf_links = [join(base_link, link.get('href')) for link in a_content if str(link.get('href')).endswith('pdf')] for download_link in pdf_links: if not download_link.endswith('supp.pdf'): try: content = web.download(download_link) except: print 'link : %s is obsolete' % download_link continue f = open(download_link.split('/')[-1], 'wb') f.write(content) f.close() os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')
def getStockPrice(self): ''' Parses the webpage for the stock price ''' return bsoup(self.getPage(), "html.parser").find( 'div', { 'class': 'My(6px) Pos(r) smartphone_Mt(6px)' }).find('span').get_text()
def exploit(url, payload): res = requests.post(url, data=payload, timeout=10) if res.status_code == 200: soup = bsoup(res.text, 'html.parser') print soup.p.string print "[+] Exploit Finished!" else: print "\n[!] Exploit Failed!"
def cook_categories(content): psoup = bsoup(content, 'lxml') urls = psoup.find('div', {'id': 'filterJob'}).find_all('li') urls = { x.text: x.find('a')['href'] for x in urls if '全部' not in x.text and '招聘会' not in x.text } return urls
def SCRAPE_URL( self, SOURCE_URL="", # URL to scrape URL_PARAMS=None, # Additional params to pass with GET request PARA_MINLEN=15 # Paragraph elements < PARA_MINLEN characters will be ignored ): '''[BRIEF] : Function requests the source of the specified URL and scrapes the title, headers and paragraph elements of the HTML page [NOTE] : All paragraphs < PARA_MINLEN characters in length are ignored ( default = 15 ) ''' # [STEP 1]: RAW CONTENT PROCUREMENT rawContent = self.MAKE_REQUEST(SOURCE_URL, URL_PARAMS) # [STEP 2]: USING PARSER TO GET HEADERS AND PARAGRAPHS # FIXME: For now, bs4 is the only supported parser backend if (self.PARSER_BACKEND == "bs4"): soup = bsoup(rawContent.content, 'html.parser') # STORE HEADERS AT EACH LEVEL for iHeaderLvl in range(self.HEADER_MAXDEPTH): headerKey = 'h' + str(iHeaderLvl) for item in soup.find_all(headerKey): try: item.get_text() self.HEADERS[headerKey] = item.get_text() except: print('Unable to extract text from bs4 HEADER object') logging.error( 'Unable to extract text from bs4 HEADER object') # STORE PARAGRAPHS for iPara in soup.find_all('p'): if (iPara.get_text()): if (len(iPara.get_text().strip()) >= PARA_MINLEN): self.PARAGRAPHS.append(iPara.get_text().strip()) # STORE IMAGE LINKS if (self.PARSE_IMAGES): soup = bsoup(rawContent.text, 'lxml') images = soup.find_all('img') self.IMAGE_LINKS = [link.get('src') for link in images] self.IMAGE_LINKS = [ valid_link for valid_link in self.IMAGE_LINKS if valid_link ] return
def filterSoup(html): data_ = [] page = bsoup(html) for link in page.findAll('a'): data_.append(link.get('href')) for link in page.findAll('form'): data_.append(link.get('action')) return data_
def translate(self, to, query): query = query.replace(" ", "+") url = "{}&sl=auto&tl={}&ie=UTF-8&prev=_m&q={}".format( self.host, to, query) r = requests.get(url) data = bsoup(r.content, 'lxml') result = data.findAll('div', {'class': 't0'})[0] return result.text
def recrawl_duplicate_page(self): # 对因为超时重复的page,重新爬取。 asc_index = 0 dup_key = 'icd_code' db = pymongo.MongoClient('mongodb://localhost:27017')['icd11'] info = db['info'] html = db['html'] total_num = info.find().count() icd_code_dict = {} dup_idx_queue = queue.Queue(maxsize=0) for cursor in info.find({}, {dup_key: 1, 'foundation_id': 1}): if cursor[dup_key] == 'None': cursor[dup_key] += str(asc_index) asc_index += 1 if cursor[dup_key] not in icd_code_dict.keys(): icd_code_dict[cursor[dup_key]] = [cursor['foundation_id']] else: icd_code_dict[cursor[dup_key]].append(cursor['foundation_id']) for k in icd_code_dict: if len(icd_code_dict[k]) > 1: print('[INFO] %s with size %s, members: %s' % (k, len(icd_code_dict[k]), ', '.join(icd_code_dict[k]))) for e in icd_code_dict[k]: dup_idx_queue.put(e) self.browser = webdriver.Safari() self.browser.implicitly_wait(5) self.browser.set_page_load_timeout(40) part_cnt = 1 while dup_idx_queue.qsize() > 0: self.total_cnt += 1 curr_url_idx = dup_idx_queue.get() try: part_cnt += 1 curr_url = self.url_page_format + curr_url_idx self.browser.get(curr_url) time.sleep(5) page = self.browser.page_source page_sup = bsoup(page, features='html.parser') page_div = page_sup.find_all('div', {'id': 'firstright'}) if len(page_div) > 0: self.update_mongo_rawpage(curr_url_idx, str(page_div[0]), 0, 0) else: self.update_mongo_rawpage(curr_url_idx, page, 0, 0) except TimeoutException as time_exp: print('[ERROR] time out error with code : %s' % time_exp) dup_idx_queue.put(curr_url_idx) except Exception as e: dup_idx_queue.put(curr_url_idx) print('[ERROR] unexpected error occurs with code : %s' % e) traceback.print_exc() self.browser.close()
def generate_response(self) -> Tuple[Any, int]: """Generates a response for the user's query Returns: Tuple[Any, int]: A tuple in the format (response, # of elements) For example, in the case of a "feeling lucky" search, the response is a result URL, with no encrypted elements to account for. Otherwise, the response is a BeautifulSoup response body, with N encrypted elements to track before key regen. """ mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent content_filter = Filter(self.session['fernet_keys'], mobile=mobile, config=self.config) full_query = gen_query(self.query, self.request_params, self.config, content_filter.near) get_body = g.user_request.send(query=full_query) # Produce cleanable html soup from response html_soup = bsoup(content_filter.reskin(get_body.text), 'html.parser') html_soup.insert( 0, bsoup(TOR_BANNER, 'html.parser') if g.user_request.tor_valid else bsoup('', 'html.parser')) if self.feeling_lucky: return get_first_link(html_soup), 0 else: formatted_results = content_filter.clean(html_soup) # Append user config to all search links, if available param_str = ''.join( '&{}={}'.format(k, v) for k, v in self.request_params.to_dict(flat=True).items() if self.config.is_safe_key(k)) for link in formatted_results.find_all('a', href=True): if 'search?' not in link['href'] or link['href'].index( 'search?') > 1: continue link['href'] += param_str return formatted_results, content_filter.elements
def VeriverBabus(self): if self.adresVer.text() == "": self.adresVer.setPlaceholderText("Meslek dalı yazmalısınız..") else: self.adres = self.adresVer.text() if len(self.adres.split()) > 1: self.adres = "-".join(self.adres.split()) print("Aranan : ", self.adres) link = "https://www.kariyer.net/is-ilanlari/" + self.adres header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" } ilan_listesi = [] kaynak_isle = requests.get(link, headers=header).text kod = bsoup(kaynak_isle, "lxml") for a in kod.find_all("a"): try: a["href"] except KeyError: pass else: if a["href"].startswith("/is-ilani/"): # self.gelenIlanlar.append("İş ilanı bulundu: "+ "https://kariyer.net"+ a["href"]) print("https://www.kariyer.net" + a["href"]) ilan_listesi.append("https://www.kariyer.net" + a["href"]) if ilan_listesi: for link in ilan_listesi: self.gelenIlanlar.append(link) self.gelenIlanlar.append("") kod = bsoup( requests.get(link, headers=header).text, "lxml") for konu in kod.find_all( "div", attrs={"class": "genel-nitelikler"}): for i in konu.find_all("li"): self.gelenIlanlar.append(i.text) self.gelenIlanlar.append("") self.gelenIlanlar.append( "*******************************************************************************************************************************************************************************************\n\n" ) self.gelenIlanlar.repaint() else: self.gelenIlanlar.append("Bilgi Bulunamadı :(")
def get_appstatus(self): appstatus = {} for program in self.programs: r = self.s.get(program["Link"]) appstatuspage = bsoup(r.content, "lxml") status = appstatuspage.find("a", {"name": "STATUS$0"}).get_text() appstatus[program["Program"]] = status return appstatus
def get_game_html(home, away, date): doc = requests.get(GAME_URL.format( year=date.year, month=str(date.month).zfill(2), day=str(date.day).zfill(2), away=away, home=home, )) return bsoup(doc.text)
async def __get_ids(self, url: str) -> list: r = self.__ses.get(url=url, allow_redirects=True) logg.debug(f'Requesting IDs page resulted in code: {r.status_code}') if r.status_code != 200: await Util().raise_error( r.status_code, str(r.reason) + ' - ' + str(bsoup(r.text, 'lxml').get_text())) soup = bsoup(r.content, 'lxml') for s in soup.findAll('script'): s.decompose() table = soup.find('table', attrs={'rules': 'rows'}) ids = [] for tr in table.findAll('tr')[1:]: ids.append(tr.td.get_text(strip=True)) return ids
def get_song(index): song_list = [] for x in index: url = get_url(x) html = get_html(url) soup = bsoup(html, 'html.parser') song = soup.select('.wrap_song_info') song_list += [x.text for x in song] return song_list
def _proc_request(self): try: req = requests.get(self.url, headers=self.header) except Exception as errmsg: print("Err: %s" % errmsg) return False else: soup = bsoup(req.text, 'html5lib') return soup
def parse_docs(): for file in filenames: if not file.startswith('la'): continue print(file) try: soup = bsoup(open(os.path.join(data_root,file)), 'html.parser') except UnicodeDecodeError: soup = bsoup(open(os.path.join(data_root,file), encoding='ISO-8859-15'), 'html.parser') docs = soup.find_all('doc') for doc in docs: # op_dict = { # "index": { # "_index": "trec", # "_type": TYPE, # "_id": doc.docno.get_text().strip() # } # } # data_dict = { # "text": doc.find('text').get_text().strip() # } # data.append(op_dict) # data.append(data_dict) try: id = doc.docno.extract().get_text().strip() try: doc.docid.extract() except AttributeError: pass try: doc.length.extract() except AttributeError: pass yield { "_index": index, "_op_type": "index", "_type": TYPE, "_id": id, "text": re.sub('[\s\n]+', ' ', doc.get_text().strip()) } except AttributeError: continue
def crawl_page(self): self.browser = webdriver.Safari() self.browser.implicitly_wait(5) self.browser.set_page_load_timeout(40) self.init_html_url_queue() part_cnt = 1 while self.url_queue.qsize() > 0: self.total_cnt += 1 if part_cnt % 500 == 0: print('[INFO] refreshing browser to collapse tree...') self.browser.refresh() time.sleep(10) curr_url_ele = self.url_queue.get() curr_url_idx = curr_url_ele['idx'] curr_url_bth = curr_url_ele['breath'] curr_url_lef = curr_url_ele['leaf'] if self.col.find({'idx': curr_url_idx}).count() != 0: print('[INFO] %sth\t%s already exists in mongodb' % (self.total_cnt, curr_url_idx)) continue try: part_cnt += 1 curr_url = self.url_page_format + curr_url_idx # print('[INFO] --- start crawling with url %s ------->' % curr_url) self.browser.get(curr_url) time.sleep(3) page = self.browser.page_source # curr_idx = self.crawled_idx_pattern.findall(page)[0] # if curr_idx in self.crawled_idx: # print('[ERROR] %s occurs again, probably due to page frozen, sleep for a while' % curr_idx) # self.url_queue.put(curr_url_ele) # self.browser.refresh() # time.sleep(10) # continue # self.crawled_idx.add(curr_idx) page_sup = bsoup(page, features='html.parser') page_div = page_sup.find_all('div', {'id': 'firstright'}) if len(page_div) > 0: self.save_mongo_rawpage(curr_url_idx, str(page_div[0]), curr_url_bth, curr_url_lef) else: self.save_mongo_rawpage(curr_url_idx, page, curr_url_bth, curr_url_lef) except TimeoutException as time_exp: print('[ERROR] time out error with code : %s' % time_exp) self.url_queue.put(curr_url_ele) except Exception as e: self.url_queue.put(curr_url_ele) print('[ERROR] unexpected error occurs with code : %s' % e) traceback.print_exc() # break self.browser.close()
async def topics(self): async with aiohttp.ClientSession() as ses: async with ses.get(self.base_url) as res: s = bsoup(await res.text(), 'lxml') return [{ "name": a.find("span", class_='_1WMnM xLon9').text, "url": f"{self.base_url}{a.find('a', class_='qvEaq')['href']}" } for a in s.find_all('li', class_='_1hkdt')]
def clean(html_str): souped = bsoup(html_str.strip(), 'html.parser') temp = [] for element in souped.find_all(True, recursive=False): temp.append(flatten(element)) result = ' '.join(temp) result = whitespaces_matcher.sub(' ', result) return result
def scrape(path: str): if bool(re.match(r'((http|https):\/\/)?(www.)?maid.my.id/(.*)', path)): url = path elif not path.startswith('/'): url = f'{BASE}/{path}' else: url = BASE + path response = requests.get(url) soup = bsoup(response.text, 'lxml') return soup
def get_topics (): tList = [] r = requests.get('{}/{}'.format(BASE_URL, 'topic')) soup = bsoup(r.text, 'lxml') nav_footer = soup.find(id='block-views-topics-block_2') table = nav_footer.find('table', {'class' : 'views-view-grid grid-2'}) topics = table.findAll('a') for i in topics: tList.append(i.text) return tList
def xml_parse(domain): try: sitemap_xml = bsoup(get_sitemap(domain), 'lxml') urls = sitemap_xml.find_all('loc') for url in urls: yield url.string except Exception: pass except KeyboardInterrupt: exit('Bye!')
def handle_readmore(text_body, text_body_attrs): body_html = bsoup(text_body) readmores = body_html.findAll('a', class_='read_more') try: for flag in readmores: user_host_name, pid = re.search(r'\://([^/]+)/post/(\d+)', flag['href']).groups() print user_host_name, pid try: post = get_post(user_host_name, pid) except: continue replacement = bsoup( post[text_body_attrs[post['type']]] ) replacement.find('html').unwrap() replacement.find('body').unwrap() flag.replace_with(replacement) return str(body_html) except: errors.exception('Readmore fail:\n%s', text_body) return text_body
def _read_text(self, path): try: page = bsoup(open(path, 'rb').read(), 'html.parser') text = [ div.text for div in page.find('div', class_='main').find_all( 'div')[1:-2] ] return re.sub(' [0-9]+\\xa0', '', ''.join(text)) except: pass
def parse(self): r = requests.get(self._proxy_url) soup = bsoup(r.content, u'html.parser') proxies_tag = soup.find( u'table', class_=u'htable proxylist').find(u'tbody').find_all(u'tr') for proxy_tag in proxies_tag: self.proxies_list.append(proxy_tag.find_all(u'td')[0].text.strip())
def soupify(self, response): """ Uses BeautifulSoup4 to return the HTML content of the specified HTTP response, as obtained using the requests module. "response" is not a web address, but an already pinged response object generated from an address. """ print('souping') return bsoup(response.content, 'html.parser')
def CheckSequel(self, url): previous_score = '' previous_pop = '' isSequel = 'False' search_url = requests.get( url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36" }).content new_html = bsoup(search_url, "lxml") old_start = GetText(self.html, 'Aired:') old_start = old_start.split(' to ')[0] new_start = GetText(new_html, 'Aired:') new_start = new_start.split(' to ')[0] try: if date.strptime(old_start, '%b %d, %Y') > date.strptime( new_start, '%b %d, %Y'): isSequel = 'True' #Get previous show popularity and rating initial = new_html.find('span', attrs={'itemprop': 'ratingValue'}) previous_score = initial.get_text() previous_pop = GetText(new_html, 'Members:') except ValueError: try: if date.strptime(old_start, '%b %d, %Y') > date.strptime( new_start, '%Y'): isSequel = 'True' #Get previous show popularity and rating initial = new_html.find('span', attrs={'itemprop': 'ratingValue'}) previous_score = initial.get_text() previous_pop = GetText(new_html, 'Members:') except ValueError: if date.strptime(old_start, '%Y') > date.strptime( new_start, '%b %d, %Y'): isSequel = 'True' #Get previous show popularity and rating initial = new_html.find('span', attrs={'itemprop': 'ratingValue'}) previous_score = initial.get_text() previous_pop = GetText(new_html, 'Members:') return [isSequel, previous_score, previous_pop]
def _getPostmodernSentences(url = 'http://dev.null.org/postmodern/'): soup = bsoup(urllib2.urlopen(url, 'html5lib').read()) sentences = [] for tag in soup.body.children: # read all <p> tags up to the first <hr> if tag.name == 'p': sentences.extend([_parsePostmodern(s) for s in splitSentences(tag.get_text()) if len(s) > 1]) if tag.name == 'hr': break return sentences
def get_todays_games(date): games = [] doc = bsoup(requests.get(GAMELINE_URL.format( year=date.year, month=str(date.month).zfill(2), day=str(date.day).zfill(2), )).text) divs = doc.find(id="nbaSSOuter").find_all("div", class_="nbaModTopScore") return [get_game(g) for g in divs]
def isFlood(res): soup = bsoup(res.text) sel = soup.select(".messageArea h3") if sel: sel = sel[0].decode_contents() if sel == u'内容確認' and u'存在しないユーザIDです' not in res.text: return True else: return False else: return False
def add_row(self, local_pn, item): row = self.__outsoup.new_tag("tr") # Local PN col = self.__outsoup.new_tag("td") col.string = local_pn row.append(col) # Digikey PN col = self.__outsoup.new_tag("td") search_link = self.__outsoup.new_tag("a", href=item['search_url'], target="_blank") search_link.string = item['digikey_pn'] col.append(search_link) row.append(col) # Pricing information col = self.__outsoup.new_tag("td") table = ("<table class=\"table table-hover\"border=\"1\"><tbody><tr><th>Price Break</th>" "<th>Unit Price</th></tr></tbody></table>") table = bsoup(table, 'html.parser') table.tbody.append(self.__row_for(item['pricing']['min'])) table.tbody.append(self.__row_for(item['pricing']['max'])) col.append(table) row.append(col) # Image col = self.__outsoup.new_tag("td") if(item.has_image_url()): img_title = self.__outsoup.new_tag("div") img_title.string = "NAME: " + self.__get_basename_from_url(item['image_url']) col.append(img_title) img_div = self.__outsoup.new_tag("div") img_link = self.__outsoup.new_tag("a", href=item['image_url'], target="_blank") img_tag = self.__outsoup.new_tag("img", border="0", width="100", src=item['image_url']) img_link.append(img_tag) img_div.append(img_link) col.append(img_div) else: col.string = "N/A" row.append(col) # Datasheets col = self.__outsoup.new_tag("td") if(item.has_datasheet_urls()): for url in item['datasheet_urls']: link = self.__outsoup.new_tag("a", href=url, target="_blank") link.string = self.__get_basename_from_url(url) col.append(link) else: col.string = "N/A" row.append(col) self.__outsoup.find(class_="table").append(row)
def pull_title_and_images(self): filter_query = Q(url_title=None) | Q(url_title='') | Q(url_image=None) | Q(url_image='') url_without_title = News.objects.filter(filter_query).values_list('url', flat=True) for url in url_without_title: try: headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'} response = requests.get(url, headers=headers) except requests.exceptions.ConnectionError as e: response = None if response: soup = bsoup(response.text, "html.parser") url_title = None if soup.title: url_title = soup.title.string url_description = None twitter_description = soup.find('meta', attrs={'property': 'twitter:description', 'content': True}) og_description = soup.find('meta', attrs={'property': 'og:description', 'content': True}) meta_description = soup.find('meta', attrs={'name': 'description', 'content': True}) if twitter_description: url_description = twitter_description['content'] elif og_description: url_description = og_description['content'] elif meta_description: url_description = meta_description['content'] url_image = None twitter_image = soup.find('meta', attrs={'property': 'twitter:image:src', 'content': True}) og_image = soup.find('meta', attrs={'property': 'og:image', 'content': True}) if twitter_image: url_image = twitter_image['content'] elif og_image: url_image = og_image['content'] else: images = soup.find_all('img') url_image = get_biggest_images(images) news_is_exist = News.objects.filter(url=url).exists() if news_is_exist: news = News.objects.get(url=url) if url_title and url_description: if news.channel: url_title = url_title.replace(' - %s' % news.channel.name, '') url_title = url_title.replace(' | %s' % news.channel.name, '') news.url_title = url_title news.url_description = url_description news.save() News.objects.filter(url=url, url_image__isnull=True).update(url_image=url_image) else: news.delete()
def isRestricted(res): soup = bsoup(res.text) sel = soup.select(".messageArea h3") if sel: sel = sel[0].decode_contents() if sel == u'アクセスできません': return True else: print "Restricted?: %s" % (sel,) return False else: return False
def search_emojipedia(query): html = cache.get(query) if html is None: url = u'http://emojipedia.org/search/?q={}'.format(query) r = requests.get(url) html = r.text cache.set(query, html) soup = bsoup(html, 'lxml') results = soup.find('ol', class_='search-results') results = results.find_all('a') results = [result.text.replace(' ', ' ', 1) for result in results if result is not None] return results
def parse_espn(name): url = "http://espn.go.com/nba/team/schedule/_/name/" + name r = rq.get(url) soup = bsoup(r.content, 'lxml') ## Find all the rows that have classes. Remove the first one -- it's irrelevant. trs = soup.find_all("tr", class_=True)[1:] ## Main procedure. with open("nba_2016_csv/" + name + "_2016_schedule.csv", "wb") as ofile: f = csv.writer(ofile) ## Write the headers. f.writerow( ["team","date","a/h","opponent","w/l","s","os"] ) for tr in trs: team = name tds = tr.find_all("td") date = tds[0].get_text().encode("utf-8") opp = tds[1].find_all( "li", {"class": "team-name"} ) for teams in opp: other_team = teams.get_text() opponent = tds[1].find_all( 'li',{'class':'game-status'} ) for o in opponent: h_a = o.get_text() try: win_loss = tds[2].find_all( 'li',{'class':'game-status win'} ) or tds[2].find_all( 'li',{'class':'game-status loss'} ) for a in win_loss: w_l = a.get_text() score = tds[2].find_all('a') for s in score: if w_l == 'W': gs = s.get_text().split("-")[0] gs = int(gs) ogs = s.get_text().split("-")[1].split(" ")[0] ogs = int(ogs) elif w_l == 'L': gs = s.get_text().split("-")[1].split(" ")[0] gs = int(gs) ogs = s.get_text().split("-")[0] ogs = int(ogs) except: h_a = '' w_l = None other_team = '' gs = None ogs = None ## write the result to the CSV file. finally: f.writerow([team,date,h_a,other_team,w_l,gs,ogs]) g = dict(date=date, team=team,h_a=h_a,other_team=other_team, w_l=w_l,s=gs,os=ogs) connect().save(g)
def get_indicator_code (ind): if not ind: return None iCodes = dict() r = requests.get('{}/{}'.format(BASE_URL, 'indicator')) soup = bsoup(r.text, 'lxml') tables = soup.findAll('tbody') for table in tables: links = table.findAll('a') for link in links: if link.text in ind: iCodes[link.text] = link.get('href').split('/')[-1] return iCodes
def _getFromWeb(self, s_date, s_filter='DI1'): ''' Return a dataframe of the data of specific data ''' # recupera dados de website url = self.url + s_date resp = requests.get(url) ofile = resp.text soup = bsoup(ofile, 'lxml') soup.prettify() tables = soup.find_all('table') storeValueRows = tables[6].find_all('tr') # checa re retrornou valores if len(storeValueRows) == 2: return None # separa informacoes de interesse storeMatrix = [] s_ass = '' for row in storeValueRows: storeMatrixRow = [] for cell in row.find_all('td'): s = cell.get_text().strip() if s != '': storeMatrixRow.append(s) if len(storeMatrixRow) == 6: s_ass = storeMatrixRow[0].split()[0] storeMatrixRow = [s_ass] + storeMatrixRow[1:] elif len(storeMatrixRow) == 5: storeMatrixRow = [s_ass] + storeMatrixRow storeMatrix.append(storeMatrixRow) # monta dataframe com dados filtrados df = pd.DataFrame(storeMatrix[1:], columns=storeMatrix[0]) if s_filter: df = df[df.Mercadoria == s_filter].reset_index(drop=True) df = df.ix[:, :-2] df.index = [list(df.Mercadoria + df.Vct), [s_date]*df.shape[0]] df.drop([u'Mercadoria', u'Vct'], axis=1, inplace=True) # transforma dados em dicionario d_rtn = {} d_aux = df.T.to_dict() for x in d_aux: if x[0] not in d_rtn: d_rtn[x[0]] = {} s_atual = d_aux[x][u'Pre\xe7o de Ajuste Atual'] s_anterior = d_aux[x][u'Pre\xe7o de Ajuste Anterior'] s_atual = s_atual.replace('.', '').replace(',', '.') s_anterior = s_anterior.replace('.', '').replace(',', '.') d_rtn[x[0]][x[1]] = {'PU_Anterior': float(s_anterior), 'PU_Atual': float(s_atual)} return d_rtn
def parseHTML(html):#parse a google page into result objects #netlib calls this before return its results results = [] #list of result objects if html==None: #then work with `self.gooHTML' return [] else: #use `html' #okay off to the lab, gotta whip up a small script to get this going #need to check that the ires div is available on the page resDiv = bsoup(html).find('div',{'id':'ires'}) resultTags = "" try: if resDiv.ol: #we know that the ol tag is available resultTags = resDiv.ol.findAll('li',{'class':'g'}) if len(resultTags) > 0: for result in resultTags: resultObj = goo_result(None,None,None,None,None) #a couple of things could be in each result tag #just realized GooDork would make an awesome browser extension! if result.blockquote: #sometimes results are grouped together using this tag result=result.blockquote if result.h3: #the link to the actual page sits in this tag, with the title of the page h3=result.h3 if h3.a: href=h3.a.get('href') href=str(href[7:].split("&sa=")[0]) title=''.join([string for string in h3.a.strings]) resultObj.setURL(href) resultObj.setTitle(title) if result.div: #some extra's, possibly containing the cache link, image or summary summary=''.join([string for string in result.div.strings]) resultObj.setSummary(summary) if result.div.div: #contains the cache link cached=result.div.div if cached.cite: citeURL=''.join([ string for string in cached.cite.strings ]) resultObj.setCiteURL(citeURL) if cached.span: if cached.span.a: #here lies the cached link cacheLink=cached.span.a.get('href') resultObj.setCacheLink(cacheLink[2:]) results.append(resultObj) return results else: return [] #no results could be found except Exception, e: raise Exception("\n\t\t[goo_result] Problem parsing Google Search page:\n\t\t%s" % (e))
def do_crawl(job_id, depth, url): """ Perform a new crawl task for the given `job_id` and `url`. If `depth > 0` then enqueue additional crawl tasks for each valid `<a href=""></a>` tag encountered at this URL. NOTE: This works purely on static HTML. No Javascript gets run! """ log("Starting crawl (job_id='%s' depth='%s' url='%s')" % ( job_id, depth, url)) # Increment inprogress r.incr("JOB_%s_INPROGRESS" % job_id) try: # Get image urls page = requests.get(url).content html = bsoup(page) # Push all img srcs to database for img_tag in html.find_all('img'): if not img_tag.get("src"): # Skip images with empty src attrs continue r.sadd("JOB_%s_RESULTS" % job_id, img_tag["src"]) # If we should go deeper, enqueue more crawls if depth > 0: for a_tag in html.find_all("a"): href = a_tag.get("href","") if not href or href.startswith("javascript"): continue # Build full url full_url = urljoin(url, href) # Enqueue a crawl for this job for this url, decrementing depth counter r.rpush('CRAWL_QUEUE', "%s$%s$%s" % (job_id, depth - 1, full_url)) except requests.exceptions.SSLError: warn("SSL Error: Skipping url '%s'") finally: # Always decrement inprogress r.decr("JOB_%s_INPROGRESS" % job_id) # Increment completed r.incr("JOB_%s_COMPLETED" % job_id)
def loadImage(self, parFnImg): if os.path.isfile(parFnImg): self.dataImg=ops.OpenSlide(parFnImg) xmlInfo=bsoup(self.dataImg.properties['tiff.ImageDescription'],'lxml-xml') lstXMLObj=xmlInfo.find_all("DataObject", ObjectType="PixelDataRepresentation") arrSizesMm=np.zeros(len(lstXMLObj), np.float) for i,ii in enumerate(lstXMLObj): tmp=ii.find_all("Attribute", Name="DICOM_PIXEL_SPACING")[0] tsiz=float(tmp.getText().split(" ")[0].replace('"','')) tidx=int(ii.find_all("Attribute", Name="PIIM_PIXEL_DATA_REPRESENTATION_NUMBER")[0].getText()) print i, " : ", tidx, " : ", tsiz arrSizesMm[tidx]=tsiz self.realScales=np.round(arrSizesMm/arrSizesMm[0]) arrLayerSizes=np.array(self.dataImg.level_dimensions) self.layerSizes=np.array([(arrLayerSizes[0][0]/ss, arrLayerSizes[0][1]/ss) for ss in self.realScales], np.int) self.numLevels=self.dataImg.level_count
def manage_remainder_links(self): for ak in self.acctobjs: for i in range(1,ak.num_acct_pages()): burl = ak.base + "?page={}".format(i) rb = requests.get(burl).text sp = bsoup(rb) lks = sp.findAll('a') for l in lks: tst = re.search(self.vid_patt, l['href']) if tst is not None: ed = tst.groups()[0] if ed not in self.ids: self.ids[ed] = 1 else: continue else: continue
def results(html): res_wrap = None res_wrap = bsoup(html).find('div',{'id':'ires'}) if not (res_wrap): raise Exception('Could not parse file') if len(res_wrap)==1: if res_wrap.ol: oltag = res_wrap.ol gresults=oltag.findAll('li',{'class':'g'}) #print type(gresults) #ResultSet if len(gresults)>=1: return gresults else: raise Exception('No results found!') else: raise Exception('No results found!') return