def define(word): url = 'http://www.google.com/dictionary/json?callback=a&sl=en&tl=en&q=' + word + '&restrict=pr%2Cde&client=te' headers = {'User-Agent' : 'Mozilla 5.10'} request = urllib2.Request(url, None, headers) response = urllib2.urlopen(request) jsonstring = '[' + response.read()[2:-1] + ']' #To replace hex characters with ascii characters response = re.compile(r'\\x(\w{2})') ascii_string = response.sub(asciirepl, jsonstring) data = json.loads(ascii_string) try: defs = data[0]['primaries'][0]['entries'] except KeyError: try: defs = data[0]['webDefinitions'][0]['entries'] except KeyError: raise ValueError('no definition') # Get first definition of the word for entry in defs: if entry['type'] == 'meaning': # Clean the definition of any html html = lxml.html.fragment_fromstring('<p>' + entry['terms'][0]['text'] + '</p>') definition = html.text_content() break return definition.encode('utf8')
def links_grab(self, url): if self.incorrect_url_ending(url): return [] if self.basic_auth_required: r = requests.get(url, auth=HTTPBasicAuth(self.username, self.password)) else: r = requests.get(url) html = lxml.html.fromstring(unidecode(r.text)) content = str(unidecode(html.text_content())) content = "".join( [elem for elem in content if not elem in ["\t", "\n", "\r"]]) content = " ".join([elem for elem in content.split(" ") if elem != '']) #if self.save_to_database: self.db.save(url,content) #saves to the database, database.db datum = {} try: datum["title"] = html.xpath("//title")[0].text_content() except: print r.url if r.url.startswith("/"): datum["url"] = self.domain_name + r.url else: datum["url"] = r.url self.data.append(datum) url_list = html.xpath("//a/@href") uri_list = [] for uri in url_list: if uri.startswith("/"): uri_list.append(self.domain_name + uri) else: uri_list.append(uri) return uri_list + [url] #ensures the url is stored in the final list
def parse(url): data = urllib2.urlopen(url) d = feedparser.parse(data.read()) feed = { 'title': d.feed.title, 'subtitle': d.feed.subtitle, 'link': d.feed.link, 'articles': [] } for entry in d.entries: html = lxml.html.fromstring(entry.summary) img = html.xpath("//img") text = html.text_content().strip() if img is not None and len(img) > 0: img = img[0].attrib['src'] else: img = '' article = { 'title': entry.title, 'content': text, 'author': entry.author if 'author' in entry else '', 'link': entry.link, 'photo': img, 'date': datetime.datetime.fromtimestamp(time.mktime(entry.updated_parsed)) } feed['articles'].append(article) return feed
def _get_uks_link_mp3_cambridge(self, org_word, word, item): BASE_URL = 'http://dictionary.cambridge.org/dictionary/english/' url = BASE_URL + word print(url) html = requests.get(url).content tree = lxml.html.fromstring(html) uks = tree.xpath("//span[@class='sound audio_play_button pron-icon uk']/@data-src-mp3") #pos_header = tree.xpath("//div[@class='pos-header']")[0] # //*[@id="dataset-british"]/div[1]/div[2]/div/div/div[1]/span[2] # //*[@id="dataset-british"]/div[1]/div[2]/div/div/div[1]/span[2]/span # //*[@id="dataset-british"]/div[1]/div[2]/div/div/div[1]/span[@class='uk']/span[@class='pron']/span[@class='ipa']/text() # uks_pron = tree.xpath("//span[@class='uk']/span[@class='pron']/span[@class='ipa']/text()") uks_pron_html = tree.xpath("//*[@id='dataset-british']/div[1]/div[2]/div/div/div[1]/span[@class='uk']/span[@class='pron']/span[@class='ipa']") sqlVocab = SqliteVocabulary("studyenglish.db", "vocabulary") #import xml.etree.ElementTree as ET uks_pron = [html.text_content() for html in uks_pron_html] prons = u'/' + u'/,/'.join(uks_pron) + u'/' #if uks_pron: # prons = u'/' + uks_pron[0] + u'/' self.tree.set(item,'#2',prons) if len(uks_pron)>0: sqlVocab.update_uk_pron(org_word, prons) return uks
def parse_full_10K(cls, doc): text = "" for child in doc.getchildren(): if child.tag == 'sec-header': continue html, properties = TXTML.get_HTML_from_document(child) if re.search('10[-]*[KQ]', properties['type']): text = text + html.text_content() return text
def getTodos(projects, objects): """ Get todos for each project """ tags_dict = getTags(objects) for project in projects: for ref_id in project['ref_ids'].split(): for object in objects: if object.attributes['id'].value == ref_id: attribute_nodes = object.getElementsByTagName("attribute") title = "" content = "" datemodified = "" datecreated = "" datecompleted= "" tags = "" for attribute_node in attribute_nodes: if attribute_node.attributes['name'].value == 'title': if attribute_node.childNodes: title = attribute_node.childNodes[0].nodeValue.encode("utf-8") break # Check if todo has a note attached if title: for attribute_node in attribute_nodes: # <attribute name="datemodified" >309306984.40529602766036987305 if attribute_node.attributes['name'].value == 'datemodified': datemodified = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) # <attribute name="datecreated" >306520491.00000000000000000000 if attribute_node.attributes['name'].value == 'datecreated': datecreated = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) #<attribute name="datecompleted" type="date">292880221.18648099899291992188 if attribute_node.attributes['name'].value == 'datecompleted': datecompleted = convertCocoaEpoch(attribute_node.childNodes[0].\ nodeValue.encode("utf-8")) if attribute_node.attributes['name'].value == 'content': content = attribute_node.childNodes[0].nodeValue #.encode("utf-8") # lets encode in writeOutline # I think we need to translate all this things html = content.replace('\\u3c00', '<').replace('\\u3e00', '>') html = html.replace('\u2600', '&') html = lxml.html.fromstring(html) content = html.text_content().split('\n') for l in html.iterlinks(): content += [l[2]] relationship_nodes = object.getElementsByTagName("relationship") for relationship_node in relationship_nodes: if relationship_node.attributes['name'].value == 'tags': try: tags_id = relationship_node.attributes['idrefs'].value tags = [tags_dict[t_id] for t_id in tags_id.split()] except: tags = "" project['todos'].append([title, content, datecreated, datemodified, datecompleted, tags]) return projects
def index(): if request.method == 'GET': return render_template('index.html') if request.method == 'POST': url = request.form.get('url') data = urllib.request.urlopen(url).read() #soup = BeautifulSoup(data,"lxml") html = lxml.html.fromstring(data) #text=soup.get_text() text = html.text_content() return render_template('output.html', result=analyze_text(text))
def links_grab(self,url): if self.incorrect_url_ending(url): return [] if self.basic_auth_required: r = requests.get(url,auth=HTTPBasicAuth(self.username, self.password)) else: r = requests.get(url) html = lxml.html.fromstring(unidecode(r.text)) #checks if VEC urls are in page #remove this after VEC goes live VEC = [ "/veterans-employment-center/", "https://www.vets.gov/veterans-employment-center/", "https://www.vets.gov/employment/", "/employment/" ] if any([uri in html.xpath("//a/@href") for uri in VEC]): print "found" return [] content = str(unidecode(html.text_content())) if "Coming Soon." in content: print "found" return [] content = "".join([elem for elem in content if not elem in ["\t","\n","\r"]]) content = " ".join([elem for elem in content.split(" ") if elem != '']) if self.save_to_database: self.db.save(url,content) #saves to the database, database.db datum = {} try: datum["title"] = html.xpath("//title")[0].text_content() except: print r.url datum["path"] = r.url datum["created"] = self.created datum["content"] = content if self.testing: datum["description"] = 'none' else: try: datum["description"] = str(self.df.Description[self.df.Page_Address == r.url].tolist()[0]) except: datum["description"] = "none" datum["promote"] = "false" datum["language"] = "en" #remove after VEC goes live if not "veterans-employment-center" in url: self.data.append(datum) url_list = html.xpath("//a/@href") uri_list = [] for uri in url_list: if uri.startswith("/"): uri_list.append(self.domain_name+uri) else: uri_list.append(uri) return uri_list + [url] #ensures the url is stored in the final list
def get_info(file): with open(os.getcwd()+"/WEBPAGES_RAW/"+file,encoding = 'utf-8') as file: textcont = file.read() html = lxml.html.fromstring(textcont)#gets the title of the page and a little bit of the content title = "" for tag in html.xpath("//title"): title += tag.text_content() title += "" # body = html.xpath("//body").text_content() intro = " ".join([w for w in re.split(r"[^a-zA-Z0-9]",html.text_content()[:400]) if len(w) > 0]) return (title, intro.lower())
def links_grab(self,url): if self.incorrect_url_ending(url): return [] if self.basic_auth_required: r = requests.get(url,auth=HTTPBasicAuth(self.username, self.password)) else: r = requests.get(url) html = lxml.html.fromstring(unidecode(r.text)) #checks if VEC urls are in page #remove this after VEC goes live VEC = [ "/veterans-employment-center/", "/employment/" ] if any([uri in html.xpath("//a/@href") for uri in VEC]): print "found" return [] content = str(unidecode(html.text_content())) content = "".join([elem for elem in content if not elem in ["\t","\n","\r"]]) content = " ".join([elem for elem in content.split(" ") if elem != '']) if self.save_to_database: self.db.save(url,content) #saves to the database, database.db datum = {} try: datum["title"] = html.xpath("//title")[0].text_content() except: print r.url datum["path"] = r.url datum["created"] = self.created datum["content"] = content if self.testing: datum["description"] = 'none' else: try: datum["description"] = str(self.df.Description[self.df.Page_Address == r.url].tolist()[0]) except: datum["description"] = "none" datum["promote"] = "false" datum["language"] = "en" #remove after VEC goes live if not "veterans-employment-center" in url: self.data.append(datum) url_list = html.xpath("//a/@href") uri_list = [] for uri in url_list: if uri.startswith("/"): uri_list.append(self.domain_name+uri) else: uri_list.append(uri) return uri_list + [url] #ensures the url is stored in the final list
def links_grab(self,url): r = requests.get(url) domain_name = self.get_domain_name(url) html = lxml.html.fromstring(unidecode(r.text)) content = str(unidecode(html.text_content())) if self.save_to_database: self.db.save(url,content) #saves to the database, database.db self.data.append(content) url_list = html.xpath("//a/@href") uri_list = [] for uri in url_list: if uri.startswith("/"): uri_list.append(domain_name+uri) else: uri_list.append(uri) return uri_list + [url] #ensures the url is stored in the final list
def links_grab(self, url): r = requests.get(url) domain_name = self.get_domain_name(url) html = lxml.html.fromstring(unidecode(r.text)) content = str(unidecode(html.text_content())) if self.save_to_database: self.db.save(url, content) #saves to the database, database.db self.data.append(content) url_list = html.xpath("//a/@href") uri_list = [] for uri in url_list: if uri.startswith("/"): uri_list.append(domain_name + uri) else: uri_list.append(uri) return uri_list + [url] #ensures the url is stored in the final list
def scrape(): data_items = [] rss_xml = lxml.html.fromstring(scraperwiki.scrape(rss_url)) for entry_el in rss_xml.findall('entry'): alert_id = entry_el.findtext('id').partition('.post-')[2] location_name = entry_el.findtext('title').replace('Rower Notification - ', '') html = lxml.html.fromstring(saxutils.unescape(entry_el.findtext('content')).replace('<BR>', '\n').replace('<br />', '\n').partition('___')[0].partition('Regards')[0]) note_text = html.text_content().strip() (subject, sep, detail) = note_text.partition('\n') location_desc = subject.replace('Rower notification from Thames Water:', '').strip() # Remove prefix date_time = entry_el.findtext('published').partition('.')[0].replace('T', ' ') alert_detail = detail.strip() data_items.append({ 'alert_id': alert_id, 'loc_name': location_name, 'loc_desc': location_desc, 'published': date_time, 'detail': alert_detail }) # Save the data scraperwiki.sqlite.save(unique_keys=['alert_id'], data=data_items, table_name='alerts', verbose=data_verbose)
def get_clean_html(etree, text_only=False): _is_etree(etree) # enable filters to remove Javascript and CSS from HTML document cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaner.html = True cleaner.page_structure = False cleaner.meta = False cleaner.safe_attrs_only = False cleaner.links = False html = cleaner.clean_html(etree) if text_only: return html.text_content() return lxml.html.tostring(html)
def get_string(html_text): html = lxml.html.fromstring(html_text) remove_tags = ('.//style', './/script', './/noscript') for remove_tag in remove_tags: for tag in html.findall(remove_tag): tag.drop_tree() # ここでの削除は元の変数tに反映されます。 codeframe_list = [] lang_list = [] # コードの削除 for tag in html.findall(".//div[@class='code-frame']"): codeframe_list.append(tag.text_content()) lang_list.append(tag.attrib["data-lang"]) tag.drop_tree() atext_list = [] ahref_list = [] # href リンクの削除 for tag in html.cssselect('a'): if tag.text is not None: atext_list.append(tag.text) if tag.get('href') is not None: ahref_list.append(tag.get('href')) tag.drop_tree() code_list = [] # 一行コードの削除 for cc in html.cssselect('code'): if cc.text is not None: code_list.append(cc.text) cc.drop_tree() text = html.text_content().strip('\n') return pd.Series( [ "".join(text.split('\n')), ",".join(codeframe_list), ",".join(lang_list), ",".join(code_list), ",".join(atext_list), ",".join(ahref_list) ], index=['text', 'code-frame', 'lang', 'code', 'a-text', 'a-href'])
def getParcel(streetnum, street): streetname = street.rsplit(" ",1)[0] pageURL = "http://www.padctnwebpro.com/WebproNashville/searchResults.asp?cboSearchType=Street&SearchVal1=" + urllib.quote(streetname) + "&SearchVal2=" + urllib.quote(streetnum) cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) html = lxml.html.parse(opener.open(pageURL)).getroot() rows = html.cssselect('#T1 tbody tr') parcelID = "" if len(rows) > 1: i = 0 while i <= len(rows)-1: parcelID += list(rows[i])[0].text_content().replace(" ", "-") if i < len(rows)-1: parcelID += "|" i += 1 else: if len(rows) == 1: parcelID = list(rows[0])[0].text_content().replace(" ", "-") else: print "pageURL: " + pageURL print "html: " + html.text_content() print "parcelID = " + parcelID return parcelID
def get_clean_text(file): #strings of important html tags title = "" h1 = "" h2 = "" h3 = "" bold = "" try: with open(file, 'r', encoding="utf-8") as f: textcont = f.read() html = lxml.html.fromstring(textcont) except: return ("", "", "", "", "", "") words = html.text_content() #get the content in important tags for tag in html.xpath("//title"): title += tag.text_content() title += "" for tag in html.xpath("//h1"): h1 += tag.text_content() h1 += "" for tag in html.xpath("//h2"): h2 += tag.text_content() h2 += "" for tag in html.xpath("//h3"): h3 += tag.text_content() h3 += "" for tag in html.xpath("//bold"): bold += tag.text_content() bold += "" #return a tuple return (words.lower(), title.lower(), h1.lower(), h2.lower(), h3.lower(), bold.lower())
list_dir = sorted(list_dir) id = 0 files = sorted(os.listdir("developer/DEV/" + list_dir[0])) f = open("idToIndex.txt", "w") for each_sub in list_dir: for each_json in sorted(os.listdir("developer/DEV/" + each_sub)): with open("developer/DEV/" + each_sub + "/" + each_json, encoding="utf-8") as read_file: stemProcesser = PorterStemmer() data = json.load(read_file) data_content = data["content"] data_content = data_content.encode("utf-8", "ignore") try: html = lxml.html.fromstring(data_content) parse_html_content = html.text_content() token_list = [] parse_html_content = parse_html_content.translate(table) parse_html_split = parse_html_content.rsplit() new_list = [ token for token in parse_html_split if token.isascii() ] #list of token after_stem = [] for word in new_list: #stem processing after_stem.append(stemProcesser.stem(word)) #print(after_stem)
# scrape the San Luis Obispo Police Department Summary Report, # parse the responses, then add them to the CSV. from lxml import html import requests import csv def oreo(s, start, end): return ((s.split(start))[1].split(end)[0]).strip() page = requests.get('http://pdreport.slocity.org/policelog/rpcdsum.txt') html = html.fromstring(page.content) text = html.text_content() cut_footer = text.split( '--------------------------------------------------------------------------------' )[0] els = cut_footer.split( '===============================================================================' ) header = els.pop(0) new_log_day = oreo(header, '\n', 'Summary Report').strip() print 'DOWNLOADING SLO POLICE DATA...' i = 0 items = [] for el in els:
def _findTextInNode(bs_node_or_text): if isinstance(bs_node_or_text, basestring): return bs_node_or_text else: html = lxml.html.fromstring(unicode(bs_node_or_text)) return html.text_content()
# scrape the San Luis Obispo Police Department Summary Report, # parse the responses, then add them to the CSV. from lxml import html import requests import csv def oreo(s, start, end): return ((s.split(start))[1].split(end)[0]).strip() page = requests.get('http://pdreport.slocity.org/policelog/rpcdsum.txt') html = html.fromstring(page.content) text = html.text_content() cut_footer = text.split('--------------------------------------------------------------------------------')[0] els = cut_footer.split('===============================================================================') header = els.pop(0) new_log_day = oreo(header, '\n', 'Summary Report').strip() print 'DOWNLOADING SLO POLICE DATA...' i = 0 items = [] for el in els: el = el.strip() if ( i % 2 == 0 ): time_data = el.split(' ') item = {} item['date'] = time_data[1]
def get_text_from_html(html_str): html = lxml.html.fromstring(html_str) contents = html.text_content().strip() return contents
def get_clean_text(html): """ Removes extra blank spaces and nbsp from html text. """ return " ".join(html.text_content().split())
def _get_descr(event): event = _fetch(_URL, {'sch.tv_event_id': event['id']})['tv_event'] html = lxml.html.fragment_fromstring(event['descr'], create_parent=True) return html.text_content()
def get_canonical_dom(text_id, html): """ Gets a list of first level dom elements part of the law project (titre, chapitre, articles) by removing unwanted parts (exposé des motifs, décret) (laws are usually made of few div containing lots of p elements) :param text_id: :param dom_elements: :return: """ if 'Projet de loi de finances' in html.text_content()[0:1000]: # Projet de loi finance (ex: PRJLANR5L15B2272) return None # law parts are in div with class assnatSection but not all of them are meaningful dom_elements = html.xpath( "/html/body/div[starts-with(@class,'assnatSection')]") if dom_elements: dom_element = dom_elements[-1] if search(r'Annexe \w', dom_element.getchildren()[1].text_content()): # If there is one or more annexe (with title formatted as Annexe X), each one is in # a different div. Thus, more than one div should be aggregated to make the final draft # ex: PRJLANR5L15B2296, PRJLANR5L15B2416 # Also, this should be in addition to an empty div (next case). # This part of the method should probably be recursive... children = [] pass elif 'PRJ' in text_id and len(dom_element.text_content()) < 10: # if div content is less thant 10 char, going for previous block children = dom_elements[-2] elif 'PRJ' in text_id and 'RAPPORT ANNEXÉ' in dom_element.text_content( ): # if there is one "rapport annexé", taking both the previous div and this one children = dom_elements[-2].getchildren() children.extend(dom_elements[-1].getchildren()) else: # otherwise, take all tags of the last div children = dom_element.getchildren() if not len(children): return None count_tags = reduce(count_by_element_type, children, {}) # max_tag = (None, 0) # for count_tag in count_tags.items(): # if count_tag[1] > max_tag[1]: # max_tag = count_tag max_tag = reduce(lambda acc, x: x if x[1] < acc[1] else acc, count_tags.items(), (None, 0)) if max_tag[0] == 'div': # some law are made of div containg lots of div (not the same format) => PRJLANR5L15BTC3995 return None # print(count_tags) count_class = reduce( lambda acc, x: (acc + 1) if x.get('class') else acc, children, 0) # print(count_class) if count_class < 0.1 * len(children): # less than 10% of tags have a class, considered unparsable (projet de loi finance / budget) return None text_content = dom_element.text_content() res = [] if 'PION' in text_id: # print('EXPOSÉ DES MOTIFS') if 'EXPOSÉ DES MOTIFS' in text_content else print('...') # print('PROPOSITION DE LOI') if 'proposition de loi' in text_content else print('...') if 'EXPOSÉ DES MOTIFS' in text_content and 'PROPOSITION DE LOI' in text_content: # in some law text, there is a first section, not meaningful that needs to be removed for index in range(len(children)): if 'PROPOSITION DE LOI' in children[index].text_content(): res = children[index + 1:] break else: # FIXME weird case res = [] else: res = children elif 'PRJ' in text_id: if 'table' not in count_tags: # if there is a table, it might be for: a signed pre-text or a table inside the law # but if there isn't, all the tags can be taken res = children else: for index in range(len(children)): if 'Articleliminaire' in compile(r'\s+').sub( '', children[index].text_content()): res = children[index:] break if 'Article unique' in children[index].text_content(): res = children[index:] break if 'Article 1' in children[index].text_content(): # if article 1 is found, going backwards to search for chapter and part titles for jndex in range(index, 0, -1): if children[jndex].get( 'class') and 'assnatLoiTexte' in children[ jndex].get('class'): # print(jndex) res = children[jndex + 1:] break else: # if not found, we suspect that there is nothing before hence, # ever children are taken res = children break return res
def crawl(self, url): try: global num_of_visited_links, pq, total_downloaded_size #Checking if the specified page limit is reached or not if len(pq)>total_page_limit: return try: #Extracting home link of current page to get URL/robots.txt rpurl = url if '/' in url[url.index('.'):]: rpurl = url[:url.index('/',url.index('.'))] rpurl = rpurl + '/robots.txt' #Checking if current url can be visited rp = robotparser.RobotFileParser() rp.set_url(rpurl) rp.read() if not rp.can_fetch("*", url): return except Exception as e: if debug_mode: print 'Error in Robot Parser : '+str(e) pass #Opening the url and reading its content page = urllib.urlopen(url) content = page.read() #content = content[content.index("<body"):content.index("</body")] #Using lxml library to extract html from the content html = lxml.html.fromstring(content) #Modifying all relative urls present in html file and making #them absolute using the specified url passed as parameter html.make_links_absolute(url) #Incrementing the total downloaded size of all pages total_downloaded_size = total_downloaded_size + sys.getsizeof(content) #Incrementing no. of visited links for each page visited num_of_visited_links = num_of_visited_links + 1 #Printing out <Links visited : __ | Relevent Links : __ > stdout.write("\rLinks Visited : %i | Relevant Links : %i" % (num_of_visited_links, len(pq))) stdout.flush() #Extracting text content text_content = html.text_content() #Calculating word count for given query word_count = 0 query_words = self.query.split() for w in query_words: word_count += text_content.count(w) #If word count is less than 1; page irrelevant! if word_count < 1 : return #Checking is the present url is an anchor jump url #If yes, then extract the parent link from anchor jump url #and use that as current url if not already present in the #priority queue anchor_jump_link = '' if '#' in url: anchor_jump_link = url[:url.index('#')] #if self.pagevisited(anchor_jump_link): # return #else: url = anchor_jump_link #Check to see if the page already exists or not #If yes, then increase the page priority by one and return #If not, then acquire lock and write url details onto links.txt #and push [priority, url] into pq (priority queue) existing_page = self.pagevisited(url) if not existing_page: self.lock.acquire() self.file.write('Link : '+url+' | Word Count(Priority) : '+str(word_count)+'\n') self.lock.release() heappush(pq, [-word_count, url]) else: existing_page[0] = existing_page[0] + (-1) return #Extracting all anchor tag elements urls = html.xpath('//a') #Iterating though all anchor tags that were extracted for u in urls: #Extracting the href attribute of each anchor tag for get a url link = u.get('href') #If extracted url not already visited, converting it to #lower case and passing it to crawl method recursively if not self.pagevisited(link): if issubclass(type(link), unicode) or issubclass(type(link), str): self.crawl(link.lower()) except Exception as e: if debug_mode: print 'Error in parser : url<'+url+'> : ' + str(e) pass
def run(self): try: global num_of_visited_links, pq, total_downloaded_size #Checking if the specified page limit is reached or not if len(pq)>total_page_limit: return #Checking if the type of url is correct if not issubclass(type(self.url), unicode) and not issubclass(type(self.url), str): return #Opening the url and reading its content page = urllib.urlopen(self.url) content = page.read() #content = content[content.index("<body"):content.index("</body")] #Using lxml library to extract html from the content html = lxml.html.fromstring(content) #Modifying all relative urls present in html file and making #them absolute using the specified url passed as parameter html.make_links_absolute(self.url) #Incrementing the total downloaded size of all pages total_downloaded_size = total_downloaded_size + sys.getsizeof(content) #Incrementing no. of visited links for each page visited num_of_visited_links = num_of_visited_links + 1 #Printing out <Links visited : __ | Relevent Links : __ > #stdout.write("\rLinks Visited : %i | Relevant Links : %i" % (num_of_visited_links, len(pq))) #stdout.flush() #Extracting text content text_content = html.text_content() #Calculating word count for given query word_count = 0 query_words = self.query.split() for w in query_words: word_count += text_content.count(w) #If word count is less than 1; page irrelevant! if word_count < 1 : return #Extracting all anchor tag elements urls = html.xpath('//a') #Iterating though all anchor tags that were extracted for u in urls: #Extracting the href attribute of each anchor tag for get a url link = u.get('href') #If extracted url not already visited, converting it to #lower case and passing it to crawl method if not self.pagevisited(link): if issubclass(type(link), unicode) or issubclass(type(link), str): self.crawl(link.lower()) except Exception as e: if debug_mode: print "Exception in Run : "+ str(e) pass
def test_character(response, excerpt): html = lxml.html.fromstring(response.content) assert excerpt in subparsers.character(html.text_content())
def test_linear_feet(response, result): html = lxml.html.fromstring(response.content) assert subparsers.linear_feet(html.text_content()) == result