def crawl_one_keyword(keyword): '''Scrapes one keyword. Returns: list of links, a link is a dictionary with keys: link, rank, snippet, title, visible_link, date, keyword raises: SERPError ''' url = get_keyword_url(keyword) logging.debug("trying to download SERP {}".format(url)) try: rawhtml, headers = urlrequest.get_raw_html(url) except requests.exceptions.RequestException as e: raise SERPError(e) date = _date() if is_blocked(rawhtml): raise SERPError() #links = parse(rawhtml) + [{'link':"http://lesbartavelles13.free.fr/IMAGE-ISO/ENGLISH6EME.iso"}] links = parse(rawhtml) # adding scraping information to links for i in links: i['date'] = date i['keyword'] = keyword i['link'] = encode(i['link']) #.encode('UTF-8') return links
def _generate(self): while True: x,_ = urlrequest.get_raw_html('https://en.wikipedia.org/wiki/Special:Random') pagetitle = BeautifulSoup(x,"lxml").html.head.title.string # there is " - Wikipedia, the free encyclopedia" in the end of every # page title, I'm removing it title = pagetitle[:-len(" - Wikipedia, the free encyclopedia")] for i in title.split(): yield i
def _generate(self): while True: url = "https://en.wikipedia.org/wiki/Special:Random" html, _ = urlrequest.get_raw_html(url) soup = BeautifulSoup(html,"lxml").html divs = soup('div',{"id":"mw-content-text"}) if len(divs)==0: # article is probably empty continue # this is a text of article article = divs[0].text words = article.split() for i in words: yield i