def crawl_one_keyword(keyword): '''Scrapes one keyword. Returns: list of links, a link is a dictionary with keys: link, rank, snippet, title, visible_link, date, keyword raises: SERPError ''' url = get_keyword_url(keyword) logging.debug("trying to download SERP {}".format(url)) try: rawhtml, headers = urlrequest.get_raw_html(url) except requests.exceptions.RequestException as e: raise SERPError(e) date = _date() if is_blocked(rawhtml): raise SERPError() #links = parse(rawhtml) + [{'link':"http://lesbartavelles13.free.fr/IMAGE-ISO/ENGLISH6EME.iso"}] links = parse(rawhtml) # adding scraping information to links for i in links: i['date'] = date i['keyword'] = keyword i['link'] = encode(i['link']) #.encode('UTF-8') return links
def test_parsing(self): link = "http://www.bing.com/search?q=ahoj&qs=ds&form=QBLH&scope=web" html = requests.get(link).text links = parse(html) self.assertTrue(len(links)>0) l = links[0] self.assertTrue('link' in l and 'snippet' in l and 'title' in l) self.assertTrue('http' in l['link'])