def fetch_pages_helper(self, urls, start, step, cache, results): """ Helper function for parallel fetching """ max_size = 5000000 pages = [] for i in range(start, len(urls), step): url = urls[i] if (i + 1) % 500 == 0: print "Fetched ", i, " urls" page = Page(url) try: text = '' size = 0 res = requests.get(url, headers=self.header, verify=False, timeout=5, stream=True) #t = time.time() for chunk in res.iter_content(10000): #if (time.time() - t) > 5: # break # raise ValueError('timeout reached') text += chunk size += len(chunk) if size > max_size: print "Size exceeds ", size raise ValueError('response too large') if res.status_code == 200: #page = Page(url) if len(text) < self.max_html_size: page.add_html(text) else: print "Failed to fetch ", url, res.status_code, start except: print "Failed to fetch ", url continue # Save to cache. Note that always save the fetched pages even if the requests were failed # since we want to avoid re-fetch these pages in the future if self.caching: cache.add(url, page.get_json_obj()) else: page.get_json_obj() # hack if page.body and (len(page.get_text('body')) > 100): #if not page.is_empty(): pages.append(page) results.put(pages)
def fetch(self, urls, out_file, extraction=True): """ Parameters: ----------- urls: list of url. Each url represents a website Returns: -------- list<website>: list of fetched websites """ if os.path.exists(out_file): fetched_urls = [] with open(out_file) as lines: for line in lines: try: jsobj = json.loads(line) fetched_urls.append(jsobj['url']) except: traceback.print_exc() urls = [url for url in urls if url not in fetched_urls] print "Number of urls to fetch: ", len(urls) out = open(out_file, 'a+') for i, url in enumerate(urls): if (i + 1) % 20 == 0: print "Fetched ", i, " urls" try: res = requests.get(url, headers=self.header, verify=False, timeout=10) if res.status_code == 200: page = Page(url) if len(res.text) < self.max_html_size: page.add_html(res.text) if extraction: jspage = page.get_json_obj() else: jspage = {'url': url, 'html': res.text} out.write(json.dumps(jspage) + '\n') else: print res.status_code, url except: print "Failed to fetch ", url traceback.print_exc() out.close()