def scrape(self,url,parent): Logger.debug('Starting url scrap for {}'.format(url)) config.last_url_and_parent=url+', {}'.format('' if parent==None else parent) new_url=base_util.unreplace_dot_url(url) response=self.http.get(new_url) Logger.debug('Got URL') if not hasattr(response,'data') and new_url.startswith('www.'): new_url=new_url.replace('www.','http://') response=self.http.get(new_url) if not hasattr(response,'data'): new_url=new_url.replace('http://','http://www.') response=self.http.get(new_url) if hasattr(response,'data'): body=base_util.utf_8_safe_decode(response.data) else: Logger.error('No data associated with '+new_url) raise AttributeError(new_url+':::No data') return body,new_url
def grab_urls_and_genres(): file_name="url_list.txt" line_template="{}:::{}\n" with open(file_name,encoding="latin-1",mode="w") as url_file_handle: for url_obj in URLToGenre.objects(original=True): genres_list=[g.genre for g in url_obj.genre] url_file_handle.write(line_template.format(unreplace_dot_url(url_obj.url),":::".join(genres_list)))
def scrape_link_and_child(self,parent_url): parent_url=base_util.replace_dot_url(parent_url) webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None) #exit if failed to scrap website if webpage_body is None: return Logger.debug('Saving Parent') MongoDB.save_page(url=parent_url,page=webpage_body) Logger.info('Completed page: '+parent_url) #Now, we grab the childs of this webpage all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs] child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref #get rid of bad normalization if not re.match('^www[.].*$',parent_url): Logger.info('Updating bad url for {}'.format(parent_url)) MongoDB.update_url(base_util.normalize_url(parent_url),parent_url) if len(child_urls) > 0: #get the childs, child urls is a subset of all urls for child_url in child_urls: Logger.debug('Get Child {}'.format(child_url)) child_page=self.scrape(child_url,parent_url) if child_page is None: exploredset=set() tries=0 for url in set(all_ahref)^(exploredset): if tries==settings.MAX_RETRIES: Logger.info('Max retrie number exceeded') break Logger.info("trying new url: "+url) child_page=self.scrape(url,parent_url) if child_page is not None: break exploredset.add(url) tries+=1 if child_page is not None: Logger.debug('Saving Child {}'.format(child_url)) MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=[],page=child_page) Logger.info('Completed page: '+child_url)
def query_url(self,url): """ Query the url in alexa, will automatically unreplace the ˙ Returns normalized genre, aka any leading / or trailing / is removed :raise AssertionError: Assertion error(None page or empty page) :param url: url to be :return genrestring: genre string """ self.scraper_count+=1 if self.scraper_count>self.scraper_threshold: alexa_logger.debug("Alexa query, waiting for 30 seconds due to hitting scrape count") self.scraper_count=0 time.sleep(30) url=unreplace_dot_url(url) page=self.get_page(AlexaScraper.alexa_template.format(url)) all_genre_strings=[] if page is None or page.strip() == "": raise AssertionError("The page is either empty or none") else: page_soup=BeautifulSoup(page,"html.parser") #ignore world link_table=page_soup.find(id="category_link_table").find("tbody") all_genre_tr=link_table.find_all("tr") for tr in all_genre_tr: span=tr.find("span") all_genre_strings.append('/'.join([genre_component_link.string for genre_component_link in span.find_all("a")])) return all_genre_strings
def collect_bad_url(): """ Make bows of websites in the bad url list :return: """ queue=DBQueue_old("genre_bow") #don't trust anything summarizer=Summarizer() bow=BagOfWords() short_genre_to_genre=coll.ShortGenre() url_to_bow=coll.URLBow() start_pos=queue.get() for c,line in enumerate(open("bad_url_summarize_bow.txt")): if c<start_pos: continue url=line.split(" ")[1].split(":::")[0] try: print('New url {} num: {}'.format(url,c)) url_obj=coll.URLToGenre().select(url=url).find_one() if not hasattr(url_obj,"original") or not url_obj["original"]: print("Not original") continue #request page anyways, most of the bad pages are due to bad pagess data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"]))) if data is None: raise Exception('url {} No has page'.format(url)) else: if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]): print("updating data") data=base_util.utf_8_safe_decode(data) if not hasattr(url_obj,"page"): #save page if the new page is significantly bigger than the old one url_obj.save(page=data) else: url_obj.update(page=data) url_obj.reload() if len(data) > len(url_obj.page): raise Exception("Inconsistency b/w data and page data") #url_obj=repair.genre_to_genre_data(url_obj.document) #get genre strings #register the genre with the short genres for faster retrieval genre_string_list=[] for g in url_obj.genre: normalized_string=base_util.normalize_genre_string(g["genre"]) genre_string_list.append(normalized_string) short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g) Logger.info("Getting bow rep") #get BOW representation bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj))) if len(bow_dict)<20: raise Exception("Words less than 20") Logger.info("Update count:"+str(bow_dict)) #store the url bow in urlbow table if not url_to_bow.select(url=url_obj["url"]).find_one(): url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list) else: print('Exists bow url number {}'.format(url)) queue.increment() except Exception as ex: Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")
def scrape_pipeline(self,webpageinfo_iterable,output_collection_cls): """ Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres """ webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable) , output_collection_cls)) for rank,webpageinfo_obj in itertools.islice(enumerate(webpageinfo_iterable),self.queue.get_location(),None): assert isinstance(webpageinfo_obj,WebPageInfo) webscraper_logger.debug("Current on rank number {}".format(rank)) url=unreplace_dot_url(webpageinfo_obj.url) try: #first get the webpage page=self.get_page(url) if page is None: raise AssertionError("Skippin rank {} due to empty page".format(rank)) webscraper_logger.debug("Found page of length {}".format(len(page))) dot_replaced_url=replace_dot_url(url) try: alexa_genre_strings=self.alexa_scraper.query_url(url) dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings)) except DoesNotExist: #sleep for 200 seconds and then try again time.sleep(200) dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings)) if len(alexa_genre_strings)+len(dmoz_genre_strings)==0: raise AssertionError("Skippin rank {} due to no genres".format(rank)) webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings))) webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings))) #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url) dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url) #convert from genres -> embedded genres for more info and storage in genre_metadata alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs) dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs) #Create the genre metadata genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url) #finally put page in collection output_collection_cls(genres_data=genre_metadata,url=dot_replaced_url,original=True,page=page,ranking=rank).save() webscraper_logger.debug("Done, commited to URlToGenreAlexa300k, there are now {} objects" .format(output_collection_cls.objects.count())) except Exception as ex: webscraper_logger.info("Exception occured: {}".format(str(ex))) #update reference so we don't go over the same again self.queue.increment_location()
def scrape_pipeline(self,webpageinfo_iterable,input_collection_cls,start=0): """ Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres """ webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable) , input_collection_cls)) for count,webpageinfo_obj in enumerate(webpageinfo_iterable,start=start): assert isinstance(webpageinfo_obj,WebPageInfo) url=unreplace_dot_url(webpageinfo_obj.url) try: dot_replaced_url=replace_dot_url(url) url_obj=input_collection_cls.objects.get(url=dot_replaced_url) if not hasattr(url_obj,"original") or not url_obj.original: self.queue.increment_location() continue webscraper_logger.debug("Current on count number {}".format(count)) alexa_genre_strings=self.alexa_scraper.query_url(url) dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings)) if len(alexa_genre_strings)+len(dmoz_genre_strings)==0: raise AssertionError("Skippin count {} due to no genres".format(count)) webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings))) webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings))) #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url) dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url) #convert from genres -> embedded genres for more info and storage in genre_metadata alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs) dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs) #Create the genre metadata models.GenreMetaData.objects(url=url).delete() genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url) #finally put page in collection url_obj.update(genres_data=genre_metadata) input_collection_cls.objects.get(url=dot_replaced_url) webscraper_logger.debug("Done, validating") #something is very wrong with mongoengine, references do not work any longer fetched_genre_data=models.GenreMetaData.objects.get(url=dot_replaced_url).genres except (AssertionError,DoesNotExist) as ex: webscraper_logger.info("AssertException occured: {}".format(str(ex))) #update reference so we don't go over the same again self.queue.increment_location()