Example #1
0
    def scrape(self,url,parent):
        Logger.debug('Starting url scrap for {}'.format(url))
        config.last_url_and_parent=url+', {}'.format('' if parent==None else parent)

        new_url=base_util.unreplace_dot_url(url)

        response=self.http.get(new_url)
        Logger.debug('Got URL')
        if not hasattr(response,'data') and new_url.startswith('www.'):
            new_url=new_url.replace('www.','http://')

            response=self.http.get(new_url)

            if not hasattr(response,'data'):
                new_url=new_url.replace('http://','http://www.')
                response=self.http.get(new_url)


        if hasattr(response,'data'):
            body=base_util.utf_8_safe_decode(response.data)

        else:
            Logger.error('No data associated with '+new_url)
            raise AttributeError(new_url+':::No data')

        return body,new_url
Example #2
0
def grab_urls_and_genres():
    file_name="url_list.txt"

    line_template="{}:::{}\n"
    with open(file_name,encoding="latin-1",mode="w") as url_file_handle:


        for url_obj in URLToGenre.objects(original=True):
            genres_list=[g.genre for g in url_obj.genre]

            url_file_handle.write(line_template.format(unreplace_dot_url(url_obj.url),":::".join(genres_list)))
Example #3
0
    def scrape_link_and_child(self,parent_url):
        parent_url=base_util.replace_dot_url(parent_url)
        webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None)

        #exit if failed to scrap website
        if webpage_body is None:
            return

        Logger.debug('Saving Parent')
        MongoDB.save_page(url=parent_url,page=webpage_body)
        Logger.info('Completed page: '+parent_url)

        #Now, we grab the childs of this webpage
        all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs]

        child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref

        #get rid of bad normalization
        if not re.match('^www[.].*$',parent_url):
            Logger.info('Updating bad url for {}'.format(parent_url))
            MongoDB.update_url(base_util.normalize_url(parent_url),parent_url)

        if len(child_urls) > 0:

            #get the childs, child urls is a subset of all urls
            for child_url in child_urls:
                Logger.debug('Get Child {}'.format(child_url))
                child_page=self.scrape(child_url,parent_url)

                if child_page is None:
                    exploredset=set()
                    tries=0
                    for url in set(all_ahref)^(exploredset):
                        if tries==settings.MAX_RETRIES:
                            Logger.info('Max retrie number exceeded')
                            break

                        Logger.info("trying new url: "+url)

                        child_page=self.scrape(url,parent_url)

                        if child_page is not None:
                            break
                        exploredset.add(url)

                        tries+=1

                if child_page is not None:
                    Logger.debug('Saving Child {}'.format(child_url))
                    MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=[],page=child_page)
                    Logger.info('Completed page: '+child_url)
Example #4
0
    def query_url(self,url):
        """
        Query the url in alexa, will automatically unreplace the ˙

        Returns normalized genre, aka any leading / or trailing / is removed

        :raise AssertionError: Assertion error(None page or empty page)
        :param url: url to be
        :return genrestring: genre string
        """

        self.scraper_count+=1

        if self.scraper_count>self.scraper_threshold:
            alexa_logger.debug("Alexa query, waiting for 30 seconds due to hitting scrape count")
            self.scraper_count=0
            time.sleep(30)

        url=unreplace_dot_url(url)

        page=self.get_page(AlexaScraper.alexa_template.format(url))

        all_genre_strings=[]
        if page is None or page.strip() == "":
            raise AssertionError("The page is either empty or none")
        else:
            page_soup=BeautifulSoup(page,"html.parser")
            #ignore world
            link_table=page_soup.find(id="category_link_table").find("tbody")

            all_genre_tr=link_table.find_all("tr")

            for tr in all_genre_tr:
                span=tr.find("span")

                all_genre_strings.append('/'.join([genre_component_link.string for genre_component_link in span.find_all("a")]))

        return all_genre_strings
Example #5
0
def collect_bad_url():
    """
    Make bows of websites in the bad url list

    :return:
    """

    queue=DBQueue_old("genre_bow")

    #don't trust anything
    summarizer=Summarizer()
    bow=BagOfWords()
    short_genre_to_genre=coll.ShortGenre()
    url_to_bow=coll.URLBow()
    start_pos=queue.get()

    for c,line in enumerate(open("bad_url_summarize_bow.txt")):
        if c<start_pos:
            continue

        url=line.split(" ")[1].split(":::")[0]

        try:
            print('New url {} num: {}'.format(url,c))

            url_obj=coll.URLToGenre().select(url=url).find_one()

            if not hasattr(url_obj,"original") or not url_obj["original"]:
                print("Not original")
                continue

            #request page anyways, most of the bad pages are due to bad pagess
            data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"])))

            if data is None:
                raise Exception('url {} No has page'.format(url))
            else:
                if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]):
                    print("updating data")
                    data=base_util.utf_8_safe_decode(data)

                    if not hasattr(url_obj,"page"):
                        #save page if the new page is significantly bigger than the old one
                        url_obj.save(page=data)

                    else:
                        url_obj.update(page=data)
                    url_obj.reload()

            if len(data) > len(url_obj.page):
                raise Exception("Inconsistency b/w data and page data")



            #url_obj=repair.genre_to_genre_data(url_obj.document)

            #get genre strings
            #register the genre with the short genres for faster retrieval
            genre_string_list=[]
            for g in url_obj.genre:
                normalized_string=base_util.normalize_genre_string(g["genre"])
                genre_string_list.append(normalized_string)
                short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g)

            Logger.info("Getting bow rep")
            #get BOW representation
            bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj)))

            if len(bow_dict)<20:
                raise Exception("Words less than 20")

            Logger.info("Update count:"+str(bow_dict))


            #store the url bow in urlbow table
            if not url_to_bow.select(url=url_obj["url"]).find_one():
                url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list)

            else:
                print('Exists bow url number {}'.format(url))

            queue.increment()
        except Exception as ex:
            Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")
Example #6
0
    def scrape_pipeline(self,webpageinfo_iterable,output_collection_cls):
        """
        Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres

        """
        webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable)
                                                                                                  , output_collection_cls))

        for rank,webpageinfo_obj in itertools.islice(enumerate(webpageinfo_iterable),self.queue.get_location(),None):
            assert isinstance(webpageinfo_obj,WebPageInfo)
            webscraper_logger.debug("Current on rank number {}".format(rank))

            url=unreplace_dot_url(webpageinfo_obj.url)

            try:
                #first get the webpage
                page=self.get_page(url)

                if page is None:
                    raise AssertionError("Skippin rank {} due to empty page".format(rank))

                webscraper_logger.debug("Found page of length {}".format(len(page)))

                dot_replaced_url=replace_dot_url(url)

                try:
                    alexa_genre_strings=self.alexa_scraper.query_url(url)
                    dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))
                except DoesNotExist:
                    #sleep for 200 seconds and then try again
                    time.sleep(200)
                    dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))


                if len(alexa_genre_strings)+len(dmoz_genre_strings)==0:
                    raise AssertionError("Skippin rank {} due to no genres".format(rank))

                webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings)))
                webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings)))

                #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects
                alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url)
                dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url)

                #convert from genres -> embedded genres for more info and storage in genre_metadata
                alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs)
                dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs)

                #Create the genre metadata
                genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url)


                #finally put page in collection
                output_collection_cls(genres_data=genre_metadata,url=dot_replaced_url,original=True,page=page,ranking=rank).save()
                webscraper_logger.debug("Done, commited to URlToGenreAlexa300k, there are now {} objects"
                                       .format(output_collection_cls.objects.count()))

            except Exception as ex:
                webscraper_logger.info("Exception occured: {}".format(str(ex)))

            #update reference so we don't go over the same again
            self.queue.increment_location()
Example #7
0
    def scrape_pipeline(self,webpageinfo_iterable,input_collection_cls,start=0):
        """
        Iterate over WebSiteInfo named tuple iterable. Get the url and grab its genres

        """
        webscraper_logger.debug("Starting webscraper, input from iterable {}, output to {}".format(str(webpageinfo_iterable)
                                                                                                  , input_collection_cls))

        for count,webpageinfo_obj in enumerate(webpageinfo_iterable,start=start):
            assert isinstance(webpageinfo_obj,WebPageInfo)

            url=unreplace_dot_url(webpageinfo_obj.url)

            try:

                dot_replaced_url=replace_dot_url(url)

                url_obj=input_collection_cls.objects.get(url=dot_replaced_url)

                if not hasattr(url_obj,"original") or not url_obj.original:
                    self.queue.increment_location()
                    continue


                webscraper_logger.debug("Current on count number {}".format(count))

                alexa_genre_strings=self.alexa_scraper.query_url(url)
                dmoz_genre_strings=list(set(self.dmoz_scraper.query_url(url))-set(alexa_genre_strings))

                if len(alexa_genre_strings)+len(dmoz_genre_strings)==0:
                    raise AssertionError("Skippin count {} due to no genres".format(count))

                webscraper_logger.debug("Found {} alexa genres ".format(len(alexa_genre_strings)))
                webscraper_logger.debug("Found {} dmoz genres".format(len(dmoz_genre_strings)))

                #then get the urls's genres from alexa and dmoz that are EXACT matches and convert from string -> genre coll objects
                alexa_genre_refs=Genres.create_genres(alexa_genre_strings,dot_replaced_url)
                dmoz_genre_refs=Genres.create_genres(dmoz_genre_strings,dot_replaced_url)

                #convert from genres -> embedded genres for more info and storage in genre_metadata
                alexa_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="alexa") for g_ref in alexa_genre_refs)
                dmoz_embedded_ref_list=(EmbeddedGenre(type="url",genre=g_ref,result_type="dmoz") for g_ref in dmoz_genre_refs)

                #Create the genre metadata
                models.GenreMetaData.objects(url=url).delete()

                genre_metadata=GenreMetaData.create_genremetadata([eg for eg in itertools.chain(alexa_embedded_ref_list,dmoz_embedded_ref_list)],dot_replaced_url)


                #finally put page in collection
                url_obj.update(genres_data=genre_metadata)
                input_collection_cls.objects.get(url=dot_replaced_url)
                webscraper_logger.debug("Done, validating")

                #something is very wrong with mongoengine, references do not work any longer
                fetched_genre_data=models.GenreMetaData.objects.get(url=dot_replaced_url).genres


            except (AssertionError,DoesNotExist) as ex:
                webscraper_logger.info("AssertException occured: {}".format(str(ex)))

            #update reference so we don't go over the same again
            self.queue.increment_location()