Example #1
0
    def scrape_links(self,pos):

        doc_object=MongoDB.get(URLQueue,'document',number=pos)

        while doc_object is not None:
            self.scrape_link_and_child(doc_object['url'])
            pos=MongoDB.increment_url_counter()

            doc_object=MongoDB.get(URLQueue,'document',number=pos)
Example #2
0
    def save_top_urls_to_mongo(cls):
        with open(settings.OUTPUT_FILE,encoding='ISO-8859-1') as input:

            for webpage in input:
                #dictionary of webpage properties
                webpage_dict=cls.__get_url_properties_and_sanitize(webpage)
                MongoDB.save_modify_url(**webpage_dict)

        return cls
Example #3
0
    def scrape_urls(cls):

        position=MongoDB.get(MetaData,'position',type='queue')

        WebScraper().scrape_links(position)

        return cls
Example #4
0
    def scrape(self):
        home=self.http.get(dmoz_home)

        home_page_links=self._scrapeHomeAndGetLinks(home.data)

        #visit each link in homepage and dig down
        #for url in home_page_links:
        i=0
        while i<settings.NUM_RANDOM_WEBPAGE:
            result=self._scrapPage(home_page_links[random.randint(0,len(home_page_links)-1)])

            if result is not None and MongoDB.get_url_object(result['url']) is None:
                i+=1
                try:
                    page=utf_8_safe_decode(self.http.get(result['url']).data)

                    MongoDB.save_modify_url(page=page,**result)

                    Logger.info("Completed: "+result['url'])
                except Exception as ex:
                    Logger.error(ex)
Example #5
0
    def scrape_link_and_child(self,parent_url):
        parent_url=base_util.replace_dot_url(parent_url)
        webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None)

        #exit if failed to scrap website
        if webpage_body is None:
            return

        MongoDB.save_page(url=parent_url,page=webpage_body)
        Logger.info('Completed page: '+parent_url)

        #Now, we grab the childs of this webpage
        all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs]

        child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref

        #get rid of bad normalization
        if not re.match('^www[.].*$',parent_url):
            Logger.info('Updating bad url for {}'.format(parent_url))
            MongoDB.update_url(base_util.normalize_url(parent_url),parent_url)

        if len(child_urls) > 0:
            parent_genres=MongoDB.get_genre(parent_url)

            #get the childs
            for child_url in child_urls:
                child_page=self.scrape(child_url,parent_url)

                if child_page is None:
                    exploredset=set()
                    tries=0
                    for url in set(all_ahref)^(exploredset):
                        if tries==settings.MAX_RETRIES:
                            Logger.info('Max retrie number exceeded')
                            break

                        Logger.info("trying new url: "+url)

                        child_page=self.scrape(url,parent_url)

                        if child_page is not None:
                            break
                        exploredset.add(url)

                        tries+=1

                if child_page is not None:
                    MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=parent_genres,page=child_page)
                    Logger.info('Completed page: '+child_url)
        edit_distance = sys.maxsize

        alexa_genre_length += len(alexa_genre_name)
        for dmoz_genre_name, dmoz_genre_info in dmoz_dict.items():
            if edit_distance is 0:
                break

            if alexa_genre_name == dmoz_genre_name:
                exact_match += 1
                edit_distance = 0

            else:
                edit_distance = min(edit_distance, levenshtein(dmoz_genre_name, alexa_genre_name))

        if edit_distance is not sys.maxsize:
            total_edit_distance += edit_distance
            edit_distance_count += 1

    return {
        "alexa_total": len(alexa_dict),
        "edit_distance_count": edit_distance_count,
        "total_edit_distance": total_edit_distance,
        "alexa_match": exact_match,
        "alexa_genre_length": alexa_genre_length,
    }

    # accumulate stats and update the collection


MongoDB.connect(settings.HOST_NAME, settings.PORT)
Example #7
0
    def start(cls):
        MongoDB.connect(settings.HOST_NAME,settings.PORT)

        return cls
Example #8
0
    def create_url_queue(cls):
        for num,URL_document in enumerate(URLToGenre.objects):
                MongoDB.push_to_queue(num,URL_document)

        return cls