Python MongoDB Examples

Programming Language: Python

Namespace/Package Name: db.database

Class/Type: MongoDB

Examples at hotexamples.com: 8

Python MongoDB - 8 examples found. These are the top rated real world Python examples of db.database.MongoDB extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

save_modify_url(3)

connect(2)

get(2)

get_url_object(2)

get_genre(1)

increment_url_counter(1)

push_to_queue(1)

save_page(1)

update_url(1)

Example #1

Show file

File: web_scraper.py Project: wangk1/research

    def scrape_links(self,pos):

        doc_object=MongoDB.get(URLQueue,'document',number=pos)

        while doc_object is not None:
            self.scrape_link_and_child(doc_object['url'])
            pos=MongoDB.increment_url_counter()

            doc_object=MongoDB.get(URLQueue,'document',number=pos)

Example #2

Show file

File: web_scrape_pipeline.py Project: wangk1/research

    def save_top_urls_to_mongo(cls):
        with open(settings.OUTPUT_FILE,encoding='ISO-8859-1') as input:

            for webpage in input:
                #dictionary of webpage properties
                webpage_dict=cls.__get_url_properties_and_sanitize(webpage)
                MongoDB.save_modify_url(**webpage_dict)

        return cls

Example #3

Show file

File: web_scrape_pipeline.py Project: wangk1/research

    def scrape_urls(cls):

        position=MongoDB.get(MetaData,'position',type='queue')

        WebScraper().scrape_links(position)

        return cls

Example #4

Show file

File: dmoz.py Project: wangk1/research

    def scrape(self):
        home=self.http.get(dmoz_home)

        home_page_links=self._scrapeHomeAndGetLinks(home.data)

        #visit each link in homepage and dig down
        #for url in home_page_links:
        i=0
        while i<settings.NUM_RANDOM_WEBPAGE:
            result=self._scrapPage(home_page_links[random.randint(0,len(home_page_links)-1)])

            if result is not None and MongoDB.get_url_object(result['url']) is None:
                i+=1
                try:
                    page=utf_8_safe_decode(self.http.get(result['url']).data)

                    MongoDB.save_modify_url(page=page,**result)

                    Logger.info("Completed: "+result['url'])
                except Exception as ex:
                    Logger.error(ex)

Example #5

Show file

File: web_scraper.py Project: wangk1/research

    def scrape_link_and_child(self,parent_url):
        parent_url=base_util.replace_dot_url(parent_url)
        webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None)

        #exit if failed to scrap website
        if webpage_body is None:
            return

        MongoDB.save_page(url=parent_url,page=webpage_body)
        Logger.info('Completed page: '+parent_url)

        #Now, we grab the childs of this webpage
        all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs]

        child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref

        #get rid of bad normalization
        if not re.match('^www[.].*$',parent_url):
            Logger.info('Updating bad url for {}'.format(parent_url))
            MongoDB.update_url(base_util.normalize_url(parent_url),parent_url)

        if len(child_urls) > 0:
            parent_genres=MongoDB.get_genre(parent_url)

            #get the childs
            for child_url in child_urls:
                child_page=self.scrape(child_url,parent_url)

                if child_page is None:
                    exploredset=set()
                    tries=0
                    for url in set(all_ahref)^(exploredset):
                        if tries==settings.MAX_RETRIES:
                            Logger.info('Max retrie number exceeded')
                            break

                        Logger.info("trying new url: "+url)

                        child_page=self.scrape(url,parent_url)

                        if child_page is not None:
                            break
                        exploredset.add(url)

                        tries+=1

                if child_page is not None:
                    MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=parent_genres,page=child_page)
                    Logger.info('Completed page: '+child_url)

Example #6

Show file

File: dmoz_alexa_similarity_bak.py Project: wangk1/research

        edit_distance = sys.maxsize

        alexa_genre_length += len(alexa_genre_name)
        for dmoz_genre_name, dmoz_genre_info in dmoz_dict.items():
            if edit_distance is 0:
                break

            if alexa_genre_name == dmoz_genre_name:
                exact_match += 1
                edit_distance = 0

            else:
                edit_distance = min(edit_distance, levenshtein(dmoz_genre_name, alexa_genre_name))

        if edit_distance is not sys.maxsize:
            total_edit_distance += edit_distance
            edit_distance_count += 1

    return {
        "alexa_total": len(alexa_dict),
        "edit_distance_count": edit_distance_count,
        "total_edit_distance": total_edit_distance,
        "alexa_match": exact_match,
        "alexa_genre_length": alexa_genre_length,
    }

    # accumulate stats and update the collection


MongoDB.connect(settings.HOST_NAME, settings.PORT)

Example #7

Show file

File: web_scrape_pipeline.py Project: wangk1/research

    def start(cls):
        MongoDB.connect(settings.HOST_NAME,settings.PORT)

        return cls

Example #8

Show file

File: web_scrape_pipeline.py Project: wangk1/research

    def create_url_queue(cls):
        for num,URL_document in enumerate(URLToGenre.objects):
                MongoDB.push_to_queue(num,URL_document)

        return cls