Python crawlutilの例、opencontext_py.libs.crawlerutilites.crawlutil Pythonの例

コード例 #1

0

ファイルを表示

ファイル: crawler.py プロジェクト: ekansa/open-context-py

    def index_single_document(self, uuid):
        '''
        Use this method to crawl a single document. Provide the item's
        UUID as an argument. For example:

        crawler = Crawler()
        crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030')
        '''
        print('\nAttempting to index document ' + uuid + '...\n')
        start_time = time.time()
        try:
            sd_obj = SolrDocument(uuid)
            sd_obj.process_item()
            solrdocument = sd_obj.fields
            if crawlutil().is_valid_document(solrdocument):
                # Commit the document and save the response status.
                # Note: solr.update() expects a list
                solr_status = self.solr.update(
                    [solrdocument], 'json', commit=True).status
                if solr_status == 200:
                    print('Successfully indexed ' + uuid + ' in ' +
                          crawlutil().get_elapsed_time_in_seconds(start_time)
                          + ' seconds.')
                else:
                    print('Error: ' + str(self.solr.update(
                        [solrdocument], 'json', commit=True
                        ).raw_content['error']['msg'])
                    )
            else:
                print('Error: Unable to index ' + uuid + ' due to '
                      'a datatype mismatch.')
        except TypeError as error:
            print("Type Error {0}".format(error) + ": Unable to process document " + uuid + '.')
        except Exception as error:
            print("Error: {0}".format(error) + " -----> " + uuid)

コード例 #2

0

ファイルを表示

ファイル: crawler.py プロジェクト: portableant/open-context-py

    def index_single_document(self, uuid):
        '''
        Use this method to crawl a single document. Provide the item's
        UUID as an argument. For example:

        crawler = Crawler()
        crawler.index_single_document('9E474B89-E36B-4B9D-2D38-7C7CCBDBB030')
        '''
        print('\nAttempting to index document ' + uuid + '...\n')
        start_time = time.time()
        try:
            solrdocument = SolrDocument(uuid).fields
            if crawlutil().is_valid_document(solrdocument):
                # Commit the document and save the response status.
                # Note: solr.update() expects a list
                solr_status = self.solr.update([solrdocument],
                                               'json',
                                               commit=True).status
                if solr_status == 200:
                    print('Successfully indexed ' + uuid + ' in ' +
                          crawlutil().get_elapsed_time_in_seconds(start_time) +
                          ' seconds.')
                else:
                    print('Error: ' + str(
                        self.solr.update([solrdocument], 'json', commit=True).
                        raw_content['error']['msg']))
            else:
                print('Error: Unable to index ' + uuid + ' due to '
                      'a datatype mismatch.')
        except TypeError:
            print("Error: Unable to process document " + uuid + '.')
        except Exception as error:
            print("Error: {0}".format(error) + " -----> " + uuid)

コード例 #3

0

ファイルを表示

ファイル: crawler.py プロジェクト: ekansa/open-context-py

 def index_document_list(self,
                         uuid_list,
                         chunksize=20,
                         stop_at_invalid=True):
     """
     Indexes a list of uuids. The list is generated elsewhere.
     """
     if isinstance(uuid_list, list):
         document_count = 0
         documents = []
         total_count = len(uuid_list)
         i = 0;
         for uuid in uuid_list:
             try:
                 manifest = Manifest.objects.get(uuid=uuid)
             except Manifest.DoesNotExist:
                 print('Where is ' + uuid + ' in the manifest?')
                 manifest = False
             if manifest is not False:
                 try:
                     sd_obj = SolrDocument(uuid)
                     if isinstance(self.max_geo_zoom, int):
                         if self.max_geo_zoom > 5:
                             # only positive integers
                             sd_obj.max_geo_zoom = self.max_geo_zoom
                     sd_obj.process_item()
                     if isinstance(self.human_remains, int):
                         if self.human_remains > 0:
                             sd_obj.fields['human_remains'] = self.human_remains
                     solrdocument = sd_obj.fields
                     if crawlutil().is_valid_document(solrdocument):
                         if solrdocument is not None:
                             i += 1
                             print('OK to index: ' + uuid)
                             documents.append(solrdocument)
                             manifest.indexed_save()  # saves the time this was indexed
                         else:
                             print('Something wrong with: ' + uuid)
                             if stop_at_invalid:
                                 break
                     else:
                         print('Not valid: ' + uuid)
                         if stop_at_invalid:
                             break
                 except Exception as error:
                     print("Error: {0}".format(error) + " -----> " + uuid)
                     if stop_at_invalid:
                             break
             if len(documents) >= chunksize:
                 ok = self.commit_documents(documents, i, total_count)
                 if ok is False and stop_at_invalid:
                     # a problem in committing the documents
                     break
                 documents = []
         # now finish off the remaining documents
         if len(documents) > 0:
             ok = self.commit_documents(documents, i, total_count)

コード例 #4

0

ファイルを表示

ファイル: crawler.py プロジェクト: portableant/open-context-py

 def index_document_list(self,
                         uuid_list,
                         chunksize=20,
                         stop_at_invalid=True):
     """
     Indexes a list of uuids. The list is generated elsewhere.
     """
     if isinstance(uuid_list, list):
         document_count = 0
         documents = []
         total_count = len(uuid_list)
         i = 0
         for uuid in uuid_list:
             try:
                 manifest = Manifest.objects.get(uuid=uuid)
             except Manifest.DoesNotExist:
                 print('Where is ' + uuid + ' in the manifest?')
                 manifest = False
             if manifest is not False:
                 try:
                     solrdocument = SolrDocument(uuid).fields
                     if crawlutil().is_valid_document(solrdocument):
                         i += 1
                         print('OK to index: ' + uuid)
                         documents.append(solrdocument)
                         manifest.indexed_save(
                         )  # saves the time this was indexed
                     else:
                         print('Not valid: ' + uuid)
                         if stop_at_invalid:
                             break
                 except Exception as error:
                     print("Error: {0}".format(error) + " -----> " + uuid)
                     if stop_at_invalid:
                         break
             if len(documents) >= chunksize:
                 ok = self.commit_documents(documents, i, total_count)
                 if ok is False and stop_at_invalid:
                     # a problem in committing the documents
                     break
                 documents = []
         # now finish off the remaining documents
         if len(documents) > 0:
             ok = self.commit_documents(documents, i, total_count)

コード例 #5

0

ファイルを表示

ファイル: crawler.py プロジェクト: ekansa/open-context-py

    def crawl(self, chunksize=100):
        '''
        For efficiency, this method processes documents in "chunks." The
        default chunk size is 100, but one can specify other values. For
        example, to specify a chunksize of 500, use this method as follows:

        crawler = Crawler()
        crawler.crawl(500)
        '''

        # Get a logger
        logger = logging.getLogger(__name__)

        start_time = time.time()
        print('\n\nStarting crawl...\n')
        print("(#)\tUUID")
        print('--------------------------------------------')
        document_count = 0
        while self.uuidlist is not None:
            documents = []
            # Process the UUID list in chunks
            ok_manifests = []
            for uuid in islice(self.uuidlist, 0, chunksize):
                try:
                    sd_obj = SolrDocument(uuid)
                    if isinstance(self.max_geo_zoom, int):
                        if self.max_geo_zoom > 5:
                            # only positive integers
                            sd_obj.max_geo_zoom = self.max_geo_zoom
                    sd_obj.process_item()
                    solrdocument = sd_obj.fields 
                    if crawlutil().is_valid_document(solrdocument):
                        try:
                            manifest = Manifest.objects.get(uuid=uuid)
                            ok_manifests.append(manifest)
                            # manifest.indexed_save()  # saves the time this was indexed
                        except Manifest.DoesNotExist:
                            print("Error: {0} Database bizzare error -----> " + uuid)
                            logger.error('[' + datetime.now().strftime('%x %X ') +
                                         settings.TIME_ZONE + '] Error: Missing manifest record for => ' + uuid)
                        documents.append(solrdocument)
                        document_count += 1
                        print("(" + str(document_count) + ")\t" + uuid)
                    else:
                        print('Error: Skipping document due to a datatype '
                              'mismatch -----> ' + uuid)
                except Exception as error:
                    print("Error: {0}".format(error) + " -----> " + uuid)
                    logger.error('[' + datetime.now().strftime('%x %X ') +
                                 settings.TIME_ZONE + '] Error: ' + str(error)
                                 + ' => ' + uuid)
            # Send the documents to Solr while saving the
            # response status code (e.g, 200, 400, etc.)
            solr_status = self.solr.update(documents, 'json',
                                           commit=False).status
            if solr_status == 200:
                self.solr.commit()
                print('--------------------------------------------')
                print('Crawl Rate: ' + crawlutil().get_crawl_rate_in_seconds(
                    document_count, start_time) + ' documents per second')
                print('Updating indexed time for ' + str(len(ok_manifests)) + ' manifest objects.')
                for manifest in ok_manifests:
                    manifest.indexed_save()  # saves the time this was indexed
                print('--------------------------------------------')
            else:
                print('Error: ' + str(self.solr.update(
                    documents, 'json', commit=False
                    ).raw_content['error']['msg']))
        # Once the crawl has completed...
        self.solr.optimize()
        print('\n--------------------------------------------')
        print('Crawl completed')
        print('--------------------------------------------\n')

コード例 #6

0

ファイルを表示

ファイル: crawler.py プロジェクト: portableant/open-context-py

    def crawl(self, chunksize=100):
        '''
        For efficiency, this method processes documents in "chunks." The
        default chunk size is 100, but one can specify other values. For
        example, to specify a chunksize of 500, use this method as follows:

        crawler = Crawler()
        crawler.crawl(500)
        '''

        # Get a logger
        logger = logging.getLogger(__name__)

        start_time = time.time()
        print('\n\nStarting crawl...\n')
        print("(#)\tUUID")
        print('--------------------------------------------')
        document_count = 0
        while self.uuidlist is not None:
            documents = []
            # Process the UUID list in chunks
            for uuid in islice(self.uuidlist, 0, chunksize):
                try:
                    solrdocument = SolrDocument(uuid).fields
                    if crawlutil().is_valid_document(solrdocument):
                        try:
                            manifest = Manifest.objects.get(uuid=uuid)
                            manifest.indexed_save(
                            )  # saves the time this was indexed
                        except Manifest.DoesNotExist:
                            print("Error: {0} Database bizzare error -----> " +
                                  uuid)
                            logger.error(
                                '[' + datetime.now().strftime('%x %X ') +
                                settings.TIME_ZONE +
                                '] Error: Missing manifest record for => ' +
                                uuid)
                        documents.append(solrdocument)
                        document_count += 1
                        print("(" + str(document_count) + ")\t" + uuid)
                    else:
                        print('Error: Skipping document due to a datatype '
                              'mismatch -----> ' + uuid)
                except Exception as error:
                    print("Error: {0}".format(error) + " -----> " + uuid)
                    logger.error('[' + datetime.now().strftime('%x %X ') +
                                 settings.TIME_ZONE + '] Error: ' +
                                 str(error) + ' => ' + uuid)
            # Send the documents to Solr while saving the
            # response status code (e.g, 200, 400, etc.)
            solr_status = self.solr.update(documents, 'json',
                                           commit=False).status
            if solr_status == 200:
                self.solr.commit()
                print('--------------------------------------------')
                print('Crawl Rate: ' + crawlutil().get_crawl_rate_in_seconds(
                    document_count, start_time) + ' documents per second')
                print('--------------------------------------------')
            else:
                print('Error: ' + str(
                    self.solr.update(documents, 'json', commit=False).
                    raw_content['error']['msg']))
        # Once the crawl has completed...
        self.solr.optimize()
        print('\n--------------------------------------------')
        print('Crawl completed')
        print('--------------------------------------------\n')