Python Crawler Beispiele, opencontext_py.apps.indexer.crawler.Crawler Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: reindex.py Projekt: portableant/open-context-py

 def reindex_uuids(self, uuids):
     """ reindexes a list of uuids
     """
     if isinstance(uuids, list):
         crawler = Crawler()
         crawler.index_document_list(uuids)
         return len(uuids)
     else:
         return False

Beispiel #2

0

Datei anzeigen

Datei: reindex.py Projekt: ekansa/open-context-py

 def reindex_uuids(self, uuids):
     """ reindexes a list of uuids
     """
     self.clear_caches()
     if isinstance(uuids, list):
         crawler = Crawler()
         if isinstance(self.max_geo_zoom, int):
                 if self.max_geo_zoom > 5:
                     # only positive integers
                     crawler.max_geo_zoom = self.max_geo_zoom
         if isinstance(self.human_remains, int):
             if self.human_remains > 0:
                 # we're reindexing sensitive human remains
                 crawler.human_remains  = self.human_remains
         crawler.index_document_list(uuids, self.list_size)
         return len(uuids)
     else:
         return False

Beispiel #3

0

Datei anzeigen

Datei: reindex.py Projekt: rdhyee/open-context-py

 def reindex_uuids(self, uuids):
     """ reindexes a list of uuids
     """
     self.clear_caches()
     if isinstance(uuids, list):
         crawler = Crawler()
         if isinstance(self.max_geo_zoom, int):
                 if self.max_geo_zoom > 5:
                     # only positive integers
                     crawler.max_geo_zoom = self.max_geo_zoom
         if isinstance(self.human_remains, int):
             if self.human_remains > 0:
                 # we're reindexing sensitive human remains
                 crawler.human_remains  = self.human_remains
         crawler.index_document_list(uuids, self.list_size)
         return len(uuids)
     else:
         return False

Beispiel #4

0

Datei anzeigen

Datei: reindex.py Projekt: rdhyee/open-context-py

 def reindex(self):
     """ Reindexes items in Solr,
         with item UUIDs coming from a given source
     """
     self.clear_caches()
     self.iteration += 1
     print('Iteration: ' + str(self.iteration))
     if self.iteration <= self.max_iterations:
         uuids = []
         if self.solr_direct_url is not False:
             print('Get uuids from solr: ' + str(self.solr_direct_url))
             uuids = self.get_uuids_solr_direct(self.solr_direct_url)
         elif self.oc_url is not False:
             # now validate to make sure we're asking for uuids
             if 'response=uuid' in self.oc_url \
                and '.json' in self.oc_url:
                 print('Get uuids from OC-API: ' + str(self.oc_url))
                 uuids = self.get_uuids_oc_url(self.oc_url)
         elif isinstance(self.project_uuids, list) \
             and self.annotated_after is False \
             and self.skip_indexed_after is False:
             # now validate to make sure we're asking for uuids
             print('Getting uuids for: ' + str(len(self.project_uuids)) + ' projects')
             uuids = []
             raw_uuids = Manifest.objects\
                                 .filter(project_uuid__in=self.project_uuids)\
                                 .values_list('uuid', flat=True)
             for raw_uuid in raw_uuids:
                 uuids.append(str(raw_uuid))
         elif isinstance(self.project_uuids, list)\
              and self.annotated_after is False\
              and self.skip_indexed_after is not False:
             # index items from projects, but not items indexed after a certain
             # datetime
             uuids = []
             raw_uuids = Manifest.objects\
                                 .filter(project_uuid__in=self.project_uuids)\
                                 .exclude(indexed__gte=self.skip_indexed_after)\
                                 .values_list('uuid', flat=True)
             for raw_uuid in raw_uuids:
                 uuids.append(str(raw_uuid))
         elif self.annotated_after is not False:
             self.max_iterations = 1
             uuids = []
             anno_list = []
             if self.project_uuids is not False:
                 if not isinstance(self.project_uuids, list):
                     project_uuids = [self.project_uuids]
                 else:
                     project_uuids = self.project_uuids
                 anno_list = LinkAnnotation.objects\
                                           .filter(project_uuid__in=project_uuids,
                                                   updated__gte=self.annotated_after)
             else:
                 anno_list = LinkAnnotation.objects\
                                           .filter(updated__gte=self.annotated_after)
             for anno in anno_list:
                 print('Index annotation: ' + anno.subject + ' :: ' + anno.predicate_uri + ' :: ' + anno.object_uri)
                 if(anno.subject_type in (item[0] for item in settings.ITEM_TYPES)):
                     # make sure it's an Open Context item that can get indexed
                     if anno.subject not in uuids:
                         uuids.append(anno.subject)
                 if anno.subject_type == 'types' and self.related_annotations:
                     # get the
                     # subjects item used with this type, we need to do a lookup
                     # on the assertions table
                     assertions = Assertion.objects\
                                           .filter(object_uuid=geo_anno.subject)
                     for ass in assertions:
                         if ass.uuid not in uuids:
                             uuids.append(ass.uuid)
         if isinstance(uuids, list):
             print('Ready to index ' + str(len(uuids)) + ' items')
             crawler = Crawler()
             if isinstance(self.max_geo_zoom, int):
                 if self.max_geo_zoom > 5:
                     # only positive integers
                     crawler.max_geo_zoom = self.max_geo_zoom
             if isinstance(self.human_remains, int):
                 if self.human_remains > 0:
                     # we're reindexing sensitive human remains
                     crawler.human_remains  = self.human_remains
             crawler.index_document_list(uuids, self.list_size)
             self.reindex()
         else:
             print('Problem with: ' + str(uuids))

Beispiel #5

0

Datei anzeigen

Datei: reindex.py Projekt: portableant/open-context-py

 def reindex(self):
     """ Reindexes items in Solr,
         with item UUIDs coming from a given source
     """
     self.iteration += 1
     print('Iteration: ' + str(self.iteration))
     if self.iteration <= self.max_iterations:
         uuids = []
         if self.solr_direct_url is not False:
             print('Get uuids from solr: ' + str(self.solr_direct_url))
             uuids = self.get_uuids_solr_direct(self.solr_direct_url)
         elif self.oc_url is not False:
             # now validate to make sure we're asking for uuids
             if 'response=uuid' in self.oc_url \
                and '.json' in self.oc_url:
                 print('Get uuids from OC-API: ' + str(self.oc_url))
                 uuids = self.get_uuids_oc_url(self.oc_url)
         elif isinstance(self.project_uuids, list) \
             and self.annotated_after is False \
             and self.skip_indexed_after is False:
             # now validate to make sure we're asking for uuids
             uuids = []
             raw_uuids = Manifest.objects\
                                 .filter(project_uuid__in=self.project_uuids)\
                                 .values_list('uuid', flat=True)
             for raw_uuid in raw_uuids:
                 uuids.append(str(raw_uuid))
         elif isinstance(self.project_uuids, list)\
              and self.annotated_after is False\
              and self.skip_indexed_after is not False:
             # index items from projects, but not items indexed after a certain
             # datetime
             uuids = []
             raw_uuids = Manifest.objects\
                                 .filter(project_uuid__in=self.project_uuids)\
                                 .exclude(indexed__gte=self.skip_indexed_after)\
                                 .values_list('uuid', flat=True)
             for raw_uuid in raw_uuids:
                 uuids.append(str(raw_uuid))
         elif self.annotated_after is not False:
             self.max_iterations = 1
             uuids = []
             anno_list = []
             if self.project_uuids is not False:
                 if not isinstance(self.project_uuids, list):
                     project_uuids = [self.project_uuids]
                 else:
                     project_uuids = self.project_uuids
                 anno_list = LinkAnnotation.objects\
                                           .filter(project_uuid__in=project_uuids,
                                                   updated__gte=self.annotated_after)
             else:
                 anno_list = LinkAnnotation.objects\
                                           .filter(updated__gte=self.annotated_after)
             for anno in anno_list:
                 print('Index annotation: ' + anno.subject + ' :: ' + anno.predicate_uri + ' :: ' + anno.object_uri)
                 if(anno.subject_type in (item[0] for item in settings.ITEM_TYPES)):
                     # make sure it's an Open Context item that can get indexed
                     if anno.subject not in uuids:
                         uuids.append(anno.subject)
                 if anno.subject_type == 'types' and self.related_annotations:
                     # get the
                     # subjects item used with this type, we need to do a lookup
                     # on the assertions table
                     assertions = Assertion.objects\
                                           .filter(object_uuid=geo_anno.subject)
                     for ass in assertions:
                         if ass.uuid not in uuids:
                             uuids.append(ass.uuid)
         if isinstance(uuids, list):
             print('Ready to index ' + str(len(uuids)) + ' items')
             crawler = Crawler()
             crawler.index_document_list(uuids)
             self.reindex()
         else:
             print('Problem with: ' + str(uuids))