コード例 #1
0
ファイル: tasks.py プロジェクト: surfedushare/search-portal
def sync_sharekit_metadata():
    # Select which data to sync this run
    latest_active_dataset = Dataset.objects.filter(is_active=True).last()
    if not latest_active_dataset:
        return
    dataset_version = DatasetVersion.objects.get_current_version()
    harvest_queryset = Harvest.objects.filter(
        dataset=latest_active_dataset,
        source__repository=Repositories.SHAREKIT,
        stage=HarvestStages.
        COMPLETE  # prevents syncing materials half way a full harvest
    )
    # First we acquire a permanent lock on Harvests,
    # because if latest_update_at is a while ago this command will run a long time.
    # We don't want to keep all those syncing changes waiting in that one transaction.
    try:
        with atomic():
            harvest_queryset.filter(is_syncing=False).select_for_update(
                nowait=True).update(is_syncing=True)
    except DatabaseError:
        logger.warning(
            "Did not acquire lock on Harvester when syncing Sharekit metadata")
        return
    # Now that we're the only ones starting the sync we execute it
    for harvest in harvest_queryset.filter(is_syncing=True):
        # Check that a non-valid harvest source didn't slip through the lock
        if harvest.stage != HarvestStages.COMPLETE:
            logging.warning(
                "Encountered a non-complete harvest source during sync")
            continue
        # Recording which time will become latest_update_at
        current_time = make_aware(datetime.now())
        # Getting metadata from Sharekit and stop immediately if anything went wrong
        send_config = create_config("http_resource", {
            "resource": harvest.source.repository,
            "continuation_limit": 10000,
        })
        set_specification = harvest.source.spec
        scc, err = send(set_specification,
                        f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}",
                        config=send_config,
                        method="get")
        if len(err) or not len(scc):
            continue
        # Now parse the metadata and update current Collection for this Harvest
        seeds = get_harvest_seeds(Repositories.SHAREKIT,
                                  set_specification,
                                  harvest.latest_update_at,
                                  include_no_url=True)
        collection = dataset_version.collection_set.filter(
            name=harvest.source.spec).last()
        for seeds_batch in ibatch(seeds, batch_size=32):
            collection.update(seeds_batch, "external_id")
        # Last but not least we update the harvest update time to get a different delta later
        harvest.latest_update_at = current_time
        harvest.save()
    # And we release the syncing lock
    with atomic():
        harvest_queryset.filter(is_syncing=True).select_for_update().update(
            is_syncing=False)
コード例 #2
0
 def test_send(self):
     # Test makes equivalent call of HttpResourceProcessor.fetch.delay("test")
     scc, err = send(query="test", method=self.method, config=self.config, session=self.session)
     self.check_results(scc, 1)
     self.check_results(err, 0)
     # Similar but with a cached result
     scc, err = send(query="success", method=self.method, config=self.config, session=self.session)
     self.check_results(scc, 1)
     self.check_results(err, 0)
     # And with an error response
     scc, err = send(query="404", method=self.method, config=self.config, session=self.session)
     self.check_results(scc, 0)
     self.check_results(err, 1)
     scc, err = send(query="500", method=self.method, config=self.config, session=self.session)
     self.check_results(scc, 0)
     self.check_results(err, 1)
コード例 #3
0
    def harvest_seeds(self, harvest, current_time):
        send_config = create_config("http_resource", {
            "resource": harvest.source.repository,
            "continuation_limit": 10000,
        })

        set_specification = harvest.source.spec
        scc, err = send(set_specification,
                        f"{harvest.latest_update_at:%Y-%m-%dT%H:%M:%SZ}",
                        config=send_config,
                        method="get")

        if len(err):
            Resource = apps.get_model(harvest.source.repository)
            error_counter = Counter([
                error.status for error in Resource.objects.filter(id__in=err)
            ])
            raise CommandError(
                f"Failed to harvest seeds from {harvest.source.name}: {error_counter}"
            )

        harvest.harvested_at = current_time
        harvest.save()

        return len(scc), len(err)
コード例 #4
0
    def handle(self, *args, **options):

        freeze_name = options["freeze"]
        dummy = options["dummy"]

        if not dummy:
            self.prepare_harvest(freeze_name)

        harvest_queryset = EdurepHarvest.objects.filter(
            freeze__name=freeze_name, stage=HarvestStages.NEW)
        if not harvest_queryset.exists():
            raise EdurepHarvest.DoesNotExist(
                f"There are no NEW EdurepHarvest objects for '{freeze_name}'")

        self.header("EDUREP SEEDS HARVEST", options)

        # Calling the Edurep OAI-PMH interface and get the Edurep meta data about learning materials
        self.info("Fetching metadata for sources ...")
        send_config = create_config("http_resource", {
            "resource": "edurep.EdurepOAIPMH",
            "continuation_limit": 1000,
        })
        current_time = now()
        successes = defaultdict(int)
        fails = defaultdict(int)
        for harvest in self.progress(harvest_queryset,
                                     total=harvest_queryset.count()):
            set_specification = harvest.source.collection_name
            scc, err = send(set_specification,
                            f"{harvest.latest_update_at:%Y-%m-%d}",
                            config=send_config,
                            method="get")
            if len(err):
                raise CommandError(
                    "Failed to harvest seeds from Edurep OAI-PMH")
            successes[set_specification] += len(scc)
            fails[set_specification] += len(err)
            if not dummy:
                harvest.harvested_at = current_time
                harvest.save()
        self.info('Failed OAI-PMH calls: ', fails)
        self.info('Successful OAI-PMH calls: ', successes)
        success_count = sum(successes.values())
        fail_count = sum(fails.values())
        return f'OAI-PMH: {success_count}/{success_count+fail_count}'
コード例 #5
0
ファイル: build_corpus.py プロジェクト: SURFpol/pol-harvester
    def handle(self, *args, **options):

        language = options["language"]
        category_namespace = self.CATEGORY_NAMESPACES[language]
        categories = options["categories"]
        corpus_name = "-".join(
            sorted([
                category.replace(category_namespace, "")
                for category in categories
            ]))

        results = []
        for category in categories:
            category_name = category.replace(category_namespace, "")

            send_config = create_config(
                "http_resource", {
                    "resource": "pol_harvester.wikipediacategorymembers",
                    "wiki_country": language,
                    "continuation_limit": 100
                })
            scc, err = send(category, config=send_config, method="get")
            print(f"Send {category_name}:", scc, err)
            resources = WikipediaCategoryMembers.objects.filter(id__in=scc)

            extract_config = {
                "objective": {
                    "@": "$.query.pages",
                    "pageid": "$.pageid",
                    "title": "$.title",
                    "categories": "$.categories",
                    "wikidata": "$.pageprops.wikibase_item",
                    "wikitext": "$.revisions.0.slots.main.*"
                }
            }
            prc = ExtractProcessor(config=extract_config)
            for resource in resources:
                results += prc.extract_from_resource(resource)

        corpus, created = Corpus.objects.get_or_create(name=corpus_name,
                                                       identifier="pageid",
                                                       schema={})
        articles = []
        for result in results:
            if not result["wikitext"]:
                continue
            result["text"] = self.clean_text(
                mwparserfromhell.parse(result["wikitext"]).strip_code(),
                category_namespace)
            articles.append(
                Article(properties=result, collection=corpus, schema={}))
        corpus.add(articles, reset=True)

        vectorizer = CountVectorizer()
        vectorizer.fit_transform([
            self.clean_text(doc.properties["text"], category_namespace)
            for doc in corpus.documents.all()
        ])
        dst = os.path.join(datagrowth_settings.DATAGROWTH_DATA_DIR,
                           "custom_vocabulary", language)
        os.makedirs(dst, exist_ok=True)
        joblib.dump(vectorizer, os.path.join(dst, corpus_name + ".pkl"))
コード例 #6
0
 def dispatch_resource(self, config, *args, **kwargs):
     return send(*args, **kwargs, config=config, method=config.method)
コード例 #7
0
 def test_send_inserted_session_provider(self, get_resource_link_mock):
     send("test", method=self.method, config=self.config, session="ProcessorMock")
     args, kwargs = get_resource_link_mock.call_args
     config, session = args
     self.assertTrue(session.from_provider)
コード例 #8
0
 def test_send_inserted_session(self):
     scc, err = send(query="test", method=self.method, config=self.config, session=MockRequestsWithAgent)
     self.check_results(scc, 1)
     self.check_results(err, 0)
     link = HttpResourceMock.objects.get(id=scc[0])
     self.assertIn("user-agent", link.head)
コード例 #9
0
 def test_send_continuation(self):
     self.config.continuation_limit = 10
     scc, err = send(query="next", method=self.method, config=self.config, session=self.session)
     self.check_results(scc, 2)
     self.check_results(err, 0)
コード例 #10
0
 def test_send_continuation_prohibited(self):
     scc, err = send(query="next", method=self.method, config=self.config, session=self.session)
     self.check_results(scc, 1)
     self.check_results(err, 0)