Exemple #1
0
 def get_pmh_record(self, record_id):
     my_sickle = _get_my_sickle(self.pmh_url)
     pmh_input_record = my_sickle.GetRecord(identifier=record_id, metadataPrefix=self.metadata_prefix)
     my_pmh_record = pmh_record.PmhRecord()
     my_pmh_record.populate(self.id, pmh_input_record, metadata_prefix=self.metadata_prefix)
     my_pmh_record.repo_id = self.id_old  # delete once endpoint_id is populated
     return my_pmh_record
Exemple #2
0
 def get_pmh_record(self, record_id):
     my_sickle = self.get_my_sickle(self.pmh_url)
     pmh_input_record = my_sickle.GetRecord(identifier=record_id,
                                            metadataPrefix="oai_dc")
     my_pmh_record = pmh_record.PmhRecord()
     my_pmh_record.populate(pmh_input_record)
     my_pmh_record.repo_id = self.id
     return my_pmh_record
Exemple #3
0
    def call_pmh_endpoint(self,
                          first=None,
                          last=None,
                          chunk_size=10,
                          scrape=False):

        args = {}
        args['metadataPrefix'] = 'oai_dc'

        if "citeseerx" in self.pmh_url:
            proxy_url = os.getenv("STATIC_IP_PROXY")
            proxies = {"https": proxy_url, "http": proxy_url}
        else:
            proxies = {}

        my_sickle = MySickle(self.pmh_url, proxies=proxies, timeout=120)
        logger.info(u"connected to sickle with {} {}".format(
            self.pmh_url, proxies))

        args['from'] = first
        if last:
            args["until"] = last

        records_to_save = []

        logger.info(u"calling ListRecords with {} {}".format(
            self.pmh_url, args))
        try:
            pmh_records = my_sickle.ListRecords(ignore_deleted=True, **args)
            logger.info(u"got pmh_records with {} {}".format(
                self.pmh_url, args))
            pmh_input_record = safe_get_next_record(pmh_records)
        except Exception as e:
            logger.info(u"no records with {} {}".format(self.pmh_url, args))
            # logger.exception(u"no records with {} {}".format(self.pmh_url, args))
            pmh_input_record = None

        while pmh_input_record:

            my_pmh_record = pmh_record.PmhRecord()

            my_pmh_record.id = pmh_input_record.header.identifier
            my_pmh_record.api_raw = pmh_input_record.raw
            my_pmh_record.record_timestamp = pmh_input_record.header.datestamp
            my_pmh_record.title = oai_tag_match("title", pmh_input_record)
            my_pmh_record.authors = oai_tag_match("creator",
                                                  pmh_input_record,
                                                  return_list=True)
            my_pmh_record.oa = oai_tag_match("oa", pmh_input_record)
            my_pmh_record.urls = oai_tag_match("identifier",
                                               pmh_input_record,
                                               return_list=True)
            for fulltext_url in my_pmh_record.urls:
                if fulltext_url and (is_doi_url(fulltext_url)
                                     or fulltext_url.startswith(u"doi:")
                                     or re.findall(u"10\.", fulltext_url)):
                    try:
                        my_pmh_record.doi = clean_doi(fulltext_url)
                    except NoDoiException:
                        pass

            my_pmh_record.license = oai_tag_match("rights", pmh_input_record)
            my_pmh_record.relations = oai_tag_match("relation",
                                                    pmh_input_record,
                                                    return_list=True)
            my_pmh_record.sources = oai_tag_match("collname",
                                                  pmh_input_record,
                                                  return_list=True)
            my_pmh_record.source = self.id

            if is_complete(my_pmh_record):
                db.session.merge(my_pmh_record)
                my_pages = my_pmh_record.mint_pages()
                logger.info(u"made {} pages for id {}".format(
                    len(my_pages), my_pmh_record.id))
                for my_page in my_pages:
                    if scrape:
                        logger.info(u"scraping pages")
                        my_page.scrape()
                    db.session.merge(my_page)
                records_to_save.append(my_pmh_record)
                # logger.info(u":")
                logger.info(u"my_pmh_record {}".format(
                    my_pmh_record.get_good_urls()))
            else:
                logger.info(u"not complete")

            if len(records_to_save) >= chunk_size:
                last_record = records_to_save[-1]
                logger.info(u"last record saved: {} for {}".format(
                    last_record.id, self.id))
                safe_commit(db)
                records_to_save = []

            pmh_input_record = safe_get_next_record(pmh_records)

        # make sure to get the last ones
        if records_to_save:
            last_record = records_to_save[-1]
            logger.info(
                u"saving {} last ones, last record saved: {} for {}".format(
                    len(records_to_save), last_record.id, self.id))
            safe_commit(db)
        logger.info(u"done everything for {}".format(self.id))
Exemple #4
0
    def call_pmh_endpoint(self,
                          first=None,
                          last=None,
                          chunk_size=50,
                          scrape=False):

        start_time = time()
        records_to_save = []
        num_records_updated = 0
        loop_counter = 0
        self.error = None

        (pmh_input_record, pmh_records,
         error) = self.get_pmh_input_record(first, last)

        if error:
            self.error = u"error in get_pmh_input_record: {}".format(error)
            return

        while pmh_input_record:
            loop_counter += 1
            # create the record
            my_pmh_record = pmh_record.PmhRecord()

            # set its vars
            my_pmh_record.repo_id = self.id_old  # delete once endpoint_ids are all populated
            my_pmh_record.endpoint_id = self.id
            my_pmh_record.rand = random()
            my_pmh_record.populate(pmh_input_record)

            if is_complete(my_pmh_record):
                my_pages = my_pmh_record.mint_pages()
                my_pmh_record.pages = my_pages
                if scrape:
                    for my_page in my_pages:
                        my_page.scrape_if_matches_pub()
                records_to_save.append(my_pmh_record)
                db.session.merge(my_pmh_record)
            else:
                logger.info(u"pmh record is not complete")
                # print my_pmh_record
                pass

            if len(records_to_save) >= chunk_size:
                num_records_updated += len(records_to_save)
                safe_commit(db)
                records_to_save = []

            if loop_counter % 100 == 0:
                logger.info(
                    u"iterated through 100 more items, loop_counter={} for {}".
                    format(loop_counter, self.id))

            pmh_input_record = self.safe_get_next_record(pmh_records)

        # make sure to get the last ones
        if records_to_save:
            num_records_updated += len(records_to_save)
            last_record = records_to_save[-1]
            logger.info(
                u"saving {} last ones, last record saved: {} for {}, loop_counter={}"
                .format(len(records_to_save), last_record.id, self.id,
                        loop_counter))
            safe_commit(db)
        else:
            logger.info(
                u"finished loop, but no records to save, loop_counter={}".
                format(loop_counter))

        logger.info(
            u"updated {} PMH records for endpoint_id={}, took {} seconds".
            format(num_records_updated, self.id, elapsed(start_time, 2)))