Example #1
0
def extract_arxiv_ids_from_recid(recid):
    """Extract arxiv # for given recid

    We get them from the record which has this format:
    037__ $9arXiv$arXiv:1010.1111
    """
    record = get_record(recid)
    for report_number_field in record.get("037", []):
        try:
            source = report_number_field.get_subfield_values("9")[0]
        except IndexError:
            continue
        else:
            if source != "arXiv":
                continue

        try:
            report_number = report_number_field.get_subfield_values("a")[0]
        except IndexError:
            continue
        else:
            # Extract arxiv id
            if report_number.startswith("arXiv"):
                report_number = report_number.split(":")[1]
            if ARXIV_VERSION_PATTERN.search(report_number):
                report_number = report_number[:-2]
            yield report_number
Example #2
0
def extract_arxiv_ids_from_recid(recid):
    """Extract arxiv # for given recid

    We get them from the record which has this format:
    037__ $9arXiv$arXiv:1010.1111
    """
    record = get_record(recid)
    for report_number_field in record.get('037', []):
        try:
            source = report_number_field.get_subfield_values('9')[0]
        except IndexError:
            continue
        else:
            if source != 'arXiv':
                continue

        try:
            report_number = report_number_field.get_subfield_values('a')[0]
        except IndexError:
            continue
        else:
            # Extract arxiv id
            if report_number.startswith('arXiv'):
                report_number = report_number.split(':')[1]
            if ARXIV_VERSION_PATTERN.search(report_number):
                report_number = report_number[:-2]
            yield report_number
Example #3
0
def get_citation_informations(recid_list,
                              tags,
                              config,
                              fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'hdl': {},
        'isbn': {},
        'record_id': {},
    }

    references_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'record_id': {},
        'isbn': {},
        'hdl': {},
    }

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info['record_id'][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, 'collections'):
            if recid not in recids_cache(config.get(function, 'collections')):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags['refs_report_number']:
            references_info['report-numbers'][recid] = [
                t.value
                for t in record.find_subfields(tags['refs_report_number'])
            ]
            msg = "references_info['report-numbers'][%s] = %r" \
                        % (recid, references_info['report-numbers'][recid])
            write_message(msg, verbose=9)
        if tags['refs_journal']:
            references_info['journals'][recid] = []
            for ref in record.find_subfields(tags['refs_journal']):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(',')
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ','.join([journal, alt_volume, page])
                        references_info['journals'][recid] += [alt_ref]
                references_info['journals'][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" \
                              % (recid, references_info['journals'][recid])
            write_message(msg, verbose=9)
        if tags['refs_doi']:
            references = [
                t.value for t in record.find_subfields(tags['refs_doi'])
            ]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info['doi'][recid] = dois
            references_info['hdl'][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)

        if tags['refs_record_id']:
            references_info['record_id'][recid] = [
                t.value for t in record.find_subfields(tags['refs_record_id'])
            ]
            msg = "references_info['record_id'][%s] = %r" \
                                   % (recid, references_info['record_id'][recid])
            write_message(msg, verbose=9)
        if tags['refs_isbn']:
            references_info['isbn'][recid] = [
                t.value for t in record.find_subfields(tags['refs_isbn'])
            ]
            msg = "references_info['isbn'][%s] = %r" \
                                   % (recid, references_info['isbn'][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags['record_pri_number'] or tags['record_add_number']:
            records_info['report-numbers'][recid] = []

            if tags['record_pri_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_pri_number'])
                ]

            if tags['record_add_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_add_number'])
                ]

            msg = "records_info[%s]['report-numbers'] = %r" \
                        % (recid, records_info['report-numbers'][recid])
            write_message(msg, verbose=9)

        if tags['doi']:
            records_info['doi'][recid] = []
            records_info['hdl'][recid] = []
            for tag in tags['doi']:
                for field in record.find_fields(tag[:5]):
                    if 'DOI' in field.get_subfield_values('2'):
                        dois = field.get_subfield_values('a')
                        records_info['doi'][recid].extend(dois)
                    elif 'HDL' in field.get_subfield_values('2'):
                        hdls = field.get_subfield_values('a')
                        records_info['hdl'][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" \
                                      % (recid, records_info['doi'][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" \
                                      % (recid, records_info['hdl'][recid])
            write_message(msg, verbose=9)

        if tags['isbn']:
            records_info['isbn'][recid] = []
            for tag in tags['isbn']:
                values = [t.value for t in record.find_subfields(tag)]
                records_info['isbn'][recid] += values

            msg = "records_info[%s]['isbn'] = %r" \
                                      % (recid, records_info['isbn'][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags['publication']:
            records_info['journals'][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" \
                                 % (recid, records_info['journals'][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info "
                  "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info
Example #4
0
def get_citation_informations(recid_list, tags, config, fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {"report-numbers": {}, "journals": {}, "doi": {}, "hdl": {}, "isbn": {}, "record_id": {}}

    references_info = {"report-numbers": {}, "journals": {}, "doi": {}, "record_id": {}, "isbn": {}, "hdl": {}}

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info["record_id"][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, "collections"):
            if recid not in recids_cache(config.get(function, "collections")):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags["refs_report_number"]:
            references_info["report-numbers"][recid] = [
                t.value for t in record.find_subfields(tags["refs_report_number"])
            ]
            msg = "references_info['report-numbers'][%s] = %r" % (recid, references_info["report-numbers"][recid])
            write_message(msg, verbose=9)
        if tags["refs_journal"]:
            references_info["journals"][recid] = []
            for ref in record.find_subfields(tags["refs_journal"]):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(",")
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ",".join([journal, alt_volume, page])
                        references_info["journals"][recid] += [alt_ref]
                references_info["journals"][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" % (recid, references_info["journals"][recid])
            write_message(msg, verbose=9)
        if tags["refs_doi"]:
            references = [t.value for t in record.find_subfields(tags["refs_doi"])]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info["doi"][recid] = dois
            references_info["hdl"][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)

        if tags["refs_record_id"]:
            references_info["record_id"][recid] = [t.value for t in record.find_subfields(tags["refs_record_id"])]
            msg = "references_info['record_id'][%s] = %r" % (recid, references_info["record_id"][recid])
            write_message(msg, verbose=9)
        if tags["refs_isbn"]:
            references_info["isbn"][recid] = [t.value for t in record.find_subfields(tags["refs_isbn"])]
            msg = "references_info['isbn'][%s] = %r" % (recid, references_info["isbn"][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags["record_pri_number"] or tags["record_add_number"]:
            records_info["report-numbers"][recid] = []

            if tags["record_pri_number"]:
                records_info["report-numbers"][recid] += [
                    t.value for t in record.find_subfields(tags["record_pri_number"])
                ]

            if tags["record_add_number"]:
                records_info["report-numbers"][recid] += [
                    t.value for t in record.find_subfields(tags["record_add_number"])
                ]

            msg = "records_info[%s]['report-numbers'] = %r" % (recid, records_info["report-numbers"][recid])
            write_message(msg, verbose=9)

        if tags["doi"]:
            records_info["doi"][recid] = []
            records_info["hdl"][recid] = []
            for tag in tags["doi"]:
                for field in record.find_fields(tag[:5]):
                    if "DOI" in field.get_subfield_values("2"):
                        dois = field.get_subfield_values("a")
                        records_info["doi"][recid].extend(dois)
                    elif "HDL" in field.get_subfield_values("2"):
                        hdls = field.get_subfield_values("a")
                        records_info["hdl"][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" % (recid, records_info["doi"][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" % (recid, records_info["hdl"][recid])
            write_message(msg, verbose=9)

        if tags["isbn"]:
            records_info["isbn"][recid] = []
            for tag in tags["isbn"]:
                values = [t.value for t in record.find_subfields(tag)]
                records_info["isbn"][recid] += values

            msg = "records_info[%s]['isbn'] = %r" % (recid, records_info["isbn"][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags["publication"]:
            records_info["journals"][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" % (recid, records_info["journals"][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info " "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info