Python get_record Exemples, invenio.docextract_record.get_record Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : BibIndexCountryTokenizer.py Projet : fpoli/invenio

    def _tokenize_from_country_name_tag(self, instID):
        """Get country name and country code tokens reading the
           country_name_tag tag from record instID.
           Returns a list of tokens (empty if something fails)
        """
        tokens = []
        record = get_record(instID)

        # Read the country name tags that are not marked as secondary
        country_name_list = []
        for field in record[self.address_field]:
            if "secondary" not in field.get_subfield_values(
                    self.secondary_address_subfield):
                country_name_list += field.get_subfield_values(
                    self.country_name_subfield
                )

        country_name_list = [s.encode('utf-8') for s in country_name_list]

        for country_name in country_name_list:
            # Find the country code using KB
            kb_country_code = get_kb_mapping(
                kb_name=self.kb_country_codes,
                value=country_name
            )
            if kb_country_code:
                country_code = kb_country_code["key"]

                if country_name and country_code:
                    tokens += [country_name, country_code]

        return tokens

Exemple #2

0

Afficher le fichier

def extract_arxiv_ids_from_recid(recid):
    """Extract arxiv # for given recid

    We get them from the record which has this format:
    037__ $9arXiv$arXiv:1010.1111
    """
    record = get_record(recid)
    for report_number_field in record.get('037', []):
        try:
            source = report_number_field.get_subfield_values('9')[0]
        except IndexError:
            continue
        else:
            if source != 'arXiv':
                continue

        try:
            report_number = report_number_field.get_subfield_values('a')[0]
        except IndexError:
            continue
        else:
            # Extract arxiv id
            if report_number.startswith('arXiv'):
                report_number = report_number.split(':')[1]
            if ARXIV_VERSION_PATTERN.search(report_number):
                report_number = report_number[:-2]
            yield report_number

Exemple #3

0

Afficher le fichier

Fichier : arxiv_pdf_checker.py Projet : aw-bib/tind-invenio

def extract_arxiv_ids_from_recid(recid):
    """Extract arxiv # for given recid

    We get them from the record which has this format:
    037__ $9arXiv$arXiv:1010.1111
    """
    record = get_record(recid)
    for report_number_field in record.get('037', []):
        try:
            source = report_number_field.get_subfield_values('9')[0]
        except IndexError:
            continue
        else:
            if source != 'arXiv':
                continue

        try:
            report_number = report_number_field.get_subfield_values('a')[0]
        except IndexError:
            continue
        else:
            # Extract arxiv id
            if report_number.startswith('arXiv'):
                report_number = report_number.split(':')[1]
            if ARXIV_VERSION_PATTERN.search(report_number):
                report_number = report_number[:-2]
            yield report_number

Exemple #4

0

Afficher le fichier

Fichier : aff_replacement.py Projet : jmartinm/inspire-scripts

def create_our_record(recid):
    old_record = get_record(recid)

    for subfield in old_record.find_subfields('100__u'):
        if subfield.value.lower() == 'lisbon, lifep':
            subfield.value = 'LIP, Lisbon'

    for subfield in old_record.find_subfields('700__u'):
        if subfield.value.lower() == 'lisbon, lifep':
            subfield.value = 'LIP, Lisbon'

    try:
        instances_100 = old_record['100']
    except KeyError:
        instances_100 = []

    try:
        instances_700 = old_record['700']
    except KeyError:
        instances_700 = []

    record = BibRecord(recid=recid)
    record['100'] = instances_100
    record['700'] = instances_700
    return record.to_xml()

Exemple #5

0

Afficher le fichier

    def _tokenize_from_country_name_tag(self, instID):
        """Get country name and country code tokens reading the
           country_name_tag tag from record instID.
           Returns a list of tokens (empty if something fails)
        """
        tokens = []
        record = get_record(instID)

        # Read the country name tags that are not marked as secondary
        country_name_list = []
        for field in record[self.address_field]:
            if "secondary" not in field.get_subfield_values(
                    self.secondary_address_subfield):
                country_name_list += field.get_subfield_values(
                    self.country_name_subfield
                )

        country_name_list = [s.encode('utf-8') for s in country_name_list]

        for country_name in country_name_list:
            # Find the country code using KB
            kb_country_code = get_kb_mapping(
                kb_name=self.kb_country_codes,
                value=country_name
            )
            if kb_country_code:
                country_code = kb_country_code["key"]

                if country_name and country_code:
                    tokens += [country_name, country_code]

        return tokens

Exemple #6

0

Afficher le fichier

Fichier : bibrank_recompute_multiple_journals.py Projet : jmartinm/inspire-scripts

 def cb_process_one(recid):
     print 'processing', recid
     try:
         rec = get_record(recid)
     except UnicodeDecodeError:
         pass
     else:
         if len(rec.find_fields('773__')) > 1:
             bibrank.add(recid)

Exemple #7

0

Afficher le fichier

Fichier : bst_arxiv_doi_update.py Projet : michamos/inspire

def append_to_record(rec_id, doi, published_date):
    """Attempt to add a DOI to a record.

    Also adds 930 'Published' if not already there and
    adds the extrapolated PubNote data to 773.
    """
    record = get_record(recid=rec_id)
    new_record = BibRecord(rec_id)
    # make sure that there is no DOI for this record
    if not record_has_doi(record, rec_id, doi):
        # create new record with only 0247 field, that we will append
        # to the existing record with bibupload function
        new_field = new_record.add_field('0247_')
        new_field.add_subfield('2', 'DOI')
        new_field.add_subfield('a', doi.decode('utf-8'))

        _print('DOI to be added: ' + doi + ' to the record ' + str(rec_id), 3)

    if not is_marked_published(record):
        new_field_980 = new_record.add_field('980__')
        new_field_980.add_subfield('a', 'Published')

    append_773 = False
    field_773 = record.find_fields('773__')
    new_field_773 = create_pubnote(doi, published_date)
    if len(field_773) == 0:
        append_773 = True
        _print("No pubnote, adding field 773 to record...", 7)
    elif not is_pubnote_identical(field_773, new_field_773):
        append_773 = True
        _print(
            "Field 773 already exists for record, " +
            "differs from DOI extract", 3)
    else:
        _print(
            "Field 773 already exists, does not " + "contradict DOI extract.",
            6)

    if append_773:
        new_field = new_record.add_field('773__')
        for code, value in new_field_773.iteritems():
            new_field.add_subfield(code, value)

    field_260 = record.find_subfields("260__c")

    if len(field_260) == 0:
        # We add 260__c publication date
        new_field = new_record.add_field('260__')
        new_field.add_subfield("c", published_date)

    if len(new_record.record) > 1:
        return new_record.to_xml()
    else:
        return None

Exemple #8

0

Afficher le fichier

Fichier : bst_arxiv_doi_update.py Projet : kaplun/inspire

def append_to_record(rec_id, doi, published_date):
    """Attempt to add a DOI to a record.

    Also adds 930 'Published' if not already there and
    adds the extrapolated PubNote data to 773.
    """
    record = get_record(recid=rec_id)
    new_record = BibRecord(rec_id)
    # make sure that there is no DOI for this record
    if not record_has_doi(record, rec_id, doi):
        # create new record with only 0247 field, that we will append
        # to the existing record with bibupload function
        new_field = new_record.add_field('0247_')
        new_field.add_subfield('2', 'DOI')
        new_field.add_subfield('a', doi.decode('utf-8'))

        _print('DOI to be added: ' + doi +
               ' to the record ' + str(rec_id), 3)

    if not is_marked_published(record):
        new_field_980 = new_record.add_field('980__')
        new_field_980.add_subfield('a', 'Published')

    append_773 = False
    field_773 = record.find_fields('773__')
    new_field_773 = create_pubnote(doi, published_date)
    if len(field_773) == 0:
        append_773 = True
        _print("No pubnote, adding field 773 to record...", 7)
    elif not is_pubnote_identical(field_773, new_field_773):
        append_773 = True
        _print("Field 773 already exists for record, " +
               "differs from DOI extract", 3)
    else:
        _print("Field 773 already exists, does not " +
               "contradict DOI extract.", 6)

    if append_773:
        new_field = new_record.add_field('773__')
        for code, value in new_field_773.iteritems():
            new_field.add_subfield(code, value)

    field_260 = record.find_subfields("260__c")

    if len(field_260) == 0:
        # We add 260__c publication date
        new_field = new_record.add_field('260__')
        new_field.add_subfield("c", published_date)

    if len(new_record.record) > 1:
        return new_record.to_xml()
    else:
        return None

Exemple #9

0

Afficher le fichier

 def test_equality(self):
     for recid in self.records_cache.iterkeys():
         for recid2 in self.records_cache.iterkeys():
             record = self.records_cache[recid]
             xml = self.xml_cache[recid]
             if recid == recid2:
                 record2 = get_record(recid)
                 xml2 = record2.to_xml()
                 self.assertEqual(record, record2)
                 self.assertXmlEqual(xml, xml2)
             else:
                 record2 = self.records_cache[recid2]
                 xml2 = self.xml_cache[recid2]
                 self.assertNotEqual(record, record2)

Exemple #10

0

Afficher le fichier

Fichier : docextract_record_regression_tests.py Projet : BessemAamira/invenio

 def test_equality(self):
     for recid in self.records_cache.iterkeys():
         for recid2 in self.records_cache.iterkeys():
             record = self.records_cache[recid]
             xml = self.xml_cache[recid]
             if recid == recid2:
                 record2 = get_record(recid)
                 xml2 = record2.to_xml()
                 self.assertEqual(record, record2)
                 self.assertXmlEqual(xml, xml2)
             else:
                 record2 = self.records_cache[recid2]
                 xml2 = self.xml_cache[recid2]
                 self.assertNotEqual(record, record2)

Exemple #11

0

Afficher le fichier

def process_chunk(recids):
    # Map for PID -> Affiliation info
    aff = {}

    for recid in recids:
        pids = get_personids(recid)
        record = get_record(recid)

        # Check for authors removed from papers
        missing_pids = intbitset(run_sql("SELECT personid FROM aidAFFILIATIONS WHERE last_recid = %s", [recid]))
        if pids:
            missing_pids -= intbitset(pids.values())
        for pid in missing_pids:
            try:
                recomputed_aff_info = recompute_affiliation(pid)
            except RecomputeException:
                continue
            if recomputed_aff_info:
                if pid not in aff or aff[pid]['last_occurence'] <= recomputed_aff_info['last_occurence']:
                    aff[pid] = recomputed_aff_info
            else:
                run_sql("DELETE FROM aidAFFILIATIONS WHERE personid = %s", [pid])

        if not pids:
            continue

        # Check for new affiliations
        for field in chain(record['100'], record['700']):
            if not field['a']:
                continue
            field_author = field['a'][0]
            field_aff = field['u']
            if field_aff:
                try:
                    pid = pids[field_author]
                except KeyError:
                    # This happens when bibupload runs while
                    # bibauthorid (rabbit) is sleeping
                    continue
                record_date = get_creation_date(recid)
                if pid not in aff or aff[pid]['last_occurence'] <= record_date:
                    aff[pid] = {'aff': field_aff,
                                'last_recid': recid,
                                'last_occurence': record_date}

    return aff

Exemple #12

0

Afficher le fichier

Fichier : docextract_record_regression_tests.py Projet : BessemAamira/invenio

    def setUp(self):
        self.maxDiff = None

        def order_by_tag(field1, field2):
            """Function used to order the fields according to their tag"""
            return cmp(field1[0], field2[0])
        bibrecord._order_by_ord = order_by_tag  # pylint: disable-msg=W0212

        self.records_cache = {}
        self.xml_cache = {}
        for recid in perform_request_search(p=""):
            r = run_sql("SELECT master_format FROM bibrec WHERE id=%s", [recid])
            self.assertTrue(r, msg="bibrec row for %s missing" % recid)
            if r[0][0] != 'marc':
                continue
            record = get_record(recid)
            self.records_cache[recid] = record
            self.xml_cache[recid] = record.to_xml()

Exemple #13

0

Afficher le fichier

def get_all_authority_record_for_field(field, value):
    """
    Return all the authority record for a specific field.

    This function will look in the configuration to see if it can create
    a link between the field and the authority category.
    If yes then it will do a database request to retrieve all the authority record matching
    with it
    .
    :param field: Field tag for searching.
    :return: List of records from bibextract
    """
    if field in CFG_BIBAUTHORITY_RECORD_AUTHOR_CONTROL_NUMBER_FIELDS_REVERSED:
        authority_type = CFG_BIBAUTHORITY_RECORD_AUTHOR_CONTROL_NUMBER_FIELDS_REVERSED[
            field]
        list_index_fields = CFG_BIBAUTHORITY_AUTHORITY_SUBFIELDS_TO_INDEX[
            CFG_BIBAUTHORITY_RECORD_AUTHOR_CONTROL_NUMBER_FIELDS_REVERSED[
                field]]

        list_request = []

        for field in list_index_fields:
            first_two_char = field[:2]
            list_request.append(
                'SELECT DISTINCT bibrec_bib{0}x.id_bibrec ' \
                'FROM bibrec_bib{0}x, bib{0}x, bibrec_bib98x, bib98x ' \
                'WHERE bib{0}x.tag like "{2}%" ' \
                'AND bib{0}x.value like "%{1}%" ' \
                'AND bibrec_bib{0}x.id_bibxxx=bib{0}x.id ' \
                'AND bibrec_bib{0}x.id_bibrec=bibrec_bib98x.id_bibrec ' \
                'AND bibrec_bib98x.id_bibxxx=bib98x.id ' \
                'AND bib98x.value<>"DELETED" ' \
                'AND bib98x.value="AUTHORITY"'.format(first_two_char, value, field))

        sql_request = " UNION ".join(list_request)
        authority_records_matching = run_sql(sql_request)
        authority_records_matching = list(set(authority_records_matching))[:20]
        authority_records = []
        for authority_record in authority_records_matching:
            authority_records.append(get_record(authority_record[0]).record)
        return authority_records
    else:
        return []

Exemple #14

0

Afficher le fichier

Fichier : move_65017_to_084__.py Projet : jmartinm/inspire-scripts

def create_our_record(recid, bibupload, bibupload2):
    old_record = get_record(recid)

    try:
        instances_084 = old_record['084']
    except KeyError:
        instances_084 = []

    to_remove_instances_650 = []


    modified = False
    for field in old_record['650']:
        if 'PACS' in field.get_subfield_values('2'):
            assert len(field.subfields) >= 2
            assert len(field.subfields) -1 == len(field.get_subfield_values('a'))
            to_remove_instances_650.append(field)
            for value in field.get_subfield_values('a'):
                sub_2 = BibRecordSubField(code='2', value='PACS')
                sub_a = BibRecordSubField(code='a', value=value)
                f = BibRecordField(subfields=[sub_2, sub_a])
                instances_084.append(f)
                modified = True

    if not modified:
        return None

    # Remove wrong indicator
    for field in instances_084[:]:
        if field.ind1 == '1' and field.ind2 == '7' \
                and 'PACS' in field.get_subfield_values('2'):
            field.ind1 = ' '
            field.ind2 = ' '

    record = BibRecord(recid=recid)
    record['084'] = set(instances_084)
    bibupload.add(record.to_xml())

    if to_remove_instances_650:
        record = BibRecord(recid=recid)
        record['650'] = to_remove_instances_650
        bibupload2.add(record.to_xml())

Exemple #15

0

Afficher le fichier

    def setUp(self):
        self.maxDiff = None

        def order_by_tag(field1, field2):
            """Function used to order the fields according to their tag"""
            return cmp(field1[0], field2[0])

        bibrecord._order_by_ord = order_by_tag  # pylint: disable-msg=W0212

        self.records_cache = {}
        self.xml_cache = {}
        for recid in perform_request_search(p=""):
            r = run_sql("SELECT master_format FROM bibrec WHERE id=%s",
                        [recid])
            self.assertTrue(r, msg="bibrec row for %s missing" % recid)
            if r[0][0] != 'marc':
                continue
            record = get_record(recid)
            self.records_cache[recid] = record
            self.xml_cache[recid] = record.to_xml()

Exemple #16

0

Afficher le fichier

def recompute_affiliation(pid):
    pid_100_rows = run_sql("""SELECT bibrec, bib10x.value as name
                               FROM aidPERSONIDPAPERS INNER JOIN bib10x
                               ON aidPERSONIDPAPERS.bibref_value = bib10x.id
                               WHERE personid = %s and bibref_table = '100'""", [pid])
    pid_700_rows = run_sql("""SELECT bibrec, bib70x.value as name
                               FROM aidPERSONIDPAPERS INNER JOIN bib70x
                               ON aidPERSONIDPAPERS.bibref_value = bib70x.id
                               WHERE personid = %s and bibref_table = '700'""", [pid])

    for recid, name in chain(pid_100_rows, pid_700_rows):
        record = get_record(recid)
        for field in chain(record['100'], record['700']):
            try:
                if field['a'][0] == name and field['u']:
                    return {'aff': field['u'],
                            'last_recid': recid,
                            'last_occurence': get_creation_date(recid)}
            except IndexError, e:
                print 'WARNING: problem in recomputing affiliations for pid ', pid
                raise RecomputeException(str(e))

Exemple #17

0

Afficher le fichier

def get_all_record_for_field(field, value):
    """
    Return all the authority record for a specific field.

    This function will look in the configuration to see if it can create
    a link between the field and the authority category.
    If yes then it will do a database request to retrieve all the authority record matching
    with it
    .
    :param field: Field tag for searching.
    :return: List of records from bibextract
    """

    sql_request = 'SELECT DISTINCT bibrec_bib{0}x.id_bibrec from bibrec_bib{0}x, bib{0}x where bib{0}x.tag like "{2}%" and bib{0}x.value like "%{1}%" and bibrec_bib{0}x.id_bibxxx = bib{0}x.id'.format(
        field[:2], value, field)
    authority_records_matching = run_sql(sql_request)
    authority_records_matching = authority_records_matching[:20]
    authority_records = []
    for authority_record in authority_records_matching:
        authority_records.append(get_record(authority_record[0]).record)
    return authority_records

Exemple #18

0

Afficher le fichier

Fichier : add_doi.py Projet : jmartinm/inspire-scripts

def append_doi(recID, doi):
    record = get_record(recid=recID)
    try:
        # make sure that there is no DOI for this record
        if record.find_subfields('0247_a'):
            messages.append('Record %s already has a doi' % recID)
            if record.find_subfields('0247_a')[0].value != doi:
                errors.append('DOI of %s record is different than the new doi (%s)!'
                              % (recID, doi))
        else:
            # create new record with only 0247 field, that we will append
            # to the existing record with bibupload function
            new_record = BibRecord(recID)
            new_field = new_record.add_field('0247_')
            new_field.add_subfield('a', doi.decode('utf-8'))
            new_field.add_subfield('2', 'DOI')

            messages.append('Successfully inserted the doi: ' + doi +
                            ' to the record ' + str(recID))

            return new_record.to_xml()
    except Exception, e:
        traceback.print_exc()
        errors.append('Unknown error: ' + repr(e))

Exemple #19

0

Afficher le fichier

def get_citation_informations(recid_list,
                              tags,
                              config,
                              fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'hdl': {},
        'isbn': {},
        'record_id': {},
    }

    references_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'record_id': {},
        'isbn': {},
        'hdl': {},
    }

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info['record_id'][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, 'collections'):
            if recid not in recids_cache(config.get(function, 'collections')):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags['refs_report_number']:
            references_info['report-numbers'][recid] = [
                t.value
                for t in record.find_subfields(tags['refs_report_number'])
            ]
            msg = "references_info['report-numbers'][%s] = %r" \
                        % (recid, references_info['report-numbers'][recid])
            write_message(msg, verbose=9)
        if tags['refs_journal']:
            references_info['journals'][recid] = []
            for ref in record.find_subfields(tags['refs_journal']):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(',')
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ','.join([journal, alt_volume, page])
                        references_info['journals'][recid] += [alt_ref]
                references_info['journals'][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" \
                              % (recid, references_info['journals'][recid])
            write_message(msg, verbose=9)
        if tags['refs_doi']:
            references = [
                t.value for t in record.find_subfields(tags['refs_doi'])
            ]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info['doi'][recid] = dois
            references_info['hdl'][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)

        if tags['refs_record_id']:
            references_info['record_id'][recid] = [
                t.value for t in record.find_subfields(tags['refs_record_id'])
            ]
            msg = "references_info['record_id'][%s] = %r" \
                                   % (recid, references_info['record_id'][recid])
            write_message(msg, verbose=9)
        if tags['refs_isbn']:
            references_info['isbn'][recid] = [
                t.value for t in record.find_subfields(tags['refs_isbn'])
            ]
            msg = "references_info['isbn'][%s] = %r" \
                                   % (recid, references_info['isbn'][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags['record_pri_number'] or tags['record_add_number']:
            records_info['report-numbers'][recid] = []

            if tags['record_pri_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_pri_number'])
                ]

            if tags['record_add_number']:
                records_info['report-numbers'][recid] += [
                    t.value
                    for t in record.find_subfields(tags['record_add_number'])
                ]

            msg = "records_info[%s]['report-numbers'] = %r" \
                        % (recid, records_info['report-numbers'][recid])
            write_message(msg, verbose=9)

        if tags['doi']:
            records_info['doi'][recid] = []
            records_info['hdl'][recid] = []
            for tag in tags['doi']:
                for field in record.find_fields(tag[:5]):
                    if 'DOI' in field.get_subfield_values('2'):
                        dois = field.get_subfield_values('a')
                        records_info['doi'][recid].extend(dois)
                    elif 'HDL' in field.get_subfield_values('2'):
                        hdls = field.get_subfield_values('a')
                        records_info['hdl'][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" \
                                      % (recid, records_info['doi'][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" \
                                      % (recid, records_info['hdl'][recid])
            write_message(msg, verbose=9)

        if tags['isbn']:
            records_info['isbn'][recid] = []
            for tag in tags['isbn']:
                values = [t.value for t in record.find_subfields(tag)]
                records_info['isbn'][recid] += values

            msg = "records_info[%s]['isbn'] = %r" \
                                      % (recid, records_info['isbn'][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags['publication']:
            records_info['journals'][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" \
                                 % (recid, records_info['journals'][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info "
                  "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info

Exemple #20

0

Afficher le fichier

Fichier : bibrank_citation_indexer.py Projet : BessemAamira/invenio

def get_citation_informations(recid_list, tags, config,
                              fetch_catchup_info=True):
    """Scans the collections searching references (999C5x -fields) and
       citations for items in the recid_list
       returns a 4 list of dictionaries that contains the citation information
       of cds records
       examples: [ {} {} {} {} ]
                 [ {5: 'SUT-DP-92-70-5'},
                   { 93: ['astro-ph/9812088']},
                   { 93: ['Phys. Rev. Lett. 96 (2006) 081301'] }, {} ]
        NB: stuff here is for analysing new or changed records.
        see "ref_analyzer" for more.
    """
    begin_time = os.times()[4]

    records_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'hdl': {},
        'isbn': {},
        'record_id': {},
    }

    references_info = {
        'report-numbers': {},
        'journals': {},
        'doi': {},
        'record_id': {},
        'isbn': {},
        'hdl': {},
    }

    # perform quick check to see if there are some records with
    # reference tags, because otherwise get.cit.inf would be slow even
    # if there is nothing to index:

    for done, recid in enumerate(recid_list):
        if done % 10 == 0:
            task_sleep_now_if_required()

        if done % 50 == 0:
            mesg = "get cit.inf done %s of %s" % (done, len(recid_list))
            write_message(mesg)
            task_update_progress(mesg)

        record = get_record(recid)
        records_info['record_id'][recid] = [unicode(recid)]

        function = config.get("rank_method", "function")
        if config.get(function, 'collections'):
            if recid not in recids_cache(config.get(function, 'collections')):
                # do not treat this record since it is not in the collections
                # we want to process
                continue
        elif recid in deleted_recids_cache():
            # do not treat this record since it was deleted; we
            # skip it like this in case it was only soft-deleted
            # e.g. via bibedit (i.e. when collection tag 980 is
            # DELETED but other tags like report number or journal
            # publication info remained the same, so the calls to
            # get_fieldvalues() below would return old values)
            continue

        if tags['refs_report_number']:
            references_info['report-numbers'][recid] = [t.value for t in
                             record.find_subfields(tags['refs_report_number'])]
            msg = "references_info['report-numbers'][%s] = %r" \
                        % (recid, references_info['report-numbers'][recid])
            write_message(msg, verbose=9)
        if tags['refs_journal']:
            references_info['journals'][recid] = []
            for ref in record.find_subfields(tags['refs_journal']):
                try:
                    # Inspire specific parsing
                    journal, volume, page = ref.value.split(',')
                except ValueError:
                    pass
                else:
                    alt_volume = get_alt_volume(volume)
                    if alt_volume:
                        alt_ref = ','.join([journal, alt_volume, page])
                        references_info['journals'][recid] += [alt_ref]
                references_info['journals'][recid] += [ref.value]
            msg = "references_info['journals'][%s] = %r" \
                              % (recid, references_info['journals'][recid])
            write_message(msg, verbose=9)
        if tags['refs_doi']:
            references = [t.value for t in
                                       record.find_subfields(tags['refs_doi'])]
            dois = []
            hdls = []
            for ref in references:
                if ref.startswith("hdl:"):
                    hdls.append(ref[4:])
                elif ref.startswith("doi:"):
                    dois.append(ref[4:])
                else:
                    dois.append(ref)
            references_info['doi'][recid] = dois
            references_info['hdl'][recid] = hdls

            msg = "references_info['doi'][%s] = %r" % (recid, dois)
            write_message(msg, verbose=9)
            msg = "references_info['hdl'][%s] = %r" % (recid, hdls)
            write_message(msg, verbose=9)


        if tags['refs_record_id']:
            references_info['record_id'][recid] = [t.value for t in
                                 record.find_subfields(tags['refs_record_id'])]
            msg = "references_info['record_id'][%s] = %r" \
                                   % (recid, references_info['record_id'][recid])
            write_message(msg, verbose=9)
        if tags['refs_isbn']:
            references_info['isbn'][recid] = [t.value for t in
                                      record.find_subfields(tags['refs_isbn'])]
            msg = "references_info['isbn'][%s] = %r" \
                                   % (recid, references_info['isbn'][recid])
            write_message(msg, verbose=9)

        if not fetch_catchup_info:
            # We do not need the extra info
            continue

        if tags['record_pri_number'] or tags['record_add_number']:
            records_info['report-numbers'][recid] = []

            if tags['record_pri_number']:
                records_info['report-numbers'][recid] += [t.value for t in
                            record.find_subfields(tags['record_pri_number'])]

            if tags['record_add_number']:
                records_info['report-numbers'][recid] += [t.value for t in
                            record.find_subfields(tags['record_add_number'])]

            msg = "records_info[%s]['report-numbers'] = %r" \
                        % (recid, records_info['report-numbers'][recid])
            write_message(msg, verbose=9)

        if tags['doi']:
            records_info['doi'][recid] = []
            records_info['hdl'][recid] = []
            for tag in tags['doi']:
                for field in record.find_fields(tag[:5]):
                    if 'DOI' in field.get_subfield_values('2'):
                        dois = field.get_subfield_values('a')
                        records_info['doi'][recid].extend(dois)
                    elif 'HDL' in field.get_subfield_values('2'):
                        hdls = field.get_subfield_values('a')
                        records_info['hdl'][recid].extend(hdls)

            msg = "records_info[%s]['doi'] = %r" \
                                      % (recid, records_info['doi'][recid])
            write_message(msg, verbose=9)
            msg = "records_info[%s]['hdl'] = %r" \
                                      % (recid, records_info['hdl'][recid])
            write_message(msg, verbose=9)

        if tags['isbn']:
            records_info['isbn'][recid] = []
            for tag in tags['isbn']:
                values = [t.value for t in record.find_subfields(tag)]
                records_info['isbn'][recid] += values

            msg = "records_info[%s]['isbn'] = %r" \
                                      % (recid, records_info['isbn'][recid])
            write_message(msg, verbose=9)

        # get a combination of
        # journal vol (year) pages
        if tags['publication']:
            records_info['journals'][recid] = get_journal_info(record, tags)
            msg = "records_info[%s]['journals'] = %r" \
                                 % (recid, records_info['journals'][recid])
            write_message(msg, verbose=9)

    mesg = "get cit.inf done fully"
    write_message(mesg)
    task_update_progress(mesg)

    end_time = os.times()[4]
    write_message("Execution time for generating citation info "
                  "from record: %.2f sec" % (end_time - begin_time))

    return records_info, references_info

Exemple #21

0

Afficher le fichier

Fichier : check_missing_references.py Projet : jmartinm/inspire-scripts

 def cb_process_one(recid):
     record = get_record(recid)
     if record.find_fields('999C5') or record.find_fields('999C6'):
         return
     if record_has_fulltext(recid):
         refextract.add(recid)