Exemple #1
0
def build_record(counts, fields, recid=None, status_code=0):
    """Given a series of MARC XML-ized reference lines and a record-id, write a
       MARC XML record to the stdout stream. Include in the record some stats
       for the extraction job.
       The printed MARC XML record will essentially take the following
       structure:
        <record>
           <controlfield tag="001">1</controlfield>
           <datafield tag="999" ind1="C" ind2="5">
              [...]
           </datafield>
           [...]
           <datafield tag="999" ind1="C" ind2="6">
              <subfield code="a">
        Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
              </subfield>
           </datafield>
        </record>
       Timestamp, error(code), reportnum, title, URL, and misc will are of
       course take the relevant values.

       @param status_code: (integer)the status of reference-extraction for the
        given record: was there an error or not? 0 = no error; 1 = error.
       @param count_reportnum: (integer) - the number of institutional
        report-number citations found in the document's reference lines.
       @param count_title: (integer) - the number of journal title citations
        found in the document's reference lines.
       @param count_url: (integer) - the number of URL citations found in the
        document's reference lines.
       @param count_misc: (integer) - the number of sections of miscellaneous
        text (i.e. 999C5$m) from the document's reference lines.
       @param count_auth_group: (integer) - the total number of author groups
        identified ($h)
       @param recid: (string) - the record-id of the given document. (put into
        001 field.)
       @param xml_lines: (list) of strings. Each string in the list contains a
        group of MARC XML 999C5 datafields, making up a single reference line.
        These reference lines will make up the document body.
       @return: The entire MARC XML textual output, plus recognition statistics.
    """
    record = BibRecord(recid=recid)
    record['999'] = fields
    field = record.add_field(CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS)
    stats_str = "%(status)s-%(reportnum)s-%(title)s-%(author)s-%(url)s-%(doi)s-%(misc)s" % {
           'status'             : status_code,
           'reportnum'          : counts['reportnum'],
           'title'              : counts['title'],
           'author'             : counts['auth_group'],
           'url'                : counts['url'],
           'doi'                : counts['doi'],
           'misc'               : counts['misc'],
    }
    field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS,
                       stats_str)
    field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME,
                       datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION,
                       CFG_REFEXTRACT_VERSION)

    return record
def build_record(counts, fields, recid=None, status_code=0):
    """Given a series of MARC XML-ized reference lines and a record-id, write a
       MARC XML record to the stdout stream. Include in the record some stats
       for the extraction job.
       The printed MARC XML record will essentially take the following
       structure:
        <record>
           <controlfield tag="001">1</controlfield>
           <datafield tag="999" ind1="C" ind2="5">
              [...]
           </datafield>
           [...]
           <datafield tag="999" ind1="C" ind2="6">
              <subfield code="a">
        Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc
              </subfield>
           </datafield>
        </record>
       Timestamp, error(code), reportnum, title, URL, and misc will are of
       course take the relevant values.

       @param status_code: (integer)the status of reference-extraction for the
        given record: was there an error or not? 0 = no error; 1 = error.
       @param count_reportnum: (integer) - the number of institutional
        report-number citations found in the document's reference lines.
       @param count_title: (integer) - the number of journal title citations
        found in the document's reference lines.
       @param count_url: (integer) - the number of URL citations found in the
        document's reference lines.
       @param count_misc: (integer) - the number of sections of miscellaneous
        text (i.e. 999C5$m) from the document's reference lines.
       @param count_auth_group: (integer) - the total number of author groups
        identified ($h)
       @param recid: (string) - the record-id of the given document. (put into
        001 field.)
       @param xml_lines: (list) of strings. Each string in the list contains a
        group of MARC XML 999C5 datafields, making up a single reference line.
        These reference lines will make up the document body.
       @return: The entire MARC XML textual output, plus recognition statistics.
    """
    record = BibRecord(recid=recid)
    record['999'] = fields
    field = record.add_field(CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS)
    stats_str = "%(status)s-%(reportnum)s-%(title)s-%(author)s-%(url)s-%(doi)s-%(misc)s" % {
           'status'             : status_code,
           'reportnum'          : counts['reportnum'],
           'title'              : counts['title'],
           'author'             : counts['auth_group'],
           'url'                : counts['url'],
           'doi'                : counts['doi'],
           'misc'               : counts['misc'],
    }
    field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS,
                       stats_str)
    field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME,
                       datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION,
                       CFG_REFEXTRACT_VERSION)

    return record
def create_our_record(recid):
    old_record = get_record(recid)

    for subfield in old_record.find_subfields('100__u'):
        if subfield.value.lower() == 'lisbon, lifep':
            subfield.value = 'LIP, Lisbon'

    for subfield in old_record.find_subfields('700__u'):
        if subfield.value.lower() == 'lisbon, lifep':
            subfield.value = 'LIP, Lisbon'

    try:
        instances_100 = old_record['100']
    except KeyError:
        instances_100 = []

    try:
        instances_700 = old_record['700']
    except KeyError:
        instances_700 = []

    record = BibRecord(recid=recid)
    record['100'] = instances_100
    record['700'] = instances_700
    return record.to_xml()
Exemple #4
0
 def test_add_subfield(self):
     xml = """<record>
         <datafield tag="100" ind1=" " ind2=" ">
             <subfield code="a">our title</subfield>
         </datafield>
     </record>"""
     expected_record = create_record(xml)
     record = BibRecord()
     record.add_subfield('100__a', 'our title')
     self.assertEqual(record, expected_record)
 def test_add_subfield(self):
     xml = """<record>
         <datafield tag="100" ind1=" " ind2=" ">
             <subfield code="a">our title</subfield>
         </datafield>
     </record>"""
     expected_record = create_record(xml)
     record = BibRecord()
     record.add_subfield('100__a', 'our title')
     self.assertEqual(record, expected_record)
    def test_del_field(self):
        record = create_record(self.xml)
        record.add_subfield('101__b', 'not title')

        del record['100__']

        expected_record = BibRecord()
        expected_record.add_subfield('101__b', 'not title')

        self.assertEqual(record, expected_record)
    def test_del_field(self):
        record = create_record(self.xml)
        record.add_subfield('101__b', 'not title')

        del record['100__']

        expected_record = BibRecord()
        expected_record.add_subfield('101__b', 'not title')

        self.assertEqual(record, expected_record)
Exemple #8
0
def append_to_record(rec_id, doi, published_date):
    """ Attempts to add a DOI to a record, also
    adds 930 'Published' if not already there and
    adds the extrapolated PubNote data to 773 """
    record = get_record(recid=rec_id)
    new_record = BibRecord(rec_id)
    # make sure that there is no DOI for this record
    if not record_has_doi(record, rec_id, doi):
        # create new record with only 0247 field, that we will append
        # to the existing record with bibupload function
        new_record = BibRecord(rec_id)
        new_field = new_record.add_field('0247_')
        new_field.add_subfield('2', 'DOI')
        new_field.add_subfield('a', doi.decode('utf-8'))

        _print('DOI to be added: ' + doi + ' to the record ' + str(rec_id), 3)

    if not is_marked_published(record):
        new_field_980 = new_record.add_field('980__')
        new_field_980.add_subfield('a', 'Published')

    append_773 = False
    field_773 = record.find_fields('773__')
    new_field_773 = create_pubnote(doi, published_date)
    if len(field_773) == 0:
        append_773 = True
        _print("No pubnote, adding field 773 to record...", 7)
    else:
        if not is_pubnote_identical(field_773, new_field_773):
            append_773 = True
            _print(
                "Field 773 already exists for record, " +
                "differs from DOI extract", 3)
        else:
            _print(
                "Field 773 already exists, does not " +
                "contradict DOI extract.", 6)

    if append_773:
        new_field = new_record.add_field('773__')
        for code, value in new_field_773.iteritems():
            new_field.add_subfield(code, value)

    field_260 = record.find_subfields("260__c")

    if len(field_260) == 0:
        # We add 260__c publication date
        new_field = new_record.add_field('260__')
        new_field.add_subfield("c", published_date)

    if len(new_record.record) > 0:
        return new_record.to_xml()
    else:
        return None
 def test_add_subfield2(self):
     expected_record = create_record(self.xml)
     record = BibRecord()
     field = BibRecordField()
     record['100'] = [field]
     field.add_subfield('a', 'our title')
     self.assertEqual(record, expected_record)
Exemple #10
0
class APSRecord(object):
    """
    Class representing a record to harvest.
    """
    def __init__(self, recid, doi=None, date=None, last_modified=None):
        self.recid = recid
        self.doi = doi or get_doi_from_record(self.recid)
        self.date = date
        self.record = BibRecord(recid or None)
        self.last_modified = last_modified

    def add_metadata(self, marcxml_file):
        """
        Adds metadata from given file. Removes any DTD definitions
        and translates the metadata to MARCXML using BibConvert.
        """
        if marcxml_file:
            self.record = create_records_from_file(marcxml_file)
            if self.recid:
                self.record['001'] = [BibRecordControlField(str(self.recid))]

    def add_metadata_by_string(self, marcxml_text):
        """
        Adds metadata from given text.
        """
        if marcxml_text:
            self.record = create_records_from_string(marcxml_text)
            if self.recid:
                self.record['001'] = [BibRecordControlField(str(self.recid))]

    def add_fft(self, fulltext_file, hidden=True):
        """
        Adds FFT information as required from given fulltext.
        """
        fft = self.record.add_field("FFT__")
        fft.add_subfield('a', fulltext_file)

        if hidden:
            fft.add_subfield('t', CFG_APSHARVEST_FFT_DOCTYPE)
            fft.add_subfield('o', "HIDDEN")
        else:
            fft.add_subfield('t', "INSPIRE-PUBLIC")

    def to_xml(self):
        return self.record.to_xml()
Exemple #11
0
class APSRecord(object):
    """
    Class representing a record to harvest.
    """
    def __init__(self, recid=None, doi=None, date=None, last_modified=None):
        self.recid = recid
        self.doi = doi or get_doi_from_record(self.recid)
        self.date = date
        self.record = BibRecord(recid or None)
        self.last_modified = last_modified

    def add_metadata(self, marcxml_file):
        """
        Adds metadata from given file. Removes any DTD definitions
        and translates the metadata to MARCXML using BibConvert.
        """
        if marcxml_file:
            self.record = create_records_from_file(marcxml_file)
            if self.recid:
                self.record['001'] = [BibRecordControlField(str(self.recid))]

    def add_metadata_by_string(self, marcxml_text):
        """
        Adds metadata from given text.
        """
        if marcxml_text:
            self.record = create_records_from_string(marcxml_text)
            if self.recid:
                self.record['001'] = [BibRecordControlField(str(self.recid))]

    def add_fft(self, fulltext_file, hidden=True):
        """
        Adds FFT information as required from given fulltext.
        """
        fft = self.record.add_field("FFT__")
        fft.add_subfield('a', fulltext_file)

        if hidden:
            fft.add_subfield('t', CFG_APSHARVEST_FFT_DOCTYPE)
            fft.add_subfield('o', "HIDDEN")
        else:
            fft.add_subfield('t', "INSPIRE-PUBLIC")

    def to_xml(self):
        return self.record.to_xml()
Exemple #12
0
    def test_hash(self):
        for dummy, original_record in self.records_cache.iteritems():
            # Our bibrecord we want to test
            record = BibRecord()

            for tag, fields in original_record.record.iteritems():
                record[tag] = list(set(fields))
                self.assertEqual(set(record[tag]), set(original_record[tag]))

            self.assertEqual(record, original_record)
def create_our_record(recid, bibupload, bibupload2):
    old_record = get_record(recid)

    try:
        instances_084 = old_record['084']
    except KeyError:
        instances_084 = []

    to_remove_instances_650 = []


    modified = False
    for field in old_record['650']:
        if 'PACS' in field.get_subfield_values('2'):
            assert len(field.subfields) >= 2
            assert len(field.subfields) -1 == len(field.get_subfield_values('a'))
            to_remove_instances_650.append(field)
            for value in field.get_subfield_values('a'):
                sub_2 = BibRecordSubField(code='2', value='PACS')
                sub_a = BibRecordSubField(code='a', value=value)
                f = BibRecordField(subfields=[sub_2, sub_a])
                instances_084.append(f)
                modified = True

    if not modified:
        return None

    # Remove wrong indicator
    for field in instances_084[:]:
        if field.ind1 == '1' and field.ind2 == '7' \
                and 'PACS' in field.get_subfield_values('2'):
            field.ind1 = ' '
            field.ind2 = ' '

    record = BibRecord(recid=recid)
    record['084'] = set(instances_084)
    bibupload.add(record.to_xml())

    if to_remove_instances_650:
        record = BibRecord(recid=recid)
        record['650'] = to_remove_instances_650
        bibupload2.add(record.to_xml())
def append_to_record(rec_id, doi, published_date):
    """ Attempts to add a DOI to a record, also
    adds 930 'Published' if not already there and
    adds the extrapolated PubNote data to 773 """
    record = get_record(recid=rec_id)
    new_record = BibRecord(rec_id)
    # make sure that there is no DOI for this record
    if not record_has_doi(record, rec_id, doi):
        # create new record with only 0247 field, that we will append
        # to the existing record with bibupload function
        new_record = BibRecord(rec_id)
        new_field = new_record.add_field('0247_')
        new_field.add_subfield('2', 'DOI')
        new_field.add_subfield('a', doi.decode('utf-8'))

        _print('DOI to be added: ' + doi +
               ' to the record ' + str(rec_id), 3)

    if not is_marked_published(record):
        new_field_980 = new_record.add_field('980__')
        new_field_980.add_subfield('a', 'Published')

    append_773 = False
    field_773 = record.find_fields('773__')
    new_field_773 = create_pubnote(doi, published_date)
    if len(field_773) == 0:
        append_773 = True
        _print("No pubnote, adding field 773 to record...", 7)
    else:
        if not is_pubnote_identical(field_773, new_field_773):
            append_773 = True
            _print("Field 773 already exists for record, " +
                   "differs from DOI extract", 3)
        else:
            _print("Field 773 already exists, does not " +
                   "contradict DOI extract.", 6)

    if append_773:
        new_field = new_record.add_field('773__')
        for code, value in new_field_773.iteritems():
            new_field.add_subfield(code, value)

    field_260 = record.find_subfields("260__c")

    if len(field_260) == 0:
        # We add 260__c publication date
        new_field = new_record.add_field('260__')
        new_field.add_subfield("c", published_date)

    if len(new_record.record) > 0:
        return new_record.to_xml()
    else:
        return None
Exemple #15
0
def append_doi(recID, doi):
    record = get_record(recid=recID)
    try:
        # make sure that there is no DOI for this record
        if record.find_subfields('0247_a'):
            messages.append('Record %s already has a doi' % recID)
            if record.find_subfields('0247_a')[0].value != doi:
                errors.append('DOI of %s record is different than the new doi (%s)!'
                              % (recID, doi))
        else:
            # create new record with only 0247 field, that we will append
            # to the existing record with bibupload function
            new_record = BibRecord(recID)
            new_field = new_record.add_field('0247_')
            new_field.add_subfield('a', doi.decode('utf-8'))
            new_field.add_subfield('2', 'DOI')

            messages.append('Successfully inserted the doi: ' + doi +
                            ' to the record ' + str(recID))

            return new_record.to_xml()
    except Exception, e:
        traceback.print_exc()
        errors.append('Unknown error: ' + repr(e))
 def test_set_record(self):
     record = BibRecord()
     field = BibRecordField()
     record['100'] = [field]
     self.assertEqual(len(record), 1)
Exemple #17
0
 def __init__(self, recid=None, doi=None, date=None, last_modified=None):
     self.recid = recid
     self.doi = doi or get_doi_from_record(self.recid)
     self.date = date
     self.record = BibRecord(recid or None)
     self.last_modified = last_modified
    def test_simple(self):
        record = BibRecord()
        record.add_subfield('100__a', 'Test Journal Name')
        record.add_subfield('773__p', 'Test Journal Name')
        record.add_subfield('999C5s', 'Test Journal Name,100,10')
        converted_record = convert_journals(self.kb, record)

        expected_record = BibRecord()
        expected_record.add_subfield('100__a', 'Test Journal Name')
        expected_record.add_subfield('773__p', 'Converted')
        expected_record.add_subfield('999C5s', 'Converted,100,10')

        self.assertEqual(expected_record, converted_record)
 def test_add_field(self):
     expected_record = create_record(self.xml)
     record = BibRecord()
     record.add_field('100__')
     record['100__'][0].add_subfield('a', 'our title')
     self.assertEqual(record, expected_record)
Exemple #20
0
 def __init__(self, recid, doi=None, date=None, last_modified=None):
     self.recid = recid
     self.doi = doi or get_doi_from_record(self.recid)
     self.date = date
     self.record = BibRecord(recid or None)
     self.last_modified = last_modified
 def test_add_subfield(self):
     expected_record = create_record(self.xml)
     record = BibRecord()
     record.add_subfield('100__a', 'our title')
     self.assertEqual(record, expected_record)
Exemple #22
0
    def test_simple(self):
        record = BibRecord()
        record.add_subfield('100__a', 'Test Journal Name')
        record.add_subfield('773__p', 'Test Journal Name')
        record.add_subfield('999C5s', 'Test Journal Name,100,10')
        converted_record = convert_journals(self.kb, record)

        expected_record = BibRecord()
        expected_record.add_subfield('100__a', 'Test Journal Name')
        expected_record.add_subfield('773__p', 'Converted')
        expected_record.add_subfield('999C5s', 'Converted,100,10')

        self.assertEqual(expected_record, converted_record)