def build_record(counts, fields, recid=None, status_code=0): """Given a series of MARC XML-ized reference lines and a record-id, write a MARC XML record to the stdout stream. Include in the record some stats for the extraction job. The printed MARC XML record will essentially take the following structure: <record> <controlfield tag="001">1</controlfield> <datafield tag="999" ind1="C" ind2="5"> [...] </datafield> [...] <datafield tag="999" ind1="C" ind2="6"> <subfield code="a"> Invenio/X.XX.X refextract/X.XX.X-timestamp-err-repnum-title-URL-misc </subfield> </datafield> </record> Timestamp, error(code), reportnum, title, URL, and misc will are of course take the relevant values. @param status_code: (integer)the status of reference-extraction for the given record: was there an error or not? 0 = no error; 1 = error. @param count_reportnum: (integer) - the number of institutional report-number citations found in the document's reference lines. @param count_title: (integer) - the number of journal title citations found in the document's reference lines. @param count_url: (integer) - the number of URL citations found in the document's reference lines. @param count_misc: (integer) - the number of sections of miscellaneous text (i.e. 999C5$m) from the document's reference lines. @param count_auth_group: (integer) - the total number of author groups identified ($h) @param recid: (string) - the record-id of the given document. (put into 001 field.) @param xml_lines: (list) of strings. Each string in the list contains a group of MARC XML 999C5 datafields, making up a single reference line. These reference lines will make up the document body. @return: The entire MARC XML textual output, plus recognition statistics. """ record = BibRecord(recid=recid) record['999'] = fields field = record.add_field(CFG_REFEXTRACT_TAG_ID_EXTRACTION_STATS) stats_str = "%(status)s-%(reportnum)s-%(title)s-%(author)s-%(url)s-%(doi)s-%(misc)s" % { 'status' : status_code, 'reportnum' : counts['reportnum'], 'title' : counts['title'], 'author' : counts['auth_group'], 'url' : counts['url'], 'doi' : counts['doi'], 'misc' : counts['misc'], } field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_STATS, stats_str) field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_TIME, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) field.add_subfield(CFG_REFEXTRACT_SUBFIELD_EXTRACTION_VERSION, CFG_REFEXTRACT_VERSION) return record
def test_simple(self): from invenio.legacy.docextract.record import BibRecord from invenio.legacy.docextract.convert_journals import convert_journals record = BibRecord() record.add_subfield('100__a', 'Test Journal Name') record.add_subfield('773__p', 'Test Journal Name') record.add_subfield('999C5s', 'Test Journal Name,100,10') converted_record = convert_journals(self.kb, record) expected_record = BibRecord() expected_record.add_subfield('100__a', 'Test Journal Name') expected_record.add_subfield('773__p', 'Converted') expected_record.add_subfield('999C5s', 'Converted,100,10') self.assertEqual(expected_record, converted_record)