def decode_record(self, record): r""" >>> reader = Reader('http://opac.uthsc.edu', 2) >>> raw = "\nLEADER 00000cas 2200517 a 4500 \n001 1481253 \n003 OCoLC \n005 19951109120000.0 \n008 750727c19589999fr qrzp b 0 b0fre d \n010 sn 86012727 \n022 0003-3995 \n030 AGTQAH \n035 0062827|bMULS|aPITT NO. 0639600000|asa64872000|bFULS \n040 MUL|cMUL|dFUL|dOCL|dCOO|dNYG|dHUL|dSER|dAIP|dNST|dAGL|dDLC\n |dTUM \n041 0 engfre|bgeritaspa \n042 nsdp \n049 TUMS \n069 1 A32025000 \n210 0 Ann. genet. \n222 0 Annales de genetique \n229 00 Annales de genetique \n229 Ann Genet \n242 00 Annals on genetics \n245 00 Annales de genetique. \n260 Paris :|bExpansion scientifique,|c1958-2004. \n300 v. :|bill. ;|c28 cm. \n310 Quarterly \n321 Two no. a year \n362 0 1,1958-47,2004. \n510 1 Excerpta medica \n510 1 Index medicus|x0019-3879 \n510 2 Biological abstracts|x0006-3169 \n510 2 Chemical abstracts|x0009-2258 \n510 2 Life sciences collection \n510 0 Bulletin signaletique \n510 0 Current contents \n546 French and English, with summaries in German, Italian, and\n Spanish. \n550 Journal of the Societe francaise de genetique. \n650 2 Genetics|vPeriodicals. \n710 2 Societ\xe9 fran\xe7aise de genetique. \n785 00 |tEuropean journal of medical genetics. \n856 41 |uhttp://library.uthsc.edu/ems/eresource/3581|zFull text \n at ScienceDirect: 43(1) Jan 2000 - 47(4) Dec 2004 \n936 Unknown|ajuin 1977 \n" >>> record = reader.decode_record(raw) >>> print record.title Annales de genetique """ pseudo_marc = record.strip().split('\n') raw_fields = [] if pseudo_marc[0][0:6] == 'LEADER': record = Record() record.leader = pseudo_marc[0][7:].strip() else: return None for field in pseudo_marc[1:]: tag = field[:3] data = unescape_entities(field[6:].decode('latin1')).encode('utf8') if tag.startswith(' '): # Additional field data needs to be prepended with an extra space # for certain fields ... #for special_tag in ('55','260'): # data = " %s" % (data,) if tag.startswith(special_tag) else data data = " %s" % (data.strip(),) raw_fields[-1]['value'] = "%s%s" % (raw_fields[-1]['value'], data) raw_fields[-1]['raw'] = "%s%s" % (raw_fields[-1]['raw'], field.strip()) else: data = data if (tag < '010' and tag.isdigit()) else "a%s" % (data,) raw_fields.append({ 'tag': tag, 'indicator1': field[3], 'indicator2': field[4], 'value': data.strip(), 'raw': field.strip() }) for raw in raw_fields: tag = raw['tag'] data = raw['value'].strip() field = Field(tag=tag, indicators=[raw['indicator1'], raw['indicator2']], data=data) if not field.is_control_field(): for sub in data.split('|'): try: field.add_subfield(sub[0].strip(), sub[1:].strip()) except Exception: # Skip blank/empty subfields continue record.add_field(field) record.parse_leader() # Disregard record if no title present if not record.get_fields('245'): return None else: return record
def processURLs(self, marc_record, proxy_location, public_note='View online - Access limited to subscribers', note_prefix='Available via Internet'): """ Method extracts URL from 856 field, sets 538 and 856 to CC's format practices. Parameters: :param marc_record: - MARC Record :param proxy_location: - proxy prefix prepended to extracted URL from 856 field :param public_note: - subfield z value, default is for CC :param note_prefix: - prefix for original URL in 538 note field, default is for CC. """ all538fields = marc_record.get_fields('538') for field538 in all538fields: marc_record.remove_field(field538) all856fields = marc_record.get_fields('856') for field856 in all856fields: # Extracts raw url from 856 subfield u, creates a url object # for original and proxy urls and replaces net location with WAM location # for proxy raw_url = urlparse.urlparse(field856.get_subfields('u')[0]) if re.match(r'http://',proxy_location): protocol = '' else: protocol = 'http://' proxy_raw_url = '%s%s%s?%s' % (protocol, proxy_location, raw_url.path, raw_url.query) proxy_url = urlparse.urlparse(proxy_raw_url) # Sets values for new 538 with constructed note in new538 = Field(tag='538', indicators=[' ',' '], subfields=['a','%s, %s' % (note_prefix,raw_url.geturl())]) marc_record.add_field(new538) # Sets values for 856 field new856 = Field(tag='856', indicators = ['4','0'], subfields=['u',proxy_url.geturl()]) # Checks for subfield 3 in original 856 field, adds to public note # in subfield z new_public_note = public_note if len(field856.get_subfields('3')) > 0: for subfield3 in field856.get_subfields('3'): subfield3_all = "%s - %s" % (public_note, subfield3) new_public_note = subfield3_all new856.add_subfield('z',new_public_note) marc_record.remove_field(field856) marc_record.add_field(new856) return marc_record
def sort_6_subs(rec): msg = '' new_rec = Record(to_unicode=True, force_utf8=True) new_rec_fields = [] rec_fields = rec.get_fields() for field in rec_fields: script_field = False if not field.is_control_field() and (len(field.get_subfields('6')) > 0): # the field contains a subfield $6 script_field = True ind1 = field.indicator1 ind2 = field.indicator2 tag = field.tag first_sub = True # variable to keep track of whether you're on the first subfield in the field needs_sorted = True # variable to keep track of whether the field needs sorted or if the $6 is already correctly the first subfield field_subs = [] # list variable to capture all the subfields in the field *except* for the subfield $6 for subfield in field: # check if $6 is the first subfield - if so, the field is OK and does *not* need to be sorted if needs_sorted and first_sub and subfield[0] == '6': needs_sorted = False elif needs_sorted: if first_sub: # this is the first subfield and is *not* $6, so the field needs sorted - creates one instance of a new_field object only when the 1st subfield is encountered new_field = Field(tag=tag, indicators=[ind1,ind2], subfields=[]) # when subfield $6 is finally encountered in the field (not the 1st), add it to the new_field object now so it becomes the first subfield # Note: subfield[0] is the subfield code and subfield[1] is the subfield content for this subfield if subfield[0]=='6': new_field.add_subfield(subfield[0],subfield[1]) # if the subfield is *not* $6, add it to the list of subfields to be added later to the new_field else: field_subs.append([subfield[0],subfield[1]]) first_sub = False if needs_sorted: # then the $6 was *not* the 1st subfield and we need to now add the remaining subfields to the new_field object for sub in field_subs: # add the remaining subfields to the new_field object new_field.add_subfield(sub[0],sub[1]) new_rec_fields.append(new_field) # add the new field to the record if not script_field or not needs_sorted: new_rec_fields.append(field) for new_f in new_rec_fields: new_rec.add_field(new_f) return new_rec
def processURLs( self, marc_record, proxy_location, public_note="View online - Access limited to subscribers", note_prefix="Available via Internet", ): """ Method extracts URL from 856 field, sets 538 and 856 to CC's format practices. Parameters: :param marc_record: - MARC Record :param proxy_location: - proxy prefix prepended to extracted URL from 856 field :param public_note: - subfield z value, default is for CC :param note_prefix: - prefix for original URL in 538 note field, default is for CC. """ all538fields = marc_record.get_fields("538") for field538 in all538fields: marc_record.remove_field(field538) all856fields = marc_record.get_fields("856") for field856 in all856fields: # Extracts raw url from 856 subfield u, creates a url object # for original and proxy urls and replaces net location with WAM location # for proxy raw_url = urllib.parse.urlparse(field856.get_subfields("u")[0]) if re.match(r"http://", proxy_location): protocol = "" else: protocol = "http://" proxy_raw_url = "{}{}{}?{}".format(protocol, proxy_location, raw_url.path, raw_url.query) proxy_url = urllib.parse.urlparse(proxy_raw_url) # Sets values for new 538 with constructed note in new538 = Field( tag="538", indicators=[" ", " "], subfields=["a", "%s, %s" % (note_prefix, raw_url.geturl())] ) marc_record.add_field(new538) # Sets values for 856 field new856 = Field(tag="856", indicators=["4", "0"], subfields=["u", proxy_url.geturl()]) # Checks for subfield 3 in original 856 field, adds to public note # in subfield z new_public_note = public_note if len(field856.get_subfields("3")) > 0: for subfield3 in field856.get_subfields("3"): subfield3_all = "%s - %s" % (public_note, subfield3) new_public_note = subfield3_all new856.add_subfield("z", new_public_note) marc_record.remove_field(field856) marc_record.add_field(new856) return marc_record
class JsonHandler: """Handle JSON.""" def __init__(self): """Init.""" self.records = [] self._record = None self._field = None self._text = [] def element(self, element_dict, name=None): """Converts a JSON `element_dict` to pymarc fields.""" if not name: self._record = Record() self.element(element_dict, "leader") elif name == "leader": self._record.leader = element_dict[name] self.element(element_dict, "fields") elif name == "fields": fields = iter(element_dict[name]) for field in fields: tag, remaining = field.popitem() self._field = Field(tag) if self._field.is_control_field(): self._field.data = remaining else: self.element(remaining, "subfields") self._field.indicators.extend( [remaining["ind1"], remaining["ind2"]]) self._record.add_field(self._field) self.process_record(self._record) elif name == "subfields": subfields = iter(element_dict[name]) for subfield in subfields: code, text = subfield.popitem() self._field.add_subfield(code, text) def elements(self, dict_list): """Sends `dict_list` to `element`.""" if type(dict_list) is not list: dict_list = [dict_list] for rec in dict_list: self.element(rec) return self.records def process_record(self, record): """Append `record` to `self.records`.""" self.records.append(record)
def generate538(self, marc_record): """Method creates a 538 field following a standard pattern Args: marc_record(pymarc.Record): MARC21 record Returns: pymarc.Record """ field856 = marc_record['856'] original_url = field856['u'] new538 = Field(tag='538', indicators=[' ',' ']) new538.add_subfield( 'a', 'Available via Internet, {}'.format(original_url)) marc_record.add_field(new538) return marc_record
def create_999_field(rec, oclc_nums): rec_003 = rec.get_fields('003')[0].value() rec_001 = rec.get_fields('001')[0].value() rec_999s = rec.get_fields('999') if len(rec_999s) == 0: new_999 = Field(tag='999', indicators=[' ',' '], subfields=['i',rec_001]) for oclc_num in oclc_nums: new_999.add_subfield('o',oclc_num) rec_orig.add_ordered_field(new_999) rec.add_ordered_field(new_999) msg += 'Record 999: '+new_999.value()+'\n' elif len(rec_999s) > 0: msg += 'ERROR-MISC: Record contains at least one 999 field\n' for rec_999 in rec_999s: msg += ' '+rec_999+'\n'
def create_999_field(rec, oclc_nums): rec_003 = rec.get_fields('003')[0].value() rec_001 = rec.get_fields('001')[0].value() rec_999s = rec.get_fields('999') if len(rec_999s) == 0: new_999 = Field(tag='999', indicators=[' ', ' '], subfields=['i', rec_001]) for oclc_num in oclc_nums: new_999.add_subfield('o', oclc_num) rec_orig.add_ordered_field(new_999) rec.add_ordered_field(new_999) msg += 'Record 999: ' + new_999.value() + '\n' elif len(rec_999s) > 0: msg += 'ERROR-MISC: Record contains at least one 999 field\n' for rec_999 in rec_999s: msg += ' ' + rec_999 + '\n'
class JsonHandler: def __init__(self): self.records = [] self._record = None self._field = None self._text = [] def element(self, element_dict, name=None): if not name: self._record = Record() self.element(element_dict, 'leader') elif name == 'leader': self._record.leader = element_dict[name] self.element(element_dict, 'fields') elif name == 'fields': fields = iter(element_dict[name]) for field in fields: tag, remaining = field.popitem() self._field = Field(tag) if self._field.is_control_field(): self._field.data = remaining else: self.element(remaining, 'subfields') self._field.indicators.extend( [remaining['ind1'], remaining['ind2']]) self._record.add_field(self._field) self.process_record(self._record) elif name == 'subfields': subfields = iter(element_dict[name]) for subfield in subfields: code, text = subfield.popitem() self._field.add_subfield(code, text) def elements(self, dict_list): if type(dict_list) is not list: dict_list = [dict_list] for rec in dict_list: self.element(rec) return self.records def process_record(self, record): self.records.append(record)
def validate245(self,marc_record): """ Method adds a subfield 'h' with value of electronic resource to the 245 field. Parameters: `marc_record`: Required, MARC record """ all245s = marc_record.get_fields('245') subfield_h_val = '[electronic resource]' if len(all245s) > 0: field245 = all245s[0] marc_record.remove_field(field245) subfield_a,subfield_c= '','' a_subfields = field245.get_subfields('a') indicator1,indicator2 = field245.indicators if len(a_subfields) > 0: subfield_a = a_subfields[0] if len(subfield_a) > 0: if subfield_a[-1] == '/': subfield_a = subfield_a[:-1].strip() new245 = Field(tag='245', indicators=[indicator1,indicator2], subfields = ['a','%s ' % subfield_a]) b_subfields = field245.get_subfields('b') c_subfields = field245.get_subfields('c') if len(c_subfields) > 0 and len(b_subfields) < 1: new245.add_subfield('h','%s / ' % subfield_h_val) elif len(b_subfields) > 0: new245.add_subfield('h','%s : ' % subfield_h_val) else: new245.add_subfield('h',subfield_h_val) if len(b_subfields) > 0: for subfield_b in b_subfields: new245.add_subfield('b',subfield_b) if len(c_subfields) > 0: for subfield_c in c_subfields: new245.add_subfield('c',subfield_c) marc_record.add_field(new245) return marc_record
def __format245__(self, field245): """Method takes a 245 field from a MARC record and returns properly formatted subfields. By not copying subfield 'h', performs the first conversion PCC recommendation. Args: field245(pymarc.Field): 245 field Returns: pymarc.Field """ if field245.tag != '245': return subfield_a,subfield_c= '','' a_subfields = field245.get_subfields('a') indicator1,indicator2 = field245.indicators if len(a_subfields) > 0: subfield_a = a_subfields[0] if len(subfield_a) > 0: if ['.','\\'].count(subfield_a[-1]) > 0: subfield_a = subfield_a[:-1].strip() new245 = Field(tag='245', indicators=[indicator1,indicator2], subfields = ['a', u'{0} '.format(subfield_a)]) b_subfields = field245.get_subfields('b') c_subfields = field245.get_subfields('c') n_subfields = field245.get_subfields('n') p_subfields = field245.get_subfields('p') # Order for 245 subfields are: # $a $n $p $b $c if len(n_subfields) > 0: for subfield_n in n_subfields: new245.add_subfield('n', subfield_n) if len(p_subfields) > 0: for subfield_p in p_subfields: new245.add_subfield('p', subfield_p) if len(c_subfields) > 0 and len(b_subfields) < 1: if 'a' in new245.subfields: new245['a'] = u'{0} /'.format(new245['a'].strip()) elif len(b_subfields) > 0: if 'a' in new245.subfields: new245['a'] = u'{0} :'.format(new245['a'].strip()) if len(b_subfields) > 0: for subfield_b in b_subfields: new245.add_subfield('b',subfield_b) if len(c_subfields) > 0: for subfield_c in c_subfields: new245.add_subfield('c',subfield_c) return new245
def create_subordinate_records(parent_record, subordinate_data_list): '''If a journal record includes a list of individual issues or volumes, this function creates separate marc files for each of those issues or volumes. The journal title and url are taken from the parent record (the journal record) and kept in the subordinate records.''' result_list = [] for subordinate_resource in subordinate_data_list: sub_record = Record(force_utf8=True) # add fields 006, 007 and 008 with minimal physical information to every marc file if 'title_full' in subordinate_resource: sub_record.add_field(Field(tag='006', data="m")) sub_record.add_field(Field(tag='007', data="cr")) # the value of field 008 is taken from the parent record and put into the subordinate one field008val = " o 0eng d" # DEFAULT ENG if 'languages' in parent_record and parent_record[ 'languages'] is not None: field008val = field008val[0:21] + lang_map.get( parent_record['languages'][0], " ") + field008val[24:] sub_record.add_field(Field(tag='008', data=field008val)) sub_record.add_field( Field( tag='245', indicators=['0', '0'], subfields=['a', subordinate_resource['title_full'][:9000]])) sub_record.add_field( Field(tag='506', indicators=['0', '#'], subfields=["a", "Open access"])) if parent_record['246']['a']: sub_record.add_field( Field(tag='490', indicators=['0', '0'], subfields=['a', parent_record['246']['a']])) # put together the issue/volume url, the journal url and the domain in field 856; # domain and journal url taken from the parent record, issue/volume url taken from the subordinate record if 'url' in subordinate_resource: current_field = Field(tag='856', indicators=['0', '0']) current_field.add_subfield('u', subordinate_resource['url']) if parent_record['856']['a']: current_field.add_subfield('a', parent_record['856']['a']) if parent_record['856']['u']: current_field.add_subfield('d', parent_record['856']['u']) sub_record.add_field(current_field) result_list.append(sub_record) return result_list
def validate300(self, marc_record): """Method modifies existing 300 field to the following RDA format: 300 1 online resource (xxvi, 368 pages) : $b illustrations Args: marc_record(pymarc.Record): Input MARC21 Returns: pymarc.Record: Modified MARC21 """ preface_pages_re = re.compile(r"(\w+), (\w+) p+") illus_re = re.compile(r"illus") all300Fields = marc_record.get_fields('300') for field in all300Fields: new_a = "1 online resource" new300 = Field(tag='300', indicators=[' ',' ']) subfield_a_lst = field.get_subfields('a') if len(subfield_a_lst) < 1: new300.add_subfield('a', new_a) for subfield in subfield_a_lst: illus_search = illus_re.search(subfield) preface_search = preface_pages_re.search(subfield) if preface_search is not None: preface, pages = preface_search.groups() new_a = "{} ({}, {} pages)".format( new_a, preface.lower(), pages) if illus_search is not None: new_a += " :" new300.add_subfield('a', new_a) if illus_search is not None: new300.add_subfield('b', 'illustrations') marc_record.remove_field(field) marc_record.add_field(new300) return marc_record
class CSV2MARC (object): """ Converts CSV to MARC records. """ def __init__(self): """ Load the CSV file. """ if len(sys.argv) > 1: filepath = sys.argv[1] else: raise Exception( "You need to provide a file path to the CSV file as an argument." ) try: self.reader = csv.reader( open(filepath, "r"), delimiter = "," ) except IOError: print >>sys.stderr, "Cannot open {0}".format(filepath) raise SystemExit output = "{0}.mrc".format(os.path.splitext(filepath)[0]) self.file = open(output, "w") # State variables self.sysno = False self.record = False self.field = False self.fieldTag = False self.fieldTagOccurrence = False self.subfieldLabel = False self.subfieldLabelOccurrence = False self.line = False def checkFieldChange(self, fieldTag, fieldTagOccurrence): if (self.fieldTag != fieldTag) or ((self.fieldTag == fieldTag) and (self.fieldTagOccurrence != fieldTagOccurrence)): return True else: return False def checkRecordChange(self, sysno): if not (sysno == self.sysno): return True else: return False def writeMARCRecord(self, record): writer = MARCWriter(self.file) writer.write(record) def getNewRecord(self, sysno): self.sysno = sysno self.record = Record() def getNewField(self, line): self.fieldTag = line["fieldTag"] self.fieldTagOccurrence = line["fieldTagOccurrence"] if line["subfieldLabel"]: # Normal field self.field = Field( tag = line["fieldTag"], indicators = [ line["indicator1"], line["indicator2"] ] ) else: # Datafield self.field = Field( tag = line["fieldTag"], data = line["value"] ) def main(self): for line in self.reader: # Parse the line line = { "sysno" : line[0], "fieldTag" : line[1], "fieldTagOccurrence" : line[2], "indicator1" : line[3], "indicator2" : line[4], "subfieldLabel" : line[5], "subfieldLabelOccurrence" : line[6], "value" : line[7], } if not self.sysno: self.getNewRecord(line["sysno"]) if self.checkRecordChange(line["sysno"]): self.record.add_field(self.field) # Add the last field of the previous record self.field = False # Remove the last field of the previous record self.fieldTag = False self.writeMARCRecord(self.record) self.getNewRecord(line["sysno"]) if not self.fieldTag: self.getNewField(line) if self.checkFieldChange(line["fieldTag"], line["fieldTagOccurrence"]): self.record.add_field(self.field) self.getNewField(line) if line["subfieldLabel"]: # If we have a subfield self.field.add_subfield( line["subfieldLabel"], line["value"] ) self.record.add_field(self.field) # Write the last field self.writeMARCRecord(self.record) # Write the last record after the iteration has ended self.file.close()
def validate245(self,marc_record): """ Method adds a subfield 'h' with value of electronic resource to the 245 field. Parameters: `marc_record`: Required, MARC record """ all245s = marc_record.get_fields('245') subfield_h_val = '[electronic resource]' if len(all245s) > 0: field245 = all245s[0] marc_record.remove_field(field245) subfield_a,subfield_c= '','' a_subfields = field245.get_subfields('a') indicator1,indicator2 = field245.indicators if len(a_subfields) > 0: subfield_a = a_subfields[0] if len(subfield_a) > 0: if ['.','\\'].count(subfield_a[-1]) > 0: subfield_a = subfield_a[:-1].strip() new245 = Field(tag='245', indicators=[indicator1,indicator2], subfields = ['a', u'{0} '.format(subfield_a)]) b_subfields = field245.get_subfields('b') c_subfields = field245.get_subfields('c') n_subfields = field245.get_subfields('n') p_subfields = field245.get_subfields('p') # Order for 245 subfields are: # $a $n $p $h $b $c if len(n_subfields) > 0: for subfield_n in n_subfields: new245.add_subfield('n', subfield_n) if len(p_subfields) > 0: for subfield_p in p_subfields: new245.add_subfield('p', subfield_p) if len(c_subfields) > 0 and len(b_subfields) < 1: new245.add_subfield('h','{0} / '.format(subfield_h_val)) elif len(b_subfields) > 0: new245.add_subfield('h','{0} : '.format(subfield_h_val)) else: new245.add_subfield('h',subfield_h_val) if len(b_subfields) > 0: for subfield_b in b_subfields: new245.add_subfield('b',subfield_b) if len(c_subfields) > 0: for subfield_c in c_subfields: new245.add_subfield('c',subfield_c) marc_record.add_field(new245) return marc_record
def convert_2_eres_rec(rec, rda_rec): msg = '' rec_003_value = rec.get_fields('003')[0].value() # the partner's institutional code from the 003 rec_001_value = rec.get_fields('001')[0].value() # the partner's local record number (BSN) from the 001 if rec_003_value == 'NNU': inst_name = 'New York Univeristy Libraries' inst_710a = 'New York University.' inst_710b = 'Libraries.' elif rec_003_value == 'NIC': inst_name = 'Cornell University Libraries' inst_710a = 'Cornell University.' inst_710b = 'Libraries.' elif rec_003_value == 'NNC': inst_name = 'Columbia University Libraries' inst_710a = 'Columbia University.' inst_710b = 'Libraries.' elif rec_003_value == 'NjP': inst_name = 'Princeton University Libraries' inst_710a = 'Princeton University.' inst_710b = 'Library.' elif rec_003_value == 'LeBAU': inst_name = "American University of Beirut's Jafet Memorial Library" inst_710a = 'Jafet Memorial Library.' inst_710b = '' elif rec_003_value == 'UaCaAUL': inst_name = "American University in Cairo Library" inst_710a = 'American University in Cairo.' inst_710b = 'Library.' else: inst_name = '' inst_710a = '' inst_710b = '' msg += 'ERROR-MISC: 003 code - '+rec_003_value+' - did not match any of the partner institutions.\n' if rec_001_value.startswith('o'): # this OCLC record did not get processed in step 4 msg += 'ERROR-MISC: 003/001 field values did not change to institutional code and BSN\n' msg += ' Record 003/001: '+rec_003_value+'_'+rec_001_value+'\n' for rec_035 in rec.get_fields('035'): msg += ' '+str(rec_035)+'\n' # delete the 005 field for rec_005 in rec.get_fields('005'): rec.remove_field(rec_005) # change the cataloging date in bytes 00-05 of the 008 to the current date curr_date = datetime.date.today() yy = str(curr_date.year)[2:].zfill(2) mm = str(curr_date.month).zfill(2) dd = str(curr_date.day).zfill(2) rec_008_value = rec.get_fields('008')[0].value() new_008_data = yy+mm+dd+rec_008_value[6:] new_008 = Field(tag='008', data=new_008_data) rec.remove_field(rec.get_fields('008')[0]) rec.add_ordered_field(new_008) # change byte 23 in the 008 field to code 'o' for 'online' rec_008_value = rec['008'].data rec['008'].data = rec_008_value[0:23] + 'o' + rec_008_value[24:] # add the 006/007 format fields for electronic resource characteristics if len(rec.get_fields('006')) > 0: for rec_006 in rec.get_fields('006'): rec_006_value = rec_006.value() msg += 'ERROR-MISC: 006 '+rec_006_value+'\n' rec.remove_field(rec_006) new_006 = Field(tag='006', data='m d ') rec.add_ordered_field(new_006) if len(rec.get_fields('007')) > 0: for rec_007 in rec.get_fields('007'): rec_007_value = rec_007.value() msg += 'ERROR-MISC: 007 '+rec_007_value+'\n' rec.remove_field(rec_007) new_007 = Field(tag='007', data='cr cn |||m|||a') rec.add_ordered_field(new_007) # delete fields that relate to the print version if len(rec.get_fields('016')) > 0: for rec_016 in rec.get_fields('016'): rec.remove_field(rec_016) if len(rec.get_fields('019')) > 0: for rec_019 in rec.get_fields('019'): rec.remove_field(rec_019) if len(rec.get_fields('025')) > 0: for rec_025 in rec.get_fields('025'): rec.remove_field(rec_025) if len(rec.get_fields('029')) > 0: for rec_029 in rec.get_fields('029'): rec.remove_field(rec_029) if len(rec.get_fields('042')) > 0: for rec_042 in rec.get_fields('042'): rec.remove_field(rec_042) if len(rec.get_fields('049')) > 0: for rec_049 in rec.get_fields('049'): rec.remove_field(rec_049) # create new 040 field for NNU for rec_040 in rec.get_fields('040'): rec.remove_field(rec_040) # delete the existing 040 field(s) if rec_003_value == 'LeBAU': cat_lang = 'ara' else: cat_lang = 'eng' if rda_rec: new_040 = Field(tag='040', indicators=[' ',' '], subfields=['a','NNU','b',cat_lang,'e','rda','c','NNU']) else: new_040 = Field(tag='040', indicators=[' ',' '], subfields=['a','NNU','b',cat_lang,'c','NNU']) rec.add_ordered_field(new_040) # correct the 041 language code field when multiple codes exist in the same subfield if len(rec.get_fields('041')) > 0: for rec_041 in rec.get_fields('041'): for rec_041_sub in rec_041: mult_langs = False new_041_subs = [] # Note: sub[0] is the subfield code and sub[1] is the subfield content for this subfield if len(rec_041_sub[1]) > 3: # there are multiple language codes in this 041 subfield mult_langs = True rec_041_sub_langs = re.findall('...',rec_041_sub[1]) for rec_041_sub_lang in rec_041_sub_langs: new_041_subs.append([rec_041_sub[0],rec_041_sub_lang]) else: new_041_subs.append([rec_041_sub[0],rec_041_sub[1]]) if mult_langs: rec_041_ind1 = rec_041.indicator1 rec_041_ind2 = rec_041.indicator2 new_rec_041 = Field(tag='041', indicators=[rec_041_ind1,rec_041_ind2], subfields=[]) for new_041_sub in new_041_subs: new_rec_041.add_subfield(new_041_sub[0],new_041_sub[1]) rec.remove_field(rec_041) rec.add_ordered_field(new_rec_041) # correct the 050 indicator 2 rec_050s = rec.get_fields('050') for rec_050 in rec_050s: this_index = rec_050s.index(rec_050) # check indicator 2 value and fix if needed if rec_050.indicator2 == ' ': rec.get_fields('050')[this_index].indicator2 = '4' # correct the 082 indicator 1 rec_082s = rec.get_fields('082') for rec_082 in rec_082s: this_index = rec_082s.index(rec_082) # check indicator 1 value and fix if needed if rec_082.indicator1 == ' ': rec.get_fields('082')[this_index].indicator1 = '0' if not rda_rec: # add GMD to 245$h for "[electronic resource]" rec_245s = rec.get_fields('245') gmd_added = False if len(rec_245s) == 0: msg += 'ERROR-MISC: Record is missing a 245 field\n' elif len(rec_245s) > 1: msg += 'ERROR-MISC: Record has multiple 245 fields\n' else: for rec_245 in rec_245s: rec_245_ind1 = rec_245.indicator1 rec_245_ind2 = rec_245.indicator2 new_rec_245 = Field(tag='245', indicators=[rec_245_ind1,rec_245_ind2], subfields=[]) # delete any existing 245 $h GMD subfields if len(rec_245.get_subfields('h')) > 0: for rec_245h in rec_245.get_subfields('h'): msg += 'ERROR-MISC: Original record for the print contains a 245$h GMD: '+rec_245h+'\n' rec_245.delete_subfield('h') rec_245_str = '' for rec_245_sub in rec_245: rec_245_str += '|$'+rec_245_sub[0]+rec_245_sub[1] # sub[0]=the subfield code; sub[1]=the subfield content rec_245_list = rec_245_str.split('|') rec_245_re1 = re.compile('\$a[^\$]*$') # matches subfield pattern $a not followed by any other subfield if rec_245_re1.search(rec_245_str) and not gmd_added: for sub in rec_245_list: post_gmd_sub_code = '' if sub.startswith('$a'): sub_index = rec_245_list.index(sub) if len(rec_245_list) > sub_index+1: post_gmd_sub = rec_245_list[sub_index+1] post_gmd_sub_code = post_gmd_sub[0:2] new_rec_245, gmd_added = add_ordered_gmd(sub,'$a', post_gmd_sub_code, new_rec_245, gmd_added) rec_245_re2 = re.compile('\$a[^\$]*\$[^np]') # matches subfield pattern $a not followed by $n or $p if rec_245_re2.search(rec_245_str) and not gmd_added: for sub in rec_245_list: post_gmd_sub_code = '' if sub.startswith('$a'): sub_index = rec_245_list.index(sub) if len(rec_245_list) > sub_index+1: post_gmd_sub = rec_245_list[sub_index+1] post_gmd_sub_code = post_gmd_sub[0:2] new_rec_245, gmd_added = add_ordered_gmd(sub,'$a', post_gmd_sub_code, new_rec_245, gmd_added) rec_245_re3 = re.compile('\$a[^\$]*\$n[^\$]*\$[^np]') # matches subfield pattern $a $n not followed by $n or $p if rec_245_re3.search(rec_245_str) and not gmd_added: for sub in rec_245_list: post_gmd_sub_code = '' if sub.startswith('$n'): sub_index = rec_245_list.index(sub) if len(rec_245_list) > sub_index+1: post_gmd_sub = rec_245_list[sub_index+1] post_gmd_sub_code = post_gmd_sub[0:2] new_rec_245, gmd_added = add_ordered_gmd(sub,'$n', post_gmd_sub_code, new_rec_245, gmd_added) rec_245_re4 = re.compile('\$a[^\$]*\$p[^\$]*\$[^np]') # matches subfield pattern $a $p not followed by $n or $p if rec_245_re4.search(rec_245_str) and not gmd_added: for sub in rec_245_list: post_gmd_sub_code = '' if sub.startswith('$p'): sub_index = rec_245_list.index(sub) if len(rec_245_list) > sub_index+1: post_gmd_sub = rec_245_list[sub_index+1] post_gmd_sub_code = post_gmd_sub[0:2] new_rec_245, gmd_added = add_ordered_gmd(sub,'$p', post_gmd_sub_code, new_rec_245, gmd_added) rec_245_re5 = re.compile('\$a[^\$]*\$n[^\$]*\$p[^\$]*\$[^np]') # matches subfield pattern $a $n $p not followed by $n or $p if rec_245_re5.search(rec_245_str) and not gmd_added: for sub in rec_245_list: post_gmd_sub_code = '' if sub.startswith('$p'): sub_index = rec_245_list.index(sub) if len(rec_245_list) > sub_index+1: post_gmd_sub = rec_245_list[sub_index+1] post_gmd_sub_code = post_gmd_sub[0:2] new_rec_245, gmd_added = add_ordered_gmd(sub,'$p', post_gmd_sub_code, new_rec_245, gmd_added) rec.remove_field(rec_245) rec.add_ordered_field(new_rec_245) if not gmd_added: msg += 'ERROR-MISC: GMD did not get added to non-RDA record\n' # NEED TO FIGURE OUT HOW TO ADD GMD to corresponding 880 field if it exists # delete subfield $c from 300 fields, modify punctuation in subfields $a and $b, and add 'online resource' to subfield $a for rec_300 in rec.get_fields('300'): if not rec_300.get_subfields('a')[0].startswith('online'): rec_300.delete_subfield('c') rec_300a = rec_300.get_subfields('a')[0] rec_300a = rec_300a.strip(' ;') rec_300a_pgs = rec_300a.split(' :') rec_300.delete_subfield('a') try: rec_300b = rec_300.get_subfields('b')[0] rec_300b = rec_300b.strip(' ;') rec_300.delete_subfield('b') rec_300a_mod = 'online resource ('+rec_300a_pgs[0]+') :' rec_300.add_subfield('a', rec_300a_mod) rec_300.add_subfield('b', rec_300b) except: # there is no subfield $b in the 300 rec_300a_mod = 'online resource ('+rec_300a_pgs[0]+')' rec_300.add_subfield('a', rec_300a_mod) if rda_rec: # Delete any existing 336, 337, and 338 fields for the print characteristics if len(rec.get_fields('336')) > 0: for rec_336 in rec.get_fields('336'): rec.remove_field(rec_336) if len(rec.get_fields('337')) > 0: for rec_337 in rec.get_fields('337'): rec.remove_field(rec_337) if len(rec.get_fields('338')) > 0: for rec_338 in rec.get_fields('338'): rec.remove_field(rec_338) # Add 336, 337, and 338 fields for the e-resource characteristics for content, media, and carrier new_rec_336 = Field(tag='336', indicators=[' ',' '], subfields=['a','text','2','rdacontent']) new_rec_337 = Field(tag='337', indicators=[' ',' '], subfields=['a','computer','2','rdamedia']) new_rec_338 = Field(tag='338', indicators=[' ',' '], subfields=['a','online resource','2','rdacarrier']) rec.add_ordered_field(new_rec_336) rec.add_ordered_field(new_rec_337) rec.add_ordered_field(new_rec_338) # add ACO note field new_500_aco = Field(tag='500', indicators=[' ',' '], subfields=['a','Part of the Arabic Collections Online (ACO) project, contributed by '+inst_name+'.']) rec.add_ordered_field(new_500_aco) # delete any print record's reference to other formats if len(rec.get_fields('530')) > 0: for rec_530 in rec.get_fields('530'): rec.remove_field(rec_530) # delete any existing 533 fields (e.g. for microform) for rec_533 in rec.get_fields('533'): rec.remove_field(rec_533) # add 533 field related to electronic reproduction curr_year = datetime.date.today().year new_533 = Field(tag='533', indicators=[' ',' '], subfields=['a', 'Electronic reproduction.', 'b', 'New York, N.Y. :', 'c', 'New York University,', 'd', str(curr_year)+'.', '5', 'NNU']) rec.add_ordered_field(new_533) # delete any existing 539 fields (e.g. for microform) for rec_539 in rec.get_fields('539'): rec.remove_field(rec_539) # new_539 = Field(tag='539', indicators=[' ',' '], subfields=['a', 's', 'b', str(curr_year), 'd', 'nyu', 'e', 'n', 'g', 'o']) # rec.add_ordered_field(new_539) # add headings referring to the ACO project and partners if not inst_710b == '': new_710 = Field(tag='710', indicators=['2',' '], subfields=['a', inst_710a, 'b', inst_710b]) else: new_710 = Field(tag='710', indicators=['2',' '], subfields=['a', inst_710a]) rec.add_ordered_field(new_710) new_730 = Field(tag='730', indicators=['0',' '], subfields=['a','Arabic Collections Online.']) rec.add_ordered_field(new_730) # add a new 776 field referencing the relationship to the print version new_776 = Field(tag='776', indicators=['0','8'], subfields=['i', 'Print version:']) # capture name entry from 100 or 110 if they exist and insert into new 776 subfield $a to reference print version if len(rec.get_fields('100', '110')) > 0: new_776a = rec.get_fields('100', '110')[0].value() if new_776a.startswith('8'): new_776a = new_776a[7:] new_776.add_subfield('a', new_776a) # capture title entry from 245 and insert into new 776 subfield $t to reference print version new_776t = rec.get_fields('245')[0].get_subfields('a')[0] new_776t = new_776t.rstrip(' /:.,') new_776.add_subfield('t', new_776t) # capture institutional ID entry from 003/001 and insert into new 776 subfield $w to reference print version new_776.add_subfield('w', '('+rec_003_value+')'+rec_001_value) if len(rec.get_fields('010')) > 0: if len(rec.get_fields('010')[0].get_subfields('a')) > 0: new_776w_010 = rec.get_fields('010')[0].get_subfields('a')[0] new_776.add_subfield('w', '(DLC)'+new_776w_010) rec.remove_field(rec.get_fields('010')[0]) if len(rec.get_fields('035')) > 0: for rec_035 in rec.get_fields('035'): rec_035a = rec_035.get_subfields('a')[0] if rec_035a.startswith('(OCoLC)'): new_776w_oclc = rec_035a new_776.add_subfield('w', new_776w_oclc) rec.remove_field(rec_035) new_020z_fields = [] # variable to collect the 020 fields as "invalid" subfield z's instead of subfield a's new_020z_subfields = [] # variable to collect the print ISBNs to add to the 776 field if len(rec.get_fields('020')) > 0: # record contains 020 ISBN fields for rec_020 in rec.get_fields('020'): # iterate through each of the 020 fields msg += '020s: YES\n' if len(rec_020.get_subfields('a')) > 0: # the 020 field has a subfield a for rec_020a in rec_020.get_subfields('a'): # iterate through the subfield a's msg += '020a: '+str(rec_020a)+'\n' new_020z_field = Field(tag='020', indicators=[' ',' '], subfields=['z', rec_020a]) new_020z_fields.append(new_020z_field) new_020z_subfields.append(rec_020a) rec.remove_field(rec_020) for new_020z_field in new_020z_fields: rec.add_ordered_field(new_020z_field) for new_776z in new_020z_subfields: new_776.add_subfield('z', new_776z) rec.add_ordered_field(new_776) # delete any 090 $h/$i fields if len(rec.get_fields('090')) > 0: for rec_090 in rec.get_fields('090'): if len(rec_090.get_subfields('h')) > 0: for rec_090h in rec_090.get_subfields('h'): rec_090.delete_subfield('h') if len(rec_090.get_subfields('i')) > 0: for rec_090i in rec_090.get_subfields('i'): rec_090.delete_subfield('i') if rec_090.format_field()=='': rec.remove_field(rec_090) # delete any local fields (9XXs, OWN, AVA) rec_9XXs = rec.get_fields('852','903','907','910','938','945','950','955','981','987','994','998','OWN','AVA') if len(rec_9XXs) > 0: for rec_9XX in rec_9XXs: rec.remove_field(rec_9XX) return (rec, msg)
def validate245(self, marc_record): """ Method adds a subfield 'h' with value of electronic resource to the 245 field. Parameters: `marc_record`: Required, MARC record """ all245s = marc_record.get_fields("245") subfield_h_val = "[electronic resource]" if len(all245s) > 0: field245 = all245s[0] marc_record.remove_field(field245) subfield_a, subfield_c = "", "" a_subfields = field245.get_subfields("a") indicator1, indicator2 = field245.indicators if len(a_subfields) > 0: subfield_a = a_subfields[0] if len(subfield_a) > 0: if [".", "\\"].count(subfield_a[-1]) > 0: subfield_a = subfield_a[:-1].strip() new245 = Field(tag="245", indicators=[indicator1, indicator2], subfields=["a", u"{0} ".format(subfield_a)]) b_subfields = field245.get_subfields("b") c_subfields = field245.get_subfields("c") n_subfields = field245.get_subfields("n") p_subfields = field245.get_subfields("p") # Order for 245 subfields are: # $a $n $p $h $b $c if len(n_subfields) > 0: for subfield_n in n_subfields: new245.add_subfield("n", subfield_n) if len(p_subfields) > 0: for subfield_p in p_subfields: new245.add_subfield("p", subfield_p) if len(c_subfields) > 0 and len(b_subfields) < 1: new245.add_subfield("h", "{0} / ".format(subfield_h_val)) elif len(b_subfields) > 0: new245.add_subfield("h", "{0} : ".format(subfield_h_val)) else: new245.add_subfield("h", subfield_h_val) if len(b_subfields) > 0: for subfield_b in b_subfields: new245.add_subfield("b", subfield_b) if len(c_subfields) > 0: for subfield_c in c_subfields: new245.add_subfield("c", subfield_c) marc_record.add_field(new245) return marc_record
#-------------------------------------------- # Create 506 field for the Rights statement rights = fields[6].strip() if not rights == '': rec_506 = Field(tag='506', indicators=[' ', ' '], subfields=['a', rights]) new_marc_rec.add_ordered_field(rec_506) #-------------------------------------------- # Create 260 field for the Publisher and Date Issued fields date = fields[7].strip() pub = fields[8].strip() rec_260 = Field(tag='260', indicators=[' ', ' ']) add_260 = False if not pub == '': rec_260.add_subfield('b', pub) add_260 = True if not date == '': rec_260.add_subfield('c', date) add_260 = True if add_260: new_marc_rec.add_ordered_field(rec_260) #-------------------------------------------- # Create 008 field with Date Issued as bytes 07-10 (Date1) and Language as bytes 35-37 # Descriptions of the 008 fields are at: http://www.oclc.org/bibformats/en/fixedfield.html # For breakdown of 008 byte positions, see: http://www.oclc.org/bibformats/en/fixedfield/008summary.html curr_date = datetime.date.today() yy = str(curr_date.year)[2:].zfill(2) mm = str(curr_date.month).zfill(2) dd = str(curr_date.day).zfill(2) entered = yy + mm + dd
def json_to_marc(infilename, outfilename): print('Processing: ' + infilename) #progress message data = json.load(open(infilename, "r")) record = Record(force_utf8=True) #create MARC record, enforce Unicode # add fields 006, 007 and 008 with minimal physical information to every marc file record.add_field(Field(tag='006', data="m")) record.add_field(Field(tag='007', data="cr")) # the iana language code from the json file is taken, checked against the list of language codes, # substituted with its iso639-2 equivalent and put in position 21-24 of the field 008 content field008val = " o 0eng d" # DEFAULT ENG try: if 'languages' in data and data['languages'][0] is not None: field008val = field008val[0:21] + lang_map.get( data['languages'][0], " ") + field008val[24:] except IndexError: field008val = field008val[0:21] + " " + field008val[24:] record.add_field(Field(tag='008', data=field008val)) # extract issn, in json 'generic' and/or 'electronic', and put into separate subfields of 022 if "identifiers" in data and "issn" in data["identifiers"]: field_issn = Field(tag='022', indicators=['0', '#']) if "generic" in data["identifiers"]["issn"]: field_issn.add_subfield('a', data["identifiers"]["issn"]["generic"][0]) if "electronic" in data["identifiers"]["issn"]: field_issn.add_subfield( 'l', data["identifiers"]["issn"]["electronic"][0]) record.add_field(field_issn) # title of the series or journal if data["is_part_of"] is not None and data["is_part_of"]['title_full']: record.add_field( Field(tag='245', indicators=['0', '0'], subfields=["a", data["is_part_of"]["title_full"][:9000]])) if data["title"]: record.add_field( Field(tag='246', indicators=['0', '0'], subfields=["a", data["title"][:9000]])) if data["year"]: record.add_field( Field(tag="260", indicators=["#", "#"], subfields=["c", data["year"]])) # add field 506 to all records, as not present in all json files record.add_field( Field(tag='506', indicators=['0', '#'], subfields=["a", "Open access"])) # some json files contain a very long description; the maximum length of data in a variable field #in MARC21 is 9,999 bytes, so here only a certain amount of content is put into the 520 field if data["description"]: record.add_field( Field(tag='520', indicators=['2', '#'], subfields=["a", data["description"][:9000]])) # keep together the journal url, host and domain as different subfields of field 856 # check if either exists, before initializing a new field instance if data['url'] or (data['is_part_of'] is not None and data['is_part_of']['url']): field = Field(tag='856', indicators=['0', '0']) if data['domain']: field.add_subfield('a', data['domain']) if data['is_part_of'] is not None and data['is_part_of']['url']: field.add_subfield('d', data['is_part_of']['url']) if data['url']: field.add_subfield('u', data['url']) record.add_field(field) if data["volume"]: record.add_field( Field(tag='866', indicators=['0', '0'], subfields=["a", data["volume"]])) #output marc file with same filename in Output directory out = open(outfilename, 'wb') out.write(record.as_marc()) out.close() # execute function for creating separate records for subordinate resources if data['subordinate_resources'] is not None: subordinate_records = create_subordinate_records( record, data['subordinate_resources']) counter = 0 # add counter and "-sub" to filenames of subordinate records for subordinate_record in subordinate_records: out = open( outfilename.replace(".marc", "-sub" + str(counter) + ".marc"), 'wb') out.write(subordinate_record.as_marc()) out.close() counter = counter + 1