Example #1
0
    def test_xml(self):
        # read in xml to a record
        record1 = pymarc.parse_xml_to_array("test/batch.xml")[0]
        # generate xml
        xml = pymarc.record_to_xml(record1)
        # parse generated xml
        record2 = pymarc.parse_xml_to_array(BytesIO(xml))[0]

        # compare original and resulting record
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(
                    field1[pos].get_subfields(), field2[pos].get_subfields()
                )
                self.assertEqual(field1[pos].indicators, field2[pos].indicators)
            pos += 1
Example #2
0
    def test_xml(self):
        # read in xml to a record
        fh = gzip.open('test/batch.xml.gz','rb')
        record1 = pymarc.parse_xml_to_array(fh)[0]
        # generate xml
        xml = pymarc.record_to_xml(record1)
        # parse generated xml 
        record2 = pymarc.parse_xml_to_array(six.BytesIO(xml))[0]

        # compare original and resulting record
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields())
                self.assertEqual(field1[pos].indicators, field2[pos].indicators)
            pos += 1
Example #3
0
    def test_xml_sort(self):
        # read in xml to a record
        record1 = pymarc.parse_xml_to_array('test/order.xml')[0]
        # generate xml
        xml = pymarc.record_to_xml(record1)
        # parse generated xml 
        record1 = pymarc.parse_xml_to_array(StringIO(xml))[0]
        # parse xml in order
        record2 = pymarc.parse_xml_to_array('test/order_ok.xml')[0]

        # compare original and resulting record
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(field1[pos].get_subfields(), field2[pos].get_subfields())
                self.assertEqual(field1[pos].indicators, field2[pos].indicators)
            pos += 1
 def test_load_title_missing_language(self):
     filename = abs_filename('./test-data/title-missing-language.xml')
     marc = pymarc.parse_xml_to_array(filename)[0]
     loader = TitleLoader()
     title = Title.objects.get(lccn='sn83030846')
     self.assertRaises(core.models.Language.DoesNotExist,
                       loader._extract_languages, marc, title)
Example #5
0
def raw(marcfile,  edition):
    record = pymarc.parse_xml_to_array(marcfile)[0]
    for field in record:
        if field.tag in ('001', '003', '005', '006', '007', '856') or int( field.tag ) > 900:
            record.remove_field(field)
    add_stuff(record)
    return record
Example #6
0
def xml_to_binary(rec, writer):

    #parse records
    records = pymarc.parse_xml_to_array(rec)
    for r in records:
        #strip the tags
        r.remove_fields('938')

        #find subjects
        if not r.subjects():
            return False
            continue
        else:
            #count fields
            fields = r.get_fields()
            if len(fields) >= 9:

                #determins the format
                format = get_format(r)
                #adds the GMD
                if format:
                    gmd = '[' + format + ']'
                    if r['245']['h']:
                        r['245']['h'] = gmd
                    else:
                        r['245'].add_subfield('h', gmd)
                #writes the record
                writer.write(r)

                return True
            else:
                return False
def generate_marcfiles(reverse_order = False):
  docfiles = sorted([x for x in os.listdir(DATAROOT) if x.startswith("19C_0")])
  if reverse_order:
    docfiles.reverse()
  for docfile in docfiles:
    docfilepath = os.path.join(DATAROOT, docfile)
    yield (docfilepath, pymarc.parse_xml_to_array(docfilepath))
Example #8
0
    def search(self, query, page=1, size=20):
        try:
            # The "sc" parameter (split by collection) is used to provide
            # search results consistent with the ones from the CDS website
            req = requests.get(
                self.baseURL + "/search",
                params={
                    "p": query,
                    "of": "xm",
                    "rg": size,
                    "jrec": int(size) * (int(page) - 1) + 1,
                },
            )
        except Exception:
            raise ServiceUnavailable("Cannot perform search")

        if not req.ok:
            raise ServiceUnavailable(f"Search failed with error code {req.status_code}")

        # Parse MARC XML
        records = pymarc.parse_xml_to_array(io.BytesIO(req.content))
        results = []
        for record in records:
            results.append(self.parse_record(record))

        if len(records) > 0:
            # Get total number of hits
            pattern = "<!-- Search-Engine-Total-Number-Of-Results:(.*?)-->"

            total_num_hits = int(re.search(pattern, req.text).group(1))
        else:
            total_num_hits = 0

        return {"total_num_hits": total_num_hits, "results": results}
Example #9
0
def readMARCfromURL(url):
    #r = requests.get(url)

    #print(r.text[0:100])
    reader = parse_xml_to_array('sources.xml')
    record = reader[0]
    for field in record.get_fields('031'):
        print(field['d'], field['p'])
def generate_marcfiles(reverse_order=False):
    docfiles = sorted(
        [x for x in os.listdir(DATAROOT) if x.startswith("19C_0")])
    if reverse_order:
        docfiles.reverse()
    for docfile in docfiles:
        docfilepath = os.path.join(DATAROOT, docfile)
        yield (docfilepath, pymarc.parse_xml_to_array(docfilepath))
Example #11
0
File: tests.py Project: edsu/id
 def test_marc_to_concept(self):
     r = parse_xml_to_array('test_data/record.xml')[0]
     c = create_concept(r)
     self.assertEqual(c.lccn, 'sh00000011')
     self.assertEqual(c.pref_label, 'ActionScript (Computer program language)')
     self.assertEqual(c.modified, datetime(2007, 10, 12, 7, 53, 10))
     self.assertEqual(c.created, date(2000, 9, 27))
     self.assertEqual(c.heading_tag, '150')
Example #12
0
def seekBookbyISBN(isbn, library="helmet"):

    library = builcodes[library]

    ##print("XXX", url+isbn+full+FLTR+library)

    result = urllib.request.urlopen(url + isbn + full + FLTR + library).read()
    result = json.loads(result)
    result = result.get('records')

    # print "data searched:" + url + isbn + "\n"
    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(result)

    resultSet = result[0]

    xmlmarc = result[0]['fullRecord']

    # todo unnecessary bit...

    with open("kirja.xml", "w") as text_file:
        text_file.write(xmlmarc.encode("UTF-8"))

    from pymarc import parse_xml_to_array
    reader = parse_xml_to_array("kirja.xml")

    details = {}

    import re
    isbn = ""
    isbns = []

    for record in reader:

        for f in record.get_fields('020'):
            #print("kentta:", f['a'])
            isbns.append(f['a'].replace("-", ""))

        try:
            isbn = record['020']['a']
        except TypeError:
            pass

    if (len(isbns) > 1):
        if (len(isbns[0]) > len(isbns[1])):
            isbn = isbns[0]

        if (len(isbns[1]) > len(isbns[0])):
            isbn = isbns[1]

    title = getMarcValue(record, 'title')
    author = getMarcValue(record, 'author')
    publisher = getMarcValue(record, 'publisher').replace(",", "")
    pubyear = getMarcValue(record, 'pubyear').replace(".", "")

    #print("Valittu ISBN: {0}, title {1}".format(isbn, title.encode('utf-8')))

    return (title, author, isbn, publisher, pubyear)
Example #13
0
 def marc(self):
     # use pymarc to read the marcxml to make fields available
     if os.path.exists(self.marc_path):
         # with codecs.open(self.marc_path, 'r', "utf-8") as marcdata:
         with open(self.marc_path, 'r') as marcdata:
             # reader = MARCReader(marcdata, utf8_handling='ignore')
             return pymarc.parse_xml_to_array(marcdata)[0]
     else:
         print "Check if file %s exists or your mount connection" % self.marc_path
Example #14
0
 def load(self, mrcfile: str = '', filetype: str = 'mrc') -> None:
     if filetype == "mrc":
         with open(mrcfile, 'rb') as fh:
             reader = MARCReader(fh)
             for record in reader:
                 self.records.append(record)
     elif filetype == "xml":
         self.records = parse_xml_to_array(mrcfile)
     return None
Example #15
0
 def _record(self):
     if self._the_record:
         the_record = self._the_record
     else:
         the_record = pymarc.parse_xml_to_array(StringIO(self.guts))[0]
     for field in the_record.get_fields('856'):
         the_record.remove_field(field)
     self._the_record = the_record
     return the_record
Example #16
0
    def get_marc(self, url):
        xml = self.get_page(url)
        f = tempfile.NamedTemporaryFile(delete=False)
        f.write(xml)
        f.close()
        records = pymarc.parse_xml_to_array(f.name)

        os.unlink(f.name)

        return records
Example #17
0
def marcxml2array(marcxml):
    """
    serializes marcxml into pymarc array
    args:
        marcxml: xml
    returns:
        records: pymarc records array
    """
    records = BytesIO(ET.tostring(marcxml, encoding="utf-8"))
    return parse_xml_to_array(records)
Example #18
0
 def __init__(self, record_element, strip_ns=True):
     super(SickleMARCRecord, self).__init__(record_element,
                                            strip_ns=strip_ns)
     if not self.deleted:
         marc_file = tempfile.TemporaryFile()
         metadata = self.xml.find(".//" + self._oai_namespace + "metadata/")
         marc_file.write(etree.tostring(metadata, encoding='utf-8'))
         marc_file.seek(0)
         records = parse_xml_to_array(marc_file)
         self.metadata = records[0].as_dict()
Example #19
0
 def _record(self):
     if self._the_record:
         the_record = self._the_record
     else:
         the_record = pymarc.parse_xml_to_array(
             BytesIO(bytes(self.guts, 'utf-8')))[0]
     for field in the_record.get_fields('856'):
         the_record.remove_field(field)
     self._the_record = the_record
     return the_record
 def set_search_strings(cls,f,*args):
     '''
     Function: set_search_strings
     
     Purpose: the function reads a file and generates a list of search strings. 
     
     Parameters: 
     f = the path of the file to open that contains the search strings. This can be a 'txt', 'csv' or marcxml 'xml' file. 
     The file extension is used to determine how to process the file.
     
     t_a (optional) = the type of file string to create. This is most relevant for marcxml files. 
         The file string can be constructed to include just the title or the title and author. 
         Appropriate values include: 'title' and 'title_author'
     
     
     Example:  
     f='<path/to/file>'
     set_search_strings(f,'title_author')
     
     '''
     import pymarc
     import marcx
     import pandas as pd
     import io
     if len(args) > 0:
         t_a = args[0]
     else:
         t_a = None
     if (f[-3:]) == 'csv':
         cls.search_strings = open(f).read().splitlines()
     if (f[-3:]) == 'txt':
         cls.search_strings = open(f).read().splitlines()
     if (f[-3:]) == 'xml':
         records = pymarc.parse_xml_to_array(io.open(f,mode='r',encoding='utf-8'))
         df_search = pd.DataFrame()
         for rec in records:
             d = {}
             rec = marcx.FatRecord.from_record(rec)
             try:
                 d['author'] = rec['100']['a']
             except Exception as e:
                 d['author'] = ''
             d['title'] = rec.title() #.replace('/','')
             d['mmsid'] = rec['001'].data
             d['title_author'] = rec.title() #.replace('/','') + ' ' + d['author'] #rec.author()
             df_search = df_search.append(d,ignore_index=True)
         if t_a == None:
             t_a = 'title'
         if t_a == 'title_author':
             cls.search_strings = df_search['title_author']
         else:
             cls.search_strings = df_search['title']
Example #21
0
 def handle(self, **options):
     for title in Title.objects.filter(urls__value__icontains='chroniclingamerica'):
         record = pymarc.parse_xml_to_array(StringIO(title.marc.xml))[0]
         if record['245']['h'] == '[electronic resource].':
             if options['pretend']:
                 print title
             else:
                 LOGGER.info("deleting %s [%s] from solr index")
                 index.delete_title(title)
                 LOGGER.info("purging %s [%s]" % (title, title.lccn))
                 title.delete()
     if not options['pretend']:
         index.commit()
Example #22
0
def convert_xml_to_marc(hostenv):
    """Convert MARC XML to MARC formatted .mrc file"""
    for marcfilename in os.listdir(app_configs[hostenv]['marc_dir']):
        if marcfilename[-3:] == 'xml':
            newfilename = re.sub("-orig.xml", "-marc.mrc", marcfilename)
            logging.info("Converting to MARC %s", marcfilename)
            marc_recs_out = pymarc.MARCWriter(open(app_configs[hostenv]['marc_dir'] \
                                                   +"/"+ newfilename, 'wb'))
            marc_xml_array = pymarc.parse_xml_to_array(app_configs[hostenv]['marc_dir'] \
                                                       +marcfilename)
            for rec in marc_xml_array:
                marc_recs_out.write(rec)
            marc_recs_out.close()
 def handle(self, **options):
     for title in Title.objects.filter(urls__value__icontains='chroniclingamerica'):
         record = pymarc.parse_xml_to_array(StringIO(title.marc.xml))[0]
         if record['245']['h'] == '[electronic resource].':
             if options['pretend']:
                 self.stdout.write(title)
             else:
                 self.stdout.write("deleting %s [%s] from solr index")
                 index.delete_title(title)
                 self.stdout.write("purging %s [%s]" % (title, title.lccn))
                 title.delete()
     if not options['pretend']:
         index.commit()
Example #24
0
def read_marc(filename):
    """Read MARC record from filename.

    Takes just the first record if there are multiple ones.
    """
    logging.info("Reading %s" % (filename))
    records = parse_xml_to_array(filename)
    if (len(records) == 0):
        logging.error("No records in %s, aborting" % (filename))
        raise Exception("No records in %s, aborting" % (filename))
    elif (len(records) > 1):
        logging.info("Have taken first of %d records from %s" % (len(records), filename))
    return(records[0])
Example #25
0
    def __get_records(self, acnr):
        """Get all records containing acnr and return a list of pymarc.Record objects."""

        # Namespaces for the responses from Alma
        ns = {'marc': 'http://www.loc.gov/MARC21/slim',
              'srw': 'http://www.loc.gov/zing/srw/'}
        # Template für MARC-XML
        marc_template = """<marc:collection xmlns:marc="http://www.loc.gov/MARC21/slim" 
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
        xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd"/>"""
        xml_records = ET.fromstring(marc_template)

        # get the records from Alma
        offset = 1
        sru_request = "https://obv-at-obvsg.alma.exlibrisgroup.com/view/sru/43ACC_NETWORK?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query=other_system_number={acnr}&startRecord={offset}&maximumRecords=50"
        # sru_request = "https://obv-at-obvsg.alma.exlibrisgroup.com/view/sru/43ACC_UBG?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query=other_system_number={acnr}&startRecord={offset}&maximumRecords=50"

        # get the first 50 records
        res = requests.get(sru_request.format(acnr=acnr, offset=offset))

        # check how many records there are. If there are none, return None
        res_xml = ET.fromstring(res.text)
        numberOfRecords = int(res_xml.find("srw:numberOfRecords", ns).text)
        if numberOfRecords == 0:
            return None

        # add the records to the record list
        for record in res_xml.findall('.//marc:record', ns):
            xml_records.append(record)

        # repeat the request with increasing offset to get all records
        while offset < numberOfRecords - 50:
            offset += 50
            res = requests.get(sru_request.format(acnr=acnr, offset=offset))
            res_xml = ET.fromstring(res.text)
            # add the records to the record list
            for record in res_xml.findall('.//marc:record', ns):
                xml_records.append(record)

        # convert xml element to file-like-object, so pymarc can parse it
        marcfile = io.StringIO(ET.tostring(xml_records, encoding="unicode"))

        # parse the xml to a pymarc.Reader and make a list of pymarc.Records
        with marcfile as marcfile:
            reader = pymarc.parse_xml_to_array(marcfile)
            pymarc_records = []
            for record in reader:
                pymarc_records.append(record)

        return pymarc_records
Example #26
0
 def default_map(self, file_name, xpath):
     ns = {'marc': 'http://www.loc.gov/MARC21/slim',
           'oai': 'http://www.openarchives.org/OAI/2.0/'}
     file_path = r'./tests/test_data/default/{}'.format(file_name)
     record = pymarc.parse_xml_to_array(file_path)[0]
     result = self.mapper.parse_bib(record, "source")
     if self.config.validate_json_schema:
         validate(result, self.instance_schema)
     root = etree.parse(file_path)
     data = str('')
     for element in root.xpath(xpath, namespaces=ns):
         data = ' '.join(
             [data, str(etree.tostring(element, pretty_print=True), 'utf-8')])
     return [result, data]
def addLBD(config, oclcnumber, note):
    oauth_session = config.get('oauth-session')
    #create the LBD
    record = Record(leader='00000n   a2200000   4500')
    record.add_field(Field(tag='004', data=oclcnumber))
    record.add_field(
        Field(indicators=[' ', ' '], tag='500', subfields=['a', note]),
        Field(indicators=[' ', ' '],
              tag='935',
              subfields=['a', str(time.time())]),
        Field(indicators=[' ', ' '],
              tag='940',
              subfields=['a', config.get('oclcSymbol')]))
    input = pymarc.record_to_xml(record).decode("utf-8")

    try:
        r = oauth_session.post(
            config.get('metadata_service_url') + "/lbd/data",
            data=input,
            headers={
                "Accept":
                'application/atom+xml;content="application/vnd.oclc.marc21+xml"',
                "Content-Type": "application/vnd.oclc.marc21+xml"
            })
        r.raise_for_status
        try:
            result = ElementTree.fromstring(r.content)
            ns = {
                'atom': 'http://www.w3.org/2005/Atom',
                'wc': 'http://worldcat.org/rb'
            }
            marcNode = result.findall('atom:content/wc:response',
                                      ns)[0].getchildren()[0]
            marcData = StringIO(
                ElementTree.tostring(marcNode,
                                     encoding='unicode',
                                     method='xml'))
            # need to get this XML section out as a string and into a file like object
            marcRecords = pymarc.parse_xml_to_array(marcData)
            # pull out the LBD accession number
            print(marcRecords)
            accessionNumber = marcRecords[0]['001'].value()
            status = "success"
        except xml.etree.ElementTree.ParseError as err:
            accessionNumber = ""
            status = "failed XML parsing issue"
            print(err)
    except requests.exceptions.HTTPError as err:
        status = "failed"
    return pd.Series([oclcnumber, accessionNumber, status])
Example #28
0
def batch_to_list(infile):
    """Take a filename of a marc-file (binary or xml)and return a list of pymarc.Record objects."""
    with open(infile, "rb") as fh:
        # check if its xml or binary
        firstline = fh.readline()
        # set the pointer back to the beginning
        fh.seek(0)
        if b"<?xml version" in firstline:
            reader = pymarc.parse_xml_to_array(fh)
        else:
            # default: utf8_handling="strict"
            reader = pymarc.MARCReader(fh)

        record_list = list(reader)
    return record_list
def download_infoscience_labs():
    INFOSCIENCE_API_KEY = 'Token ' + read_api_key()

    labs = dict()
    search_url = "https://infoscience.epfl.ch/api/v1/search?p=&cc=Lab&c=Lab&format=files"
    #search_url = "https://infoscience.epfl.ch/api/v1/search?p=Shchutska&cc=People&c=People&format=files"
    headers = {'User-Agent': 'Custom FORCE_SCRIPT_NAME = None', 'Authorization': INFOSCIENCE_API_KEY}
    r = requests.get(search_url, headers=headers, stream=True)
    data = r.content
    dump = open('dump.dat', 'wb')
    dump.write(data)
    dump.close()
    z = zipfile.ZipFile(io.BytesIO(data))
    for x in z.infolist():
        if x.filename.find('metadata.xml') > 0:
            recid = ""
            lab_code = ""
            lab_uid = ""
            liaison_librarian = "Unknown"
            infoscience_manager = ""
            metadata = z.read(x.filename)
            pseudofile = io.StringIO(metadata.decode('utf-8'))
            records = pymarc.parse_xml_to_array(pseudofile)
            for field in records[0].fields:
                if field.tag == '001':
                    recid = field.data
                if field.tag > '010':
                    codes = field.subfields[0:len(field.subfields):2]
                    values = field.subfields[1:len(field.subfields):2]
                    field_subfields = dict(zip(codes, values))
                    if field.tag == '195':
                        lab_code = field_subfields['a']
                    if field.tag == '371':
                        lab_uid = field_subfields['g']
                    if field.tag == '270':
                        try:
                            infoscience_manager = field_subfields['m']
                        except KeyError:
                            print('manager', field)
                    if field.tag == '271':
                        try:
                            liaison_librarian = field_subfields['p']
                        except KeyError:
                            print('liaison', field)
            labs[lab_code] = {'uid': lab_uid, 'recid': recid, 'manager': infoscience_manager, 'liaison': liaison_librarian}

    return labs
def extract_lines(record_xml):
    index = 0
    parsed_all = []
    sourcename = record_xml.split('/')[-1].replace('.xml', '')
    records = pymarc.parse_xml_to_array(record_xml)
    for r in records:
        index += 1
        parsed_record = parse_marc(r, source=sourcename, index=index)
        parsed_all.append(parsed_record)

        if index % 1000 == 0:
            print(f"# of extracted records: {index} ...")
            # logging.info(f"# of extracted records: {index} ...")

    print(f"Extracted {index} records from {sourcename}.xml")
    logging.info(f"Extracted {index} records from {sourcename}.xml")
    return pd.concat(parsed_all)
Example #31
0
def main(argv):
    if len(argv) != 2:
        usage(sys.stderr)
        sys.exit(1)

    # inputs
    inFile = argv[1]

    # filecheck inputs
    fileCheck(inFile)

    # output file
    outFile = 'wau.alma.archived.' + time.strftime("%Y%m%d") + '.mrc'


    # file streams
    writer = codecs.open(outFile, 'wb', 'utf-8')

    #------------------------------------------------------------------#
    # Read an MMS ID, make a REST Call, Store Both IDs
    #------------------------------------------------------------------#
    print('Reading MARCXML file...')
    records = pymarc.parse_xml_to_array(inFile)

    count = 0
    for rec in records:           
        # force utf-8
        rec.force_utf8 = True

        # skip over any entries that have an empty 583 field
        if len(rec.get_fields('583')) == 0:
            print('Blank 583 field: skipping ' +  rec['001'].value() + ' / ' + rec['004'].value())
            continue;
            
        count = count + 1
        
        # get string representation of marc
        marc = rec.as_marc()
        # decode character set
        marc = marc.decode('utf-8')
                        
        # output
        writer.write(marc)
    # end for loop
   
    print('Finished. ' + unicode(count) + ' MARCXML records converted to MARC21 binary.')
def download_infoscience_authors():
    INFOSCIENCE_API_KEY = 'Token ' + read_api_key()
    
    authors = dict()
    search_url = "https://infoscience.epfl.ch/api/v1/search?p=&cc=People&c=People&format=files"
    #search_url = "https://infoscience.epfl.ch/api/v1/search?p=Shchutska&cc=People&c=People&format=files"
    headers = {'User-Agent': 'Custom FORCE_SCRIPT_NAME = None', 'Authorization': INFOSCIENCE_API_KEY}
    r = requests.get(search_url, headers=headers, stream=True)
    data = r.content
    dump = open('dump.dat', 'wb')
    dump.write(data)
    dump.close()
    z = zipfile.ZipFile(io.BytesIO(data))
    for x in z.infolist():
        names = []
        labs = []
        if x.filename.find('metadata.xml') > 0:
            metadata = z.read(x.filename)
            pseudofile = io.StringIO(metadata.decode('utf-8'))
            records = pymarc.parse_xml_to_array(pseudofile)
            try:
                names.append(records[0]['100']['a'])
                sciper = records[0]['935']['a']
            except:
                print('Extracting names:',inspire_recid,records[0].as_json(indent=2))
                sciper = ''
            for field in records[0].fields:
                if field.tag == '001':
                    recid = field.data
                if field.tag == '400':
                    names.append(field.subfields[1])
                if field.tag == '790':
                    codes = field.subfields[0:len(field.subfields):2]
                    values = field.subfields[1:len(field.subfields):2]
                    labs_positions = getIndexPositions(codes, 'a')
                    labs = [values[k] for k in labs_positions]
                    # print(labs)
            for name in names:
                if name not in authors:
                    authors[name] = [(sciper, recid, labs)]
                else:
                    if (sciper, recid) != authors[name][0:2]:
                        print(recid, name, 'already in database:', authors[name])
                        authors[name].append((sciper, recid, labs))
    return authors
    def test_encoding(self):
        # Create a record
        record1 = pymarc.Record()
        # Add a field containing no diacritics
        record1.add_field(
            pymarc.Field(
                tag='245',
                indicators=[' ', ' '],
                subfields=[
                    'a',
                    'Report of the Committee on the Peaceful Uses of Outer Space'
                ]))
        # And a field containing diacritics
        record1.add_field(
            pymarc.Field(
                tag='246',
                indicators=[' ', ' '],
                subfields=[
                    'a',
                    "Rapport du Comité des utilisations pacifiques de l'espace extra-atmosphérique"
                ]))
        # Create XML with an encoding specified
        record_xml = pymarc.marcxml.record_to_xml(record1, encoding='utf-8')
        # Parse the generated XML
        record2 = pymarc.parse_xml_to_array(six.BytesIO(record_xml))[0]

        # Compare the two records. If the other tests above pass, and this one passes, then the addition of an encoding
        # parameter in the marcxml.record_to_xml fuction didn't seem to break basic functionality of the library.
        self.assertEqual(record1.leader, record2.leader)

        field1 = record1.get_fields()
        field2 = record2.get_fields()
        self.assertEqual(len(field1), len(field2))

        pos = 0
        while pos < len(field1):
            self.assertEqual(field1[pos].tag, field2[pos].tag)
            if field1[pos].is_control_field():
                self.assertEqual(field1[pos].data, field2[pos].data)
            else:
                self.assertEqual(field1[pos].get_subfields(),
                                 field2[pos].get_subfields())
                self.assertEqual(field1[pos].indicators,
                                 field2[pos].indicators)
            pos += 1
Example #34
0
def splitMarcFile(marcFilename, saveLocation):
    records = parse_xml_to_array(marcFilename)
    for record in records:
        print "Saving record for " + record.title()
        if '856' in record:
            url = record['856']['u']
            if url.split('.')[-1] == "pdf":
                print "Downloading PDF"
                outputFilename = os.path.join(saveLocation,record['001'].format_field())
                try:
                    urllib.urlretrieve(record['856']['u'], outputFilename+'.pdf')
                except:
                    print "Error: Cannot download pdf file " + url
                try:
                    writeRecordToFile(record, outputFilename+".xml")
                except:
                    print "Error: Cannot write record to "+outputFilename+".xml"
    return len(records)
Example #35
0
    def search_by_id(self, recid):
        result = []

        try:
            # The "sc" parameter (split by collection) is used to provide
            # search results consistent with the ones from the CDS website
            req = requests.get(self.get_record_url(recid), params={"of": "xm"})
        except Exception:
            raise ServiceUnavailable("Cannot perform search")

        if req.ok:
            try:
                record = pymarc.parse_xml_to_array(io.BytesIO(req.content))[0]
                result.append(self.parse_record(record))
            except SAXParseException:
                # If authentication failed page is returned
                result = []

        return {"result": result}
def main():
#    with pymarc.MARCWriter(file(DATA_DIR+'ebooks.mrc','wb')) as writer:
    writer = pymarc.MARCWriter(codecs.open(DATA_DIR+'ebooks.mrc','w','utf-8'))
    count = written = 0
    for line in codecs.open(DATA_DIR+'openlib_url_list.tsv', encoding='utf-8'):
        count += 1
        if count < 2:
            continue # skip header line
        #url = line.rstrip('\n').split('\t')[6].replace('https:','http:')
        url = line.rstrip('\n').replace('https:','http:')
        print '  ',url
        jsonurl = '/'.join(url.split('/')[0:5])+'.json'
        json = get_json(jsonurl)
        if 'ocaid' in json:
            ia = json['ocaid']
            # This is the Internet Archive version of the MARC record for the electronic version e.g.
            #   https://archive.org/download/myantonia00cathrich/myantonia00cathrich_archive_marc.xml
            # not the original libraries MARC record for the paper book e.g.
            #   https://archive.org/download/myantonia00cathrich/myantonia00cathrich_marc.xml
            marcurl = 'http://archive.org/download/%s/%s_archive_marc.xml' % (ia,ia)
            records = None
            retries = 0
            while retries < 3 and records == None:
                retries += 1
                try:
                    records = pymarc.parse_xml_to_array(marcurl)
                except SAXParseException:
                    records = None
            record = update_marc_record(records[0],ia,url)
#            record.force_utf8 = True
            try:
                writer.write(record)
                written += 1
            except:
                print '** failed to write MARC record for ',marcurl
                print traceback.format_exc()
                #for field in record.fields:
                #    print str(field)
        else:
            print '** Unexpectedly missing ocaid for ',jsonurl
    writer.close()
    print 'Wrote %d of %d MARC records' % (written, count-1)
Example #37
0
 def next(self):
     '''Return MARC records in sets to controller.
     Break when last record position == num_records
     '''
     if self.current_record >= self.num_records:
         raise StopIteration
     # get chunk from self.current_record to self.current_record + page_size
     tree = self.get_current_xml_tree()
     recs_xml = tree.findall('.//zs:record', self.ns)
     # advance current record to end of set
     self.current_record = int(recs_xml[-1].find(
                             './/zs:recordPosition', self.ns).text)
     self.current_record += 1
     # translate to pymarc records & return
     marc_xml_file = tempfile.TemporaryFile()
     marc_xml_file.write(ET.tostring(tree))
     marc_xml_file.seek(0)
     recs = [rec.as_dict() for rec in
             pymarc.parse_xml_to_array(marc_xml_file) if rec is not None]
     return recs
def main():
    #    with pymarc.MARCWriter(file(DATA_DIR+'SCCLclassics.mrc','wb')) as writer:
    writer = pymarc.MARCWriter(codecs.open(DATA_DIR + "SCCLclassics.mrc", "w", "utf-8"))
    count = written = 0
    for line in codecs.open(DATA_DIR + "SCCL classics candidates - v3 selected.tsv", encoding="utf-8"):
        count += 1
        if count < 2:
            continue  # skip header line
        url = line.rstrip("\n").split("\t")[6].replace("https:", "http:")
        print "  ", url
        jsonurl = "/".join(url.split("/")[0:5]) + ".json"
        json = get_json(jsonurl)
        if "ocaid" in json:
            ia = json["ocaid"]
            # This is the Internet Archive version of the MARC record for the electronic version e.g.
            #   https://archive.org/download/myantonia00cathrich/myantonia00cathrich_archive_marc.xml
            # not the original libraries MARC record for the paper book e.g.
            #   https://archive.org/download/myantonia00cathrich/myantonia00cathrich_marc.xml
            marcurl = "http://archive.org/download/%s/%s_archive_marc.xml" % (ia, ia)
            records = None
            retries = 0
            while retries < 3 and records == None:
                retries += 1
                try:
                    records = pymarc.parse_xml_to_array(marcurl)
                except SAXParseException:
                    records = None
            record = update_marc_record(records[0], ia, url)
            #            record.force_utf8 = True
            try:
                writer.write(record)
                written += 1
            except:
                print "** failed to write MARC record for ", marcurl
                print traceback.format_exc()
                # for field in record.fields:
                #    print str(field)
        else:
            print "** Unexpectedly missing ocaid for ", jsonurl
    writer.close()
    print "Wrote %d of %d MARC records" % (written, count - 1)
def hathi_record_yielder(
        filenames,
        sample_files=100,
        sample_records=100
        ):
    """Returns a generator that cycles, one at a time, through all the records.

    We're reading the full DPLA dump out of the tarfile, and then
    definnig a generator object from a yielding function.

    Each object returned by the generator is a record from the great
    pymarc utility.

    It has a native method for parsing multirecord marcxml, but that
    relies on reading the entire giant files into the DOM. That's a
    waste of time, so I just chunk the records out by hand in a
    non-elegant way.  This method may not work on non-Hathi MARC files
    if they use different patterns of newlines.
    """
   
#    hathi_records = tarfile.open(tarfile_location)    
    for file in filenames:
        if file.endswith(".xml") and random.random()<=(sample_files/float(100)):    
            logging.info("Parsing new XML file " + file)
            buffer = ""
            in_record = False
            for line in open(file,"r"):
                #hathi_records.extractfile(file):\
                if "<record>" in line:
                    if random.random()<=(sample_records/float(100)):
                        in_record=True
                if in_record:
                    buffer += line
                if "</record>" in line and in_record:
                    in_record = False
                    records = pymarc.parse_xml_to_array(cStringIO.StringIO(buffer))
                    buffer = ""
                    for record in records:
                        record.__class__ = BRecord
                        yield record
Example #40
0
 def next(self):
     '''Return MARC records in sets to controller.
     Break when last record position == num_records
     '''
     if self.current_record >= self.num_records:
         raise StopIteration
     # get chunk from self.current_record to self.current_record + page_size
     tree = self.get_current_xml_tree()
     recs_xml = tree.findall('.//zs:record', self.ns)
     # advance current record to end of set
     self.current_record = int(recs_xml[-1].find('.//zs:recordPosition',
                                                 self.ns).text)
     self.current_record += 1
     # translate to pymarc records & return
     marc_xml_file = tempfile.TemporaryFile()
     marc_xml_file.write(ET.tostring(tree))
     marc_xml_file.seek(0)
     recs = [
         rec.as_dict() for rec in pymarc.parse_xml_to_array(marc_xml_file)
         if rec is not None
     ]
     return recs
    def metadata(self):
        """
        Fetch additional information about a volume from the HathITrust Bibliographic API.

        See: https://www.hathitrust.org/bib_api

        :return: A `pymarc` record. See pymarc's documentation for details on using it.
        """
        if not self._metadata:
            logging.debug("Looking up full metadata for {0}".format(self.id))
            data = requests.get(self.ht_bib_url).json()

            record_id = data['items'][0]['fromRecord']
            marc = data['records'][record_id]['marc-xml']

            # Pymarc only reads a file, so stream the text as if it was one
            xml_stream = StringIO(marc)
            xml_record = pymarc.parse_xml_to_array(xml_stream)[0]
            xml_stream.close()

            self._metadata = xml_record
        return self._metadata
Example #42
0
    def test_parse_to_array(self):
        records = pymarc.parse_xml_to_array('test/batch.xml')
        self.assertEqual(len(records), 2)

        # should've got two records
        self.assertEqual(type(records[0]), pymarc.Record)
        self.assertEqual(type(records[1]), pymarc.Record)
       
        # first record should have 18 fields
        record = records[0]
        self.assertEqual(len(record.get_fields()), 18)
      
        # check the content of a control field
        self.assertEqual(record['008'].data, 
                         u'910926s1957    nyuuun              eng  ')

        # check a data field with subfields
        field = record['245']
        self.assertEqual(field.indicator1, '0')
        self.assertEqual(field.indicator2, '4')
        self.assertEqual(field['a'], u'The Great Ray Charles')
        self.assertEqual(field['h'], u'[sound recording].')
Example #43
0
    def test_parse_to_array(self):
        records = pymarc.parse_xml_to_array('test/batch.xml')
        self.assertEqual(len(records), 2)

        # should've got two records
        self.assertEqual(type(records[0]), pymarc.Record)
        self.assertEqual(type(records[1]), pymarc.Record)

        # first record should have 18 fields
        record = records[0]
        self.assertEqual(len(record.get_fields()), 18)

        # check the content of a control field
        self.assertEqual(record['008'].data,
                         u'910926s1957    nyuuun              eng  ')

        # check a data field with subfields
        field = record['245']
        self.assertEqual(field.indicator1, '0')
        self.assertEqual(field.indicator2, '4')
        self.assertEqual(field['a'], u'The Great Ray Charles')
        self.assertEqual(field['h'], u'[sound recording].')
Example #44
0
    def metadata(self):
        """
        Fetch additional information about a volume from the HathITrust Bibliographic API.

        See: https://www.hathitrust.org/bib_api

        return: A `pymarc` record. See pymarc's documentation for details on using it.
        """
        if not self._extra_metadata:
            logging.debug("Looking up full metadata for {0}".format(self.id))
            data = requests.get(self.ht_bib_url).json()

            record_id = data['items'][0]['fromRecord']
            marc = data['records'][record_id]['marc-xml']

            # Pymarc only reads a file, so stream the text as if it was one
            xml_stream = StringIO(marc)
            xml_record = pymarc.parse_xml_to_array(xml_stream)[0]
            xml_stream.close()

            self._extra_metadata = xml_record
        return self._extra_metadata
Example #45
0
            continue
        cleaned_subfields.append(code)
        cleaned_subfields.append(value)
    return cleaned_subfields


inputfilename = "30_input.xml"
outputfilename = "30_output.mrc"

if len(sys.argv) >= 3:
    inputfilename, outputfilename = sys.argv[1:3]

inputfile = io.open(inputfilename, "rb")
outputfile = io.open(outputfilename, "wb")

reader = pymarc.parse_xml_to_array(inputfile)

for oldrecord in reader:

    newrecord = marcx.Record()
    newrecord.strict = False

    # prüfen, ob Titel vorhanden ist
    if not oldrecord["245"]:
        continue

    # leader
    newrecord.leader = "     " + oldrecord.leader[5:]
    if len(newrecord.leader) < 9:
        logging.debug("too short %s: %s", len(newrecord.leader), newrecord.leader)
        continue
		if os.path.exists(src_folder+src):
			shutil.move(src_folder+src, dest_folder+src)


# (Re)Process the records

# Convert the individual marcxml_in files to raw marc and write them all to a single .mrc file
# OUTPUT FILE
marcRecsOut_orig_recs = pymarc.MARCWriter(file(aco_globals.batch_folder+'/'+batch_name+'_0_orig_recs.mrc', 'w'))

marcxml_dir = aco_globals.batch_folder+'/marcxml_in'
for filename in os.listdir(marcxml_dir):
	file_path = os.path.join(marcxml_dir,filename)
	if os.path.isfile(file_path):
		if file_path[-3:]=='xml':
			marc_xml_array = pymarc.parse_xml_to_array(file_path)
			for rec in marc_xml_array:
				rec = aco_functions.pad_008(rec)
				rec_001 = rec.get_fields('001')[0]
				print rec_001
				marcRecsOut_orig_recs.write(rec)
marcRecsOut_orig_recs.close()

# Extract the OCLC numbers from each record and write records to .mrc and .txt files depending if record contains OCLC number or not
# INPUT FILE
marcRecsIn_orig_recs = pymarc.MARCReader(file(aco_globals.batch_folder+'/'+batch_name+'_0_orig_recs.mrc'), to_unicode=True, force_utf8=True)

# OUTPUT FILES
try:
	os.makedirs(aco_globals.batch_folder+'/'+batch_name+'_1/')
except OSError as exception:
Example #47
0
import pymarc
import time

writer = pymarc.MARCWriter(file('free_ebooks.marc', 'w'))
error = open('errors.txt', 'w')

count = 0

for line in open('marc_urls.txt'):
    try:
        marcrec = pymarc.parse_xml_to_array(line)
        writer.write(marcrec[0])            
    except:
        error.write(line)

    count += 1
    time.sleep(0.3) #try to be nice to IA servers
    if (count % 1000) == 0:
        time.sleep(600) #try to be nice to IA servers

writer.close()

            
            
            
import pymarc
import os
import pandas as pd

data_dir = '/Volumes/Data/LibraryVis'
out_file = 'TOC_800s_subject.csv'

# For our 5.8 Gb MARCXML file, this takes a long time (but < 1.5 hrs) and 32 Gb RAM...
records = pymarc.parse_xml_to_array(os.path.join(data_dir, 'TOC_2000bis2016.mrc.xml'))

titles = [r.title() for r in records]

# Not going with int() because some have things like 'c 2005'
years = [r.pubyear() for r in records]

# Only really need part of the dewey field
deweys = [r.get_fields('082')[0].get_subfields('a')[0] if '082' in r else '' for r in records]

# Getting rid of extra author info don't need right now
authors = ["|".join(r['100'].get_subfields('a','d')) if '100' in r else '' for r in records]

# MARC records
# tag 082 - Dewey Decimal Classification Number
# tag 856 - Electronic Location and Access

# TOC URLs
# e.g.
# {'856': {'ind1': u'4',
#     'ind2': u'2',
#     'subfields': [{u'm': u'V:DE-605'},
#      {u'q': u'application/pdf'},
 def setUp(self):
   self.tf1 = pymarc.parse_xml_to_array(TF1)
   self.collated_tf1 = collate(self.tf1[0])
   self.tf2 = pymarc.parse_xml_to_array(TF2)
   self.collated_tf2 = collate(self.tf2[0])
Example #50
0
 def test_strict(self):
     fh = gzip.open('test/batch.xml.gz','rb')
     a = pymarc.parse_xml_to_array(fh, strict=True)
     self.assertEqual(len(a), 2)
Example #51
0
import marcx
import pymarc

copytags = ("100", "105", "120", "130", "150", "174", "200", "245", "246", "250", "260", "300", "335", "336", "337",
            "338", "351", "361", "400", "500", "520", "650", "689", "700", "710", "800")

inputfilename = "156_input.xml"
outputfilename = "156_output.mrc"

if len(sys.argv) == 3:
    inputfilename, outputfilename = sys.argv[1:]

inputfile = open(inputfilename, "rb")
outputfile = open(outputfilename, "wb")
oldrecords = pymarc.parse_xml_to_array(inputfile)

for i, oldrecord in enumerate(oldrecords, start=1):

    try:
        f245a = oldrecord["245"]["a"]
    except:
        continue

    newrecord = marcx.Record(force_utf8=True)

    # leader
    leader = "     " + oldrecord.leader[5:]
    newrecord.leader = leader

    # 001
Example #52
0
 def load(self, file_name):
     import pymarc
     from es_config import marc2json
     for record in pymarc.parse_xml_to_array(file_name):
         fields = record.as_dict().get("fields")
         self.append(marc2json(fields))
Example #53
0
# <codecell>

import xml.etree.ElementTree as ET
from collections import defaultdict
import networkx as nx
import matplotlib.pyplot as plt
import pyddc
import pymarc
import operator
import math
import json


# <codecell>
print 'loading xml'
completedb = pymarc.parse_xml_to_array(open('CLASS_23eng_marc_webdewey_20131020.xml'))
print 'loaded xml'
# <markdowncell>

# Our rules:
# 
# * only if leader byte 8 is 'a'
# * exclude spans,  recrods that have eithe \$c or \$y in 153
# * 253 ind 0  \$a "see reference"
# * 253 ind 2  \$a "class elsewhere"
#      * may be multiple 253s
# * 153 \$a to $e is "notational hiearchy"
# 
# 
# Later on tables:
# 
Example #54
0
 def test_strict(self):
     a = pymarc.parse_xml_to_array(open('test/batch.xml'), strict=True)
     self.assertEqual(len(a), 2)
Example #55
0
 def test_bad_tag(self):
     a = pymarc.parse_xml_to_array(open('test/bad_tag.xml'))
     self.assertEqual(len(a), 1)
Example #56
0
def parse_xml_string_to_record(xmlstring):
    """ Parse an xml string and return a pymarc.Record object. """

    xml_io = cStringIO.StringIO(xmlstring)
    record = parse_xml_to_array(xml_io)[0]
    return record