Ejemplo n.º 1
0
def rero_get_record(id, verbose=False, debug=False):
    """Get a record from RERO data repo.

    RERO documentation:
    http://data.rero.ch/
    http://data.rero.ch/02-A000069866/marcxml
    """
    base_url = 'http://data.rero.ch/02-'
    query_id = '{id}'.format(id=id)
    format = '/marcxml'
    url = '{base_url}{query_id}{format}'.format(base_url=base_url,
                                                query_id=query_id,
                                                format=format)
    trans_record = None
    response = requests.get(url)
    if response.status_code == requests.codes.ok:
        try:
            records = parse_xml_to_array(BytesIO(response.content))
            if records:
                trans_record = Transformation(records[0]).json
                if verbose:
                    click.echo('API-rero get: {id}'.format(id=id))
        except Exception as err:
            if verbose:
                click.echo('ERROR get rero record: {err}'.format(err=err))
            if debug:
                raise Exception(err)
    return trans_record
Ejemplo n.º 2
0
def trans_prep(source, xml_part_to_add):
    """Prepare transformation."""
    build_xml_record_file(xml_part_to_add)
    current_dir = os.path.dirname(__file__)
    file_name = os.path.join(current_dir, 'examples/xml_minimal_record.xml')
    records = marcxml.parse_xml_to_array(file_name,
                                         strict=False,
                                         normalize_form=None)
    logger = Logger()
    trans = {
        'gnd':
        Transformation_gnd(marc=records[0],
                           logger=logger,
                           verbose=True,
                           transform=False),
        'idref':
        Transformation_idref(marc=records[0],
                             logger=logger,
                             verbose=True,
                             transform=False),
        'rero':
        Transformation_rero(marc=records[0],
                            logger=logger,
                            verbose=True,
                            transform=False)
    }
    return trans.get(source)
Ejemplo n.º 3
0
def _fetch_metadata(url):
    """
    @args: url, UNDL url given by user
    raises: 500 error if url can not be retrieved
    parse xml of response and return array of pymarc records
    """
    try:
        r = req.Request(
            url,
            data=None,
            headers={
                'User-Agent':
                "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"
            })
        resp = req.urlopen(r, context=ssl._create_unverified_context())
    except (HTTPError, URLError) as e:
        logger.error("Error: {}".format(e))
        abort(500)

    if resp.status != 200:
        logger.error("Could not get {}, status: {}".format(url, resp.status))
        abort(500)
    else:
        raw_xml = resp.read()
        #print(raw_xml)
        xml_doc = BytesIO(raw_xml)
        logger.info(xml_doc)
        collection = marcxml.parse_xml_to_array(xml_doc, False, 'NFC')
        return collection
Ejemplo n.º 4
0
def getRawFilesLocs(metadata_filename):
    """
    Given a MARC21 metadata file,
    return an array of "Files" objects, containing:
    `filename`
    `uri`
    `remote`
    `hash`
    `size`
    """

    # Parse the XML as MARC21
    #  and get the first record (result should be one record anyway)
    record = marcxml.parse_xml_to_array(metadata_filename)[0]
    # Look for 856 entries
    #  MARC21: 856 - Electronic Location and Access (R)
    files = []
    for f in record.get_fields("856"):
        obj = {}

        # Unknown size fallback
        obj["size"] = 0

        if f["u"]:
            obj["url"] = f["u"]
            obj["remote"] = "HTTP"
        elif f["d"]:
            obj["url"] = f["d"]
            obj["remote"] = "EOS"
        else:
            logging.debug(f'Skipped 856 entry "{f}", no u or d field.')
            continue

        # File checksum
        if f["w"]:
            p = re.compile(r"\([A-Za-z]*:([A-Za-z0-9]*).*;([A-Za-z0-9]*)")
            m = p.match(f["w"])
            alg = m.groups()[0].lower()
            checksum = m.groups()[1]
            obj["checksum"] = f"{alg}:{checksum}"

        # File size
        if f["s"]:
            obj["size"] = int(f["s"])

        # Get basename
        if obj["url"]:
            obj["filename"] = ntpath.basename(obj["url"])
            obj["path"] = obj["filename"]

        obj["metadata"] = False
        obj["downloaded"] = False

        if obj["filename"]:
            files.append(obj)
        else:
            logging.warning(f'Skipped entry "{f}", no basename found (probably an URL?)')

    return files
Ejemplo n.º 5
0
    def MarcEditXmlToMarc(self, x):
        mrcFileName = re.sub('.xml', '.mrc', x)
        print('\n<Converting from XML to MARC>\n')
        #subprocess.call([MonoBin,MarcEditBin,"-s", x, "-d",mrcFileName,"-xmlmarc","-marc8", "-mxslt","/opt/marcedit/xslt/MARC21XML2Mnemonic_plugin.xsl"])
        marcStr = ''
        with open(x, 'rb') as fh:
            recs = marcxml.parse_xml_to_array(fh)
            for rec in recs:
                marcStr += str(rec)

        return marcStr
Ejemplo n.º 6
0
 def __init__(self, url):
     resp = req.urlopen(url, context=ssl._create_unverified_context())
     if resp.status != 200:
         raise PageNotFoundException(
             "Could not get data from {}".format(url))
     self.xml_doc = BytesIO(resp.read())
     #print(self.xml_doc)
     r = marcxml.parse_xml_to_array(self.xml_doc, False, 'NFC')
     #print(r)
     if len(r) > 0:
         self.record = r[0]
     else:
         self.record = None
Ejemplo n.º 7
0
def trans_prep(source, xml_part_to_add):
    """Prepare transformation."""
    build_xml_record_file(xml_part_to_add)
    current_dir = os.path.dirname(__file__)
    file_name = os.path.join(
        current_dir, 'examples/xml_minimal_record.xml')
    records = marcxml.parse_xml_to_array(
        file_name, strict=False, normalize_form=None)
    trans = {
        'concepts': Transformation_rero_concepts(
            marc=records[0], logger=None, verbose=False, transform=False)
    }
    return trans.get(source)
Ejemplo n.º 8
0
def get_resource(parameters, stats):
    stats['search_rows'] = 0
    for filenum in parameters['filenums']:
        filepath = parameters['files-path-template'].format(filenum=filenum)
        search_id = 'neaman{}'.format(filenum)
        with open(filepath) as f:
            for record_num, record in enumerate(parse_xml_to_array(f)):
                row = parse_record(record)
                migdar_id = '{}-{}'.format(search_id, record_num)
                row.update(migdar_id=migdar_id, first_ccl_query='neaman{}.xml'.format(filenum),
                           last_query_datetime=datetime.datetime.now(),
                           json=json.loads(row['json']))
                stats['search_rows'] += 1
                yield row
Ejemplo n.º 9
0
def main(filename, out_filename=""):

    # check stdin
    if type(filename) is str:
        out_path = emx.get_out_filename(
            filename) if not out_filename else out_filename
    else:
        return 1

    with open(out_path, 'w', newline='') as fh:
        # open output file
        csv.register_dialect('marcxmltotsv',\
            delimiter='\t',quoting=csv.QUOTE_NONE,quotechar='',\
            doublequote=False,escapechar=None)
        csv_writer = csv.DictWriter(fh,
                                    fieldnames=COLUMNS,
                                    dialect='marcxmltotsv')
        csv_writer.writeheader()

        # parse xml
        collection = marcxml.parse_xml_to_array(filename, strict=True)

        def get_curr_row(record):

            curr_row = {}
            for col in COLUMNS:
                # for each record create row in tsv
                curr_row[col] = emx.get_value(col, record)
            return curr_row

        for record in collection:

            if len(DATE_RANGE) < 2:

                curr_row = get_curr_row(record)
                csv_writer.writerow(curr_row)
                continue

            else:

                curr_date = emx.get_pymarc_field_value('008',
                                                       record)[7:11].strip()

                if len(curr_date) == 4 and int(curr_date) >= DATE_RANGE[0] \
                and int(curr_date) <= DATE_RANGE[1]:

                    curr_row = get_curr_row(record)
                    csv_writer.writerow(curr_row)

    return 0
Ejemplo n.º 10
0
def worldcat(num, num_type, url, key):
# e.g., /oclc/34473395  /oclc/34474496
    url = url % (num, key)
    try:
        records = marcxml.parse_xml_to_array(urlopen(url))
        if not records:
            return None
        record = records[0]
    except:
        return None
    bib = {}
    bib[num_type.upper()] = num
    bib['TITLE'] = record.uniformtitle()
    if not bib['TITLE']:
        bib['TITLE'] = record.title() if record.title() else ''
    bib['TITLE_ALL'] = bib['TITLE']
    bib['AUTHORS'] = [entry.format_field() for entry in record.addedentries()]
    bib['AUTHOR'] = record.author() if record.author() else ''
    if bib['AUTHOR']:
        bib['AUTHORS'].insert(0, bib['AUTHOR'])
    bib['PUBLISHER'] = record.publisher() if record.publisher() else ''
    try:
    	bib['PUBLISH_DATE'] = record.pubyear() 
    except:
	bib['PUBLISH_DATE'] = ''
    bib['IMPRINT'] = '%s %s' % (bib['PUBLISHER'], bib['PUBLISH_DATE'])
    bib['BIB_FORMAT'] = 'as' if num_type == 'issn' else 'am'
    bib['ISBN'] = record.isbn()
    try:
	physical = [entry.format_field() for entry in record.physicaldescription()]
    	bib['DESC'] = physical[0]
    except:
	bib['DESC'] = ''
    try:
	notes = [entry.format_field() for entry in record.notes()]
    	bib['NOTES'] = notes[0]
    except:
	bib['NOTES'] = ''
    try:
	subjects = [entry.format_field() for entry in record.subjects()]
    	bib['SUBJECTS'] = subjects[0]
    except:
	bib['SUBJECTS'] = ''
    # identify worldcat response for the item.html block
    bib['WORLDCAT_RESPONSE'] = num
    bib['WORLDCAT_MESSAGE'] = 'This page contains information from the OCLC WorldCat catalog.'
    bib['WORLDCAT_SEARCH'] = '<a href=http://www.worldcat.org/search?q={{ bib.TITLE }}\
				>Search OCLC Worldcat</a>'
    return bib
Ejemplo n.º 11
0
def rero_get_record(id, verbose=False, debug=False):
    """Get a record from RERO data repo.

    RERO documentation:
    http://data.rero.ch/
    http://data.rero.ch/02-A000069866/marcxml
    """
    url = f'http://data.rero.ch/02-{id}/marcxml'
    trans_record = None
    response = requests.get(url)
    if response.status_code == requests.codes.ok:
        try:
            records = parse_xml_to_array(BytesIO(response.content))
            if records:
                trans_record = Transformation(records[0]).json
                if verbose:
                    click.echo(f'API-rero get: {id}')
        except Exception as err:
            if verbose:
                click.echo(f'ERROR get RERO record: {err}')
            if debug:
                raise Exception(err)
    return trans_record
Ejemplo n.º 12
0
def oai_get_record(id,
                   name,
                   transformation,
                   record_cls,
                   access_token=None,
                   identifier=None,
                   dbcommit=False,
                   reindex=False,
                   test_md5=False,
                   verbose=False,
                   debug=False,
                   **kwargs):
    """Get record from an OAI repo.

    :param identifier: identifier of record.
    """
    url, metadata_prefix, lastrun, setspecs = get_info_by_oai_name(name)

    request = Sickle(url)

    params = {}
    if access_token:
        params['accessToken'] = access_token

    params['metadataPrefix'] = metadata_prefix
    params['identifier'] = f'{identifier}{id}'
    try:
        record = request.GetRecord(**params)
    except Exception as err:
        if debug:
            raise Exception(err)
        return None
    records = parse_xml_to_array(StringIO(record.raw))
    trans_record = transformation(records[0]).json
    if verbose:
        click.echo(f'OAI-{name} get: {id}')
    return trans_record
Ejemplo n.º 13
0
def trans_prep(source, xml_part_to_add):
    """Prepare transformation."""
    build_xml_record_file(xml_part_to_add)
    current_dir = os.path.dirname(__file__)
    file_name = os.path.join(current_dir, 'examples/xml_minimal_record.xml')
    records = marcxml.parse_xml_to_array(file_name,
                                         strict=False,
                                         normalize_form=None)
    if source == 'bnf':
        trans = Transformation_bnf(marc=records[0],
                                   logger=None,
                                   verbose=False,
                                   transform=False)
    elif source == 'gnd':
        trans = Transformation_gnd(marc=records[0],
                                   logger=None,
                                   verbose=False,
                                   transform=False)
    elif source == 'rero':
        trans = Transformation_rero(marc=records[0],
                                    logger=None,
                                    verbose=False,
                                    transform=False)
    return trans
Ejemplo n.º 14
0
def worldcat(num, num_type, url, key):
    url = url % (num_type, num, key)
    try:
        records = marcxml.parse_xml_to_array(urlopen(url))
        if not records:
            return None
        record = records[0]
    except:
        return None
    bib = {}
    bib[num_type.upper()] = num
    bib['TITLE'] = record.uniformtitle()
    if not bib['TITLE']:
        bib['TITLE'] = record.title() if record.title() else ''
    bib['TITLE_ALL'] = bib['TITLE']
    bib['AUTHORS'] = [entry.format_field() for entry in record.addedentries()]
    bib['AUTHOR'] = record.author() if record.author() else ''
    if bib['AUTHOR']:
        bib['AUTHORS'].insert(0, bib['AUTHOR'])
    bib['PUBLISHER'] = record.publisher() if record.publisher() else ''
    bib['PUBLISHER_DATE'] = record.pubyear() if record.pubyear() else ''
    bib['IMPRINT'] = '%s %s' % (bib['PUBLISHER'], bib['PUBLISHER_DATE'])
    bib['BIB_FORMAT'] = 'as' if num_type == 'issn' else 'am'
    return bib
Ejemplo n.º 15
0
def oai_process_records_from_dates(name, sickle, oai_item_iterator,
                                   transformation, record_cls, max_retries=0,
                                   access_token=None, days_spann=30,
                                   from_date=None, until_date=None,
                                   ignore_deleted=False, dbcommit=True,
                                   reindex=True, test_md5=True, online=False,
                                   verbose=False, debug=False, **kwargs):
    """Harvest multiple records from an OAI repo.

    :param name: The name of the OAIHarvestConfig to use instead of passing
                 specific parameters.
    :param from_date: The lower bound date for the harvesting (optional).
    :param until_date: The upper bound date for the harvesting (optional).
    """
    # data on IDREF Servers starts on 2000-10-01
    if kwargs.get('kwargs', {}).get('online'):
        online = kwargs.get('kwargs', {}).get('online')
    name = name
    days_spann = days_spann
    last_run = None
    url, metadata_prefix, last_run, setspecs = get_info_by_oai_name(name)

    request = sickle(url, iterator=oai_item_iterator, max_retries=max_retries)

    dates_inital = {
        'from': from_date or last_run,
        'until': until_date
    }
    update_last_run = from_date is None and until_date is None
    # Sanity check
    if dates_inital['until'] is not None \
            and dates_inital['from'] > dates_inital['until']:
        raise WrongDateCombination("'Until' date larger than 'from' date.")

    last_run_date = datetime.now()

    # If we don't have specifications for set searches the setspecs will be
    # set to e list with None to go into the retrieval loop without
    # a set definition (line 177)
    setspecs = setspecs.split() or [None]
    count = 0
    action_count = {}
    mef_action_count = {}
    viaf_online_count = 0
    for spec in setspecs:
        dates = dates_inital
        params = {
            'metadataPrefix': metadata_prefix,
            'ignore_deleted': ignore_deleted
        }
        if access_token:
            params['accessToken'] = access_token
        params.update(dates)
        if spec:
            params['set'] = spec

        my_from_date = parser.parse(dates['from'])
        my_until_date = last_run_date
        if dates['until']:
            my_until_date = parser.parse(dates['until'])
        while my_from_date <= my_until_date:
            until_date = my_from_date + timedelta(days=days_spann)
            if until_date > my_until_date:
                until_date = my_until_date
            dates = {
                'from': my_from_date.strftime("%Y-%m-%d"),
                'until': until_date.strftime("%Y-%m-%d")
            }
            params.update(dates)

            try:
                for record in request.ListRecords(**params):
                    count += 1
                    records = parse_xml_to_array(StringIO(record.raw))
                    try:
                        try:
                            updated = datetime.strptime(
                                records[0]['005'].data,
                                '%Y%m%d%H%M%S.%f'
                            )
                        except:
                            updated = '????'
                        rec = transformation(records[0]).json
                        pid = rec.get('pid')
                        rec, action, m_record, m_action, v_record, v_online = \
                            record_cls.create_or_update_agent_mef_viaf(
                                data=rec,
                                dbcommit=True,
                                reindex=True,
                                online=online,
                                verbose=verbose
                            )
                        action_count.setdefault(action.name, 0)
                        action_count[action.name] += 1
                        mef_action_count.setdefault(m_action.name, 0)
                        mef_action_count[m_action.name] += 1
                        if v_online:
                            viaf_online_count += 1

                        if verbose:
                            m_pid = 'Non'
                            if m_record:
                                m_pid = m_record.pid
                            v_pid = 'Non'
                            if v_record:
                                v_pid = v_record.pid
                            click.echo(
                                (
                                    'OAI {name} spec({spec}): {pid}'
                                    ' updated: {updated} {action}'
                                    ' | mef: {m_pid} {m_action}'
                                    ' | viaf: {v_pid} online: {online}'
                                ).format(
                                    name=name,
                                    spec=spec,
                                    pid=pid,
                                    action=action.value,
                                    m_pid=m_pid,
                                    m_action=m_action.value,
                                    v_pid=v_pid,
                                    online=v_online,
                                    updated=updated
                                )
                            )
                    except Exception as err:
                        msg = 'ERROR creating {name} {count}: {err}'
                        msg = msg.format(
                            name=name,
                            count=count,
                            err=err
                        )
                        if rec:
                            msg += '\n{rec}'.format(rec=rec)

                        current_app.logger.error(msg)
                        if debug:
                            traceback.print_exc()
            except NoRecordsMatch:
                my_from_date = my_from_date + timedelta(days=days_spann + 1)
                continue
            except Exception as err:
                current_app.logger.error(err)
                if debug:
                    traceback.print_exc()
                count = -1

            my_from_date = my_from_date + timedelta(days=days_spann + 1)
            if verbose:
                click.echo(
                    ('OAI {name} {spec}: {from_d} .. +{days_spann}').format(
                        name=name,
                        spec=spec,
                        from_d=my_from_date.strftime("%Y-%m-%d"),
                        days_spann=days_spann
                    )
                )

    if update_last_run:
        if verbose:
            click.echo(
                ('OAI {name}: update last run: {last_run}').format(
                    name=name,
                    last_run=last_run_date
                )
            )
        oai_source = get_oaiharvest_object(name)
        oai_source.update_lastrun(last_run_date)
        oai_source.save()
        db.session.commit()
    return count, action_count, mef_action_count
Ejemplo n.º 16
0
    def parse_metadata(self, metadata_filename):
        """
        Given a MARC21 metadata file,
        return an array of "Files" objects, containing:
        `filename`
        `uri`
        `remote`
        `hash`
        `size`
        """

        # Parse the XML as MARC21
        #  and get the first record (result should be one record anyway)
        logging.debug("Parsing metadata..")
        try:
            record = marcxml.parse_xml_to_array(metadata_filename)[0]
        except:
            raise Exception(
                "Malformed metadata. Check if the record is public.")
        # Look for 856 entries
        #  MARC21: 856 - Electronic Location and Access (R)
        files = []
        for f in record.get_fields("856"):
            obj = {}

            # Unknown size fallback
            obj["size"] = 0

            if f["u"]:
                obj["url"] = f["u"]
                obj["remote"] = "HTTP"
            elif f["d"]:
                obj["url"] = f["d"]
                obj["remote"] = "EOS"
            else:
                logging.debug(f'Skipped 856 entry "{f}". No `u` or `d` field.')
                continue

            # File checksum
            if f["w"]:
                p = re.compile(r"\([A-Za-z]*:([A-Za-z0-9]*).*;([A-Za-z0-9]*)")
                m = p.match(f["w"])
                alg = m.groups()[0].lower()
                checksum = m.groups()[1]
                obj["checksum"] = f"{alg}:{checksum}"

            # File size
            if f["s"]:
                obj["size"] = int(f["s"])

            # Get basename
            if obj["url"]:
                obj["filename"] = ntpath.basename(obj["url"])
                # We suppose no folder structure
                obj["path"] = obj["filename"]
                obj["localpath"] = f"data/content/{obj['path']}"

            obj["metadata"] = False
            obj["downloaded"] = False

            if obj["filename"]:
                files.append(obj)
            else:
                logging.warning(
                    f'Skipped entry "{f}". No basename found (probably an URL?)'
                )
        logging.debug(f"Got {len(files)} files")

        meta_file_entry = {
            "filename": "metadata.xml",
            "path": "metadata.xml",
            "metadata": True,
            "downloaded": True,
            "localpath": "data/meta/metadata.xml",
            "url": self.metadata_url,
            "size": self.metadata_size,
        }
        files.append(meta_file_entry)

        return files
Ejemplo n.º 17
0
def oai_save_records_from_dates(name, file_name, sickle, oai_item_iterator,
                                max_retries=0,
                                access_token=None, days_spann=30,
                                from_date=None, until_date=None,
                                verbose=False, **kwargs):
    """Harvest and save multiple records from an OAI repo.

    :param name: The name of the OAIHarvestConfig to use instead of passing
                 specific parameters.
    :param from_date: The lower bound date for the harvesting (optional).
    :param until_date: The upper bound date for the harvesting (optional).
    """
    # data on IDREF Servers starts on 2000-10-01
    name = name
    days_spann = days_spann
    last_run = None
    url, metadata_prefix, last_run, setspecs = get_info_by_oai_name(name)

    request = sickle(url, iterator=oai_item_iterator, max_retries=max_retries)

    dates_inital = {
        'from': from_date or last_run,
        'until': until_date
    }
    # Sanity check
    if dates_inital['until'] is not None \
            and dates_inital['from'] > dates_inital['until']:
        raise WrongDateCombination("'Until' date larger than 'from' date.")

    last_run_date = datetime.now()

    # If we don't have specifications for set searches the setspecs will be
    # set to e list with None to go into the retrieval loop without
    # a set definition (line 177)
    setspecs = setspecs.split() or [None]
    count = 0
    with open(file_name, 'bw') as output_file:
        for spec in setspecs:
            dates = dates_inital
            params = {
                'metadataPrefix': metadata_prefix,
                'ignore_deleted': False
            }
            if access_token:
                params['accessToken'] = access_token
            params.update(dates)
            if spec:
                params['set'] = spec

            my_from_date = parser.parse(dates['from'])
            my_until_date = last_run_date
            if dates['until']:
                my_until_date = parser.parse(dates['until'])
            while my_from_date <= my_until_date:
                until_date = my_from_date + timedelta(days=days_spann)
                if until_date > my_until_date:
                    until_date = my_until_date
                dates = {
                    'from': my_from_date.strftime("%Y-%m-%d"),
                    'until': until_date.strftime("%Y-%m-%d")
                }
                params.update(dates)

                try:
                    for record in request.ListRecords(**params):
                        count += 1
                        records = parse_xml_to_array(StringIO(record.raw))
                        record_id = '???'
                        field_001 = records[0]['001']
                        if field_001:
                            record_id = field_001.data
                        if verbose:
                            click.echo(
                                'OAI {name} spec({spec}): {from_d} '
                                'count:{count:>10} = {id}'.format(
                                    name=name,
                                    spec=spec,
                                    from_d=my_from_date.strftime("%Y-%m-%d"),
                                    days_spann=days_spann,
                                    count=count,
                                    id=record_id
                                )
                            )
                        rec = records[0]
                        rec.leader = rec.leader[0:9] + 'a' + rec.leader[10:]
                        output_file.write(rec.as_marc())
                except NoRecordsMatch:
                    my_from_date = my_from_date + timedelta(
                        days=days_spann + 1)
                    continue
                except Exception as err:
                    current_app.logger.error(err)

                my_from_date = my_from_date + timedelta(days=days_spann + 1)
                if verbose:
                    click.echo(
                        'OAI {name} spec({spec}): '
                        '{from_d} .. +{days_spann}'.format(
                            name=name,
                            spec=spec,
                            from_d=my_from_date.strftime("%Y-%m-%d"),
                            days_spann=days_spann
                        )
                    )
    if verbose:
        click.echo('OAI {name}: {count}'.format(
            name=name,
            count=count
        ))
    return count
Ejemplo n.º 18
0
def main(filename, out_filename=""):
    def get_field_vol_values(field, codes, row):
        """return dictionary of values based on current volume level field"""
        def get_subfield_values_list(field, code):

            values = []
            for subfield in field.get_subfields(code):
                values.append(subfield.replace('\\', '').strip())

            return values if values else ""

        for code in codes:
            row[code] = get_subfield_values_list(field, code)

        return row

    def field_to_tsv(field, collection):

        out_path = emx.get_out_filename(filename, field['desc'])

        with open(out_path, 'w', newline='') as fh:
            # open output file

            columns = emx.COLUMNS_WORK + field['codes']

            csv.register_dialect('marcxmltotsv',\
                delimiter='\t',quoting=csv.QUOTE_NONE,quotechar='',\
                doublequote=False,escapechar=None)
            csv_writer = csv.DictWriter(fh,
                                        fieldnames=columns,
                                        dialect='marcxmltotsv')

            headers = {}
            headers_list = []
            headers_list += emx.COLUMNS_WORK
            if 'headers' in field: headers_list += field['headers']
            else: headers_list += field['codes']

            i = 0
            for column in columns:
                headers[column] = headers_list[i]
                i += 1

            index = 0

            csv_writer.writerow(headers)

            def get_curr_row(curr_fields, curr_work, tag):

                curr_row = {}
                if curr_fields and tag == '989':
                    curr_row = emx.get_field_989_values(
                        curr_fields, field['codes'], curr_work)
                else:
                    for curr_field in curr_fields:

                        curr_row = curr_work
                        # curr_row = emx.get_field_vol_values(curr_field,field['codes'],curr_row)
                        curr_row = get_field_vol_values(
                            curr_field, field['codes'], curr_row)

                return curr_row

            for record in collection:

                curr_work = emx.get_work_metadata(record)
                curr_fields = record.get_fields(field['tag'])
                '''
                
                lines 160 - 171 are an alternate way of returning current row including relator conditional
                created for epigraph author data subset and retained for demonstration
                
                '''

                # if len(DATE_RANGE) < 2:

                #     epi_authors = []

                #     for curr_field in curr_fields:

                #         if curr_field['4'] and 'author (epigraph)' in curr_field['4'].strip().lower():

                #             epi_authors.append(curr_field)
                #             curr_row = get_curr_row(epi_authors,curr_work,field['tag'])
                #             if curr_row: csv_writer.writerow(curr_row)
                #         continue

                if len(DATE_RANGE) < 2:

                    curr_row = get_curr_row(curr_fields, curr_work,
                                            field['tag'])
                    if curr_row: csv_writer.writerow(curr_row)
                    continue

                else:

                    curr_date = emx.get_pymarc_field_value(
                        '008', record)[7:11].strip()

                    if len(curr_date) == 4 and int(curr_date) >= DATE_RANGE[0] \
                    and int(curr_date) <= DATE_RANGE[1]:

                        curr_row = get_curr_row(curr_fields, curr_work,
                                                field['tag'])
                        # if curr_row: csv_writer.writerow(curr_row)
        return 0

    # parse xml
    collection = marcxml.parse_xml_to_array(filename, strict=True)

    for datafield in DATAFIELDS:

        field_to_tsv(datafield, collection)
Ejemplo n.º 19
0
def import_marc21xml(url, can_display_pending_publications):
    result = []

    o = urlparse(url)
    if '*' not in settings.ALLOWED_HOSTS and \
            o.netloc not in settings.ALLOWED_HOSTS:
        result.append(get_message('danger', _('The domain is not allowed')))
        return result

    if url.find(settings.SITE_DOMAIN + settings.SITE_PATH) != -1:  # self referencing url is bad
        result.append(get_message('danger', _('The domain is not allowed')))
        return result

    try:
        url = ''.join(c for c in unicodedata.normalize('NFD', url) if unicodedata.category(c) != 'Mn')
        reader = marcxml.parse_xml_to_array(urlopen(url))
    except IOError as e:
        result.append(get_message('danger', str(e)))
    except Exception as e:
        result.append(get_message('danger', str(e)))
    if result:
        return result

    for record in reader:
        dict_result = {}
        dict_record = parse_dict(record.as_dict())

        ##
        # only parse the good ones
        if 'publication_status' in dict_record and dict_record['publication_status'] == 'DELETED':
            continue
        if 'control_number' not in dict_record:
            continue

        dict_result['Id'] = dict_record['control_number']
        dict_result['Infoscience_URL'] = "{}/record/{}".format(
            'https://infoscience.epfl.ch', dict_record['control_number'])
        dict_result['ELA_Icon'] = dict_record['electronic_location_access']['icon']
        dict_result['ELA_URL'] = dict_record['electronic_location_access']['fulltexts']
        dict_result['DOI'] = dict_record['other_standard_identification_doi']
        dict_result['Title'] = dict_record['title']
        dict_result['Authors'] = dict_record['added_entry_personal_name']
        dict_result['Authors_1'] = dict_record['added_entry_uncontrolled_name_person_1']
        dict_result['Authors_3'] = dict_record['added_entry_uncontrolled_name_person_3']
        dict_result['Directors'] = dict_record['added_entry_uncontrolled_name_person_2']
        dict_result['Patents'] = dict_record['patent_control_information']
        dict_result['Publication_Location'] = dict_record['publication_distribution'].get('a', '')
        dict_result['Publication_Institution'] = dict_record['publication_distribution'].get('b', '')
        dict_result['Publication_Date'] = dict_record['date_of_publication']
        dict_result['Publication_Year'] = set_year(dict_result['Publication_Date'])
        dict_result['Publication_Pages'] = dict_record['physical_description_extent']
        dict_result['Publisher'] = dict_record['host_item_entry'].get('t', '')
        dict_result['Publisher_Volume'] = dict_record['host_item_entry'].get('j', '')
        dict_result['Publisher_Volume_Number'] = dict_record['host_item_entry'].get('k', '')
        dict_result['Publisher_Volume_Pages'] = dict_record['host_item_entry'].get('q', '')
        dict_result['Local_Url_Link'] = dict_record['local_added_entry_url_link']
        dict_result['Conference_Meeting_Name'] = dict_record['added_entry_meeting'].get('a', '')
        dict_result['Conference_Meeting_Location'] = dict_record['added_entry_meeting'].get('c', '')
        dict_result['Conference_Meeting_Date'] = dict_record['added_entry_meeting'].get('d', '')
        dict_result['Corporate_Name'] = dict_record['added_entry_corporate_name']
        dict_result['Company_Name'] = dict_record['added_entry_uncontrolled_name_company']
        dict_result['Approved_Publications'] = dict_record['approved_publications']
        dict_result['Pending_Publications'] = dict_record['pending_publications']
        dict_result['Doc_Type'] = dict_record['source_of_acquisition']
        dict_result['ISBN'] = dict_record['isbn']
        dict_result['Summary'] = dict_record['summary']

        dict_result['Description'] = [entry.format_field() for entry in record.physicaldescription()]
        dict_result['Subjects'] = [entry.format_field() for entry in record.subjects()]

        is_pending = dict_result['Pending_Publications'] and not dict_result['Approved_Publications']
        if not is_pending or can_display_pending_publications:
            result.append(dict_result)

    if len(result) == 0 and len(reader) > 0:
        result.append(get_message('info', _('There are only pending publications')))

    return result
Ejemplo n.º 20
0
                                                  prefix + 'subfield')
                    data_subfield.set('code', subfield[0])
                    data_subfield.text = translate(subfield[1])

        return root


# open marcxml file

filename = 'full-041417.xml'
fileout = filename[:-4] + '-appended' + '.xml'
# fileout = filename[:-4] + '.tsv'
# fileout = filename[:-4] + '.json'

xmlhandle = NSMarcXml()
records = marcxml.parse_xml_to_array(filename, strict=True)
""" sample code for writing out to json """
# jsonwriter = JSONWriter(open(fileout,'wt'))
# for record in records: jsonwriter.write(record)
# jsonwriter.close()
""" reads in tsv and generates dictionary of cleaned values with id keys """

cleaned_dict = {}
cleaned_keys = []

with open('cleaned.tsv', 'r') as fh:

    cleaned_list = csv.reader(fh, delimiter="\t")
    for row in cleaned_list:
        # cleaned_dict[row[0].strip()] = [row[2],row[4]]
        cleaned_dict[row[0]] = [row[2].strip(), row[4].strip()]
Ejemplo n.º 21
0
def main(filename, out_filename=""):
    def field_to_tsv(field, collection):

        out_path = emx.get_out_filename(filename, field['desc'])

        with open(out_path, 'w', newline='') as fh:
            # open output file

            columns = emx.COLUMNS_WORK + field['codes']

            csv.register_dialect('marcxmltotsv',\
                delimiter='\t',quoting=csv.QUOTE_NONE,quotechar='',\
                doublequote=False,escapechar=None)
            csv_writer = csv.DictWriter(fh,
                                        fieldnames=columns,
                                        dialect='marcxmltotsv')

            headers = {}
            headers_list = []
            headers_list += emx.COLUMNS_WORK
            if 'headers' in field: headers_list += field['headers']
            else: headers_list += field['codes']

            i = 0
            for column in columns:
                headers[column] = headers_list[i]
                i += 1

            index = 0

            csv_writer.writerow(headers)

            def get_curr_row(curr_fields, curr_work, tag):

                curr_row = {}
                if curr_fields and tag == '989':
                    curr_row = emx.get_field_989_values(
                        curr_fields, field['codes'], curr_work)
                else:
                    for curr_field in curr_fields:

                        curr_row = curr_work
                        curr_row = emx.get_field_vol_values(
                            curr_field, field['codes'], curr_row)

                return curr_row

            for record in collection:

                curr_work = emx.get_work_metadata(record)
                curr_fields = record.get_fields(field['tag'])

                if len(DATE_RANGE) < 2:

                    curr_row = get_curr_row(curr_fields, curr_work,
                                            field['tag'])
                    if curr_row: csv_writer.writerow(curr_row)
                    continue

                else:

                    curr_date = emx.get_pymarc_field_value(
                        '008', record)[7:11].strip()

                    if len(curr_date) == 4 and int(curr_date) >= DATE_RANGE[0] \
                    and int(curr_date) <= DATE_RANGE[1]:

                        curr_row = get_curr_row(curr_fields, curr_work,
                                                field['tag'])
                        if curr_row: csv_writer.writerow(curr_row)
        return 0

    # parse xml
    collection = marcxml.parse_xml_to_array(filename, strict=True)

    for datafield in DATAFIELDS:

        field_to_tsv(datafield, collection)