Ejemplo n.º 1
0
    :return: csv_data
    """
    csv_data = {}
    row = 0
    col_names = set(y for x in bib_data.entries for y in x.keys())

    for x in bib_data.entries:
        row += 1
        csv_data[row] = {}

        for col_name in col_names:
            v = x.get(col_name, '')
            v = v.replace('\n', ' ')
            v = v.replace('\r', ' ')
            v = v.replace('\t', ' ')
            csv_data[row][col_name] = v.encode('utf-8').strip()
    return csv_data


bib_str = ""
for line in sys.stdin:
    bib_str += line

bib_data = loads(bib_str)
csv_data = bib2csv(bib_data)

# @TODO: fail if the number of entries do not match
print_err("Entries: {}".format(len(bib_data.entries)))
print_err("Rows: {}".format(len(csv_data)))
write_csv_fp(sys.stdout, csv_data)
Ejemplo n.º 2
0
    (to be assigned during update)
"""

__author__ = "Michael Conlon"
__copyright__ = "Copyright 2015 (c) Michael Conlon"
__license__ = "New BSD License"
__version__ = "0.01"

import sys

from disambiguate.utils import print_err
from pump.vivopump import read_csv_fp, write_csv_fp, get_vivo_journals, get_parms

parms = get_parms()
data_in = read_csv_fp(sys.stdin)
print_err("Input data length: {}".format(len(data_in)))

data_out = {}

# get dictionary of journal uri keyed by
# International Standard Serial Numbers (ISSN)
vivo_journals = get_vivo_journals(parms)
print_err('There are {} journals in VIVO'.format(len(vivo_journals)))
# print_err(vivo_journals)

for row, data in data_in.items():
    data_out[row] = data

    if data['issn'] not in vivo_journals:
        # name is not vivo.  These are the ones to add
        data_out[row]['uri'] = ''
Ejemplo n.º 3
0
    SELECT
    ?uri ?doi
    WHERE {
        ?uri a vivo:InformationResource .
        ?uri bibo:doi ?doi .
    }
    """
    results = vivo_query(query, parms)
    bindings = results['results']['bindings']
    doi_list = [b['doi']['value'] for b in bindings]
    uri_list = [b['uri']['value'] for b in bindings]
    return dict(zip(doi_list, uri_list))

parms = get_parms()
data_in = read_csv_fp(sys.stdin)
print_err("{} rows in the input".format(len(data_in)))

data_out = {}
# get dictionary of pub uri keyed by doi
vivo_pubs = get_vivo_academic_articles(parms)

print_err('{} publications found in VIVO'.format(len(vivo_pubs)))
# print >>sys.stderr, vivo_pubs

for row, data in data_in.items():
    data_out[row] = data

    # name is not vivo.  These are the ones to add
    if data['doi'] not in vivo_pubs:
        data_out[row]['uri'] = ''
    else:
import sys

from disambiguate.utils import print_err
from pump.vivopump import read_csv_fp, write_csv_fp, improve_org_name

data_in = read_csv_fp(sys.stdin)

# The first filter should fail if there is no data to process
assert (len(data_in) > 0)

# import pprint; pprint.pprint(data_in)

# create a list of
column_names = data_in[1].keys()
print_err("==> {} columns in the input: {} ".format(len(column_names),
                                                    column_names))

data_out = {}
keep_names = set(['remove', 'uri', 'name', 'type'])

for row, data in data_in.items():
    if 'publisher' not in data:
        raise Exception(
            "The input row {} is missing the 'publisher' value".format(row))
    new_data = dict(data)

    # Add these columns
    new_data['remove'] = ''
    new_data['uri'] = ''
    new_data['name'] = improve_org_name(new_data['publisher']).upper()
    new_data['type'] = 'publisher'
Ejemplo n.º 5
0
def parse_author_data(author_data, affiliation_data, max_list_length=50):
    """
    Parse the author string from TR bibtex.  It has the names of each author.
    For each author, determine from the affiliation string if the author is the
    corresponding author (true or false) and if they are a UF author (true or
    false).  Return six data elements for each author -- display_name, last,
    first, middle names and the two true/false values.  Return a list of
    authors.  Each author has the six elements :param author_data: :param
    affiliation_data:

    :param max_list_length: Author list maximum length.  To prevent Physics
        papers from swamping the process
    :return: author_list.  A list of authors. Each author is a dict with seven
    elements.
    """
    from pump.vivopump import replace_initials
    author_list = []
    author_names = author_data.split(' and ')
    list_length = 0

    for display_name in author_names:
        list_length += 1
        if list_length > max_list_length:
            break

        # occasional leading '-' before some initials
        display_name = display_name.replace(' -', ' ')
        author_dict = {
            'display_name': display_name,
            'suffix': '',
            'corresponding': 'false',
            'uf': 'false'
        }

        if ' Jr.,' in display_name:
            author_dict['suffix'] = 'Jr.'
            display_name = display_name.replace(' Jr.,', '')
        if ' III,' in display_name:
            author_dict['suffix'] = 'III'
            display_name = display_name.replace(' III,', '')
        if ',' in display_name:
            k = display_name.find(',')
            author_dict['last'] = display_name[0:k]
            remainder = display_name[k + 2:]
            if ' ' in remainder:
                k = remainder.find(' ')
                author_dict['first'] = remainder[0:k].replace('.', '')
                if ' ' in remainder:
                    k = remainder.find(' ')
                    author_dict['first'] = remainder[0:k].replace('.', '')
                    author_dict['middle'] = remainder[k + 1:].replace('.', '')
            else:
                author_dict['first'] = remainder.replace('.', '')
                author_dict['middle'] = ''
        else:
            author_dict['last'] = display_name
            author_dict['first'] = ''
            author_dict['middle'] = ''
        author_list.append(author_dict)

    # If there is only one author, they must be UF and Corresponding
    if len(author_list) == 1:
        author_list[0]['corresponding'] = 'true'
        author_list[0]['uf'] = 'true'
        return author_list

    # Now find the Corresponding Author
    k = affiliation_data.find('(Reprint Author)')

    if k > 0:
        reprint_name = affiliation_data[0:k - 1]
        k = reprint_name.find(' ')
        reprint_last = reprint_name[0:k - 1]
        reprint_fi = reprint_name[k + 1:k + 2]
        for author_dict in author_list:
            if author_dict['last'] == reprint_last \
                    and author_dict['first'][0] == reprint_fi:
                author_dict['corresponding'] = 'true'

    # Now find the UF authors.  Could there be a more arcane format for the
    # affiliations (bunched, etc, etc), So first thing we do is build a
    # structure that can identify who is a UF author

    # periods are used for ending initials in names. Remove these
    affiliation_data = replace_initials(affiliation_data)

    # Now periods demarc the groups of authors with like affiliation
    affiliation_list = affiliation_data.split('.')
    affiliations = []

    for affiliation_string in affiliation_list:
        affiliation = {'affiliation_string': affiliation_string}
        if 'Univ Florida' in affiliation_string:
            affiliation['uf'] = 'true'
        else:
            affiliation['uf'] = 'false'
    affiliations.append(affiliation)
    print_err(affiliations)

    # Now we are ready to look for affiliations by name.  Messy business.
    for author_dict in author_list:
        if author_dict['first'] == '':
            # corporate authors can not be UF authors
            continue
        find_string = author_dict['last'] + ', ' + author_dict['first'][0]

        for affiliation in affiliations:  # look in each affiliation group
            if affiliation['affiliation_string'].find(find_string) > -1:
                author_dict['uf'] = affiliation['uf']
                # if you find the author, use the affiliation of the group
                # and don't look further.  If you don't find the author
                # the default affiliation is uf false
                continue

    print_err("{} Authors in list: {}".format(len(author_list), author_list))
    return author_list
Ejemplo n.º 6
0
    3. publisher not in VIVO, is in source => Add to VIVO
"""
__author__ = "Michael Conlon"
__copyright__ = "Copyright 2015 (c) Michael Conlon"
__license__ = "New BSD License"
__version__ = "0.02"

import sys

from disambiguate.utils import print_err
from pump.vivopump import read_csv_fp, write_csv_fp, get_vivo_publishers, get_parms,\
    key_string

parms = get_parms()
data_in = read_csv_fp(sys.stdin)
print_err('total input publishers: {}'.format(len(data_in)))

data_out = {}

# get dictionary of publisher uri keyed by simplified  name
vivo_publishers = get_vivo_publishers(parms)
print_err('total VIVO publishers: {}'.format(len(vivo_publishers)))
# print_err(vivo_publishers)

row_out = 0

for row, data in data_in.items():
    if 0 == row_out:
        # copy the header line so we don't end up with an empty file
        data_out[row_out] = data
        row_out += 1
Ejemplo n.º 7
0
__license__ = "New BSD License"
__version__ = "0.01"

import sys

from pump.vivopump import read_csv_fp, write_csv_fp, get_parms
from disambiguate import utils

parms = get_parms()
data_in = read_csv_fp(sys.stdin)
print >> sys.stderr, len(data_in)

# file_name = '/Users/asura/git/vivo-pump/author_list.csv'
# @TODO: pass file name path as a command line parameter
file_name = 'author_list.csv'
utils.print_err("Using static disambiguation file: {}".format(file_name))

# get dictionaries of authors keyed by name parts
vivo_auth_disambig_data = utils.get_vivo_disambiguation_data_from_csv(
    file_name)

utils.print_err("Finished loading {} entries from: {}".format(
    len(vivo_auth_disambig_data), file_name))
data_out = {}
row_out = 0
"""
Example of rows in data_in:

display_name        | suffix  | first   | uri | remove| middle| corresponding | uf  | last
Stienmetz, Jason L. |         | Jason   |     |       | L     |  true         | true| Stienmetz
"""