Exemple #1
0
    def proc(record):
        try:
            if 'authors' not in record.json:
                error('no authors for record %s' % record.json['control_number'])
                return

            for author_index, author_data in enumerate(record.json['authors']):
                if 'affiliations' not in author_data:
                    error('no affiliations for record %s' % record.json['control_number'])
                    continue

                for aff_index, aff_data in enumerate(author_data['affiliations']):
                    counts['all'] += 1

                    new_country = find_country(aff_data['value'])
                    if aff_data['country'] != new_country:
                        counts['changed'] += 1

                        info('Changed country for record with id %s from %s to %s' % (record.json['control_number'],
                                                                                      aff_data['country'], new_country))
                        record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country

            if not dry_run:
                flag_modified(record, 'json')
        except Exception as e:
            error(str(e))
Exemple #2
0
def test_cache():
    test_country_key = "Some cached value2"
    test_country_value = "Noland"
    cc = CountryCache()
    cc.key = test_country_key
    cc.country = test_country_value
    db.session.add(cc)
    db.session.commit()

    assert (find_country(test_country_key) == test_country_value)
Exemple #3
0
def test_cache():
    test_country_key = "some cached value2"
    test_country_value = "Noland"
    cc = CountryCache()
    cc.key = test_country_key
    cc.country = test_country_value
    db.session.add(cc)
    db.session.commit()

    assert(find_country(test_country_key) == test_country_value)
Exemple #4
0
def parse_inspire_records(size, query, jrec=1):
    articles = {'hits': {'hits': [], 'total': 0}}
    jrec = jrec
    articles['hits']['total'], records = fetch_url(jrec, size, query)

    for r in records:
        json_record = {'_source': {'authors': [], 'publication_info': []}}
        authors = r.findall('./a:datafield[@tag="100"]', inspire_namespace)
        authors.extend(
            r.findall('./a:datafield[@tag="700"]', inspire_namespace))
        for author in authors:
            json_author = {
                'full_name':
                author.find('./a:subfield[@code="a"]',
                            inspire_namespace).text.encode('utf-8'),
                'affiliations': []
            }
            affs = author.findall('./a:subfield[@code="v"]', inspire_namespace)
            for aff in affs:
                country = find_country(aff.text.encode('utf-8'))
                json_aff = {
                    'value': aff.text.encode('utf-8'),
                    'country': country
                }
                json_author['affiliations'].append(json_aff)
            json_record['_source']['authors'].append(json_author)

        try:
            json_record['_source']['control_number'] = int(
                r.find('./a:controlfield[@tag="001"]', inspire_namespace).text)
            json_record['_source']['dois'] = [{
                'value':
                r.find(
                    './a:datafield[@tag="024"][@ind1="7"]/a:subfield[@code="a"]',
                    inspire_namespace).text
            }]
            json_record['_source']['record_creation_date'] = r.find(
                './a:datafield[@tag="260"]/a:subfield[@code="c"]',
                inspire_namespace).text
            json_record['_source']['publication_info'].append({
                'journal_title':
                r.find('./a:datafield[@tag="773"]/a:subfield[@code="p"]',
                       inspire_namespace).text +
                r.find('./a:datafield[@tag="773"]/a:subfield[@code="v"]',
                       inspire_namespace).text
            })
        except:  # noqa todo: implement proper error handling
            continue

        articles['hits']['hits'].append(json_record)

    return articles
Exemple #5
0
def test_countries():
    test_affs = (
        ('CMS CERN Switzerland', 'CERN'),
        ('ETH Switzerland', 'Switzerland'),
        ('CMS CERN KEK', 'CERN'),
        ('KEK Japan', 'Japan'),
        ('Hungary University of Magic and Witchcraft', 'Hungary'),
        ('Rome', 'Italy'),
        ('Ankara', 'Turkey'),
    )

    for test in test_affs:
        assert (find_country(test[0]) == test[1])
Exemple #6
0
def test_countries():
    test_affs = (
        ('CMS CERN Switzerland', 'CERN'),
        ('ETH Switzerland', 'Switzerland'),
        ('CMS CERN KEK', 'CERN'),
        ('KEK Japan', 'Japan'),
        ('Hungary University of Magic and Witchcraft', 'Hungary'),
        ('Rome', 'Italy'),
        ('Ankara', 'Turkey'),
    )

    for test in test_affs:
        assert(find_country(test[0]) == test[1])
Exemple #7
0
    def proc(record):
        """Fix country mappings..."""

        if record.json and 'authors' in record.json:
            for i, a in enumerate(record.json['authors']):
                for i2, aff in enumerate(a.get('affiliations', ())):

                    c = aff.get('country')
                    new_c = find_country(aff['value'])
                    if c != new_c:
                        rinfo('%s -> %s (%s)' % (c, new_c, aff['value']), record)
                        record.json['authors'][i]['affiliations'][i2]['country'] = new_c
                        flag_modified(record, 'json')
Exemple #8
0
def add_nations(obj, eng):
    """Add nations extracted from affiliations"""
    if 'authors' not in obj.data:
        __halt_and_notify('No authors for article.', obj, eng)

    for author_index, author in enumerate(obj.data['authors']):
        if 'affiliations' not in author:
            __halt_and_notify('No affiliations for author: %s.' % author, obj,
                              eng)

        for affiliation_index, affiliation in enumerate(
                author['affiliations']):
            obj.data['authors'][author_index]['affiliations'][
                affiliation_index]['country'] = find_country(
                    affiliation['value'])
Exemple #9
0
    def proc(record):
        """Fix country mappings..."""

        if record.json and 'authors' in record.json:
            for i, a in enumerate(record.json['authors']):
                for i2, aff in enumerate(a.get('affiliations', ())):

                    c = aff['country']
                    new_c = find_country(aff['value'])
                    if c != new_c:
                        rinfo('%s -> %s (%s)' % (c, new_c, aff['value']),
                              record)
                        record.json['authors'][i]['affiliations'][i2][
                            'country'] = new_c
                        flag_modified(record, 'json')
Exemple #10
0
def parse_inspire_records(size, query, jrec=1):
    articles = {'hits': {'hits': [], 'total': 0}}
    jrec = jrec
    articles['hits']['total'], records = fetch_url(jrec, size, query)

    for r in records:
        json_record = {'_source': {'authors': [], 'publication_info': []}}
        authors = r.findall('./a:datafield[@tag="100"]', inspire_namespace)
        authors.extend(r.findall('./a:datafield[@tag="700"]',
                                 inspire_namespace))
        for author in authors:
            json_author = {
                'full_name': author.find('./a:subfield[@code="a"]',
                                         inspire_namespace).text.encode('utf-8'),
                'affiliations': []
            }
            affs = author.findall('./a:subfield[@code="v"]',
                                  inspire_namespace)
            for aff in affs:
                country = find_country(aff.text.encode('utf-8'))
                json_aff = {
                    'value': aff.text.encode('utf-8'),
                    'country': country
                }
                json_author['affiliations'].append(json_aff)
            json_record['_source']['authors'].append(json_author)

        try:
            json_record['_source']['control_number'] = int(
                r.find('./a:controlfield[@tag="001"]', inspire_namespace).text)
            json_record['_source']['dois'] = [
                {'value': r.find('./a:datafield[@tag="024"][@ind1="7"]/a:subfield[@code="a"]', inspire_namespace).text}]
            json_record['_source']['record_creation_date'] = r.find('./a:datafield[@tag="260"]/a:subfield[@code="c"]',
                                                                    inspire_namespace).text
            json_record['_source']['publication_info'].append({'journal_title': r.find(
                './a:datafield[@tag="773"]/a:subfield[@code="p"]', inspire_namespace).text + r.find(
                './a:datafield[@tag="773"]/a:subfield[@code="v"]', inspire_namespace).text})
        except:  # noqa todo: implement proper error handling
            continue

        articles['hits']['hits'].append(json_record)

    return articles
Exemple #11
0
def add_nations(obj, eng):
    """Add nations extracted from affiliations"""
    if 'authors' not in obj.data:
        __halt_and_notify('No authors for article.', eng)

    for author_index, author in enumerate(obj.data['authors']):
        if 'affiliations' not in author:
            __halt_and_notify('No affiliations for author: %s.' % author, eng)

        for affiliation_index, affiliation in enumerate(author['affiliations']):
            obj.data['authors'][author_index]['affiliations'][affiliation_index]['country'] = find_country(
                affiliation['value'])
Exemple #12
0
    def proc(record):
        rinfo('start...', record)

        if '_files' not in record.json:
            rerror('Skipping. No _files', record)
            return

        xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files'])
        if not xml:
            rerror('Skipping. No xml in _files', record)
            return

        object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key'])
        uri = object.file.uri
        xml = parse(open(uri, 'rt'))
        x_author_groups = xml.getElementsByTagName('ce:author-group')

        if not x_author_groups:
            rerror('Skipping. No author groups.', record)
            return

        if len(x_author_groups) > 1:
            rinfo('Reparse all authors.', record)
            authors = []

            for x_author_group in x_author_groups:
                # skip if not deepest author-group
                if x_author_group.getElementsByTagName('ce:author-group'):
                    continue

                # extract affiliations
                x_affiliations = x_author_group.getElementsByTagName('ce:affiliation')
                affs = []
                for a in x_affiliations:
                    value = a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue
                    affs.append({
                        u'country': find_country(value),
                        u'value': value
                    })

                # extract authors, add affiliations
                x_authors = x_author_group.getElementsByTagName('ce:author')
                for x_author in x_authors:
                    given_name = x_author.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue
                    surname = x_author.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue
                    full_name = '%s, %s' % (surname, given_name)

                    author_affs = []
                    for ref in x_author.getElementsByTagName('ce:cross-ref'):
                        affid = ref.attributes.get('refid').value
                        if 'aff' in affid:
                            aff_value = get_aff_by_id(x_author_group, affid)
                            aff_country = find_country(aff_value)
                            author_affs.append({
                                u'country': aff_country,
                                u'value': aff_value
                            })

                    if not (author_affs or affs):
                        rerror('no affs for author: %s. Skip this record.' % surname, record)
                        return

                    authors.append({
                        'full_name': full_name,
                        'given_name': given_name,
                        'surname': surname,
                        'affiliations': author_affs or affs
                    })

            if authors:
                record.json['authors'] = authors
                flag_modified(record, 'json')
                rinfo('updated', record)
            else:
                rerror('No authors found', record)

        else:
            for x_author_group in x_author_groups:
                x_collaborations = x_author_group.getElementsByTagName('ce:collaboration')
                x_affiliations = x_author_group.getElementsByTagName('ce:affiliation')
                # needed for supporting multiple author groups with author matching, but author matching is not rly possible.
                # authors_in_group = [
                #     (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(),
                #      c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title())
                #     for c in x_author_group.getElementsByTagName('ce:author')
                # ]

                if 'authors' not in record.json:
                    # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml.
                    rerror('No authors... SKIPPING', record)
                    return

                    # extract collaborations, find countries later
                    # FIXME we should always extract collaborations, but that would cause a lot more problems now.
                    authors = [{'full_name': c.getElementsByTagName('ce:text')[0].childNodes[0].nodeValue} for c in
                               x_collaborations]
                    if authors:
                        rinfo('Collaborations found: %s' % authors, record)
                        record.json['authors'] = authors
                    else:
                        rerror('No collaborations. Not fixable.', record)

                # possibly we added authors in the previous step.
                if 'authors' in record.json:
                    # Type 2 and 4: has authors, but no affiliations.
                    authors = record.json['authors']
                    aff_count = sum(map(lambda x: 'affiliations' in x, authors))
                    if aff_count == 0:
                        # Type 4: No affiliations in data.
                        new_affs = [
                            {u'country': find_country(a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue),
                             u'value': a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue
                             }
                            for a in x_affiliations]
                        if new_affs:
                            rinfo('New affiliations: %s' % new_affs, record)
                            # FIXME modify this, if multiple author groups should be supported
                            # FIXME (not all authors should be updated)!!!
                            # update_authors(record, authors_in_group, new_affs)

                            for i, a in enumerate(record.json.get('authors')):
                                record.json['authors'][i]['affiliations'] = new_affs
                            flag_modified(record, 'json')
                        else:
                            rerror('No affiliations at all. Not fixable.', record)

                    elif aff_count == len(authors):
                        empty_aff_count = sum(map(lambda x: len(x['affiliations']) == 0, authors))
                        if empty_aff_count == len(authors):
                            # Type 2: Only empty affiliations.
                            rinfo('Type 2. Not fixable.', record)
                        else:
                            rerror('Only SOME authors have EMPTY affiliations. What now?', record)
                    else:
                        rerror('Only SOME authors have affiliations. What now?', record)

        rinfo('OK', record)