def proc(record): try: if 'authors' not in record.json: error('no authors for record %s' % record.json['control_number']) return for author_index, author_data in enumerate(record.json['authors']): if 'affiliations' not in author_data: error('no affiliations for record %s' % record.json['control_number']) continue for aff_index, aff_data in enumerate(author_data['affiliations']): counts['all'] += 1 new_country = find_country(aff_data['value']) if aff_data['country'] != new_country: counts['changed'] += 1 info('Changed country for record with id %s from %s to %s' % (record.json['control_number'], aff_data['country'], new_country)) record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country if not dry_run: flag_modified(record, 'json') except Exception as e: error(str(e))
def test_cache(): test_country_key = "Some cached value2" test_country_value = "Noland" cc = CountryCache() cc.key = test_country_key cc.country = test_country_value db.session.add(cc) db.session.commit() assert (find_country(test_country_key) == test_country_value)
def test_cache(): test_country_key = "some cached value2" test_country_value = "Noland" cc = CountryCache() cc.key = test_country_key cc.country = test_country_value db.session.add(cc) db.session.commit() assert(find_country(test_country_key) == test_country_value)
def parse_inspire_records(size, query, jrec=1): articles = {'hits': {'hits': [], 'total': 0}} jrec = jrec articles['hits']['total'], records = fetch_url(jrec, size, query) for r in records: json_record = {'_source': {'authors': [], 'publication_info': []}} authors = r.findall('./a:datafield[@tag="100"]', inspire_namespace) authors.extend( r.findall('./a:datafield[@tag="700"]', inspire_namespace)) for author in authors: json_author = { 'full_name': author.find('./a:subfield[@code="a"]', inspire_namespace).text.encode('utf-8'), 'affiliations': [] } affs = author.findall('./a:subfield[@code="v"]', inspire_namespace) for aff in affs: country = find_country(aff.text.encode('utf-8')) json_aff = { 'value': aff.text.encode('utf-8'), 'country': country } json_author['affiliations'].append(json_aff) json_record['_source']['authors'].append(json_author) try: json_record['_source']['control_number'] = int( r.find('./a:controlfield[@tag="001"]', inspire_namespace).text) json_record['_source']['dois'] = [{ 'value': r.find( './a:datafield[@tag="024"][@ind1="7"]/a:subfield[@code="a"]', inspire_namespace).text }] json_record['_source']['record_creation_date'] = r.find( './a:datafield[@tag="260"]/a:subfield[@code="c"]', inspire_namespace).text json_record['_source']['publication_info'].append({ 'journal_title': r.find('./a:datafield[@tag="773"]/a:subfield[@code="p"]', inspire_namespace).text + r.find('./a:datafield[@tag="773"]/a:subfield[@code="v"]', inspire_namespace).text }) except: # noqa todo: implement proper error handling continue articles['hits']['hits'].append(json_record) return articles
def test_countries(): test_affs = ( ('CMS CERN Switzerland', 'CERN'), ('ETH Switzerland', 'Switzerland'), ('CMS CERN KEK', 'CERN'), ('KEK Japan', 'Japan'), ('Hungary University of Magic and Witchcraft', 'Hungary'), ('Rome', 'Italy'), ('Ankara', 'Turkey'), ) for test in test_affs: assert (find_country(test[0]) == test[1])
def test_countries(): test_affs = ( ('CMS CERN Switzerland', 'CERN'), ('ETH Switzerland', 'Switzerland'), ('CMS CERN KEK', 'CERN'), ('KEK Japan', 'Japan'), ('Hungary University of Magic and Witchcraft', 'Hungary'), ('Rome', 'Italy'), ('Ankara', 'Turkey'), ) for test in test_affs: assert(find_country(test[0]) == test[1])
def proc(record): """Fix country mappings...""" if record.json and 'authors' in record.json: for i, a in enumerate(record.json['authors']): for i2, aff in enumerate(a.get('affiliations', ())): c = aff.get('country') new_c = find_country(aff['value']) if c != new_c: rinfo('%s -> %s (%s)' % (c, new_c, aff['value']), record) record.json['authors'][i]['affiliations'][i2]['country'] = new_c flag_modified(record, 'json')
def add_nations(obj, eng): """Add nations extracted from affiliations""" if 'authors' not in obj.data: __halt_and_notify('No authors for article.', obj, eng) for author_index, author in enumerate(obj.data['authors']): if 'affiliations' not in author: __halt_and_notify('No affiliations for author: %s.' % author, obj, eng) for affiliation_index, affiliation in enumerate( author['affiliations']): obj.data['authors'][author_index]['affiliations'][ affiliation_index]['country'] = find_country( affiliation['value'])
def proc(record): """Fix country mappings...""" if record.json and 'authors' in record.json: for i, a in enumerate(record.json['authors']): for i2, aff in enumerate(a.get('affiliations', ())): c = aff['country'] new_c = find_country(aff['value']) if c != new_c: rinfo('%s -> %s (%s)' % (c, new_c, aff['value']), record) record.json['authors'][i]['affiliations'][i2][ 'country'] = new_c flag_modified(record, 'json')
def parse_inspire_records(size, query, jrec=1): articles = {'hits': {'hits': [], 'total': 0}} jrec = jrec articles['hits']['total'], records = fetch_url(jrec, size, query) for r in records: json_record = {'_source': {'authors': [], 'publication_info': []}} authors = r.findall('./a:datafield[@tag="100"]', inspire_namespace) authors.extend(r.findall('./a:datafield[@tag="700"]', inspire_namespace)) for author in authors: json_author = { 'full_name': author.find('./a:subfield[@code="a"]', inspire_namespace).text.encode('utf-8'), 'affiliations': [] } affs = author.findall('./a:subfield[@code="v"]', inspire_namespace) for aff in affs: country = find_country(aff.text.encode('utf-8')) json_aff = { 'value': aff.text.encode('utf-8'), 'country': country } json_author['affiliations'].append(json_aff) json_record['_source']['authors'].append(json_author) try: json_record['_source']['control_number'] = int( r.find('./a:controlfield[@tag="001"]', inspire_namespace).text) json_record['_source']['dois'] = [ {'value': r.find('./a:datafield[@tag="024"][@ind1="7"]/a:subfield[@code="a"]', inspire_namespace).text}] json_record['_source']['record_creation_date'] = r.find('./a:datafield[@tag="260"]/a:subfield[@code="c"]', inspire_namespace).text json_record['_source']['publication_info'].append({'journal_title': r.find( './a:datafield[@tag="773"]/a:subfield[@code="p"]', inspire_namespace).text + r.find( './a:datafield[@tag="773"]/a:subfield[@code="v"]', inspire_namespace).text}) except: # noqa todo: implement proper error handling continue articles['hits']['hits'].append(json_record) return articles
def add_nations(obj, eng): """Add nations extracted from affiliations""" if 'authors' not in obj.data: __halt_and_notify('No authors for article.', eng) for author_index, author in enumerate(obj.data['authors']): if 'affiliations' not in author: __halt_and_notify('No affiliations for author: %s.' % author, eng) for affiliation_index, affiliation in enumerate(author['affiliations']): obj.data['authors'][author_index]['affiliations'][affiliation_index]['country'] = find_country( affiliation['value'])
def proc(record): rinfo('start...', record) if '_files' not in record.json: rerror('Skipping. No _files', record) return xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files']) if not xml: rerror('Skipping. No xml in _files', record) return object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key']) uri = object.file.uri xml = parse(open(uri, 'rt')) x_author_groups = xml.getElementsByTagName('ce:author-group') if not x_author_groups: rerror('Skipping. No author groups.', record) return if len(x_author_groups) > 1: rinfo('Reparse all authors.', record) authors = [] for x_author_group in x_author_groups: # skip if not deepest author-group if x_author_group.getElementsByTagName('ce:author-group'): continue # extract affiliations x_affiliations = x_author_group.getElementsByTagName('ce:affiliation') affs = [] for a in x_affiliations: value = a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue affs.append({ u'country': find_country(value), u'value': value }) # extract authors, add affiliations x_authors = x_author_group.getElementsByTagName('ce:author') for x_author in x_authors: given_name = x_author.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue surname = x_author.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue full_name = '%s, %s' % (surname, given_name) author_affs = [] for ref in x_author.getElementsByTagName('ce:cross-ref'): affid = ref.attributes.get('refid').value if 'aff' in affid: aff_value = get_aff_by_id(x_author_group, affid) aff_country = find_country(aff_value) author_affs.append({ u'country': aff_country, u'value': aff_value }) if not (author_affs or affs): rerror('no affs for author: %s. Skip this record.' % surname, record) return authors.append({ 'full_name': full_name, 'given_name': given_name, 'surname': surname, 'affiliations': author_affs or affs }) if authors: record.json['authors'] = authors flag_modified(record, 'json') rinfo('updated', record) else: rerror('No authors found', record) else: for x_author_group in x_author_groups: x_collaborations = x_author_group.getElementsByTagName('ce:collaboration') x_affiliations = x_author_group.getElementsByTagName('ce:affiliation') # needed for supporting multiple author groups with author matching, but author matching is not rly possible. # authors_in_group = [ # (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(), # c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title()) # for c in x_author_group.getElementsByTagName('ce:author') # ] if 'authors' not in record.json: # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml. rerror('No authors... SKIPPING', record) return # extract collaborations, find countries later # FIXME we should always extract collaborations, but that would cause a lot more problems now. authors = [{'full_name': c.getElementsByTagName('ce:text')[0].childNodes[0].nodeValue} for c in x_collaborations] if authors: rinfo('Collaborations found: %s' % authors, record) record.json['authors'] = authors else: rerror('No collaborations. Not fixable.', record) # possibly we added authors in the previous step. if 'authors' in record.json: # Type 2 and 4: has authors, but no affiliations. authors = record.json['authors'] aff_count = sum(map(lambda x: 'affiliations' in x, authors)) if aff_count == 0: # Type 4: No affiliations in data. new_affs = [ {u'country': find_country(a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue), u'value': a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue } for a in x_affiliations] if new_affs: rinfo('New affiliations: %s' % new_affs, record) # FIXME modify this, if multiple author groups should be supported # FIXME (not all authors should be updated)!!! # update_authors(record, authors_in_group, new_affs) for i, a in enumerate(record.json.get('authors')): record.json['authors'][i]['affiliations'] = new_affs flag_modified(record, 'json') else: rerror('No affiliations at all. Not fixable.', record) elif aff_count == len(authors): empty_aff_count = sum(map(lambda x: len(x['affiliations']) == 0, authors)) if empty_aff_count == len(authors): # Type 2: Only empty affiliations. rinfo('Type 2. Not fixable.', record) else: rerror('Only SOME authors have EMPTY affiliations. What now?', record) else: rerror('Only SOME authors have affiliations. What now?', record) rinfo('OK', record)