def proc(record, parser): if record.json is None: rerror('record.json is None', record) return unescape_abstract(record, parser) unescape_titles(record, parser)
def proc_find(record): if record.json and 'authors' in record.json: for a in record.json['authors']: s = sum(map(bool, a.values())) if s == 0: rerror('error', record) missing_authors.append(record.id) return
def proc(record): if not record.json: rerror('no json.', record) return if 'record_creation_year' not in record.json: date = parse_date(record.json['record_creation_date']) if not date: rerror("Date couldn't be parsed: %s" % record.json['record_creation_date'], record) record.json['record_creation_year'] = date.year flag_modified(record, 'json')
def proc(record): if not record.json: rerror('no json.', record) return if 'record_creation_year' not in record.json: date = parse_date(record.json['record_creation_date']) if not date: rerror( "Date couldn't be parsed: %s" % record.json['record_creation_date'], record) record.json['record_creation_year'] = date.year flag_modified(record, 'json')
def unescape_abstract(record, parser): if 'abstracts' not in record.json or len(record.json['abstracts']) == 0: rerror('Record has no abstracts.', record) return if len(record.json['abstracts']) > 1: rerror('Record has more then one abstracts (%d). Skipping.' % len(record.json['abstracts']), record) return original = record.json['abstracts'][0]['value'] unescaped = parser.unescape(original) if unescaped != original: rinfo('Abstract changed.', record) record.json['abstracts'][0]['value'] = unescaped flag_modified(record, 'json')
def unescape_abstract(record, parser): if 'abstracts' not in record.json or len(record.json['abstracts']) == 0: rerror('Record has no abstracts.', record) return if len(record.json['abstracts']) > 1: rerror( 'Record has more then one abstracts (%d). Skipping.' % len(record.json['abstracts']), record) return original = record.json['abstracts'][0]['value'] unescaped = parser.unescape(original) if unescaped != original: rinfo('Abstract changed.', record) record.json['abstracts'][0]['value'] = unescaped flag_modified(record, 'json')
def unescape_titles(record, parser): if 'titles' not in record.json or len(record.json['titles']) == 0: rerror('Record has no titles.', record) return original = record.json['titles'] unescaped = [] for title in original: if 'title' not in title: rerror('title key not in title', record) title['title'] = parser.unescape(title['title']) unescaped.append(title) if unescaped != original: rinfo('Authors changed.', record) record.json['titles'] = unescaped flag_modified(record, 'json')
def map_old_record(record, dry_run): """ Maps the given record if needed to comply with the new schema. Following fields will be mapped: - page_nr will be a list of integers instead of list of strings - arxiv id will be put to the arxiv_eprints field - arxiv categories will be added if not yet present - "arxiv:" prefix will be removed from arxiv id - record_creation_date will be converted to iso format Following fields will be deleted at the end of the process: - _collections - report_numbers - files - local_files - free_keywords - additional_files - file_urls - earliest_date The result won't be saved and None will be returned in the following cases: - the record doesn't contain a json - a record fails the validation after mapping - both report_numbers and arxiv_eprints fields are present (shouldn't happen in the existing records) - there is more then one value in report_numbers field (shouldn't happen in the existing records) - report_numbers field is present, but there is no source subfield - no record_creation_date is present """ # if there is no json, the record is considered deleted if not record.json: rerror('no json', record) return # page_nr to list of integers if 'page_nr' in record.json: record.json['page_nr'] = [int(x) for x in record.json['page_nr']] # extract arxiv from report_numbers if present if "report_numbers" in record.json and "arxiv_eprints" in record.json: rerror('both report_numbers and arxiv_eprints are present. Skip record.', record) return if "report_numbers" in record.json: if len(record.json["report_numbers"]) > 1: rerror('report_numbers has more then one element. Skip record.', record) return arxiv_id = None for element in record.json.get("report_numbers", ()): source = element.get('source') if not source: rerror('report_numbers present, but no source. Skip record.', record) return if source.lower() == 'arxiv': arxiv_id = element.get('value') break if arxiv_id: arxiv_id = arxiv_id.lower().replace('arxiv:', '') record.json['arxiv_eprints'] = [{'value': arxiv_id}] rinfo('report_numbers -> arxiv_eprints', record) else: rerror('report_numbers present, but no arxiv id? Skip record.', record) return # add arxiv category if not yet present if "arxiv_eprints" in record.json: for element in record.json.get("arxiv_eprints", ()): if 'value' not in element: rerror('arxiv_eprints value missing', record) continue arxiv_id = element['value'] # remove arxiv prefix if present if arxiv_id.lower().startswith('arxiv:'): rinfo('removing "arxiv:" prefix', record) arxiv_id = arxiv_id[len('arxiv:'):] if 'categories' not in element: categories = get_arxiv_categories(arxiv_id) element['categories'] = categories # record_creation_date to isoformat record_creation_date = record.json.get('record_creation_date') if record_creation_date is None: rerror('no record creation date. Skip record.', record) return new_date = parse_date(record_creation_date).isoformat() if new_date != record_creation_date: rinfo('update record_creation_date: %s -> %s' % (record_creation_date, new_date), record) record.json['record_creation_date'] = new_date # delete unwanted fields unwanted_fields = ( '_collections', 'report_numbers', 'files', 'local_files', 'free_keywords', 'additional_files', 'file_urls', 'earliest_date', ) for key in unwanted_fields: if record.json.pop(key, None) is not None: rinfo('deleted %s field' % key, record) # validate record valid = False schema = record.json.get('$schema') if schema is not None: schema_data = requests_retry_session().get(schema).content schema_data = json.loads(schema_data) try: validate(record.json, schema_data) valid = True except ValidationError as err: rerror('Invalid record: %s' % err, record) except SchemaError as err: rerror('SchemaError during record validation! %s' % err, record) else: rerror('No schema found!', record) if not valid: return # mark changes if not dry_run if not dry_run: flag_modified(record, 'json') return record
def proc(record): rinfo('start...', record) if '_files' not in record.json: rerror('Skipping. No _files', record) return xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files']) if not xml: rerror('Skipping. No xml in _files', record) return object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key']) uri = object.file.uri xml = parse(open(uri, 'rt')) x_author_groups = xml.getElementsByTagName('ce:author-group') if not x_author_groups: rerror('Skipping. No author groups.', record) return if len(x_author_groups) > 1: rinfo('Reparse all authors.', record) authors = [] for x_author_group in x_author_groups: # skip if not deepest author-group if x_author_group.getElementsByTagName('ce:author-group'): continue # extract affiliations x_affiliations = x_author_group.getElementsByTagName('ce:affiliation') affs = [] for a in x_affiliations: value = a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue affs.append({ u'country': find_country(value), u'value': value }) # extract authors, add affiliations x_authors = x_author_group.getElementsByTagName('ce:author') for x_author in x_authors: given_name = x_author.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue surname = x_author.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue full_name = '%s, %s' % (surname, given_name) author_affs = [] for ref in x_author.getElementsByTagName('ce:cross-ref'): affid = ref.attributes.get('refid').value if 'aff' in affid: aff_value = get_aff_by_id(x_author_group, affid) aff_country = find_country(aff_value) author_affs.append({ u'country': aff_country, u'value': aff_value }) if not (author_affs or affs): rerror('no affs for author: %s. Skip this record.' % surname, record) return authors.append({ 'full_name': full_name, 'given_name': given_name, 'surname': surname, 'affiliations': author_affs or affs }) if authors: record.json['authors'] = authors flag_modified(record, 'json') rinfo('updated', record) else: rerror('No authors found', record) else: for x_author_group in x_author_groups: x_collaborations = x_author_group.getElementsByTagName('ce:collaboration') x_affiliations = x_author_group.getElementsByTagName('ce:affiliation') # needed for supporting multiple author groups with author matching, but author matching is not rly possible. # authors_in_group = [ # (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(), # c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title()) # for c in x_author_group.getElementsByTagName('ce:author') # ] if 'authors' not in record.json: # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml. rerror('No authors... SKIPPING', record) return # extract collaborations, find countries later # FIXME we should always extract collaborations, but that would cause a lot more problems now. authors = [{'full_name': c.getElementsByTagName('ce:text')[0].childNodes[0].nodeValue} for c in x_collaborations] if authors: rinfo('Collaborations found: %s' % authors, record) record.json['authors'] = authors else: rerror('No collaborations. Not fixable.', record) # possibly we added authors in the previous step. if 'authors' in record.json: # Type 2 and 4: has authors, but no affiliations. authors = record.json['authors'] aff_count = sum(map(lambda x: 'affiliations' in x, authors)) if aff_count == 0: # Type 4: No affiliations in data. new_affs = [ {u'country': find_country(a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue), u'value': a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue } for a in x_affiliations] if new_affs: rinfo('New affiliations: %s' % new_affs, record) # FIXME modify this, if multiple author groups should be supported # FIXME (not all authors should be updated)!!! # update_authors(record, authors_in_group, new_affs) for i, a in enumerate(record.json.get('authors')): record.json['authors'][i]['affiliations'] = new_affs flag_modified(record, 'json') else: rerror('No affiliations at all. Not fixable.', record) elif aff_count == len(authors): empty_aff_count = sum(map(lambda x: len(x['affiliations']) == 0, authors)) if empty_aff_count == len(authors): # Type 2: Only empty affiliations. rinfo('Type 2. Not fixable.', record) else: rerror('Only SOME authors have EMPTY affiliations. What now?', record) else: rerror('Only SOME authors have affiliations. What now?', record) rinfo('OK', record)
def proc(record): if record.json is None: rerror('record.json is None', record) return record.json = utf8rec(record.json) flag_modified(record, 'json')
def proc(record): rinfo('start...', record) if '_files' not in record.json: rerror('Skipping. No _files', record) return xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files']) if not xml: rerror('Skipping. No xml in _files', record) return object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key']) uri = object.file.uri xml = parse(open(uri, 'rt')) x_author_groups = xml.getElementsByTagName('ce:author-group') if not x_author_groups: rerror('Skipping. No author groups.', record) return if len(x_author_groups) > 1: rerror('Skipping. MORE THEN ONE author group. Not supported.', record) return for x_author_group in x_author_groups: x_collaborations = x_author_group.getElementsByTagName( 'ce:collaboration') x_affiliations = x_author_group.getElementsByTagName( 'ce:affiliation') # needed for supporting multiple author groups with author matching, but author matching is not rly possible. # authors_in_group = [ # (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(), # c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title()) # for c in x_author_group.getElementsByTagName('ce:author') # ] if 'authors' not in record.json: # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml. rerror('No authors... SKIPPING', record) return # extract collaborations, find countries later # FIXME we should always extract collaborations, but that would cause a lot more problems now. authors = [{ 'full_name': c.getElementsByTagName('ce:text') [0].childNodes[0].nodeValue } for c in x_collaborations] if authors: rinfo('Collaborations found: %s' % authors, record) record.json['authors'] = authors else: rerror('No collaborations. Not fixable.', record) # possibly we added authors in the previous step. if 'authors' in record.json: # Type 2 and 4: has authors, but no affiliations. authors = record.json['authors'] aff_count = sum(map(lambda x: 'affiliations' in x, authors)) if aff_count == 0: # Type 4: No affiliations in data. new_affs = [{ u'country': get_country_for_aff(a), u'value': a.getElementsByTagName('ce:textfn') [0].childNodes[0].nodeValue } for a in x_affiliations] if new_affs: rinfo('New affiliations: %s' % new_affs, record) # FIXME modify this, if multiple author groups should be supported # FIXME (not all authors should be updated)!!! # update_authors(record, authors_in_group, new_affs) for i, a in enumerate(record.json.get('authors')): record.json['authors'][i][ 'affiliations'] = new_affs flag_modified(record, 'json') else: rerror('No affiliations at all. Not fixable.', record) elif aff_count == len(authors): empty_aff_count = sum( map(lambda x: len(x['affiliations']) == 0, authors)) if empty_aff_count == len(authors): # Type 2: Only empty affiliations. rinfo('Type 2. Not fixable.', record) else: rerror( 'Only SOME authors have EMPTY affiliations. What now?', record) else: rerror('Only SOME authors have affiliations. What now?', record) rinfo('OK', record)