def _files(record, extra_data): """Check if it has the necessary files: .xml, .pdf, .pdfa """ journal = get_first_journal(record) required_files = current_app.config.get('COMPLIANCE_JOURNAL_FILES', {}).get(journal) if not required_files: return True, ('No required files defined!', ), None available_files = {f.get('filetype') for f in record.get('_files', ())} check_accepted = required_files == available_files details = [] if not check_accepted: missing_files = ', '.join(required_files - available_files) if missing_files: details.append('Missing files: %s' % missing_files) extra_files = ', '.join(available_files - required_files) if extra_files: details.append('Extra files: %s' % extra_files) return check_accepted, details, None
def _arxiv(record, extra_data): # if not available it is only compliant if the arXiv check is not mandatory for the journal journal = get_first_journal(record) if journal not in current_app.config.get('ARTICLE_CHECK_HAS_TO_BE_HEP'): return True, ("Doesn't have to be hep", ), None # get the primary category primary = get_arxiv_primary_category(record) if primary: check_accepted = primary in current_app.config.get( 'ARXIV_HEP_CATEGORIES') return check_accepted, ('Primary category: %s' % primary, ), None return False, ('No arXiv id', ), None
def affiliations_export(country=None, year=None): """ Creates affiliation data filtered by country and year. :param country: only affiliations for this country will be included. If None, all countries are included. :param year: only articles *published* in this year will be included. If None, all articles are included. """ size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100) search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX') source_fields = [ 'publication_info.year', 'publication_info.journal_title', 'arxiv_eprints', 'dois', 'authors', 'control_number', ] result_headers = [ 'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'country', 'affiliation', 'authors with affiliation', 'total number of authors' ] result_data = [] index = 0 # query ElasticSearch for result (and get total hits) query = get_query_string(country=country, year=year) search_results = current_search_client.search(q=query, index=search_index, _source=source_fields, size=size, from_=index) total_hits = search_results['hits']['total']['value'] logger.info( 'Searching for affiliations of country: {} and year: {}'.format( country if country else 'ALL', year if year else 'ALL')) logger.info('Total results from query: {}'.format(total_hits)) if total_hits == 0: return {'header': result_headers, 'data': result_data} while index < total_hits: # query ElasticSearch for result logger.warn('INDEX NUMBER {}'.format(index)) search_results = current_search_client.search(q=query, index=search_index, _source=source_fields, size=size, from_=index) index += len(search_results['hits']['hits']) # extract and add data to result list for hit in search_results['hits']['hits']: record = hit['_source'] year = record['publication_info'][0]['year'] journal = get_first_journal(record) doi = get_first_doi(record) arxiv = get_clean_arXiv_id(record) arxiv_category = get_arxiv_primary_category(record) authors = record.get('authors', ()) total_authors = len(authors) missing_author_affiliations = 0 extracted_affiliations = Counter() for author in authors: # if there are no affiliations, we cannot add this author # (this also means the record is not valid according to the schema) if 'affiliations' not in author: missing_author_affiliations += 1 continue # aggregate affiliations for aff in author['affiliations']: aff_country = aff.get('country', 'UNKNOWN') if country in (None, '') or aff_country == country: value = ((aff['value'], aff_country), ) extracted_affiliations.update(value) if not extracted_affiliations: logger.warn( 'Article with DOI: {} had no extracted affiliations'. format(doi)) if missing_author_affiliations: logger.warn( 'Article with DOI: {} had missing affiliations in {} / {} authors' .format(doi, missing_author_affiliations, total_authors)) # add extracted information to result list for meta, count in extracted_affiliations.items(): aff_value, aff_country = meta result_data.append([ year, journal, doi, arxiv, arxiv_category, aff_country, aff_value, count, total_authors ]) return {'header': result_headers, 'data': result_data}
def authors_export(country=None, year=None): """ Creates author and affiliation data filtered by country and year. :param country: only affiliations for this country will be included. If None, all countries are included. :param year: only articles *published* in this year will be included. If None, all articles are included. """ size = current_app.config.get('TOOL_ELASTICSEARCH_PAGE_SIZE', 100) search_index = current_app.config.get('SEARCH_UI_SEARCH_INDEX') source_fields = [ 'publication_info.year', 'publication_info.journal_title', 'arxiv_eprints', 'dois', 'authors', 'control_number', ] query = get_query_string(country=country, year=year) result_data = [] index = 0 total_hits = None while total_hits is None or index < total_hits: # query ElasticSearch for result search_results = current_search_client.search(q=query, index=search_index, _source=source_fields, size=size, from_=index) total_hits = search_results['hits']['total'] index += len(search_results['hits']['hits']) # extract and add data to result list for hit in search_results['hits']['hits']: record = hit['_source'] year = record['publication_info'][0]['year'] journal = get_first_journal(record) doi = get_first_doi(record) arxiv = get_clean_arXiv_id(record) arxiv_category = get_arxiv_primary_category(record) authors = record.get('authors', ()) total_authors = len(authors) for author in authors: # if there are no affiliations, we cannot add this author # (this also means the record is not valid according to the schema) if 'affiliations' not in author: logger.warn('No affiliations for author. doi=%s' % doi) continue author_name = author.get('full_name', 'UNKNOWN') # add extracted information to result list for affiliation in author['affiliations']: aff_country = affiliation.get('country', 'UNKNOWN') aff_value = affiliation['value'] result_data.append([ year, journal, doi, arxiv, arxiv_category, author_name, aff_country, aff_value, total_authors ]) return { 'header': [ 'year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'author', 'country', 'affiliation', 'total number of authors' ], 'data': result_data }