def run(self): # database setup database = 'dev' if self.test else 'production' logging.warning(f"Using {database} database") self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database) Base.metadata.create_all(self.engine) with db_session(self.engine) as session: field_mapping = { 'paper': 'mag_id', 'paperTitle': 'title', 'fieldsOfStudy': 'fields_of_study', 'citationCount': 'citation_count' } logging.info( "Querying database for articles without fields of study and with doi" ) articles_to_process = [ dict(id=a.id, doi=a.doi, title=a.title) for a in (session.query(Article).filter( (Article.mag_authors.is_(None) | ~Article.fields_of_study.any()) & Article.doi.isnot(None)).all()) ] total_arxiv_ids_to_process = len(articles_to_process) logging.info(f"{total_arxiv_ids_to_process} articles to process") all_articles_to_update = BatchWriter(self.insert_batch_size, update_existing_articles, self.engine) for count, row in enumerate( query_articles_by_doi(articles_to_process), start=1): # renaming and reformatting for code, description in field_mapping.items(): try: row[description] = row.pop(code) except KeyError: pass if row.get('citation_count', None) is not None: row['citation_count_updated'] = date.today() # reformat fos_ids out of entity urls try: fos = row.pop('fields_of_study') row['fields_of_study'] = { extract_entity_id(f) for f in fos.split(',') } except KeyError: # missing fields of study row['fields_of_study'] = [] except (AttributeError, TypeError): # either of these could occur when the same doi is present in 2 # articles in the same batch logging.debug("Already processed") row['fields_of_study'] = fos # reformat mag_id out of entity url try: row['mag_id'] = extract_entity_id(row['mag_id']) except TypeError: # id has already been extracted pass # query for author and affiliation details try: author_ids = { extract_entity_id(a) for a in row.pop('authors').split(',') } row['mag_authors'] = list(query_authors(author_ids)) except KeyError: pass # drop unnecessary fields for f in ['score', 'title']: try: del row[f] except KeyError: pass # check fields of study exist in the database logging.debug('Checking fields of study exist in db') found_fos_ids = { fos.id for fos in (session.query(FieldOfStudy).filter( FieldOfStudy.id.in_(row['fields_of_study'])).all()) } missing_fos_ids = row['fields_of_study'] - found_fos_ids if missing_fos_ids: logging.info( f"Missing field of study ids: {missing_fos_ids}") fos_not_found = update_field_of_study_ids_sparql( self.engine, missing_fos_ids) # any fos not found in mag are removed to prevent foreign key # constraint errors when building the link table for fos in fos_not_found: row['fields_of_study'].remove(fos) # add this row to the queue logging.debug(row) all_articles_to_update.append(row) if not count % 1000: logging.info( f"{count} done. {total_arxiv_ids_to_process - count} articles left to process" ) if self.test and count == 150: logging.warning("Exiting after 150 rows in test mode") break # pick up any left over in the batch if all_articles_to_update: all_articles_to_update.write() # mark as done logging.warning("Task complete") self.output().touch()
def test_extract_entity_id_returns_string_id(self): assert extract_entity_id('http://ma-graph.org/entity/test_id') == 'test_id' assert extract_entity_id('http://ma-graph.org/entity/another_id') == 'another_id' assert extract_entity_id('http://ma-graph.org/entity/grid.011.5') == 'grid.011.5'
def test_extract_entity_id_raises_value_error_when_not_found(self): with pytest.raises(ValueError): extract_entity_id('bad_url')
def test_extract_entity_id_returns_integer_id(self): assert extract_entity_id('http://ma-graph.org/entity/109214941') == 109214941 assert extract_entity_id('http://ma-graph.org/entity/19694890') == 19694890 assert extract_entity_id('http://ma-graph.org/entity/13203339') == 13203339