コード例 #1
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        Base.metadata.create_all(self.engine)

        with db_session(self.engine) as session:
            field_mapping = {
                'paper': 'mag_id',
                'paperTitle': 'title',
                'fieldsOfStudy': 'fields_of_study',
                'citationCount': 'citation_count'
            }

            logging.info(
                "Querying database for articles without fields of study and with doi"
            )
            articles_to_process = [
                dict(id=a.id, doi=a.doi, title=a.title)
                for a in (session.query(Article).filter(
                    (Article.mag_authors.is_(None)
                     | ~Article.fields_of_study.any())
                    & Article.doi.isnot(None)).all())
            ]
            total_arxiv_ids_to_process = len(articles_to_process)
            logging.info(f"{total_arxiv_ids_to_process} articles to process")

            all_articles_to_update = BatchWriter(self.insert_batch_size,
                                                 update_existing_articles,
                                                 self.engine)

            for count, row in enumerate(
                    query_articles_by_doi(articles_to_process), start=1):
                # renaming and reformatting
                for code, description in field_mapping.items():
                    try:
                        row[description] = row.pop(code)
                    except KeyError:
                        pass

                if row.get('citation_count', None) is not None:
                    row['citation_count_updated'] = date.today()

                # reformat fos_ids out of entity urls
                try:
                    fos = row.pop('fields_of_study')
                    row['fields_of_study'] = {
                        extract_entity_id(f)
                        for f in fos.split(',')
                    }
                except KeyError:
                    # missing fields of study
                    row['fields_of_study'] = []
                except (AttributeError, TypeError):
                    # either of these could occur when the same doi is present in 2
                    # articles in the same batch
                    logging.debug("Already processed")
                    row['fields_of_study'] = fos

                # reformat mag_id out of entity url
                try:
                    row['mag_id'] = extract_entity_id(row['mag_id'])
                except TypeError:
                    # id has already been extracted
                    pass

                # query for author and affiliation details
                try:
                    author_ids = {
                        extract_entity_id(a)
                        for a in row.pop('authors').split(',')
                    }
                    row['mag_authors'] = list(query_authors(author_ids))
                except KeyError:
                    pass

                # drop unnecessary fields
                for f in ['score', 'title']:
                    try:
                        del row[f]
                    except KeyError:
                        pass

                # check fields of study exist in the database
                logging.debug('Checking fields of study exist in db')
                found_fos_ids = {
                    fos.id
                    for fos in (session.query(FieldOfStudy).filter(
                        FieldOfStudy.id.in_(row['fields_of_study'])).all())
                }

                missing_fos_ids = row['fields_of_study'] - found_fos_ids
                if missing_fos_ids:
                    logging.info(
                        f"Missing field of study ids: {missing_fos_ids}")
                    fos_not_found = update_field_of_study_ids_sparql(
                        self.engine, missing_fos_ids)
                    # any fos not found in mag are removed to prevent foreign key
                    # constraint errors when building the link table
                    for fos in fos_not_found:
                        row['fields_of_study'].remove(fos)

                # add this row to the queue
                logging.debug(row)
                all_articles_to_update.append(row)

                if not count % 1000:
                    logging.info(
                        f"{count} done. {total_arxiv_ids_to_process - count} articles left to process"
                    )
                if self.test and count == 150:
                    logging.warning("Exiting after 150 rows in test mode")
                    break

            # pick up any left over in the batch
            if all_articles_to_update:
                all_articles_to_update.write()

        # mark as done
        logging.warning("Task complete")
        self.output().touch()
コード例 #2
0
ファイル: test_query_mag.py プロジェクト: hmessafi/nesta
 def test_extract_entity_id_returns_string_id(self):
     assert extract_entity_id('http://ma-graph.org/entity/test_id') == 'test_id'
     assert extract_entity_id('http://ma-graph.org/entity/another_id') == 'another_id'
     assert extract_entity_id('http://ma-graph.org/entity/grid.011.5') == 'grid.011.5'
コード例 #3
0
ファイル: test_query_mag.py プロジェクト: hmessafi/nesta
 def test_extract_entity_id_raises_value_error_when_not_found(self):
     with pytest.raises(ValueError):
         extract_entity_id('bad_url')
コード例 #4
0
ファイル: test_query_mag.py プロジェクト: hmessafi/nesta
 def test_extract_entity_id_returns_integer_id(self):
     assert extract_entity_id('http://ma-graph.org/entity/109214941') == 109214941
     assert extract_entity_id('http://ma-graph.org/entity/19694890') == 19694890
     assert extract_entity_id('http://ma-graph.org/entity/13203339') == 13203339