Esempio n. 1
0
    def test_query_articles_by_doi_returns_no_results_when_doi_not_found(self,
                                                                         mocked_lev_distance,
                                                                         mocked_batch):
        missing_articles = [{'id': 1, 'doi': '1.1/1234', 'title': 'title_a'},
                            {'id': 2, 'doi': 'bad_doi', 'title': 'title_b'}]

        results_batch = [{'paperTitle': 'title_aaa', 'doi': '1.1/1234'},
                         {'paperTitle': 'title_aa', 'doi': '1.1/1234'}]

        mocked_batch.return_value = [(missing_articles, results_batch)]
        mocked_lev_distance.side_effect = [2, 1]

        result = list(query_articles_by_doi(missing_articles))
        assert result == [{'paperTitle': 'title_aa', 'score': 1, 'id': 1, 'doi': '1.1/1234'}]
Esempio n. 2
0
    def test_query_articles_by_doi_returns_closest_match(self,
                                                         mocked_lev_distance,
                                                         mocked_batch):
        missing_articles = [{'id': 1, 'doi': '1.1/1234', 'title': 'title_a'},
                            {'id': 2, 'doi': '2.2/4321', 'title': 'title_b'}]

        results_batch = [{'paperTitle': 'title_aaa', 'doi': '1.1/1234'},
                         {'paperTitle': 'title_aa', 'doi': '1.1/1234'},
                         {'paperTitle': 'title_b', 'doi': '2.2/4321'},
                         {'paperTitle': 'title_c', 'doi': '2.2/4321'}]

        mocked_batch.return_value = [(missing_articles, results_batch)]
        mocked_lev_distance.side_effect = [2, 1, 0, 1]

        result = list(query_articles_by_doi(missing_articles))
        assert result == [{'paperTitle': 'title_aa', 'score': 1, 'id': 1, 'doi': '1.1/1234'},
                          {'paperTitle': 'title_b', 'score': 0, 'id': 2, 'doi': '2.2/4321'}]
Esempio n. 3
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        Base.metadata.create_all(self.engine)

        with db_session(self.engine) as session:
            field_mapping = {
                'paper': 'mag_id',
                'paperTitle': 'title',
                'fieldsOfStudy': 'fields_of_study',
                'citationCount': 'citation_count'
            }

            logging.info(
                "Querying database for articles without fields of study and with doi"
            )
            articles_to_process = [
                dict(id=a.id, doi=a.doi, title=a.title)
                for a in (session.query(Article).filter(
                    (Article.mag_authors.is_(None)
                     | ~Article.fields_of_study.any())
                    & Article.doi.isnot(None)).all())
            ]
            total_arxiv_ids_to_process = len(articles_to_process)
            logging.info(f"{total_arxiv_ids_to_process} articles to process")

            all_articles_to_update = BatchWriter(self.insert_batch_size,
                                                 update_existing_articles,
                                                 self.engine)

            for count, row in enumerate(
                    query_articles_by_doi(articles_to_process), start=1):
                # renaming and reformatting
                for code, description in field_mapping.items():
                    try:
                        row[description] = row.pop(code)
                    except KeyError:
                        pass

                if row.get('citation_count', None) is not None:
                    row['citation_count_updated'] = date.today()

                # reformat fos_ids out of entity urls
                try:
                    fos = row.pop('fields_of_study')
                    row['fields_of_study'] = {
                        extract_entity_id(f)
                        for f in fos.split(',')
                    }
                except KeyError:
                    # missing fields of study
                    row['fields_of_study'] = []
                except (AttributeError, TypeError):
                    # either of these could occur when the same doi is present in 2
                    # articles in the same batch
                    logging.debug("Already processed")
                    row['fields_of_study'] = fos

                # reformat mag_id out of entity url
                try:
                    row['mag_id'] = extract_entity_id(row['mag_id'])
                except TypeError:
                    # id has already been extracted
                    pass

                # query for author and affiliation details
                try:
                    author_ids = {
                        extract_entity_id(a)
                        for a in row.pop('authors').split(',')
                    }
                    row['mag_authors'] = list(query_authors(author_ids))
                except KeyError:
                    pass

                # drop unnecessary fields
                for f in ['score', 'title']:
                    try:
                        del row[f]
                    except KeyError:
                        pass

                # check fields of study exist in the database
                logging.debug('Checking fields of study exist in db')
                found_fos_ids = {
                    fos.id
                    for fos in (session.query(FieldOfStudy).filter(
                        FieldOfStudy.id.in_(row['fields_of_study'])).all())
                }

                missing_fos_ids = row['fields_of_study'] - found_fos_ids
                if missing_fos_ids:
                    logging.info(
                        f"Missing field of study ids: {missing_fos_ids}")
                    fos_not_found = update_field_of_study_ids_sparql(
                        self.engine, missing_fos_ids)
                    # any fos not found in mag are removed to prevent foreign key
                    # constraint errors when building the link table
                    for fos in fos_not_found:
                        row['fields_of_study'].remove(fos)

                # add this row to the queue
                logging.debug(row)
                all_articles_to_update.append(row)

                if not count % 1000:
                    logging.info(
                        f"{count} done. {total_arxiv_ids_to_process - count} articles left to process"
                    )
                if self.test and count == 150:
                    logging.warning("Exiting after 150 rows in test mode")
                    break

            # pick up any left over in the batch
            if all_articles_to_update:
                all_articles_to_update.write()

        # mark as done
        logging.warning("Task complete")
        self.output().touch()