Exemple #1
0
    def test_batched_titles_generates_title_id_lookup(self,
                                                      mocked_split_batches,
                                                      mocked_prepare_title,
                                                      mocked_articles):
        mocked_split_batches.return_value = iter([[1, 2, 3, 4, 5, 6]])

        mocked_articles = [
            mocked_articles([{
                'id': x,
                'title': 'dummy_title'
            } for x in range(1, 7)])
        ]

        mocked_session = mock.Mock()
        mocked_session.query().filter().all.side_effect = mocked_articles

        mocked_prepare_title.side_effect = ('clean title A', 'clean title B',
                                            'clean title B', 'clean title C',
                                            'clean title A', 'clean title B')

        expected_result = defaultdict(list)
        expected_result.update({
            'clean title A': [1, 5],
            'clean title B': [2, 3, 6],
            'clean title C': [4]
        })

        batcher = BatchedTitles([1, 2, 3, 4, 5, 6],
                                batch_size=3,
                                session=mocked_session)
        list(batcher)
        for title, ids in expected_result.items():
            assert ids == batcher[title]
Exemple #2
0
    def test_batched_titles_returns_all_prepared_titles(self,
                                                        mocked_split_batches,
                                                        mocked_prepare_title,
                                                        mocked_articles):
        mocked_split_batches.return_value = iter([[1, 2, 3], [4, 5, 6]])  # mocking a generator

        mocked_articles = [mocked_articles([{'id': 1, 'title': 'title A'},
                                            {'id': 2, 'title': 'title B'},
                                            {'id': 3, 'title': 'title C'}]),
                           mocked_articles([{'id': 4, 'title': 'title D'},
                                            {'id': 5, 'title': 'title E'},
                                            {'id': 6, 'title': 'title F'}])]

        mocked_session = mock.Mock()
        mocked_session.query().filter().all.side_effect = mocked_articles

        mocked_prepare_title.side_effect = ('prepared title A',
                                            'prepared title B',
                                            'prepared title C',
                                            'prepared title D',
                                            'prepared title E',
                                            'prepared title F')

        batcher = BatchedTitles([1, 2, 3, 4, 5, 6], batch_size=3, session=mocked_session)
        result = sorted(list(batcher))
        assert result == ['prepared title A',
                          'prepared title B',
                          'prepared title C',
                          'prepared title D',
                          'prepared title E',
                          'prepared title F']
Exemple #3
0
    def test_batched_titles_calls_split_batches_correctly(self,
                                                          mocked_split_batches,
                                                          mocked_prepare_title,
                                                          mocked_articles):
        mocked_split_batches.return_value = iter([[1, 2, 3, 4, 5, 6]])

        mocked_session = mock.Mock()
        mocked_session.query().filter().all.return_value = mocked_articles([{'id': 1, 'title': 'dummy_title'}])

        mocked_prepare_title.return_value = 'clean title A'

        batcher = BatchedTitles([1, 2, 3, 4], batch_size=2, session=mocked_session)
        list(batcher)
        assert mocked_split_batches.mock_calls == [mock.call([1, 2, 3, 4], 2)]
Exemple #4
0
    def run(self):
        pp = pprint.PrettyPrinter(indent=4, width=100)
        mag_config = misctools.get_config(self.mag_config_path, 'mag')
        mag_subscription_key = mag_config['subscription_key']

        # database setup
        database = 'dev' if self.test else 'production'
        logging.warning(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        Base.metadata.create_all(self.engine)

        with db_session(self.engine) as session:
            paper_fields = [
                "Id", "Ti", "F.FId", "CC", "AA.AuN", "AA.AuId", "AA.AfN",
                "AA.AfId", "AA.S"
            ]

            author_mapping = {
                'AuN': 'author_name',
                'AuId': 'author_id',
                'AfN': 'author_affiliation',
                'AfId': 'author_affiliation_id',
                'S': 'author_order'
            }

            field_mapping = {
                'Id': 'mag_id',
                'Ti': 'title',
                'F': 'fields_of_study',
                'AA': 'mag_authors',
                'CC': 'citation_count',
                'logprob': 'mag_match_prob'
            }

            logging.info(
                "Querying database for articles without fields of study")
            arxiv_ids_to_process = {
                a.id
                for a in (session.query(Article).filter(
                    ~Article.fields_of_study.any()).all())
            }
            total_arxiv_ids_to_process = len(arxiv_ids_to_process)
            logging.info(f"{total_arxiv_ids_to_process} articles to process")

            all_articles_to_update = BatchWriter(self.insert_batch_size,
                                                 update_existing_articles,
                                                 self.engine)

            batched_titles = BatchedTitles(arxiv_ids_to_process, 10000,
                                           session)
            batch_field_of_study_ids = set()

            for count, expr in enumerate(build_expr(batched_titles, 'Ti'), 1):
                logging.debug(pp.pformat(expr))
                expr_length = len(expr.split(','))
                logging.info(f"Querying MAG for {expr_length} titles")
                total_arxiv_ids_to_process -= expr_length
                batch_data = query_mag_api(expr, paper_fields,
                                           mag_subscription_key)
                logging.debug(pp.pformat(batch_data))

                returned_entities = batch_data['entities']
                logging.info(
                    f"{len(returned_entities)} entities returned from MAG (potentially including duplicates)"
                )

                # dedupe response keeping the entity with the highest logprob
                deduped_mag_ids = dedupe_entities(returned_entities)
                logging.info(
                    f"{len(deduped_mag_ids)} entities after deduplication")

                missing_articles = expr_length - len(deduped_mag_ids)
                if missing_articles != 0:
                    logging.info(f"{missing_articles} titles not found in MAG")

                batch_article_data = []

                for row in returned_entities:
                    # exclude duplicate titles
                    if row['Id'] not in deduped_mag_ids:
                        continue

                    # renaming and reformatting
                    for code, description in field_mapping.items():
                        try:
                            row[description] = row.pop(code)
                        except KeyError:
                            pass

                    for author in row.get('mag_authors', []):
                        for code, description in author_mapping.items():
                            try:
                                author[description] = author.pop(code)
                            except KeyError:
                                pass

                    if row.get('citation_count', None) is not None:
                        row['citation_count_updated'] = date.today()

                    # reformat fos_ids out of dictionaries
                    try:
                        row['fields_of_study'] = {
                            f['FId']
                            for f in row.pop('fields_of_study')
                        }
                    except KeyError:
                        row['fields_of_study'] = []
                    batch_field_of_study_ids.update(row['fields_of_study'])

                    # get list of ids which share the same title
                    try:
                        matching_articles = batched_titles[row['title']]
                    except KeyError:
                        logging.warning(
                            f"Returned title not found in original data: {row['title']}"
                        )
                        continue

                    # drop unnecessary fields
                    for f in ['prob', 'title']:
                        del row[f]

                    # add each matching article for this title to the batch
                    for article_id in matching_articles:
                        batch_article_data.append({**row, 'id': article_id})

                # check fields of study are in database
                batch_field_of_study_ids = {
                    fos_id
                    for article in batch_article_data
                    for fos_id in article['fields_of_study']
                }
                logging.debug('Checking fields of study exist in db')
                found_fos_ids = {
                    fos.id
                    for fos in (session.query(FieldOfStudy).filter(
                        FieldOfStudy.id.in_(batch_field_of_study_ids)).all())
                }

                missing_fos_ids = batch_field_of_study_ids - found_fos_ids
                if missing_fos_ids:
                    #  query mag for details if not found
                    update_field_of_study_ids(mag_subscription_key, session,
                                              missing_fos_ids)

                # add this batch to the queue
                all_articles_to_update.extend(batch_article_data)

                logging.info(
                    f"Batch {count} done. {total_arxiv_ids_to_process} articles left to process"
                )
                if self.test and count == 2:
                    logging.warning("Exiting after 2 batches in test mode")
                    break

            # pick up any left over in the batch
            if all_articles_to_update:
                all_articles_to_update.write()

        # mark as done
        logging.warning("Task complete")
        self.output().touch()