Esempio n. 1
0
    def test_multinational_flag_is_set_correctly(self):
        score = np.float64(1.21111)
        article_id = 2

        links = create_article_institute_links(article_id, ['a'], score)
        assert links[0]['is_multinational'] is False

        links = create_article_institute_links(article_id, ['a', 'b', 'c'], score)
        assert links[0]['is_multinational'] is True
Esempio n. 2
0
    def test_multiple_results_are_returned_for_multinationals(self):
        score = np.float64(1.31111)
        article_id = 2

        links = create_article_institute_links(article_id, ['a', 'b', 'c'],
                                               score)
        assert len(links) == 3
Esempio n. 3
0
    def test_data_is_returned_in_correct_format(self):
        score = np.float64(1.11111)
        article_id = 1

        links = create_article_institute_links(article_id, ['a'], score)
        expected_result = [{'article_id': 1,
                            'institute_id': 'a',
                            'is_multinational': False,
                            'matching_score': 1.11111}]
        assert links == expected_result
Esempio n. 4
0
    def test_create_article_institute_links_converts_score_to_float(self):
        score = np.float64(1.11111)
        article_id = 1

        links = create_article_institute_links(article_id, ['a'], score)
        assert type(links[0]['matching_score']) == float
Esempio n. 5
0
    def run(self):
        # database setup
        database = 'dev' if self.test else 'production'
        logging.info(f"Using {database} database")
        self.engine = get_mysql_engine(self.db_config_env, 'mysqldb', database)
        Base.metadata.create_all(self.engine)

        article_institute_batcher = BatchWriter(self.insert_batch_size,
                                                add_article_institutes,
                                                self.engine)
        match_attempted_batcher = BatchWriter(self.insert_batch_size,
                                              update_existing_articles,
                                              self.engine)

        fuzzer = ComboFuzzer([fuzz.token_sort_ratio, fuzz.partial_ratio],
                             store_history=True)

        # extract lookup of GRID institute names to ids - seems to be OK to hold in memory
        institute_name_id_lookup = grid_name_lookup(self.engine)

        with db_session(self.engine) as session:
            # used to check GRID ids from MAG are valid (they are not all...)
            all_grid_ids = {i.id for i in session.query(Institute.id).all()}
            logging.info(f"{len(all_grid_ids)} institutes in GRID")

            article_query = (session.query(
                Article.id, Article.mag_authors).filter(
                    Article.institute_match_attempted.is_(False)
                    & ~Article.institutes.any()
                    & Article.mag_authors.isnot(None)))
            total = article_query.count()
            logging.info(
                f"Total articles with authors and no institutes links: {total}"
            )

            logging.debug("Starting the matching process")
            articles = article_query.all()

        for count, article in enumerate(articles, start=1):
            article_institute_links = []
            for author in article.mag_authors:
                # prevent duplicates when a mixture of institute aliases are used in the same article
                existing_article_institute_ids = {
                    link['institute_id']
                    for link in article_institute_links
                }

                # extract and validate grid_id
                try:
                    extracted_grid_id = author['affiliation_grid_id']
                except KeyError:
                    pass
                else:
                    # check grid id is valid
                    if (extracted_grid_id in all_grid_ids and extracted_grid_id
                            not in existing_article_institute_ids):
                        links = create_article_institute_links(
                            article_id=article.id,
                            institute_ids=[extracted_grid_id],
                            score=1)
                        article_institute_links.extend(links)
                        logging.debug(f"Used grid_id: {extracted_grid_id}")
                        continue

                # extract author affiliation
                try:
                    affiliation = author['author_affiliation']
                except KeyError:
                    # no grid id or affiliation for this author
                    logging.debug(f"No affiliation found in: {author}")
                    continue

                # look for an exact match on affiliation name
                try:
                    institute_ids = institute_name_id_lookup[affiliation]
                except KeyError:
                    pass
                else:
                    institute_ids = set(
                        institute_ids) - existing_article_institute_ids
                    links = create_article_institute_links(
                        article_id=article.id,
                        institute_ids=institute_ids,
                        score=1)
                    article_institute_links.extend(links)
                    logging.debug(f"Found an exact match for: {affiliation}")
                    continue

                # fuzzy matching
                try:
                    match, score = fuzzer.fuzzy_match_one(
                        affiliation, institute_name_id_lookup.keys())
                except KeyError:
                    # failed fuzzy match
                    logging.debug(f"Failed fuzzy match: {affiliation}")
                else:
                    institute_ids = institute_name_id_lookup[match]
                    institute_ids = set(
                        institute_ids) - existing_article_institute_ids
                    links = create_article_institute_links(
                        article_id=article.id,
                        institute_ids=institute_ids,
                        score=score)
                    article_institute_links.extend(links)
                    logging.debug(
                        f"Found a fuzzy match: {affiliation}  {score}  {match}"
                    )

            # add links for this article to the batch queue
            article_institute_batcher.extend(article_institute_links)
            # mark that matching has been attempted for this article
            match_attempted_batcher.append(
                dict(id=article.id, institute_match_attempted=True))

            if not count % 100:
                logging.info(
                    f"{count} processed articles from {total} : {(count / total) * 100:.1f}%"
                )

            if self.test and count == 50:
                logging.warning("Exiting after 50 articles in test mode")
                logging.debug(article_institute_batcher)
                break

        # pick up any left over in the batches
        if article_institute_batcher:
            article_institute_batcher.write()
        if match_attempted_batcher:
            match_attempted_batcher.write()

        logging.info("All articles processed")
        logging.info(
            f"Total successful fuzzy matches for institute names: {len(fuzzer.successful_fuzzy_matches)}"
        )
        logging.info(
            f"Total failed fuzzy matches for institute names{len(fuzzer.failed_fuzzy_matches): }"
        )

        # mark as done
        logging.info("Task complete")
        self.output().touch()