Beispiel #1
0
def test_sources_afids_in_cache_empty():
    create_cache(drop=True, file=test_cache)
    df = pd.DataFrame(list(product([22900], [2010, 2005])),
                      columns=["source_id", "year"], dtype="int64")
    sa_incache, sa_search = sources_in_cache(df, file=test_cache, afid=True)
    assert_frame_equal(sa_search, df)
    assert_true(sa_incache.empty)
Beispiel #2
0
def test_sources_in_cache_full():
    create_cache(drop=True, file=test_cache)
    # Variables
    expected_sources = [22900]
    expected_years = [2010, 2005]
    cols = ["source_id", "year"]
    df = pd.DataFrame(list(product(expected_sources, expected_years)),
                      columns=cols, dtype="int64")
    # Populate cache
    res = query_year(expected_years[0], expected_sources, False, False)
    cache_insert(res, table="sources", file=test_cache)
    sources_ys_incache, sources_ys_search = sources_in_cache(df, file=test_cache)
    # Retrieve from cache
    sources_ys = sources_ys_incache[cols]
    sources_ys_incache, sources_ys_search = sources_in_cache(sources_ys,
                                                             file=test_cache)
    expected_sources = [int(s) for s in expected_sources]
    assert_equal(sources_ys_incache.source_id.tolist(), expected_sources)
    assert_equal(sources_ys_incache.year.tolist(), [expected_years[0]])
    assert_true(sources_ys_search.empty)
Beispiel #3
0
def test_sources_afids_in_cache_partial():
    create_cache(drop=True, file=test_cache)
    # Variables
    expected_sources = [22900]
    expected_years = [2010, 2005]
    df = pd.DataFrame(list(product(expected_sources, expected_years)),
                      columns=["source_id", "year"], dtype="int64")
    sa_incache, sa_search = sources_in_cache(df, file=test_cache, afid=True)
    # Populate cache
    res = query_year(expected_years[0], expected_sources, False, False, afid=True)
    cache_insert(res, table="sources_afids", file=test_cache)
    # Retrieve from cache
    sa_incache, sa_search = sources_in_cache(df, file=test_cache, afid=True)
    expected_sources = set([int(s) for s in expected_sources])
    assert_equal(set(sa_incache.source_id.tolist()), set(expected_sources))
    assert_equal(set(sa_incache.year.tolist()), set([expected_years[0]]))
    assert_equal(set(sa_search.source_id.tolist()), set(expected_sources))
    assert_equal(set(sa_search.year.tolist()), set([expected_years[1]]))
    expected = range(182-5, 182+5)
    assert_true(len(sa_incache) in expected)
    assert_true(len(sa_incache.afid.drop_duplicates()) in expected)
Beispiel #4
0
def search_group_from_sources(self, stacked, verbose, refresh=False):
    """Define groups of authors based on publications from a set of sources.

    Parameters
    ----------
    self : sosia.Original
        The object of the Scientist to search information for.

    verbose : bool (optional, default=False)
        Whether to report on the progress of the process.

    refresh : bool (optional, default=False)
        Whether to refresh cached search files.

    Returns
    -------
    today, then, negative : set
        Set of authors publishing in three periods: During the year of
        treatment, during years to match on, and during years before the
        first publication.
    """
    # Filtering variables
    min_year = self.first_year - self.year_margin
    max_year = self.first_year + self.year_margin
    if self.period:
        _margin_setter = self.publications_period
    else:
        _margin_setter = self.publications
    max_pubs = max(margin_range(len(_margin_setter), self.pub_margin))
    years = list(range(min_year, max_year + 1))
    search_years = [min_year - 1]
    if not self._ignore_first_id:
        search_years.extend(range(min_year, max_year + 1))
    search_sources, _ = zip(*self.search_sources)

    # Verbose variables
    n = len(search_sources)
    text = "Searching authors for search_group in {} sources...".format(n)
    custom_print(text, verbose)
    today = set()
    then = set()
    negative = set()

    if stacked:  # Make use of SQL cache
        # Year provided (select also based on location)
        # Get already cached sources from cache
        sources_ay = DataFrame(list(product(search_sources,
                                            [self.active_year])),
                               columns=["source_id", "year"])
        _, _search = sources_in_cache(sources_ay, refresh=refresh, afid=True)
        res = query_year(self.active_year,
                         _search.source_id.tolist(),
                         refresh,
                         verbose,
                         afid=True)
        cache_insert(res, table="sources_afids")
        sources_ay, _ = sources_in_cache(sources_ay,
                                         refresh=refresh,
                                         afid=True)
        # Authors publishing in provided year and locations
        mask = None
        if self.search_affiliations:
            mask = sources_ay.afid.isin(self.search_affiliations)
        today = flat_set_from_df(sources_ay, "auids", mask)
        # Years before active year
        # Get already cached sources from cache
        sources_ys = DataFrame(list(product(search_sources, search_years)),
                               columns=["source_id", "year"])
        _, sources_ys_search = sources_in_cache(sources_ys, refresh=refresh)
        missing_years = set(sources_ys_search.year.tolist())
        # Eventually add information for missing years to cache
        for y in missing_years:
            mask = sources_ys_search.year == y
            _sources_search = sources_ys_search[mask].source_id.tolist()
            res = query_year(y, _sources_search, refresh, verbose)
            cache_insert(res, table="sources")
        # Get full cache
        sources_ys, _ = sources_in_cache(sources_ys, refresh=False)
        # Authors publishing in year(s) of first publication
        if not self._ignore_first_id:
            mask = sources_ys.year.between(min_year, max_year, inclusive=True)
            then = flat_set_from_df(sources_ys, "auids", mask)
        # Authors with publications before
        mask = sources_ys.year < min_year
        negative = flat_set_from_df(sources_ys, "auids", mask)
    else:
        auth_count = []
        print_progress(0, n, verbose)
        for i, source_id in enumerate(search_sources):
            info = query_journal(source_id, [self.active_year] + years,
                                 refresh)
            today.update(info[str(self.active_year)])
            if not self._ignore_first_id:
                for y in years:
                    then.update(info[str(y)])
            for y in range(int(min(info.keys())), min_year):
                negative.update(info[str(y)])
            for y in info:
                if int(y) <= self.active_year:
                    auth_count.extend(info[str(y)])
            print_progress(i + 1, n, verbose)
        c = Counter(auth_count)
        negative.update({a for a, npub in c.items() if npub > max_pubs})

    return today, then, negative