Esempio n. 1
0
    def __init__(self,
                 identifier,
                 year,
                 refresh=False,
                 period=None,
                 eids=None,
                 sql_fname=None):
        """Class to represent a scientist.

        Parameters
        ----------
        identifier : list of int
            List of Scopus Author IDs of the scientist.

        year : str or numeric
            Year for which characteristics should be defined for.

        refresh : boolean or int (optional, default=False)
            Whether to refresh cached results (if they exist) or not. If int
            is passed, results will be refreshed if they are older than
            that value in number of days.

        eids : list (optional, default=None)
            A list of scopus EIDs of the publications of the scientist.  If
            it is provided, the scientist's properties are set based on these
            publications, instead of the list of publications obtained from
            the Scopus Author ID(s).

        period: int (optional, default=None)
            In additional starting x years prior to the treatment year,
            which is also used to compute characteristics in the treatment
            year.

        sql_fname : str (optional, default=None)
            The path of the SQLite database to connect to.  If None, will use
            the path specified in config.ini.

        Raises
        ------
        Exception
            When there are no publications for the author until the
            provided year.
        """
        self.identifier = identifier
        self.year = int(year)
        if not sql_fname:
            sql_fname = config.get('Filepaths', 'Database')
        self.sql_conn = connect_database(sql_fname)

        # Read mapping of fields to sources
        df, names = read_fields_sources_list()
        self.field_source = df
        self.source_names = names.set_index("source_id")["title"].to_dict()

        # Load list of publications
        if eids:
            q = f"EID({' OR '.join(eids)})"
        else:
            q = f"AU-ID({') OR AU-ID('.join([str(i) for i in identifier])})"
        integrity_fields = ["eid", "author_ids", "coverDate", "source_id"]
        res = base_query("docs", q, refresh, fields=integrity_fields)
        self._publications = [p for p in res if int(p.coverDate[:4]) <= year]
        if not len(self._publications):
            text = "No publications found for author "\
                   f"{'-'.join([str(i) for i in identifier])} until {year}"
            raise Exception(text)
        self._eids = eids or [p.eid for p in self._publications]

        # First year of publication
        pub_years = [p.coverDate[:4] for p in self._publications]
        self._first_year = int(min(pub_years))
        self._period_year = self.year - (period or (self.year + 1)) + 1
        if self._period_year < self._first_year:
            self._period_year = 0

        # Count of citations
        search_ids = eids or identifier
        self._citations = count_citations(search_ids, self.year + 1,
                                          identifier)

        # Coauthors
        self._coauthors = set(extract_authors(
            self._publications)) - set(identifier)

        # Period counts simply set to total if period is or goes back to None
        if self._period_year:
            pubs = [
                p for p in self._publications
                if self._period_year <= int(p.coverDate[:4]) <= year
            ]
            self._publications_period = pubs
            if not len(self._publications_period):
                text = "No publications found for author "\
                       f"{'-'.join([str(i) for i in identifier])} until "\
                       f"{year} in a {self._period_year}-years period"
                raise Exception(text)
            eids_period = [p.eid for p in self._publications_period]
            n_cits = count_citations(eids_period, self.year + 1, identifier)
            self._citations_period = n_cits
            self._coauthors_period = set(
                extract_authors(self._publications_period))
            self._coauthors_period -= set(identifier)
        else:
            self._coauthors_period = None
            self._publications_period = None
            self._citations_period = None

        # Author search information
        source_ids = set(
            [int(p.source_id) for p in self._publications if p.source_id])
        self._sources = add_source_names(source_ids, self.source_names)
        self._active_year = int(max(pub_years))
        mask = df["source_id"].isin(source_ids)
        self._fields = df[mask]["asjc"].astype(int).tolist()
        self._main_field = get_main_field(self._fields)
        if not self._main_field[0]:
            text = "Not possible to determine research field(s) of "\
                   "researcher.  Functionality is reduced."
            warn(text, UserWarning)

        # Most recent geolocation
        afid = find_main_affiliation(identifier, self._publications, year)
        self._affiliation_id = afid
        try:
            aff = AffiliationRetrieval(afid, refresh=refresh)
            self._affiliation_country = aff.country
            self._affiliation_name = aff.affiliation_name
            self._affiliation_type = aff.org_type
        except (Scopus404Error, ValueError):
            self._affiliation_country = None
            self._affiliation_name = None
            self._affiliation_type = None
        self._language = None

        # Author name from profile with most documents
        df = get_authors(self.identifier,
                         self.sql_conn,
                         refresh=refresh,
                         verbose=False)
        au = df.sort_values("documents", ascending=False).iloc[0]
        self._subjects = [a.split(" ")[0] for a in au.areas.split("; ")]
        self._surname = au.surname or None
        self._first_name = au.givenname or None
        name = ", ".join([self._surname or "", au.givenname or ""])
        if name == ", ":
            name = None
        self._name = name
Esempio n. 2
0
def test_base_query_author():
    auth_id = 53164702100
    query = "AU-ID({})".format(auth_id)
    size = base_query("author", query, size_only=True)
    assert_equal(size, 1)
Esempio n. 3
0
    def find_matches(self,
                     stacked=False,
                     verbose=False,
                     stop_words=STOPWORDS,
                     information=True,
                     refresh=False,
                     **tfidf_kwds):
        """Find matches within search_group based on four criteria:
        1. Started publishing in about the same year
        2. Has about the same number of publications in the year of treatment
        3. Has about the same number of coauthors in the year of treatment
        4. Has about the same number of citations in the year of treatment
        5. Works in the same field as the scientist's main field

        Parameters
        ----------
        stacked : bool (optional, default=False)
            Whether to combine searches in few queries or not.  Cached
            files will most likely not be resuable.  Set to True if you
            query in distinct fields or you want to minimize API key usage.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        stop_words : list (optional, default=STOPWORDS)
            A list of words that should be filtered in the analysis of
            abstracts.  Default list is the list of english stopwords
            by nltk, augmented with numbers and interpunctuation.

        information : bool or iterable (optional, default=True)
            Whether to return additional information on the matches that may
            help in the selection process.  If an iterable of keywords is
            provied, only return information for these keywords.  Allowed
            values are "first_year", "num_coauthors", "num_publications",
            "num_citations", "country", "language",
            "reference_sim", "abstract_sim".

        refresh : bool (optional, default=False)
            Whether to refresh cached search files.

        tfidf_kwds : keywords
            Parameters to pass to TfidfVectorizer from the sklearn package
            for abstract vectorization.  Not used when `information=False` or
            or when "abstract_sim" is not in `information`.  See
            https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
            for possible values.

        Returns
        -------
        matches : list
            A list of Scopus IDs of scientists matching all the criteria (if
            information is False) or a list of namedtuples with the Scopus ID
            and additional information (if information is True).

        Raises
        ------
        ValueError
            If information is not bool and contains invalid keywords.
        """
        # Checks
        info_keys = [
            "first_name", "surname", "first_year", "num_coauthors",
            "num_publications", "num_citations", "num_coauthors_period",
            "num_publications_period", "num_citations_period", "subjects",
            "country", "affiliation_id", "affiliation", "language",
            "reference_sim", "abstract_sim"
        ]
        if isinstance(information, bool):
            if information:
                keywords = info_keys
            elif self.search_affiliations:
                information = True
                keywords = ["affiliation_id"]
            else:
                keywords = None
        else:
            keywords = information
            invalid = [x for x in keywords if x not in info_keys]
            if invalid:
                text = ("Parameter information contains invalid keywords: ",
                        ", ".join(invalid))
                raise ValueError(text)
            if self.search_affiliations and "affiliation_id" not in keywords:
                keywords.append("affiliation_id")
        # Variables
        _years = range(self.first_year - self.year_margin,
                       self.first_year + self.year_margin + 1)
        if self.period:
            _npapers = margin_range(len(self.publications_period),
                                    self.pub_margin)
            _ncits = margin_range(self.citations_period, self.cits_margin)
            _ncoauth = margin_range(len(self.coauthors_period),
                                    self.coauth_margin)
            _npapers_full = margin_range(len(self.publications),
                                         self.pub_margin)
            _ncits_full = margin_range(self.citations, self.cits_margin)
            _ncoauth_full = margin_range(len(self.coauthors),
                                         self.coauth_margin)
        else:
            _npapers = margin_range(len(self.publications), self.pub_margin)
            _ncits = margin_range(self.citations, self.cits_margin)
            _ncoauth = margin_range(len(self.coauthors), self.coauth_margin)
        n = len(self.search_group)
        text = "Searching through characteristics of {:,} authors".format(n)
        custom_print(text, verbose)

        # First round of filtering: minimum publications and main field
        # create df of authors
        authors = query_author_data(self.search_group, verbose=verbose)
        same_field = (authors.areas.str.startswith(self.main_field[1]))
        enough_pubs = (authors.documents.astype(int) >= int(min(_npapers)))
        group = authors[same_field & enough_pubs]["auth_id"].tolist()
        group.sort()
        n = len(group)
        text = "Left with {} authors\nFiltering based on provided "\
               "conditions...".format(n)
        custom_print(text, verbose)

        # Second round of filtering:
        # Check having no publications before minimum year, and if 0, the
        # number of publications in the relevant period.
        params = {
            "group": group,
            "ybefore": min(_years) - 1,
            "yupto": self.year,
            "npapers": _npapers,
            "yfrom": self.year_period,
            "verbose": verbose
        }
        group, _, _ = filter_pub_counts(**params)
        # Also screen out ids with too many publications over the full period
        if self.period:
            params.update({
                "npapers": [1, max(_npapers_full)],
                "yfrom": None,
                "group": group
            })
            group, _, _ = filter_pub_counts(**params)

        # Third round of filtering: citations (in the FULL period).
        authors = pd.DataFrame({"auth_id": group, "year": self.year})
        _, authors_cits_search = author_cits_in_cache(authors)
        text = "Search and filter based on count of citations\n{} to search "\
               "out of {}\n".format(len(authors_cits_search), len(group))
        custom_print(text, verbose)
        if not authors_cits_search.empty:
            authors_cits_search['n_cits'] = 0
            print_progress(0, len(authors_cits_search), verbose)
            for i, au in authors_cits_search.iterrows():
                q = "REF({}) AND PUBYEAR BEF {} AND NOT AU-ID({})".format(
                    au['auth_id'], self.year + 1, au['auth_id'])
                n = base_query("docs", q, size_only=True)
                authors_cits_search.at[i, 'n_cits'] = n
                print_progress(i + 1, len(authors_cits_search), verbose)
            cache_insert(authors_cits_search, table="author_cits_size")
        auth_cits_incache, _ = author_cits_in_cache(
            authors[["auth_id", "year"]])
        # keep if citations are in range
        mask = ((auth_cits_incache.n_cits <= max(_ncits)) &
                (auth_cits_incache.n_cits >= min(_ncits)))
        if self.period:
            mask = ((auth_cits_incache.n_cits >= min(_ncits)) &
                    (auth_cits_incache.n_cits <= max(_ncits_full)))
        group = (auth_cits_incache[mask]['auth_id'].tolist())

        # Fourth round of filtering: Download publications, verify coauthors
        # (in the FULL period) and first year.
        n = len(group)
        text = "Left with {} authors\nFiltering based on coauthors "\
               "number...".format(n)
        custom_print(text, verbose)
        authors = pd.DataFrame({
            "auth_id": group,
            "year": self.year
        },
                               dtype="uint64")
        _, author_year_search = author_year_in_cache(authors)
        matches = []
        if stacked:  # Combine searches
            if not author_year_search.empty:
                q = Template(
                    "AU-ID($fill) AND PUBYEAR BEF {}".format(self.year + 1))
                auth_year_group = author_year_search.auth_id.tolist()
                params = {
                    "group": auth_year_group,
                    "res": [],
                    "template": q,
                    "refresh": refresh,
                    "joiner": ") OR AU-ID(",
                    "q_type": "docs"
                }
                if verbose:
                    params.update({"total": len(auth_year_group)})
                res, _ = stacked_query(**params)
                res = build_dict(res, auth_year_group)
                if res:
                    # res can become empty after build_dict if a au_id is old
                    res = pd.DataFrame.from_dict(res, orient="index")
                    res["year"] = self.year
                    res = res[["year", "first_year", "n_pubs", "n_coauth"]]
                    res.index.name = "auth_id"
                    res = res.reset_index()
                    cache_insert(res, table="author_year")
            author_year_cache, _ = author_year_in_cache(authors)
            if self._ignore_first_id:
                # only number of coauthors should be big enough
                enough = (author_year_cache.n_coauth >= min(_ncoauth))
                notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full))
                mask = enough & notoomany
            elif self.period:
                # number of coauthors should be "big enough" and first year in
                # window
                same_start = (author_year_cache.first_year.between(
                    min(_years), max(_years)))
                enough = (author_year_cache.n_coauth >= min(_ncoauth))
                notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full))
                mask = same_start & enough & notoomany
            else:
                # all restrictions apply
                same_start = (author_year_cache.first_year.between(
                    min(_years), max(_years)))
                same_coauths = (author_year_cache.n_coauth.between(
                    min(_ncoauth), max(_ncoauth)))
                mask = same_start & same_coauths
            matches = author_year_cache[mask]["auth_id"].tolist()
        else:  # Query each author individually
            for i, au in enumerate(group):
                print_progress(i + 1, len(group), verbose)
                res = base_query("docs",
                                 "AU-ID({})".format(au),
                                 refresh=refresh)
                res = [
                    p for p in res
                    if p.coverDate and int(p.coverDate[:4]) <= self.year
                ]
                # Filter
                min_year = int(min([p.coverDate[:4] for p in res]))
                authids = [p.author_ids for p in res if p.author_ids]
                authors = set([a for p in authids for a in p.split(";")])
                n_coauth = len(authors) - 1  # Subtract 1 for focal author
                if self._ignore_first_id and (n_coauth < max(_ncoauth)):
                    # only number of coauthors should be big enough
                    continue
                elif (self.period and ((n_coauth < max(_ncoauth)) or
                                       (min_year not in _years))):
                    # number of coauthors should be "big enough" and first year
                    # in window
                    continue
                elif ((len(res) not in _npapers) or (min_year not in _years)
                      or (n_coauth not in _ncoauth)):
                    continue
                matches.append(au)

        if self.period:
            text = "Left with {} authors\nFiltering based on exact period "\
                   "citations and coauthors...".format(len(matches))
            custom_print(text, verbose)
            # Further screen matches based on period cits and coauths
            to_loop = [m for m in matches]  # temporary copy
            for m in to_loop:
                q = "AU-ID({})".format(m)
                res = base_query("docs",
                                 "AU-ID({})".format(m),
                                 refresh=refresh,
                                 fields=["eid", "author_ids", "coverDate"])
                pubs = [
                    p for p in res if int(p.coverDate[:4]) <= self.year
                    and int(p.coverDate[:4]) >= self.year_period
                ]
                coauths = set(get_authors(pubs)) - {str(m)}
                if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)):
                    matches.remove(m)
                    continue
                eids_period = [p.eid for p in pubs]
                cits = count_citations(search_ids=eids_period,
                                       pubyear=self.year + 1,
                                       exclusion_key="AU-ID",
                                       exclusion_ids=[str(m)])
                if not (min(_ncits) <= cits <= max(_ncits)):
                    matches.remove(m)
        text = "Found {:,} author(s) matching all criteria".format(
            len(matches))
        custom_print(text, verbose)

        # Possibly add information to matches
        if keywords and len(matches) > 0:
            custom_print("Providing additional information...", verbose)
            profiles = [
                Scientist([str(a)],
                          self.year,
                          period=self.period,
                          refresh=refresh) for a in matches
            ]
            matches = inform_matches(profiles, self, keywords, stop_words,
                                     verbose, refresh, **tfidf_kwds)
        if self.search_affiliations:
            matches = [
                m for m in matches if len(
                    set(m.affiliation_id.replace(" ", "").split(";")).
                    intersection([str(a) for a in self.search_affiliations]))
            ]
        return matches
Esempio n. 4
0
def test_base_query():
    auth_id = 53164702100
    q = "AU-ID({}) AND PUBYEAR BEF {}".format(auth_id, 2017)
    size = base_query("docs", q, size_only=True)
    assert_equal(size, 5)
Esempio n. 5
0
    def __init__(self,
                 identifier,
                 year,
                 refresh=False,
                 period=None,
                 eids=None):
        """Class to represent a scientist.

        Parameters
        ----------
        identifier : list of str
            List of Scopus Author IDs of the scientist.

        year : str or numeric
            Year for which characteristics should be defined for.

        refresh : boolean (optional, default=False)
            Whether to refresh all cached files or not.

        eids : list (optional, default=None)
            A list of scopus EIDs of the publications of the scientist.  If
            it is provided, the scientist's properties are set based on these
            publications, instead of the list of publications obtained from
            the Scopus Author ID(s).

        period: int (optional, default=None)
            The period in which to consider publications. If not provided,
            all publications are considered.

        Raises
        ------
        Exeption
            When there are no publications for the author until the
            given year.
        """
        self.identifier = identifier
        self.year = int(year)
        self.period = period
        self.year_period = None

        # Read mapping of fields to sources
        df, names = read_fields_sources_list()
        self.field_source = df
        self.source_names = names.set_index("source_id")["title"].to_dict()

        # Load list of publications
        if not eids:
            q = "AU-ID({})".format(") OR AU-ID(".join(identifier))
        else:
            q = "EID({})".format(" OR ".join(eids))
        res = base_query("docs",
                         q,
                         refresh,
                         fields=["eid", "author_ids", "coverDate"])
        self._publications = [p for p in res if int(p.coverDate[:4]) <= year]
        if not len(self._publications):
            text = "No publications for author {} until year {}".format(
                "-".join(identifier), year)
            raise Exception(text)
        self._eids = eids or [p.eid for p in self._publications]

        # Fist year (if period provided set first year of period, if
        # not smaller than first year of publication
        pub_years = [p.coverDate[:4] for p in self._publications]
        self._first_year = int(min(pub_years))
        if period and year - period + 1 <= self._first_year:
            self.period = None

        # Count of citations
        search_ids = eids or identifier
        self._citations = count_citations(search_ids=search_ids,
                                          pubyear=self.year + 1,
                                          exclusion_key="AU-ID",
                                          exclusion_ids=identifier)

        # Coauthors
        self._coauthors = set(get_authors(
            self._publications)) - set(identifier)

        # Period counts simply set to total if period is or goes back to None
        if self.period:
            self.year_period = year - period + 1
            pubs = [
                p for p in self._publications if int(p.coverDate[:4]) <= year
                and int(p.coverDate[:4]) >= self.year_period
            ]
            self._publications_period = pubs
            if not len(self._publications_period):
                text = "No publications for author {} until year {} in a {}-"\
                       "years period".format("-".join(identifier), year,
                                             self.year_period)
                raise Exception(text)
            eids_period = [p.eid for p in self._publications_period]
            self._citations_period = count_citations(search_ids=eids_period,
                                                     pubyear=self.year + 1,
                                                     exclusion_key="AU-ID",
                                                     exclusion_ids=identifier)
            self._coauthors_period = set(get_authors(
                self._publications_period))
            self._coauthors_period -= set(identifier)
        else:
            self._coauthors_period = self._coauthors
            self._publications_period = self._publications
            self._citations_period = self._citations

        # Author search information
        source_ids = set(
            [int(p.source_id) for p in self._publications if p.source_id])
        self._sources = add_source_names(source_ids, self.source_names)
        self._active_year = int(max(pub_years))
        self._fields = df[df["source_id"].isin(source_ids)]["asjc"].tolist()
        self._main_field = get_main_field(self._fields)

        # Most recent geolocation
        ctry, afid, org = find_location(identifier,
                                        self._publications,
                                        year,
                                        refresh=refresh)
        self._country = ctry
        self._affiliation_id = afid
        self._organization = org
        self._language = None

        # Author name from profile with most documents
        au = query_author_data(self.identifier, refresh=refresh, verbose=False)
        au = au.sort_values("documents", ascending=False).iloc[0]
        self._subjects = [a.split(" ")[0] for a in au.areas.split("; ")]
        self._surname = au.surname or None
        if au.givenname:
            self._first_name = au.givenname.replace(".", " ").split(" ")[0]
        else:
            self._first_name = None
        if self._surname and au.givenname:
            self._name = ", ".join([self._surname, au.givenname])
        else:
            self._name = None