Ejemplo n.º 1
0
def inform_match(profile, keywords):
    """Create namedtuple adding information to matches.

    Parameters
    ----------
    profile : sosia.Scientist()
        A Scientist() object representing a match.

    keywords : iterable of strings
        Which information to add to the match.

    Returns
    -------
    match_info : dict
        Information corresponding to provided keywords.
    """
    from sosia.classes import Scientist

    info = {
        "ID": profile.identifier[0],
        "name": profile.name,
        "first_name": profile.first_name,
        "surname": profile.surname,
        "first_year": profile.first_year,
        "num_coauthors": len(profile.coauthors),
        "num_publications": len(profile.publications),
        "num_citations": profile.citations,
        "num_coauthors_period": len(profile.coauthors_period or "") or None,
        "num_publications_period": len(profile.publications_period or "")
        or None,
        "num_citations_period": profile.citations_period,
        "subjects": profile.subjects,
        "country": profile.country,
        "affiliation_id": profile.affiliation_id,
        "affiliation": profile.organization
    }
    match_info = {
        k: v
        for k, v in info.items() if k in keywords + ["ID", "name"]
    }
    if "language" in keywords:
        try:
            match_info["language"] = profile.get_publication_languages(
            ).language
        except Scopus404Error:  # Refresh profile
            profile = Scientist(profile.identifier, profile.year, refresh=True)
            match_info["language"] = profile.get_publication_languages(
            ).language
    return match_info
Ejemplo n.º 2
0
def test_add_source_names():
    s = Scientist(["55208373700"], 2017)
    expected = [(14351, "Brain Research Reviews"),
                (18632, "Progress in Brain Research")]
    ids, names = zip(*expected)
    received = add_source_names(ids, s.source_names)
    assert_equal(received, expected)
Ejemplo n.º 3
0
def same_affiliation(original, new, refresh=False):
    """Whether a new scientist shares affiliation(s) with the
    original scientist.
    """
    from sosia.classes import Scientist

    period = original.year + 1 - original._period_year
    m = Scientist([new],
                  original.year,
                  period=period,
                  refresh=refresh,
                  sql_fname=original.sql_fname)
    return any(
        str(a) in m.affiliation_id for a in original.search_affiliations)
Ejemplo n.º 4
0
# -*- coding: utf-8 -*-
"""Tests for class `Scientist`."""

from collections import namedtuple
from nose.tools import assert_equal, assert_true, assert_false

from sosia.classes import Scientist

refresh = 30
scientist1 = Scientist(["6701809842"], 2001, refresh=refresh)
scientist2 = Scientist(["55208373700", "55208373700"], 2017, refresh=refresh)
eids = ["2-s2.0-84959420483", "2-s2.0-84949113230"]
scientist3 = Scientist(["55208373700"], 2017, eids=eids, refresh=refresh)
scientist4 = Scientist(["55208373700"], 2015, refresh=refresh)
scientist5 = Scientist(["55208373700"], 2018, period=2, refresh=refresh)


def test_affiliation():
    org = 'University of Munich'
    assert_equal(scientist1.organization, org)
    org = 'École Polytechnique Fédérale de Lausanne (EPFL)'
    assert_equal(scientist2.organization, org)
    org = 'Department of Economics and Management of Innovation, '\
          'École Polytechnique Fédérale de Lausanne'
    assert_equal(scientist3.organization, org)
    org = 'École Polytechnique Fédérale de Lausanne, CEMI, CDM MTEI-GE'
    assert_equal(scientist4.organization, org)
    org = 'Max Planck Institute for Innovation and Competition'
    assert_equal(scientist5.organization, org)

Ejemplo n.º 5
0
    def __init__(self, scientist, treatment_year, first_year_margin=2,
                 pub_margin=0.2, cits_margin=0.2, coauth_margin=0.2,
                 affiliations=None, period=None, first_year_search="ID",
                 eids=None, refresh=False, sql_fname=None):
        """Representation of a scientist for whom to find a control scientist.

        Parameters
        ----------
        scientist : str, int or list of str or int
            Scopus Author ID, or list of Scopus Author IDs, of the scientist
            to find a control scientist for.

        treatment_year : str or numeric
            Year of the event.  Control scientist will be matched on trends and
            characteristics of the original scientist up to this year.

        first_year_margin : numeric (optional, default=2)
            Number of years by which the search for authors publishing around
            the year of the original scientist's year of first publication
            should be extend in both directions.

        pub_margin : numeric (optional, default=0.2)
            The left and right margin for the number of publications to match
            possible matches and the scientist on.  If the value is a float,
            it is interpreted as percentage of the scientists number of
            publications and the resulting value is rounded up.  If the value
            is an integer it is interpreted as fixed number of publications.

        cits_margin : numeric (optional, default=0.2)
            The left and right margin for the number of citations to match
            possible matches and the scientist on.  If the value is a float,
            it is interpreted as percentage of the scientists number of
            publications and the resulting value is rounded up.  If the value
            is an integer it is interpreted as fixed number of citations.

        coauth_margin : numeric (optional, default=0.2)
            The left and right margin for the number of coauthors to match
            possible matches and the scientist on.  If the value is a float,
            it is interpreted as percentage of the scientists number of
            coauthors and the resulting value is rounded up.  If the value
            is an integer it is interpreted as fixed number of coauthors.

        affiliations : list (optional, default=None)
            A list of Scopus affiliation IDs.  If provided, sosia conditions
            the match procedure on affiliation with these IDs in the
            treatment year.

        period: int (optional, default=None)
            An additional period prior to the publication year on which to
            match scientists.
            Note: If the value is larger than the publication range, period
            sets back to None.

        first_year_search: str (optional, default="ID")
            How to determine characteristics of possible control scientists
            in the first year of publication.  Mode "ID" uses Scopus Author
            IDs only.  Mode "name" will select relevant profiles based on
            their surname and first name but only when "period" is not None.
            Select this mode to counter potential incompleteness of
            author profiles.

        eids : list (optional, default=None)
            A list of scopus EIDs of the publications of the scientist you
            want to find a control for.  If it is provided, the scientist
            properties and the control group are set based on this list of
            publications, instead of the list of publications obtained from
            the Scopus Author ID.

        refresh : boolean (optional, default=False)
            Whether to refresh cached results (if they exist) or not.  If int
            is passed, results will be refreshed if they are older than
            that value in number of days.

        sql_fname : str (optional, default=None)
            The path of the SQLite database to connect to.  If None, will use
            the path specified in config.ini.
        """
        # Internal checks
        if not isinstance(first_year_margin, (int, float)):
            raise Exception("Argument first_year_margin must be float or integer.")
        if not isinstance(pub_margin, (int, float)):
            raise Exception("Argument pub_margin must be float or integer.")
        if not isinstance(coauth_margin, (int, float)):
            raise Exception("Argument coauth_margin must be float or integer.")
        if first_year_search not in ("ID", "name"):
            raise Exception("Argument first_year_search must be either ID or name.")
        if first_year_search == "name" and not period:
            first_year_search = "ID"
            text = "Argument first_year_search set to ID: Argument period "\
                   "must not be None"
            warn(text)

        # Variables
        if not isinstance(scientist, list):
            scientist = [scientist]
        self.identifier = [int(auth_id) for auth_id in scientist]
        self.treatment_year = int(treatment_year)
        self.first_year_margin = first_year_margin
        self.pub_margin = pub_margin
        self.cits_margin = cits_margin
        self.coauth_margin = coauth_margin
        self.period = period
        self.first_year_name_search = first_year_search == "name"
        self.eids = eids
        if isinstance(affiliations, (int, str)):
            affiliations = [affiliations]
        if affiliations:
            affiliations = [int(a) for a in affiliations]
        self.search_affiliations = affiliations
        self.refresh = refresh
        self.sql_fname = sql_fname

        # Instantiate superclass to load private variables
        Scientist.__init__(self, self.identifier, treatment_year, refresh=refresh,
                           period=period, sql_fname=self.sql_fname)
Ejemplo n.º 6
0
def inform_matches(self, keywords, verbose, refresh):
    """Add match-specific information to all matches.

    Parameters
    ----------
    self : sosia.Original()
        Object whose matches should receive additional information.

    keywords : iterable of strings
        Which information to add to matches.

    verbose : bool
        Whether to report on the progress of the process and the completeness
        of document information.

    refresh : bool
        Whether to refresh all cached files or not.

    Returns
    -------
    out : list of namedtuples
        A list of namedtuples representing matches.  Provided information
        depend on provided keywords.
    """
    from sosia.classes import Scientist

    # Create Match object
    fields = "ID name " + " ".join(keywords)
    m = namedtuple("Match", fields)

    # Preparation
    doc_parse = "num_cited_refs" in keywords
    if doc_parse:
        focal_docs = parse_docs([d.eid for d in self.publications], refresh)
        focal_refs, focal_refs_n = focal_docs

    # Add selected information match-by-match
    out = []
    completeness = {}
    total = len(self.matches)
    print_progress(0, total, verbose)
    for idx, auth_id in enumerate(self.matches):
        period = self.year + 1 - self._period_year
        p = Scientist([auth_id],
                      self.year,
                      period=period,
                      refresh=refresh,
                      sql_fname=self.sql_fname)
        match_info = inform_match(p, keywords, refresh=refresh)
        # Abstract and reference similarity is performed jointly
        if doc_parse:
            eids = [d.eid for d in p.publications]
            refs, refs_n = parse_docs(eids, refresh)
            completeness[auth_id] = (refs_n, len(eids))
            if "num_cited_refs" in keywords:
                ref_cos = compute_overlap(refs, focal_refs)
                match_info["num_cited_refs"] = ref_cos
        out.append(m(**match_info))
        print_progress(idx + 1, total, verbose)

    # Eventually print information on missing information
    if verbose and doc_parse:
        for auth_id, completeness in completeness.items():
            _print_missing_docs([auth_id], completeness[0], completeness[1])
        focal_pubs_n = len(self.publications)
        _print_missing_docs(self.identifier,
                            focal_refs_n,
                            focal_pubs_n,
                            res_type="Original")
    return out
Ejemplo n.º 7
0
    def __init__(self,
                 scientist,
                 year,
                 year_margin=1,
                 pub_margin=0.1,
                 cits_margin=0.1,
                 coauth_margin=0.1,
                 period=None,
                 refresh=False,
                 eids=None,
                 search_affiliations=None):
        """Class to represent a scientist for which we want to find a control
        group.

        Parameters
        ----------
        scientist : str, int or list or str or int
            Scopus Author ID, or list of Scopus Author IDs, of the scientist
            you want to find control groups for.

        year : str or numeric
            Year of the event.  Control groups will be matched on trends and
            characteristics of the scientist up to this year.

        year_margin : numeric (optional, default=1)
            Number of years by which the search for authors publishing around
            the year of the focal scientist's year of first publication should
            be extend in both directions.

        pub_margin : numeric (optional, default=0.1)
            The left and right margin for the number of publications to match
            possible matches and the scientist on.  If the value is a float,
            it is interpreted as percentage of the scientists number of
            publications and the resulting value is rounded up.  If the value
            is an integer it is interpreted as fixed number of publications.

        cits_margin : numeric (optional, default=0.1)
            The left and right margin for the number of citations to match
            possible matches and the scientist on.  If the value is a float,
            it is interpreted as percentage of the scientists number of
            publications and the resulting value is rounded up.  If the value
            is an integer it is interpreted as fixed number of citations.

        coauth_margin : numeric (optional, default=0.1)
            The left and right margin for the number of coauthors to match
            possible matches and the scientist on.  If the value is a float,
            it is interpreted as percentage of the scientists number of
            coauthors and the resulting value is rounded up.  If the value
            is an integer it is interpreted as fixed number of coauthors.

        period: int (optional, default=None)
            The period in which to consider publications. If not provided,
            all publications are considered.

        refresh : boolean (optional, default=False)
            Whether to refresh all cached files or not.

        eids : list (optional, default=None)
            A list of scopus EIDs of the publications of the scientist you
            want to find a control for.  If it is provided, the scientist
            properties and the control group are set based on this list of
            publications, instead of the list of publications obtained from
            the Scopus Author ID.

        affiliations : list (optional, default=None)
            A list of scopus affiliation IDs. If provided, sosia searches
            for matches within this affiliation in the year provided.
        """
        # Internal checks
        if not isinstance(year_margin, (int, float)):
            raise Exception("Argument year_margin must be float or integer.")
        if not isinstance(pub_margin, (int, float)):
            raise Exception("Argument pub_margin must be float or integer.")
        if not isinstance(coauth_margin, (int, float)):
            raise Exception("Argument coauth_margin must be float or integer.")

        # Variables
        if not isinstance(scientist, list):
            scientist = [scientist]
        self.identifier = [str(auth_id) for auth_id in scientist]
        self.year = int(year)
        self.year_margin = year_margin
        self.pub_margin = pub_margin
        self.cits_margin = cits_margin
        self.coauth_margin = coauth_margin
        self.period = period
        self.eids = eids
        if isinstance(search_affiliations, (int, str)):
            search_affiliations = [search_affiliations]
        if search_affiliations:
            search_affiliations = [int(a) for a in search_affiliations]
        self.search_affiliations = search_affiliations
        self.refresh = refresh

        # Instantiate superclass to load private variables
        Scientist.__init__(self, self.identifier, year, refresh, period)
Ejemplo n.º 8
0
    def find_matches(self,
                     stacked=False,
                     verbose=False,
                     stop_words=STOPWORDS,
                     information=True,
                     refresh=False,
                     **tfidf_kwds):
        """Find matches within search_group based on four criteria:
        1. Started publishing in about the same year
        2. Has about the same number of publications in the year of treatment
        3. Has about the same number of coauthors in the year of treatment
        4. Has about the same number of citations in the year of treatment
        5. Works in the same field as the scientist's main field

        Parameters
        ----------
        stacked : bool (optional, default=False)
            Whether to combine searches in few queries or not.  Cached
            files will most likely not be resuable.  Set to True if you
            query in distinct fields or you want to minimize API key usage.

        verbose : bool (optional, default=False)
            Whether to report on the progress of the process.

        stop_words : list (optional, default=STOPWORDS)
            A list of words that should be filtered in the analysis of
            abstracts.  Default list is the list of english stopwords
            by nltk, augmented with numbers and interpunctuation.

        information : bool or iterable (optional, default=True)
            Whether to return additional information on the matches that may
            help in the selection process.  If an iterable of keywords is
            provied, only return information for these keywords.  Allowed
            values are "first_year", "num_coauthors", "num_publications",
            "num_citations", "country", "language",
            "reference_sim", "abstract_sim".

        refresh : bool (optional, default=False)
            Whether to refresh cached search files.

        tfidf_kwds : keywords
            Parameters to pass to TfidfVectorizer from the sklearn package
            for abstract vectorization.  Not used when `information=False` or
            or when "abstract_sim" is not in `information`.  See
            https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
            for possible values.

        Returns
        -------
        matches : list
            A list of Scopus IDs of scientists matching all the criteria (if
            information is False) or a list of namedtuples with the Scopus ID
            and additional information (if information is True).

        Raises
        ------
        ValueError
            If information is not bool and contains invalid keywords.
        """
        # Checks
        info_keys = [
            "first_name", "surname", "first_year", "num_coauthors",
            "num_publications", "num_citations", "num_coauthors_period",
            "num_publications_period", "num_citations_period", "subjects",
            "country", "affiliation_id", "affiliation", "language",
            "reference_sim", "abstract_sim"
        ]
        if isinstance(information, bool):
            if information:
                keywords = info_keys
            elif self.search_affiliations:
                information = True
                keywords = ["affiliation_id"]
            else:
                keywords = None
        else:
            keywords = information
            invalid = [x for x in keywords if x not in info_keys]
            if invalid:
                text = ("Parameter information contains invalid keywords: ",
                        ", ".join(invalid))
                raise ValueError(text)
            if self.search_affiliations and "affiliation_id" not in keywords:
                keywords.append("affiliation_id")
        # Variables
        _years = range(self.first_year - self.year_margin,
                       self.first_year + self.year_margin + 1)
        if self.period:
            _npapers = margin_range(len(self.publications_period),
                                    self.pub_margin)
            _ncits = margin_range(self.citations_period, self.cits_margin)
            _ncoauth = margin_range(len(self.coauthors_period),
                                    self.coauth_margin)
            _npapers_full = margin_range(len(self.publications),
                                         self.pub_margin)
            _ncits_full = margin_range(self.citations, self.cits_margin)
            _ncoauth_full = margin_range(len(self.coauthors),
                                         self.coauth_margin)
        else:
            _npapers = margin_range(len(self.publications), self.pub_margin)
            _ncits = margin_range(self.citations, self.cits_margin)
            _ncoauth = margin_range(len(self.coauthors), self.coauth_margin)
        n = len(self.search_group)
        text = "Searching through characteristics of {:,} authors".format(n)
        custom_print(text, verbose)

        # First round of filtering: minimum publications and main field
        # create df of authors
        authors = query_author_data(self.search_group, verbose=verbose)
        same_field = (authors.areas.str.startswith(self.main_field[1]))
        enough_pubs = (authors.documents.astype(int) >= int(min(_npapers)))
        group = authors[same_field & enough_pubs]["auth_id"].tolist()
        group.sort()
        n = len(group)
        text = "Left with {} authors\nFiltering based on provided "\
               "conditions...".format(n)
        custom_print(text, verbose)

        # Second round of filtering:
        # Check having no publications before minimum year, and if 0, the
        # number of publications in the relevant period.
        params = {
            "group": group,
            "ybefore": min(_years) - 1,
            "yupto": self.year,
            "npapers": _npapers,
            "yfrom": self.year_period,
            "verbose": verbose
        }
        group, _, _ = filter_pub_counts(**params)
        # Also screen out ids with too many publications over the full period
        if self.period:
            params.update({
                "npapers": [1, max(_npapers_full)],
                "yfrom": None,
                "group": group
            })
            group, _, _ = filter_pub_counts(**params)

        # Third round of filtering: citations (in the FULL period).
        authors = pd.DataFrame({"auth_id": group, "year": self.year})
        _, authors_cits_search = author_cits_in_cache(authors)
        text = "Search and filter based on count of citations\n{} to search "\
               "out of {}\n".format(len(authors_cits_search), len(group))
        custom_print(text, verbose)
        if not authors_cits_search.empty:
            authors_cits_search['n_cits'] = 0
            print_progress(0, len(authors_cits_search), verbose)
            for i, au in authors_cits_search.iterrows():
                q = "REF({}) AND PUBYEAR BEF {} AND NOT AU-ID({})".format(
                    au['auth_id'], self.year + 1, au['auth_id'])
                n = base_query("docs", q, size_only=True)
                authors_cits_search.at[i, 'n_cits'] = n
                print_progress(i + 1, len(authors_cits_search), verbose)
            cache_insert(authors_cits_search, table="author_cits_size")
        auth_cits_incache, _ = author_cits_in_cache(
            authors[["auth_id", "year"]])
        # keep if citations are in range
        mask = ((auth_cits_incache.n_cits <= max(_ncits)) &
                (auth_cits_incache.n_cits >= min(_ncits)))
        if self.period:
            mask = ((auth_cits_incache.n_cits >= min(_ncits)) &
                    (auth_cits_incache.n_cits <= max(_ncits_full)))
        group = (auth_cits_incache[mask]['auth_id'].tolist())

        # Fourth round of filtering: Download publications, verify coauthors
        # (in the FULL period) and first year.
        n = len(group)
        text = "Left with {} authors\nFiltering based on coauthors "\
               "number...".format(n)
        custom_print(text, verbose)
        authors = pd.DataFrame({
            "auth_id": group,
            "year": self.year
        },
                               dtype="uint64")
        _, author_year_search = author_year_in_cache(authors)
        matches = []
        if stacked:  # Combine searches
            if not author_year_search.empty:
                q = Template(
                    "AU-ID($fill) AND PUBYEAR BEF {}".format(self.year + 1))
                auth_year_group = author_year_search.auth_id.tolist()
                params = {
                    "group": auth_year_group,
                    "res": [],
                    "template": q,
                    "refresh": refresh,
                    "joiner": ") OR AU-ID(",
                    "q_type": "docs"
                }
                if verbose:
                    params.update({"total": len(auth_year_group)})
                res, _ = stacked_query(**params)
                res = build_dict(res, auth_year_group)
                if res:
                    # res can become empty after build_dict if a au_id is old
                    res = pd.DataFrame.from_dict(res, orient="index")
                    res["year"] = self.year
                    res = res[["year", "first_year", "n_pubs", "n_coauth"]]
                    res.index.name = "auth_id"
                    res = res.reset_index()
                    cache_insert(res, table="author_year")
            author_year_cache, _ = author_year_in_cache(authors)
            if self._ignore_first_id:
                # only number of coauthors should be big enough
                enough = (author_year_cache.n_coauth >= min(_ncoauth))
                notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full))
                mask = enough & notoomany
            elif self.period:
                # number of coauthors should be "big enough" and first year in
                # window
                same_start = (author_year_cache.first_year.between(
                    min(_years), max(_years)))
                enough = (author_year_cache.n_coauth >= min(_ncoauth))
                notoomany = (author_year_cache.n_coauth <= max(_ncoauth_full))
                mask = same_start & enough & notoomany
            else:
                # all restrictions apply
                same_start = (author_year_cache.first_year.between(
                    min(_years), max(_years)))
                same_coauths = (author_year_cache.n_coauth.between(
                    min(_ncoauth), max(_ncoauth)))
                mask = same_start & same_coauths
            matches = author_year_cache[mask]["auth_id"].tolist()
        else:  # Query each author individually
            for i, au in enumerate(group):
                print_progress(i + 1, len(group), verbose)
                res = base_query("docs",
                                 "AU-ID({})".format(au),
                                 refresh=refresh)
                res = [
                    p for p in res
                    if p.coverDate and int(p.coverDate[:4]) <= self.year
                ]
                # Filter
                min_year = int(min([p.coverDate[:4] for p in res]))
                authids = [p.author_ids for p in res if p.author_ids]
                authors = set([a for p in authids for a in p.split(";")])
                n_coauth = len(authors) - 1  # Subtract 1 for focal author
                if self._ignore_first_id and (n_coauth < max(_ncoauth)):
                    # only number of coauthors should be big enough
                    continue
                elif (self.period and ((n_coauth < max(_ncoauth)) or
                                       (min_year not in _years))):
                    # number of coauthors should be "big enough" and first year
                    # in window
                    continue
                elif ((len(res) not in _npapers) or (min_year not in _years)
                      or (n_coauth not in _ncoauth)):
                    continue
                matches.append(au)

        if self.period:
            text = "Left with {} authors\nFiltering based on exact period "\
                   "citations and coauthors...".format(len(matches))
            custom_print(text, verbose)
            # Further screen matches based on period cits and coauths
            to_loop = [m for m in matches]  # temporary copy
            for m in to_loop:
                q = "AU-ID({})".format(m)
                res = base_query("docs",
                                 "AU-ID({})".format(m),
                                 refresh=refresh,
                                 fields=["eid", "author_ids", "coverDate"])
                pubs = [
                    p for p in res if int(p.coverDate[:4]) <= self.year
                    and int(p.coverDate[:4]) >= self.year_period
                ]
                coauths = set(get_authors(pubs)) - {str(m)}
                if not (min(_ncoauth) <= len(coauths) <= max(_ncoauth)):
                    matches.remove(m)
                    continue
                eids_period = [p.eid for p in pubs]
                cits = count_citations(search_ids=eids_period,
                                       pubyear=self.year + 1,
                                       exclusion_key="AU-ID",
                                       exclusion_ids=[str(m)])
                if not (min(_ncits) <= cits <= max(_ncits)):
                    matches.remove(m)
        text = "Found {:,} author(s) matching all criteria".format(
            len(matches))
        custom_print(text, verbose)

        # Possibly add information to matches
        if keywords and len(matches) > 0:
            custom_print("Providing additional information...", verbose)
            profiles = [
                Scientist([str(a)],
                          self.year,
                          period=self.period,
                          refresh=refresh) for a in matches
            ]
            matches = inform_matches(profiles, self, keywords, stop_words,
                                     verbose, refresh, **tfidf_kwds)
        if self.search_affiliations:
            matches = [
                m for m in matches if len(
                    set(m.affiliation_id.replace(" ", "").split(";")).
                    intersection([str(a) for a in self.search_affiliations]))
            ]
        return matches
Ejemplo n.º 9
0
def inform_matches(self, keywords, stop_words, verbose, refresh, **kwds):
    """Add match-specific information to all matches.

    Parameters
    ----------
    self : sosia.Original()
        Object whose matches should received additional information

    keywords : iterable of strings
        Which information to add to matches.

    stop_words : list
        A list of words that should be filtered in the analysis of abstracts.

    verbose : bool
        Whether to report on the progress of the process and the completeness
        of document information.

    refresh : bool
        Whether to refresh all cached files or not.

    kwds : keywords
        Parameters to pass to sklearn.feature_extraction.text.TfidfVectorizer
        for abstract and reference vectorization.

    Returns
    -------
    out : list of namedtuples
        A list of namedtuples representing matches.  Provided information
        depend on provided keywords.
    """
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
    from string import digits, punctuation

    from sosia.classes import Scientist

    # Create Match object
    fields = "ID name " + " ".join(keywords)
    m = namedtuple("Match", fields)

    # Preparation
    doc_parse = "reference_sim" in keywords or "abstract_sim" in keywords
    if doc_parse:
        focal_docs = parse_docs([d.eid for d in self.publications], refresh)
        focal_refs, focal_refs_n, focal_abs, focal_abs_n = focal_docs
        if not stop_words:
            stop_words = list(ENGLISH_STOP_WORDS) + list(punctuation + digits)

    # Add selected information match-by-match
    out = []
    completeness = {}
    total = len(self.matches)
    print_progress(0, total, verbose)
    meta = namedtuple("Meta", "refs absts total")
    for idx, auth_id in enumerate(self.matches):
        period = self.year + 1 - self._period_year
        p = Scientist([auth_id],
                      self.year,
                      period=period,
                      refresh=refresh,
                      sql_fname=self.sql_fname)
        match_info = inform_match(p, keywords)
        # Abstract and reference similarity is performed jointly
        if doc_parse:
            eids = [d.eid for d in p.publications]
            refs, refs_n, absts, absts_n = parse_docs(eids, refresh)
            completeness[auth_id] = meta(refs=refs_n,
                                         absts=absts_n,
                                         total=len(eids))
            if "reference_sim" in keywords:
                ref_cos = compute_similarity(refs, focal_refs, **kwds)
                match_info["reference_sim"] = ref_cos
            if "abstract_sim" in keywords:
                kwds.update({"stop_words": stop_words})
                abs_cos = compute_similarity(absts,
                                             focal_abs,
                                             tokenize=True,
                                             **kwds)
                match_info["abstract_sim"] = abs_cos
        out.append(m(**match_info))
        print_progress(idx + 1, total, verbose)

    # Eventually print information on missing information
    if verbose and doc_parse:
        for auth_id, completeness in completeness.items():
            _print_missing_docs([auth_id], completeness.absts,
                                completeness.refs, completeness.total)
        focal_pubs_n = len(self.publications)
        _print_missing_docs(self.identifier,
                            focal_abs_n,
                            focal_refs_n,
                            focal_pubs_n,
                            res_type="Original")
    return out
Ejemplo n.º 10
0
# -*- coding: utf-8 -*-
"""Tests for class `Scientist`."""

from collections import namedtuple
from nose.tools import assert_equal, assert_true, assert_false

from sosia.classes import Scientist

refresh = 30
scientist1 = Scientist([6701809842], 2001, refresh=refresh)
scientist2 = Scientist([55208373700, 55208373700], 2017, refresh=refresh)
eids = ["2-s2.0-84959420483", "2-s2.0-84949113230"]
scientist3 = Scientist([55208373700], 2017, eids=eids, refresh=refresh)
scientist4 = Scientist([55208373700], 2015, refresh=refresh)
scientist5 = Scientist([55208373700], 2018, period=2, refresh=refresh)


def test_active_year():
    assert_equal(scientist1.active_year, 2001)
    assert_equal(scientist2.active_year, 2017)
    assert_equal(scientist3.active_year, 2016)
    assert_equal(scientist4.active_year, 2012)
    assert_equal(scientist5.active_year, 2018)


def test_affiliation_country():
    assert_equal(scientist1.affiliation_country, "Germany")
    assert_equal(scientist2.affiliation_country, "Switzerland")
    assert_equal(scientist3.affiliation_country, "Switzerland")
    assert_equal(scientist4.affiliation_country, "Switzerland")
    assert_equal(scientist5.affiliation_country, "Germany")
Ejemplo n.º 11
0
def inform_matches(profiles, focal, keywords, stop_words, verbose, refresh,
                   **kwds):
    """Create namedtuple adding information to matches.

    Parameters
    ----------
    profiles : list of Scientist()
        A list of Scientist objects representing matches.

    focal : Scientist
        Object of class Scientist representing the focal scientist.

    keywords : iterable of strings
        Which information to add to matches.

    stop_words : list
        A list of words that should be filtered in the analysis of abstracts.

    verbose : bool
        Whether to report on the progress of the process and the completeness
        of document information.

    refresh : bool
        Whether to refresh all cached files or not.

    kwds : keywords
        Parameters to pass to sklearn.feature_extraction.text.TfidfVectorizer
        for abstract and reference vectorization.

    Returns
    -------
    m : list of namedtuples
        A list of namedtuples representing matches.  Provided information
        depend on provided keywords.
    """
    from sosia.classes import Scientist
    # Create Match object
    fields = "ID name " + " ".join(keywords)
    m = namedtuple("Match", fields)
    # Preparation
    doc_parse = "reference_sim" in keywords or "abstract_sim" in keywords
    total = len(profiles)
    print_progress(0, total, verbose)
    if doc_parse:
        focal_eids = [d.eid for d in focal.publications]
        focal = parse_docs(focal_eids, refresh)
        focal_refs, focal_refs_n, focal_abs, focal_abs_n = focal
    # Add selective information
    out = []
    info = {}  # to collect information on missing information
    for idx, p in enumerate(profiles):
        # Add characteristics
        match_info = {"ID": p.identifier[0], "name": p.name}
        if "language" in keywords:
            try:
                match_info["language"] = p.get_publication_languages().language
            except Scopus404Error:  # Refresh profile
                p = Scientist(p.identifier, p.year, refresh=True)
                match_info["language"] = p.get_publication_languages().language
        if "first_name" in keywords:
            match_info["first_name"] = p.first_name
        if "surname" in keywords:
            match_info["surname"] = p.surname
        if "first_year" in keywords:
            match_info["first_year"] = p.first_year
        if "num_coauthors" in keywords:
            match_info["num_coauthors"] = len(p.coauthors)
        if "num_publications" in keywords:
            match_info["num_publications"] = len(p.publications)
        if "num_citations" in keywords:
            match_info["num_citations"] = p.citations
        if "num_coauthors_period" in keywords:
            match_info["num_coauthors_period"] = len(p.coauthors_period)
        if "num_publications_period" in keywords:
            match_info["num_publications_period"] = len(p.publications_period)
        if "num_citations_period" in keywords:
            match_info["num_citations_period"] = p.citations_period
        if "subjects" in keywords:
            match_info["subjects"] = p.subjects
        if "country" in keywords:
            match_info["country"] = p.country
        if "city" in keywords:
            match_info["city"] = p.city
        if "affiliation_id" in keywords:
            match_info["affiliation_id"] = p.affiliation_id
        if "affiliation" in keywords:
            match_info["affiliation"] = p.organization
        # Abstract and reference similiarity is performed jointly
        if doc_parse:
            eids = [d.eid for d in p.publications]
            refs, refs_n, absts, absts_n = parse_docs(eids, refresh)
            vec = TfidfVectorizer(**kwds)
            ref_cos = compute_cos(vec.fit_transform([refs, focal_refs]))
            vec = TfidfVectorizer(stop_words=stop_words,
                                  tokenizer=tokenize_and_stem,
                                  **kwds)
            abs_cos = compute_cos(vec.fit_transform([absts, focal_abs]))
            # Save info for below print statement
            meta = namedtuple("Meta", "refs absts total")
            meta(refs=refs_n, absts=absts_n, total=len(eids))
            key = "; ".join(p.identifier)
            info[key] = meta(refs=refs_n, absts=absts_n, total=len(eids))
        if "reference_sim" in keywords:
            match_info["reference_sim"] = ref_cos
        if "abstract_sim" in keywords:
            match_info["abstract_sim"] = abs_cos
        # Finalize
        out.append(m(**match_info))
        print_progress(idx + 1, total, verbose)
    # Print information on missing information
    if verbose and doc_parse:
        for auth_id, info in info.items():
            _print_missing_docs(auth_id, info.refs, info.absts, info.total)
        label = ";".join(focal.identifier) + " (focal)"
        focal_pubs_n = len(focal.publications)
        _print_missing_docs(label, focal_refs_n, focal_abs_n, focal_pubs_n)
    return out