Exemple #1
0
    def test_is_null(self):

        import numpy as np
        import pandas as pd
        from pewtils import is_null, is_not_null

        for val in [None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"]:
            self.assertTrue(is_null(val))
        self.assertTrue(is_null(np.nan))
        self.assertTrue(is_not_null(0.0))
        self.assertTrue(is_null("-9", custom_nulls=["-9"]))
        self.assertTrue(is_null([], empty_lists_are_null=True))
        self.assertFalse(is_null([], empty_lists_are_null=False))
        self.assertTrue(is_null(pd.Series(), empty_lists_are_null=True))
        self.assertFalse(is_null(pd.Series(), empty_lists_are_null=False))
        self.assertTrue(is_null(pd.DataFrame(), empty_lists_are_null=True))
        self.assertFalse(is_null(pd.DataFrame(), empty_lists_are_null=False))
Exemple #2
0
    def __init__(self,
                 path,
                 use_s3=None,
                 aws_access=None,
                 aws_secret=None,
                 bucket=None):

        if aws_access is None:
            aws_access = os.environ.get("AWS_ACCESS_KEY_ID", None)
        if aws_secret is None:
            aws_secret = os.environ.get("AWS_SECRET_ACCESS_KEY", None)
        if bucket is None:
            bucket = os.environ.get("S3_BUCKET", None)

        self.path = path

        self.use_s3 = use_s3 if is_not_null(bucket) else False
        if self.use_s3:

            s3_params = {}

            if aws_access is not None:
                s3_params["aws_access_key_id"] = aws_access
                s3_params["aws_secret_access_key"] = aws_secret

            if "." in bucket:
                s3_params["calling_format"] = OrdinaryCallingFormat()

            self.s3 = S3Connection(**s3_params).get_bucket(bucket)

        else:
            self.path = os.path.join(self.path)
            if not os.path.exists(self.path):
                try:
                    os.makedirs(self.path)
                except Exception as e:
                    print("Warning: couldn't make directory '{}'".format(
                        self.path))
                    print(e)
Exemple #3
0
    def fit(self, df=None, **kwargs):
        """
        Fits a model using the method specified when initializing the ``TopicModel``. Details on model-specific \
        parameters are below:

        **sklearn_lda**

        Fits a model using :py:class:`sklearn.decomposition.LatentDirichletAllocation`. For more information on \
        available parameters, please refer to the official documentation: \
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: Represents document-topic density. When values are higher, documents will be comprised of more \
        topics; when values are lower, documents will be primarily comprised of only a few topics. This parameter is \
        used instead of the doc_topic_prior sklearn parameter, and will be passed along to sklearn using the formula: \
        ``doc_topic_prior = alpha / num_topics``
        :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \
        when values are lower, only a few words will be loaded onto each topic. This parameter is used instead of the \
        topic_word_prior sklearn parameter, and will be passed along to sklearn using the formula: \
        ``topic_word_prior = beta / num_topics``.
        :param learning_decay: See sklearn documentation.
        :param learning_offset: See sklearn documentation.
        :param learning_method: See sklearn documentation.
        :param max_iter: See sklearn documentation.
        :param batch_size: See sklearn documentation.
        :param verbose: See sklearn documentation.

        **sklearn_nmf**

        Fits a model using :py:class:`sklearn.decomposition.NMF`. For more information on available parameters, \
        please refer to the official documentation: \
        https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: See sklearn documentation.
        :param l1_ratio: See sklearn documentation.
        :param tol: See sklearn documentation.
        :param max_iter: See sklearn documentation.
        :param shuffle: See sklearn documentation.

        **gensim_lda**

        Fits an LDA model using :py:class:`gensim.models.LdaModel` or \
        :py:class:`gensim.models.ldamulticore.LdaMulticore`. \
        When ``use_multicore`` is set to True, the multicore implementation will be used, otherwise the standard \
        LDA implementation will be used. \
        For more information on available parameters, please refer to the official documentation below:

            - use_multicore=True: https://radimrehurek.com/gensim/models/ldamulticore.html
            - use_multicore=False: https://radimrehurek.com/gensim/models/ldamodel.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param alpha: Represents document-topic density. When values are higher, documents will be comprised of \
        more topics; when values are lower, documents will be primarily comprised of only a few topics. Gensim \
        options are a bit different than sklearn though; refer to the documentation for the accepted values here.
        :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \
        when values are lower, only a few words will be loaded onto each topic. Gensim options are a bit different \
        than sklearn though; refer to the documentation for the accepted values here. Gensim calls this parameter \
        ``eta``. We renamed it to be consistent with the sklearn implementations.
        :param chunksize: See gensim documentation.
        :param passes: See gensim documentation.
        :param decay: See gensim documentation.
        :param offset: See gensim documentation.
        :param workers: Number of cores to use (if using multicore)
        :param use_multicore: Whether or not to use multicore

        **gensim_hdp**

        Fits an HDP model using the gensim implementation. Contrary to LDA and NMF, HDP attempts to auto-detect the
        correct number of topics. In practice, it actually fits ``T`` topics (default is 150) but many are extremely rare
        or occur only in a very few number of documents. To identify the topics that are actually useful, this function
        passes the original :py:class:`pandas.DataFrame` through the trained model after fitting, and identifies \
        topics that compose at least 1% of a document in at least 1% of all documents in the corpus. In other words, \
        topics are thrown out if the number of documents they appear in at a rate of at least 1% are fewer than 1% of \
        the total number of documents. Subsequent use of the model will only make use of topics that meet this \
        threshold. For more information on available parameters, please refer to the official documentation: \
        https://radimrehurek.com/gensim/models/hdpmodel.html

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param max_chunks: See gensim documentation.
        :param max_time: See gensim documentation.
        :param chunksize: See gensim documentation.
        :param kappa: See gensim documentation.
        :param tau: See gensim documentation.
        :param T: See gensim documentation.
        :param K: See gensim documentation.
        :param alpha: See gensim documentation.
        :param beta: See gensim documentation.
        :param gamma: See gensim documentation.
        :param scale: See gensim documentation.
        :param var_converge: See gensim documentation.

        **corex**

        Fits a CorEx topic model. Anchors can be provided in the form of a list of lists, with each item
        corresponding to a set of words to be used to seed a topic. For example:

        .. code-block:: python

            anchors=[
                ['cat', 'kitten'],
                ['dog', 'puppy']
            ]

        The list of anchors cannot be longer than the specified number of topics, and all of the words must
        exist in the vocabulary. The ``anchor_strength`` parameter determines the degree to which the model is able to
        override the suggested words based on the data; providing higher values are a way of "insisting" more strongly
        that the model keep the provided words together in a single topic. For more information on available \
        parameters, please refer to the official documentation: https://github.com/gregversteeg/corex_topic

        :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``)
        :param anchors: A list of lists that contain words that the model should try to group together into topics
        :param anchor_strength: The degree to which the provided anchors should be preserved regardless of the data

        """

        fit_params = self.get_fit_params(**kwargs)

        if self.method in ["sklearn_lda", "sklearn_nmf"]:

            if self.method == "sklearn_lda":
                self.model = LatentDirichletAllocation(
                    n_components=self.num_topics, **fit_params)
            if self.method == "sklearn_nmf":
                self.model = NMF(n_components=self.num_topics, **fit_params)

            if is_not_null(df):
                features = self.get_features(df)
            else:
                features = self.train_features
            self.model.fit(features)

        elif self.method in ["gensim_lda", "gensim_hdp"]:

            vocab_dict = dict([(i, s) for i, s in enumerate(self.ngrams)])
            if is_not_null(df):
                features = self.get_features(df, keep_sparse=True)
            else:
                features = self.train_features
            matrix = gensim.matutils.Sparse2Corpus(features,
                                                   documents_columns=False)

            if self.method == "gensim_lda":
                fit_params["num_topics"] = self.num_topics
                fit_params["id2word"] = vocab_dict
                if fit_params["use_multicore"]:
                    model_class = gensim.models.ldamulticore.LdaMulticore
                else:
                    model_class = gensim.models.LdaModel
                    del fit_params["workers"]
                del fit_params["use_multicore"]
                self.model = model_class(**fit_params)
                self.model.update(matrix)
            elif self.method == "gensim_hdp":
                model_class = gensim.models.hdpmodel.HdpModel
                self.model = model_class(matrix, vocab_dict, **fit_params)
                doc_topics = self.get_document_topics(self.df)
                topics = ((doc_topics >= 0.01).astype(int).mean() >=
                          0.01).astype(int)
                self.topic_ids = [
                    int(col.split("_")[-1])
                    for col in topics[topics == 1].index
                    if col.startswith("topic_")
                ]
                self.num_topics = len(self.topic_ids)

        elif self.method == "corex":

            if is_not_null(df):
                features = self.get_features(df, keep_sparse=True)
            else:
                features = self.get_features(self.train_df, keep_sparse=True)
            self.model = corextopic.Corex(n_hidden=self.num_topics)
            self.model.fit(features, words=self.ngrams, **fit_params)
Exemple #4
0
def compute_mutual_info(y,
                        x,
                        weights=None,
                        col_names=None,
                        l=0,
                        normalize=True):
    """
    Computes pointwise mutual information for a set of observations partitioned into two groups.

    :param y: An array or, preferably, a :py:class:`pandas.Series`
    :param x: A matrix, :py:class:`pandas.DataFrame`, or preferably a :py:class:`scipy.sparse.csr_matrix`
    :param weights: (Optional) An array of weights corresponding to each observation
    :param col_names: The feature names associated with the columns in matrix 'x'
    :type col_names: list
    :param l: An optional Laplace smoothing parameter
    :type l: int or float
    :param normalize: Toggle normalization on or off (to control for feature prevalance), on by default
    :type normalize: bool
    :return: A :py:class:`pandas.DataFrame` of features with a variety of computed metrics including mutual information.

    The function expects ``y`` to correspond to a list or series of values indicating which partition an observation \
    belongs to. ``y`` must be a binary flag. ``x`` is a set of features (either a :py:class:`pandas.DataFrame` or \
    sparse matrix) where the rows correspond to observations and the columns represent the presence of features (you \
    can technically run this using non-binary features but the results will not be as readily interpretable.) The \
    function returns a :py:class:`pandas.DataFrame` of metrics computed for each feature, including the following \
    columns:

    - ``MI1``: The feature's mutual information for the positive class
    - ``MI0``: The feature's mutual information for the negative class
    - ``total``: The total number of times a feature appeared
    - ``total_pos_with_term``: The total number of times a feature appeared in positive cases
    - ``total_neg_with_term``: The total number of times a feature appeared in negative cases
    - ``total_pos_neg_with_term_diff``: The raw difference in the number of times a feature appeared in positive cases \
    relative to negative cases
    - ``pct_pos_with_term``: The proportion of positive cases that had the feature
    - ``pct_neg_with_term``: The proportion of negative cases that had the feature
    - ``pct_pos_neg_with_term_ratio``: A likelihood ratio indicating the degree to which a positive case was more likely \
    to have the feature than a negative case
    - ``pct_term_pos``: Of the cases that had a feature, the proportion that were in the positive class
    - ``pct_term_neg``: Of the cases that had a feature, the proportion that were in the negative class
    - ``pct_term_pos_neg_diff``: The percentage point difference between the proportion of cases with the feature that \
    were positive vs. negative
    - ``pct_term_pos_neg_ratio``: A likelihood ratio indicating the degree to which a feature was more likely to appear \
    in a positive case relative to a negative one (may not be meaningful when classes are imbalanced)

    .. note:: Note that ``pct_term_pos`` and ``pct_term_neg`` may not be directly comparable if classes are imbalanced, \
        and in such cases a ``pct_term_pos_neg_diff`` above zero or ``pct_term_pos_neg_ratio`` above 1 may not indicate a \
        true association with the positive class if positive cases outnumber negative ones.

    .. note:: Mutual information can be a difficult metric to explain to others. We've found that the \
        ``pct_pos_neg_with_term_ratio`` can serve as a more interpretable alternative method for identifying \
        meaningful differences between groups.

    Usage::

        from pewanalytics.stats.mutual_info import compute_mutual_info
        import nltk
        import pandas as pd
        from sklearn.metrics.pairwise import linear_kernel
        from sklearn.feature_extraction.text import TfidfVectorizer

        nltk.download("inaugural")
        df = pd.DataFrame([
            {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids()
        ])
        df['year'] = df['speech'].map(lambda x: int(x.split("-")[0]))
        df['21st_century'] = df['year'].map(lambda x: 1 if x >= 2000 else 0)

        vec = TfidfVectorizer(min_df=10, max_df=.9).fit(df['text'])
        tfidf = vec.transform(df['text'])

        # Here are the terms most distinctive of inaugural addresses in the 21st century vs. years prior

        >>> results = compute_mutual_info(df['21st_century'], tfidf, col_names=vec.get_feature_names())

        >>> results.sort_values("MI1", ascending=False).index[:25]
        Index(['america', 'thank', 'bless', 'schools', 'ideals', 'americans',
               'meaning', 'you', 'move', 'across', 'courage', 'child', 'birth',
               'generation', 'families', 'build', 'hard', 'promise', 'choice', 'women',
               'guided', 'words', 'blood', 'dignity', 'because'],
              dtype='object')

    """

    if is_not_null(weights):
        weights = weights.fillna(0)
        y0 = sum(weights[y == 0])
        y1 = sum(weights[y == 1])
        total = sum(weights)
    else:
        y0 = len(y[y == 0])
        y1 = len(y[y == 1])
        total = y1 + y0

    if type(x).__name__ == "csr_matrix":

        if is_not_null(weights):
            x = x.transpose().multiply(csr_matrix(weights)).transpose()
        x1 = pd.Series(x.sum(axis=0).tolist()[0])
        x0 = total - x1
        x1y0 = pd.Series(
            x[np.ravel(np.array(y[y == 0].index)), :].sum(axis=0).tolist()[0])
        x1y1 = pd.Series(
            x[np.ravel(np.array(y[y == 1].index)), :].sum(axis=0).tolist()[0])

    else:

        if type(x).__name__ != "DataFrame":
            x = pd.DataFrame(x, columns=col_names)

        if is_not_null(weights):
            x = x.multiply(weights, axis="index")
            x1 = x.multiply(weights, axis="index").sum()
            x0 = ((x * -1) + 1).multiply(weights, axis="index").sum()
        else:
            x1 = x.sum()
            x0 = ((x * -1) + 1).sum()
        x1y0 = x[y == 0].sum()
        x1y1 = x[y == 1].sum()

    px1y0 = x1y0 / total
    px1y1 = x1y1 / total
    px0y0 = (y0 - x1y0) / total
    px0y1 = (y1 - x1y1) / total

    px1 = x1 / total
    px0 = x0 / total
    py1 = float(y1) / float(total)
    py0 = float(y0) / float(total)

    MI1 = (px1y1 / (px1 * py1) +
           l).map(lambda v: math.log(v, 2) if v > 0 else 0)
    if normalize:
        MI1 = MI1 / (-1 * px1y1.map(lambda v: math.log(v, 2) if v > 0 else 0))

    MI0 = (px1y0 / (px1 * py0) +
           l).map(lambda v: math.log(v, 2) if v > 0 else 0)
    if normalize:
        MI0 = MI0 / (-1 * px1y0.map(lambda v: math.log(v, 2) if v > 0 else 0))

    df = pd.DataFrame()

    df["MI1"] = MI1
    df["MI0"] = MI0

    df["total"] = x1
    df["total_pos_with_term"] = x1y1  # total_pos_mention
    df["total_neg_with_term"] = x1y0  # total_neg_mention
    df["total_pos_neg_with_term_diff"] = (df["total_pos_with_term"] -
                                          df["total_neg_with_term"])
    df["pct_with_term"] = x1 / (x1 + x0)
    df["pct_pos_with_term"] = x1y1 / y1  # pct_pos_mention
    df["pct_neg_with_term"] = x1y0 / y0  # pct_neg_mention
    df["pct_pos_neg_with_term_diff"] = (df["pct_pos_with_term"] -
                                        df["pct_neg_with_term"]
                                        )  # pct_pos_neg_mention_diff
    df["pct_pos_neg_with_term_ratio"] = df["pct_pos_with_term"] / (
        df["pct_neg_with_term"])  # pct_pos_neg_mention_ratio

    df["pct_term_pos"] = x1y1 / x1  # pct_mention_pos
    df["pct_term_neg"] = x1y0 / x1  # pct_mention_neg
    df["pct_term_pos_neg_diff"] = (df["pct_term_pos"] - df["pct_term_neg"]
                                   )  # pct_mention_pos_neg_diff
    df["pct_term_pos_neg_ratio"] = df["pct_term_pos"] / df["pct_term_neg"]

    if col_names:
        df.index = col_names

    return df
Exemple #5
0
def compute_scores(
    coder_df,
    coder1,
    coder2,
    outcome_column,
    document_column,
    coder_column,
    weight_column=None,
    pos_label=None,
):
    """
    Computes a variety of inter-rater reliability scores, including Cohen's kappa, Krippendorf's alpha, precision,
    and recall. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns:

        - A column with values that indicate the coder (like a name)
        - A column with values that indicate the document (like an ID)
        - A column with values that indicate the code value
        - (Optional) A column with document weights

    This function will return a :py:class:`pandas.DataFrame` with agreement scores between the two specified coders.

    :param coder_df: A :py:class:`pandas.DataFrame` of codes
    :type coder_df: :py:class:`pandas.DataFrame`
    :param coder1: The value in ``coder_column`` for rows corresponding to the first coder
    :type coder1: str or int
    :param coder2: The value in ``coder_column`` for rows corresponding to the second coder
    :type coder2: str or int
    :param outcome_column: The column that contains the codes
    :type outcome_column: str
    :param document_column: The column that contains IDs for the documents
    :type document_column: str
    :param coder_column: The column containing values that indicate which coder assigned the code
    :type coder_column: str
    :param weight_column: The column that contains sampling weights
    :type weight_column: str
    :param pos_label: The value indicating a positive label (optional)
    :type pos_label: str or int
    :return: A dictionary of scores
    :rtype: dict

    .. note:: If using a multi-class (non-binary) code, some scores may come back null or not compute as expected. \
        We recommend running the function separately for each specific code value as a binary flag by providing \
        each unique value to the ``pos_label`` argument. If ``pos_label`` is not provided for multi-class codes, \
        this function will attempt to compute scores based on support-weighted averages.

    Usage::

        from pewanalytics.stats.irr import compute_scores
        import pandas as pd

        df = pd.DataFrame([
            {"coder": "coder1", "document": 1, "code": "2"},
            {"coder": "coder2", "document": 1, "code": "2"},
            {"coder": "coder1", "document": 2, "code": "1"},
            {"coder": "coder2", "document": 2, "code": "2"},
            {"coder": "coder1", "document": 3, "code": "0"},
            {"coder": "coder2", "document": 3, "code": "0"},
        ])

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': None,
         'coder1_mean_unweighted': 1.0,
         'coder1_std_unweighted': 0.5773502691896257,
         'coder2_mean_unweighted': 1.3333333333333333,
         'coder2_std_unweighted': 0.6666666666666666,
         'alpha_unweighted': 0.5454545454545454,
         'accuracy': 0.6666666666666666,
         'f1': 0.5555555555555555,
         'precision': 0.5,
         'recall': 0.6666666666666666,
         'precision_recall_min': 0.5,
         'matthews_corrcoef': 0.6123724356957946,
         'roc_auc': None,
         'pct_agree_unweighted': 0.6666666666666666}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="0")
         {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '0',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.3333333333333333,
         'coder2_std_unweighted': 0.3333333333333333,
         'alpha_unweighted': 1.0,
         'cohens_kappa': 1.0,
         'accuracy': 1.0,
         'f1': 1.0,
         'precision': 1.0,
         'recall': 1.0,
         'precision_recall_min': 1.0,
         'matthews_corrcoef': 1.0,
         'roc_auc': 1.0,
         'pct_agree_unweighted': 1.0}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="1")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '1',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.0,
         'coder2_std_unweighted': 0.0,
         'alpha_unweighted': 0.0,
         'cohens_kappa': 0.0,
         'accuracy': 0.6666666666666666,
         'f1': 0.0,
         'precision': 0.0,
         'recall': 0.0,
         'precision_recall_min': 0.0,
         'matthews_corrcoef': 1.0,
         'roc_auc': None,
         'pct_agree_unweighted': 0.6666666666666666}

        >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="2")
        {'coder1': 'coder1',
         'coder2': 'coder2',
         'n': 3,
         'outcome_column': 'code',
         'pos_label': '2',
         'coder1_mean_unweighted': 0.3333333333333333,
         'coder1_std_unweighted': 0.3333333333333333,
         'coder2_mean_unweighted': 0.6666666666666666,
         'coder2_std_unweighted': 0.3333333333333333,
         'alpha_unweighted': 0.4444444444444444,
         'cohens_kappa': 0.3999999999999999,
         'accuracy': 0.6666666666666666,
         'f1': 0.6666666666666666,
         'precision': 0.5,
         'recall': 1.0,
         'precision_recall_min': 0.5,
         'matthews_corrcoef': 0.5,
         'roc_auc': 0.75,
         'pct_agree_unweighted': 0.6666666666666666}


    """

    old_np_settings = np.seterr(all="raise")

    coder_df = copy.deepcopy(coder_df)
    if pos_label:
        coder_df[outcome_column] = (
            coder_df[outcome_column] == pos_label).astype(int)
    coder1_df = coder_df[coder_df[coder_column] == coder1]
    coder1_df.index = coder1_df[document_column]
    coder2_df = coder_df[coder_df[coder_column] == coder2]
    coder2_df.index = coder2_df[document_column]
    coder1_df = coder1_df[coder1_df.index.isin(coder2_df.index)]
    coder2_df = coder2_df[coder2_df.index.isin(
        coder1_df.index)].loc[coder1_df.index]

    row = {
        "coder1": coder1,
        "coder2": coder2,
        "n": len(coder1_df),
        "outcome_column": outcome_column,
        "pos_label": pos_label,
    }

    for labelsetname, labelset in [
        ("coder1", coder1_df[outcome_column]),
        ("coder2", coder2_df[outcome_column]),
    ]:

        if weight_column:
            try:
                weighted_stats = DescrStatsW(labelset,
                                             weights=coder1_df[weight_column])
                if weighted_stats:
                    row["{}_mean".format(labelsetname)] = weighted_stats.mean
                    row["{}_std".format(
                        labelsetname)] = weighted_stats.std_mean
            except (TypeError, ValueError):
                try:
                    weighted_stats = DescrStatsW(
                        labelset.astype(int), weights=coder1_df[weight_column])
                    if weighted_stats:
                        row["{}_mean".format(
                            labelsetname)] = weighted_stats.mean
                        row["{}_std".format(
                            labelsetname)] = weighted_stats.std_mean
                except (TypeError, ValueError):
                    pass

        try:
            unweighted_stats = DescrStatsW(labelset,
                                           weights=[1.0 for x in labelset])
            if unweighted_stats:
                row["{}_mean_unweighted".format(
                    labelsetname)] = unweighted_stats.mean
                row["{}_std_unweighted".format(
                    labelsetname)] = unweighted_stats.std_mean
        except (TypeError, ValueError):
            try:
                unweighted_stats = DescrStatsW(labelset.astype(int),
                                               weights=[1.0 for x in labelset])
                if unweighted_stats:
                    row["{}_mean_unweighted".format(
                        labelsetname)] = unweighted_stats.mean
                    row["{}_std_unweighted".format(
                        labelsetname)] = unweighted_stats.std_mean
            except (TypeError, ValueError):
                pass

    alpha = AnnotationTask(
        data=coder_df[[coder_column, document_column, outcome_column]].values)
    try:
        alpha = alpha.alpha()
    except (ZeroDivisionError, ValueError):
        alpha = None
    row["alpha_unweighted"] = alpha

    labels = np.unique(coder_df[outcome_column])
    if len(labels) <= 2:

        try:
            row["cohens_kappa"] = cohen_kappa_score(
                coder1_df[outcome_column],
                coder2_df[outcome_column],
                sample_weight=coder1_df[weight_column]
                if weight_column else None,
                labels=labels,
            )
        except FloatingPointError:
            row["cohens_kappa"] = 1.0

    try:
        row["accuracy"] = accuracy_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
        )
    except ValueError:
        row["accuracy"] = None

    try:
        row["f1"] = f1_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["f1"] = None

    try:
        row["precision"] = precision_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["precision"] = None

    try:
        row["recall"] = recall_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else "binary",
        )
    except ValueError:
        row["recall"] = None

    if is_not_null(row["precision"]) and is_not_null(row["recall"]):
        row["precision_recall_min"] = min([row["precision"], row["recall"]])
    else:
        row["precision_recall_min"] = None

    try:
        row["matthews_corrcoef"] = matthews_corrcoef(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
        )
    except ValueError:
        row["matthews_corrcoef"] = None
    except FloatingPointError:
        row["matthews_corrcoef"] = 1.0

    try:

        row["roc_auc"] = (roc_auc_score(
            coder1_df[outcome_column],
            coder2_df[outcome_column],
            sample_weight=coder1_df[weight_column] if weight_column else None,
            labels=labels,
            average="weighted" if not pos_label else None,
        ) if len(np.unique(coder1_df[outcome_column])) > 1
                          and len(np.unique(coder2_df[outcome_column])) > 1
                          else None)
    except TypeError:
        try:
            row["roc_auc"] = (roc_auc_score(
                coder1_df[outcome_column],
                coder2_df[outcome_column],
                sample_weight=coder1_df[weight_column]
                if weight_column else None,
                average="weighted" if not pos_label else None,
            ) if len(np.unique(coder1_df[outcome_column])) > 1
                              and len(np.unique(coder2_df[outcome_column])) > 1
                              else None)
        except (ValueError, TypeError):
            row["roc_auc"] = None
    except (ValueError, TypeError):
        row["roc_auc"] = None

    row["pct_agree_unweighted"] = np.average([
        1 if c[0] == c[1] else 0
        for c in zip(coder1_df[outcome_column], coder2_df[outcome_column])
    ])

    for k, v in row.items():
        if type(v) == tuple:
            row[k] = v[0]
            # For some weird reason, some of the sklearn scorers return 1-tuples sometimes

    np.seterr(**old_np_settings)

    return row
Exemple #6
0
def trim_get_parameters(url, session=None, timeout=30, user_agent=None):
    """
    Takes a URL (presumed to be the final end point) and iterates over GET parameters, attempting to find optional
    ones that can be removed without generating any redirects.

    :param url: The URL to trim
    :type url: str
    :param session: (Optional) A persistent session that can optionally be passed (useful if you're processing many \
    links at once)
    :type session: :py:class:`requests.Session` object
    :param user_agent: User agent for the auto-created requests Session to use, if a preconfigured requests Session \
    is not provided
    :type user_agent: str
    :param timeout: Timeout for requests
    :type timeout: int or float
    :return: The original URL with optional GET parameters removed
    :rtype: str

    Usage::

        from pewtils.http import trim_get_parameters

        >>> trim_get_parameters("https://httpbin.org/status/200?param=1")
        "https://httpbin.org/status/200"

    """

    close_session = False
    if not session:
        close_session = True
        session = requests.Session()
        session.headers.update({"User-Agent": user_agent})

    # Often there's extra information about social sharing and referral sources that can be removed
    ditch_params = []
    parsed = urlparse.urlparse(url)
    if parsed.query:
        params = urlparse.parse_qs(parsed.query)
        for k, v in params.items():
            # We iterate over all of the GET parameters and try holding each one out
            check = True
            for skipper in ["document", "article", "id", "qs"]:
                # If the parameter is named something that's probably a unique ID, we'll keep it
                if skipper in k.lower():
                    check = False
            for skipper in ["html", "http"]:
                # Same goes for parameters that contain URL information
                if skipper in v[0].lower():
                    check = False
            if check:
                new_params = {
                    k2: v2[0]
                    for k2, v2 in params.items() if k2 != k and len(v2) == 1
                }
                new_params = urlparse.urlencode(new_params)
                new_parsed = parsed._replace(query=new_params)
                new_url = urlparse.urlunparse(new_parsed)
                try:
                    resp = session.head(new_url,
                                        allow_redirects=True,
                                        timeout=timeout)
                except ReadTimeout:
                    resp = None
                if is_not_null(resp):
                    new_parsed = urlparse.urlparse(resp.url)
                    if new_parsed.query != "" or new_parsed.path not in [
                            "", "/"
                    ]:
                        # If removing a parameter didn't redirect to a root domain...
                        new_url = resp.url
                        compare_new = (new_url.split("?")[0]
                                       if "?" in new_url else new_url)
                        compare_old = url.split("?")[0] if "?" in url else url
                        if compare_new == compare_old:
                            # And the domain is the same as it was before, then the parameter was probably unnecessary
                            ditch_params.append(k)

    if len(ditch_params) > 0:
        # Now we remove all of the unnecessary get parameters and finalize the URL
        new_params = {
            k: v[0]
            for k, v in params.items() if len(v) == 1 and k not in ditch_params
        }
        new_params = urlparse.urlencode(new_params)
        parsed = parsed._replace(query=new_params)
        url = urlparse.urlunparse(parsed)

    if close_session:
        session.close()

    return url
Exemple #7
0
    def read(self, key, format="pkl", hash_key=False, **io_kwargs):
        """
        Reads a file from the directory or S3 path, returning its contents.

        :param key: The name of the file to read (without a suffix!)
        :type key: str
        :param format: The format of the file (pkl/json/csv/dta/xls/xlsx/tab); expects the file extension to match
        :type format: str
        :param hash_key: Whether the key should be hashed prior to looking for and retrieving the file.
        :type hash_key: bool
        :param io_kwargs: Optional arguments to be passed to the specific load function (dependent on file format)
        :return: The file contents, in the requested format

        .. note:: You can pass optional ``io_kwargs`` that will be forwarded to the function below that corresponds to \
            the format of the file you're trying to read in

            - `dta`: :py:meth:`pandas.DataFrame.read_stata`
            - `csv`: :py:meth:`pandas.DataFrame.read_csv`
            - `tab`: :py:meth:`pandas.DataFrame.read_csv`
            - `xlsx`: :py:meth:`pandas.DataFrame.read_excel`
            - `xls`: :py:meth:`pandas.DataFrame.read_excel`
        """

        format = format.strip(".")

        if hash_key:
            key = self.get_key_hash(key)

        data = None
        filepath = "/".join([self.path, "{}.{}".format(key, format)])

        if self.use_s3:

            k = self.s3.get_key(filepath)
            if k:
                try:
                    data = k.get_contents_as_string()
                except ValueError:
                    pass
        else:

            if os.path.exists(filepath):
                try:
                    with closing(open(filepath, "r")) as infile:
                        data = infile.read()
                except:
                    # TODO: handle this exception more explicitly
                    with closing(open(filepath, "rb")) as infile:
                        data = infile.read()

        if is_not_null(data):
            if format == "pkl":

                try:
                    data = pickle.loads(data)
                except TypeError:
                    data = None
                except ValueError:
                    if "attempt_count" not in io_kwargs:
                        io_kwargs["attempt_count"] = 1
                    print(
                        "Insecure pickle string; probably a concurrent read-write, \
                        will try again in 5 seconds (attempt #{})".format(
                            io_kwargs["attempt_count"]))
                    time.sleep(5)
                    if io_kwargs["attempt_count"] <= 3:
                        io_kwargs["attempt_count"] += 1
                        data = self.read(key,
                                         format=format,
                                         hash_key=hash_key,
                                         **io_kwargs)
                    else:
                        data = None
                except Exception as e:
                    print("Couldn't load pickle!  {}".format(e))
                    data = None

            elif format in ["tab", "csv"]:

                if format == "tab":
                    io_kwargs["delimiter"] = "\t"
                try:
                    data = pd.read_csv(BytesIO(data), **io_kwargs)
                except:
                    data = pd.read_csv(StringIO(data), **io_kwargs)

            elif format in ["xlsx", "xls"]:
                try:
                    data = pd.read_excel(BytesIO(data), **io_kwargs)
                except:
                    data = pd.read_excel(StringIO(data), **io_kwargs)

            elif format == "json":
                try:
                    data = json.loads(data)
                except:
                    pass

            elif format == "dta":

                try:
                    data = pd.read_stata(BytesIO(data), **io_kwargs)
                except:
                    data = pd.read_stata(StringIO(data), **io_kwargs)

        return data