def test_is_null(self): import numpy as np import pandas as pd from pewtils import is_null, is_not_null for val in [None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"]: self.assertTrue(is_null(val)) self.assertTrue(is_null(np.nan)) self.assertTrue(is_not_null(0.0)) self.assertTrue(is_null("-9", custom_nulls=["-9"])) self.assertTrue(is_null([], empty_lists_are_null=True)) self.assertFalse(is_null([], empty_lists_are_null=False)) self.assertTrue(is_null(pd.Series(), empty_lists_are_null=True)) self.assertFalse(is_null(pd.Series(), empty_lists_are_null=False)) self.assertTrue(is_null(pd.DataFrame(), empty_lists_are_null=True)) self.assertFalse(is_null(pd.DataFrame(), empty_lists_are_null=False))
def __init__(self, path, use_s3=None, aws_access=None, aws_secret=None, bucket=None): if aws_access is None: aws_access = os.environ.get("AWS_ACCESS_KEY_ID", None) if aws_secret is None: aws_secret = os.environ.get("AWS_SECRET_ACCESS_KEY", None) if bucket is None: bucket = os.environ.get("S3_BUCKET", None) self.path = path self.use_s3 = use_s3 if is_not_null(bucket) else False if self.use_s3: s3_params = {} if aws_access is not None: s3_params["aws_access_key_id"] = aws_access s3_params["aws_secret_access_key"] = aws_secret if "." in bucket: s3_params["calling_format"] = OrdinaryCallingFormat() self.s3 = S3Connection(**s3_params).get_bucket(bucket) else: self.path = os.path.join(self.path) if not os.path.exists(self.path): try: os.makedirs(self.path) except Exception as e: print("Warning: couldn't make directory '{}'".format( self.path)) print(e)
def fit(self, df=None, **kwargs): """ Fits a model using the method specified when initializing the ``TopicModel``. Details on model-specific \ parameters are below: **sklearn_lda** Fits a model using :py:class:`sklearn.decomposition.LatentDirichletAllocation`. For more information on \ available parameters, please refer to the official documentation: \ https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: Represents document-topic density. When values are higher, documents will be comprised of more \ topics; when values are lower, documents will be primarily comprised of only a few topics. This parameter is \ used instead of the doc_topic_prior sklearn parameter, and will be passed along to sklearn using the formula: \ ``doc_topic_prior = alpha / num_topics`` :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \ when values are lower, only a few words will be loaded onto each topic. This parameter is used instead of the \ topic_word_prior sklearn parameter, and will be passed along to sklearn using the formula: \ ``topic_word_prior = beta / num_topics``. :param learning_decay: See sklearn documentation. :param learning_offset: See sklearn documentation. :param learning_method: See sklearn documentation. :param max_iter: See sklearn documentation. :param batch_size: See sklearn documentation. :param verbose: See sklearn documentation. **sklearn_nmf** Fits a model using :py:class:`sklearn.decomposition.NMF`. For more information on available parameters, \ please refer to the official documentation: \ https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: See sklearn documentation. :param l1_ratio: See sklearn documentation. :param tol: See sklearn documentation. :param max_iter: See sklearn documentation. :param shuffle: See sklearn documentation. **gensim_lda** Fits an LDA model using :py:class:`gensim.models.LdaModel` or \ :py:class:`gensim.models.ldamulticore.LdaMulticore`. \ When ``use_multicore`` is set to True, the multicore implementation will be used, otherwise the standard \ LDA implementation will be used. \ For more information on available parameters, please refer to the official documentation below: - use_multicore=True: https://radimrehurek.com/gensim/models/ldamulticore.html - use_multicore=False: https://radimrehurek.com/gensim/models/ldamodel.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: Represents document-topic density. When values are higher, documents will be comprised of \ more topics; when values are lower, documents will be primarily comprised of only a few topics. Gensim \ options are a bit different than sklearn though; refer to the documentation for the accepted values here. :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \ when values are lower, only a few words will be loaded onto each topic. Gensim options are a bit different \ than sklearn though; refer to the documentation for the accepted values here. Gensim calls this parameter \ ``eta``. We renamed it to be consistent with the sklearn implementations. :param chunksize: See gensim documentation. :param passes: See gensim documentation. :param decay: See gensim documentation. :param offset: See gensim documentation. :param workers: Number of cores to use (if using multicore) :param use_multicore: Whether or not to use multicore **gensim_hdp** Fits an HDP model using the gensim implementation. Contrary to LDA and NMF, HDP attempts to auto-detect the correct number of topics. In practice, it actually fits ``T`` topics (default is 150) but many are extremely rare or occur only in a very few number of documents. To identify the topics that are actually useful, this function passes the original :py:class:`pandas.DataFrame` through the trained model after fitting, and identifies \ topics that compose at least 1% of a document in at least 1% of all documents in the corpus. In other words, \ topics are thrown out if the number of documents they appear in at a rate of at least 1% are fewer than 1% of \ the total number of documents. Subsequent use of the model will only make use of topics that meet this \ threshold. For more information on available parameters, please refer to the official documentation: \ https://radimrehurek.com/gensim/models/hdpmodel.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param max_chunks: See gensim documentation. :param max_time: See gensim documentation. :param chunksize: See gensim documentation. :param kappa: See gensim documentation. :param tau: See gensim documentation. :param T: See gensim documentation. :param K: See gensim documentation. :param alpha: See gensim documentation. :param beta: See gensim documentation. :param gamma: See gensim documentation. :param scale: See gensim documentation. :param var_converge: See gensim documentation. **corex** Fits a CorEx topic model. Anchors can be provided in the form of a list of lists, with each item corresponding to a set of words to be used to seed a topic. For example: .. code-block:: python anchors=[ ['cat', 'kitten'], ['dog', 'puppy'] ] The list of anchors cannot be longer than the specified number of topics, and all of the words must exist in the vocabulary. The ``anchor_strength`` parameter determines the degree to which the model is able to override the suggested words based on the data; providing higher values are a way of "insisting" more strongly that the model keep the provided words together in a single topic. For more information on available \ parameters, please refer to the official documentation: https://github.com/gregversteeg/corex_topic :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param anchors: A list of lists that contain words that the model should try to group together into topics :param anchor_strength: The degree to which the provided anchors should be preserved regardless of the data """ fit_params = self.get_fit_params(**kwargs) if self.method in ["sklearn_lda", "sklearn_nmf"]: if self.method == "sklearn_lda": self.model = LatentDirichletAllocation( n_components=self.num_topics, **fit_params) if self.method == "sklearn_nmf": self.model = NMF(n_components=self.num_topics, **fit_params) if is_not_null(df): features = self.get_features(df) else: features = self.train_features self.model.fit(features) elif self.method in ["gensim_lda", "gensim_hdp"]: vocab_dict = dict([(i, s) for i, s in enumerate(self.ngrams)]) if is_not_null(df): features = self.get_features(df, keep_sparse=True) else: features = self.train_features matrix = gensim.matutils.Sparse2Corpus(features, documents_columns=False) if self.method == "gensim_lda": fit_params["num_topics"] = self.num_topics fit_params["id2word"] = vocab_dict if fit_params["use_multicore"]: model_class = gensim.models.ldamulticore.LdaMulticore else: model_class = gensim.models.LdaModel del fit_params["workers"] del fit_params["use_multicore"] self.model = model_class(**fit_params) self.model.update(matrix) elif self.method == "gensim_hdp": model_class = gensim.models.hdpmodel.HdpModel self.model = model_class(matrix, vocab_dict, **fit_params) doc_topics = self.get_document_topics(self.df) topics = ((doc_topics >= 0.01).astype(int).mean() >= 0.01).astype(int) self.topic_ids = [ int(col.split("_")[-1]) for col in topics[topics == 1].index if col.startswith("topic_") ] self.num_topics = len(self.topic_ids) elif self.method == "corex": if is_not_null(df): features = self.get_features(df, keep_sparse=True) else: features = self.get_features(self.train_df, keep_sparse=True) self.model = corextopic.Corex(n_hidden=self.num_topics) self.model.fit(features, words=self.ngrams, **fit_params)
def compute_mutual_info(y, x, weights=None, col_names=None, l=0, normalize=True): """ Computes pointwise mutual information for a set of observations partitioned into two groups. :param y: An array or, preferably, a :py:class:`pandas.Series` :param x: A matrix, :py:class:`pandas.DataFrame`, or preferably a :py:class:`scipy.sparse.csr_matrix` :param weights: (Optional) An array of weights corresponding to each observation :param col_names: The feature names associated with the columns in matrix 'x' :type col_names: list :param l: An optional Laplace smoothing parameter :type l: int or float :param normalize: Toggle normalization on or off (to control for feature prevalance), on by default :type normalize: bool :return: A :py:class:`pandas.DataFrame` of features with a variety of computed metrics including mutual information. The function expects ``y`` to correspond to a list or series of values indicating which partition an observation \ belongs to. ``y`` must be a binary flag. ``x`` is a set of features (either a :py:class:`pandas.DataFrame` or \ sparse matrix) where the rows correspond to observations and the columns represent the presence of features (you \ can technically run this using non-binary features but the results will not be as readily interpretable.) The \ function returns a :py:class:`pandas.DataFrame` of metrics computed for each feature, including the following \ columns: - ``MI1``: The feature's mutual information for the positive class - ``MI0``: The feature's mutual information for the negative class - ``total``: The total number of times a feature appeared - ``total_pos_with_term``: The total number of times a feature appeared in positive cases - ``total_neg_with_term``: The total number of times a feature appeared in negative cases - ``total_pos_neg_with_term_diff``: The raw difference in the number of times a feature appeared in positive cases \ relative to negative cases - ``pct_pos_with_term``: The proportion of positive cases that had the feature - ``pct_neg_with_term``: The proportion of negative cases that had the feature - ``pct_pos_neg_with_term_ratio``: A likelihood ratio indicating the degree to which a positive case was more likely \ to have the feature than a negative case - ``pct_term_pos``: Of the cases that had a feature, the proportion that were in the positive class - ``pct_term_neg``: Of the cases that had a feature, the proportion that were in the negative class - ``pct_term_pos_neg_diff``: The percentage point difference between the proportion of cases with the feature that \ were positive vs. negative - ``pct_term_pos_neg_ratio``: A likelihood ratio indicating the degree to which a feature was more likely to appear \ in a positive case relative to a negative one (may not be meaningful when classes are imbalanced) .. note:: Note that ``pct_term_pos`` and ``pct_term_neg`` may not be directly comparable if classes are imbalanced, \ and in such cases a ``pct_term_pos_neg_diff`` above zero or ``pct_term_pos_neg_ratio`` above 1 may not indicate a \ true association with the positive class if positive cases outnumber negative ones. .. note:: Mutual information can be a difficult metric to explain to others. We've found that the \ ``pct_pos_neg_with_term_ratio`` can serve as a more interpretable alternative method for identifying \ meaningful differences between groups. Usage:: from pewanalytics.stats.mutual_info import compute_mutual_info import nltk import pandas as pd from sklearn.metrics.pairwise import linear_kernel from sklearn.feature_extraction.text import TfidfVectorizer nltk.download("inaugural") df = pd.DataFrame([ {"speech": fileid, "text": nltk.corpus.inaugural.raw(fileid)} for fileid in nltk.corpus.inaugural.fileids() ]) df['year'] = df['speech'].map(lambda x: int(x.split("-")[0])) df['21st_century'] = df['year'].map(lambda x: 1 if x >= 2000 else 0) vec = TfidfVectorizer(min_df=10, max_df=.9).fit(df['text']) tfidf = vec.transform(df['text']) # Here are the terms most distinctive of inaugural addresses in the 21st century vs. years prior >>> results = compute_mutual_info(df['21st_century'], tfidf, col_names=vec.get_feature_names()) >>> results.sort_values("MI1", ascending=False).index[:25] Index(['america', 'thank', 'bless', 'schools', 'ideals', 'americans', 'meaning', 'you', 'move', 'across', 'courage', 'child', 'birth', 'generation', 'families', 'build', 'hard', 'promise', 'choice', 'women', 'guided', 'words', 'blood', 'dignity', 'because'], dtype='object') """ if is_not_null(weights): weights = weights.fillna(0) y0 = sum(weights[y == 0]) y1 = sum(weights[y == 1]) total = sum(weights) else: y0 = len(y[y == 0]) y1 = len(y[y == 1]) total = y1 + y0 if type(x).__name__ == "csr_matrix": if is_not_null(weights): x = x.transpose().multiply(csr_matrix(weights)).transpose() x1 = pd.Series(x.sum(axis=0).tolist()[0]) x0 = total - x1 x1y0 = pd.Series( x[np.ravel(np.array(y[y == 0].index)), :].sum(axis=0).tolist()[0]) x1y1 = pd.Series( x[np.ravel(np.array(y[y == 1].index)), :].sum(axis=0).tolist()[0]) else: if type(x).__name__ != "DataFrame": x = pd.DataFrame(x, columns=col_names) if is_not_null(weights): x = x.multiply(weights, axis="index") x1 = x.multiply(weights, axis="index").sum() x0 = ((x * -1) + 1).multiply(weights, axis="index").sum() else: x1 = x.sum() x0 = ((x * -1) + 1).sum() x1y0 = x[y == 0].sum() x1y1 = x[y == 1].sum() px1y0 = x1y0 / total px1y1 = x1y1 / total px0y0 = (y0 - x1y0) / total px0y1 = (y1 - x1y1) / total px1 = x1 / total px0 = x0 / total py1 = float(y1) / float(total) py0 = float(y0) / float(total) MI1 = (px1y1 / (px1 * py1) + l).map(lambda v: math.log(v, 2) if v > 0 else 0) if normalize: MI1 = MI1 / (-1 * px1y1.map(lambda v: math.log(v, 2) if v > 0 else 0)) MI0 = (px1y0 / (px1 * py0) + l).map(lambda v: math.log(v, 2) if v > 0 else 0) if normalize: MI0 = MI0 / (-1 * px1y0.map(lambda v: math.log(v, 2) if v > 0 else 0)) df = pd.DataFrame() df["MI1"] = MI1 df["MI0"] = MI0 df["total"] = x1 df["total_pos_with_term"] = x1y1 # total_pos_mention df["total_neg_with_term"] = x1y0 # total_neg_mention df["total_pos_neg_with_term_diff"] = (df["total_pos_with_term"] - df["total_neg_with_term"]) df["pct_with_term"] = x1 / (x1 + x0) df["pct_pos_with_term"] = x1y1 / y1 # pct_pos_mention df["pct_neg_with_term"] = x1y0 / y0 # pct_neg_mention df["pct_pos_neg_with_term_diff"] = (df["pct_pos_with_term"] - df["pct_neg_with_term"] ) # pct_pos_neg_mention_diff df["pct_pos_neg_with_term_ratio"] = df["pct_pos_with_term"] / ( df["pct_neg_with_term"]) # pct_pos_neg_mention_ratio df["pct_term_pos"] = x1y1 / x1 # pct_mention_pos df["pct_term_neg"] = x1y0 / x1 # pct_mention_neg df["pct_term_pos_neg_diff"] = (df["pct_term_pos"] - df["pct_term_neg"] ) # pct_mention_pos_neg_diff df["pct_term_pos_neg_ratio"] = df["pct_term_pos"] / df["pct_term_neg"] if col_names: df.index = col_names return df
def compute_scores( coder_df, coder1, coder2, outcome_column, document_column, coder_column, weight_column=None, pos_label=None, ): """ Computes a variety of inter-rater reliability scores, including Cohen's kappa, Krippendorf's alpha, precision, and recall. The input data must consist of a :py:class:`pandas.DataFrame` with the following columns: - A column with values that indicate the coder (like a name) - A column with values that indicate the document (like an ID) - A column with values that indicate the code value - (Optional) A column with document weights This function will return a :py:class:`pandas.DataFrame` with agreement scores between the two specified coders. :param coder_df: A :py:class:`pandas.DataFrame` of codes :type coder_df: :py:class:`pandas.DataFrame` :param coder1: The value in ``coder_column`` for rows corresponding to the first coder :type coder1: str or int :param coder2: The value in ``coder_column`` for rows corresponding to the second coder :type coder2: str or int :param outcome_column: The column that contains the codes :type outcome_column: str :param document_column: The column that contains IDs for the documents :type document_column: str :param coder_column: The column containing values that indicate which coder assigned the code :type coder_column: str :param weight_column: The column that contains sampling weights :type weight_column: str :param pos_label: The value indicating a positive label (optional) :type pos_label: str or int :return: A dictionary of scores :rtype: dict .. note:: If using a multi-class (non-binary) code, some scores may come back null or not compute as expected. \ We recommend running the function separately for each specific code value as a binary flag by providing \ each unique value to the ``pos_label`` argument. If ``pos_label`` is not provided for multi-class codes, \ this function will attempt to compute scores based on support-weighted averages. Usage:: from pewanalytics.stats.irr import compute_scores import pandas as pd df = pd.DataFrame([ {"coder": "coder1", "document": 1, "code": "2"}, {"coder": "coder2", "document": 1, "code": "2"}, {"coder": "coder1", "document": 2, "code": "1"}, {"coder": "coder2", "document": 2, "code": "2"}, {"coder": "coder1", "document": 3, "code": "0"}, {"coder": "coder2", "document": 3, "code": "0"}, ]) >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': None, 'coder1_mean_unweighted': 1.0, 'coder1_std_unweighted': 0.5773502691896257, 'coder2_mean_unweighted': 1.3333333333333333, 'coder2_std_unweighted': 0.6666666666666666, 'alpha_unweighted': 0.5454545454545454, 'accuracy': 0.6666666666666666, 'f1': 0.5555555555555555, 'precision': 0.5, 'recall': 0.6666666666666666, 'precision_recall_min': 0.5, 'matthews_corrcoef': 0.6123724356957946, 'roc_auc': None, 'pct_agree_unweighted': 0.6666666666666666} >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="0") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': '0', 'coder1_mean_unweighted': 0.3333333333333333, 'coder1_std_unweighted': 0.3333333333333333, 'coder2_mean_unweighted': 0.3333333333333333, 'coder2_std_unweighted': 0.3333333333333333, 'alpha_unweighted': 1.0, 'cohens_kappa': 1.0, 'accuracy': 1.0, 'f1': 1.0, 'precision': 1.0, 'recall': 1.0, 'precision_recall_min': 1.0, 'matthews_corrcoef': 1.0, 'roc_auc': 1.0, 'pct_agree_unweighted': 1.0} >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="1") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': '1', 'coder1_mean_unweighted': 0.3333333333333333, 'coder1_std_unweighted': 0.3333333333333333, 'coder2_mean_unweighted': 0.0, 'coder2_std_unweighted': 0.0, 'alpha_unweighted': 0.0, 'cohens_kappa': 0.0, 'accuracy': 0.6666666666666666, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'precision_recall_min': 0.0, 'matthews_corrcoef': 1.0, 'roc_auc': None, 'pct_agree_unweighted': 0.6666666666666666} >>> compute_scores(df, "coder1", "coder2", "code", "document", "coder", pos_label="2") {'coder1': 'coder1', 'coder2': 'coder2', 'n': 3, 'outcome_column': 'code', 'pos_label': '2', 'coder1_mean_unweighted': 0.3333333333333333, 'coder1_std_unweighted': 0.3333333333333333, 'coder2_mean_unweighted': 0.6666666666666666, 'coder2_std_unweighted': 0.3333333333333333, 'alpha_unweighted': 0.4444444444444444, 'cohens_kappa': 0.3999999999999999, 'accuracy': 0.6666666666666666, 'f1': 0.6666666666666666, 'precision': 0.5, 'recall': 1.0, 'precision_recall_min': 0.5, 'matthews_corrcoef': 0.5, 'roc_auc': 0.75, 'pct_agree_unweighted': 0.6666666666666666} """ old_np_settings = np.seterr(all="raise") coder_df = copy.deepcopy(coder_df) if pos_label: coder_df[outcome_column] = ( coder_df[outcome_column] == pos_label).astype(int) coder1_df = coder_df[coder_df[coder_column] == coder1] coder1_df.index = coder1_df[document_column] coder2_df = coder_df[coder_df[coder_column] == coder2] coder2_df.index = coder2_df[document_column] coder1_df = coder1_df[coder1_df.index.isin(coder2_df.index)] coder2_df = coder2_df[coder2_df.index.isin( coder1_df.index)].loc[coder1_df.index] row = { "coder1": coder1, "coder2": coder2, "n": len(coder1_df), "outcome_column": outcome_column, "pos_label": pos_label, } for labelsetname, labelset in [ ("coder1", coder1_df[outcome_column]), ("coder2", coder2_df[outcome_column]), ]: if weight_column: try: weighted_stats = DescrStatsW(labelset, weights=coder1_df[weight_column]) if weighted_stats: row["{}_mean".format(labelsetname)] = weighted_stats.mean row["{}_std".format( labelsetname)] = weighted_stats.std_mean except (TypeError, ValueError): try: weighted_stats = DescrStatsW( labelset.astype(int), weights=coder1_df[weight_column]) if weighted_stats: row["{}_mean".format( labelsetname)] = weighted_stats.mean row["{}_std".format( labelsetname)] = weighted_stats.std_mean except (TypeError, ValueError): pass try: unweighted_stats = DescrStatsW(labelset, weights=[1.0 for x in labelset]) if unweighted_stats: row["{}_mean_unweighted".format( labelsetname)] = unweighted_stats.mean row["{}_std_unweighted".format( labelsetname)] = unweighted_stats.std_mean except (TypeError, ValueError): try: unweighted_stats = DescrStatsW(labelset.astype(int), weights=[1.0 for x in labelset]) if unweighted_stats: row["{}_mean_unweighted".format( labelsetname)] = unweighted_stats.mean row["{}_std_unweighted".format( labelsetname)] = unweighted_stats.std_mean except (TypeError, ValueError): pass alpha = AnnotationTask( data=coder_df[[coder_column, document_column, outcome_column]].values) try: alpha = alpha.alpha() except (ZeroDivisionError, ValueError): alpha = None row["alpha_unweighted"] = alpha labels = np.unique(coder_df[outcome_column]) if len(labels) <= 2: try: row["cohens_kappa"] = cohen_kappa_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, ) except FloatingPointError: row["cohens_kappa"] = 1.0 try: row["accuracy"] = accuracy_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, ) except ValueError: row["accuracy"] = None try: row["f1"] = f1_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else "binary", ) except ValueError: row["f1"] = None try: row["precision"] = precision_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else "binary", ) except ValueError: row["precision"] = None try: row["recall"] = recall_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else "binary", ) except ValueError: row["recall"] = None if is_not_null(row["precision"]) and is_not_null(row["recall"]): row["precision_recall_min"] = min([row["precision"], row["recall"]]) else: row["precision_recall_min"] = None try: row["matthews_corrcoef"] = matthews_corrcoef( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, ) except ValueError: row["matthews_corrcoef"] = None except FloatingPointError: row["matthews_corrcoef"] = 1.0 try: row["roc_auc"] = (roc_auc_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, labels=labels, average="weighted" if not pos_label else None, ) if len(np.unique(coder1_df[outcome_column])) > 1 and len(np.unique(coder2_df[outcome_column])) > 1 else None) except TypeError: try: row["roc_auc"] = (roc_auc_score( coder1_df[outcome_column], coder2_df[outcome_column], sample_weight=coder1_df[weight_column] if weight_column else None, average="weighted" if not pos_label else None, ) if len(np.unique(coder1_df[outcome_column])) > 1 and len(np.unique(coder2_df[outcome_column])) > 1 else None) except (ValueError, TypeError): row["roc_auc"] = None except (ValueError, TypeError): row["roc_auc"] = None row["pct_agree_unweighted"] = np.average([ 1 if c[0] == c[1] else 0 for c in zip(coder1_df[outcome_column], coder2_df[outcome_column]) ]) for k, v in row.items(): if type(v) == tuple: row[k] = v[0] # For some weird reason, some of the sklearn scorers return 1-tuples sometimes np.seterr(**old_np_settings) return row
def trim_get_parameters(url, session=None, timeout=30, user_agent=None): """ Takes a URL (presumed to be the final end point) and iterates over GET parameters, attempting to find optional ones that can be removed without generating any redirects. :param url: The URL to trim :type url: str :param session: (Optional) A persistent session that can optionally be passed (useful if you're processing many \ links at once) :type session: :py:class:`requests.Session` object :param user_agent: User agent for the auto-created requests Session to use, if a preconfigured requests Session \ is not provided :type user_agent: str :param timeout: Timeout for requests :type timeout: int or float :return: The original URL with optional GET parameters removed :rtype: str Usage:: from pewtils.http import trim_get_parameters >>> trim_get_parameters("https://httpbin.org/status/200?param=1") "https://httpbin.org/status/200" """ close_session = False if not session: close_session = True session = requests.Session() session.headers.update({"User-Agent": user_agent}) # Often there's extra information about social sharing and referral sources that can be removed ditch_params = [] parsed = urlparse.urlparse(url) if parsed.query: params = urlparse.parse_qs(parsed.query) for k, v in params.items(): # We iterate over all of the GET parameters and try holding each one out check = True for skipper in ["document", "article", "id", "qs"]: # If the parameter is named something that's probably a unique ID, we'll keep it if skipper in k.lower(): check = False for skipper in ["html", "http"]: # Same goes for parameters that contain URL information if skipper in v[0].lower(): check = False if check: new_params = { k2: v2[0] for k2, v2 in params.items() if k2 != k and len(v2) == 1 } new_params = urlparse.urlencode(new_params) new_parsed = parsed._replace(query=new_params) new_url = urlparse.urlunparse(new_parsed) try: resp = session.head(new_url, allow_redirects=True, timeout=timeout) except ReadTimeout: resp = None if is_not_null(resp): new_parsed = urlparse.urlparse(resp.url) if new_parsed.query != "" or new_parsed.path not in [ "", "/" ]: # If removing a parameter didn't redirect to a root domain... new_url = resp.url compare_new = (new_url.split("?")[0] if "?" in new_url else new_url) compare_old = url.split("?")[0] if "?" in url else url if compare_new == compare_old: # And the domain is the same as it was before, then the parameter was probably unnecessary ditch_params.append(k) if len(ditch_params) > 0: # Now we remove all of the unnecessary get parameters and finalize the URL new_params = { k: v[0] for k, v in params.items() if len(v) == 1 and k not in ditch_params } new_params = urlparse.urlencode(new_params) parsed = parsed._replace(query=new_params) url = urlparse.urlunparse(parsed) if close_session: session.close() return url
def read(self, key, format="pkl", hash_key=False, **io_kwargs): """ Reads a file from the directory or S3 path, returning its contents. :param key: The name of the file to read (without a suffix!) :type key: str :param format: The format of the file (pkl/json/csv/dta/xls/xlsx/tab); expects the file extension to match :type format: str :param hash_key: Whether the key should be hashed prior to looking for and retrieving the file. :type hash_key: bool :param io_kwargs: Optional arguments to be passed to the specific load function (dependent on file format) :return: The file contents, in the requested format .. note:: You can pass optional ``io_kwargs`` that will be forwarded to the function below that corresponds to \ the format of the file you're trying to read in - `dta`: :py:meth:`pandas.DataFrame.read_stata` - `csv`: :py:meth:`pandas.DataFrame.read_csv` - `tab`: :py:meth:`pandas.DataFrame.read_csv` - `xlsx`: :py:meth:`pandas.DataFrame.read_excel` - `xls`: :py:meth:`pandas.DataFrame.read_excel` """ format = format.strip(".") if hash_key: key = self.get_key_hash(key) data = None filepath = "/".join([self.path, "{}.{}".format(key, format)]) if self.use_s3: k = self.s3.get_key(filepath) if k: try: data = k.get_contents_as_string() except ValueError: pass else: if os.path.exists(filepath): try: with closing(open(filepath, "r")) as infile: data = infile.read() except: # TODO: handle this exception more explicitly with closing(open(filepath, "rb")) as infile: data = infile.read() if is_not_null(data): if format == "pkl": try: data = pickle.loads(data) except TypeError: data = None except ValueError: if "attempt_count" not in io_kwargs: io_kwargs["attempt_count"] = 1 print( "Insecure pickle string; probably a concurrent read-write, \ will try again in 5 seconds (attempt #{})".format( io_kwargs["attempt_count"])) time.sleep(5) if io_kwargs["attempt_count"] <= 3: io_kwargs["attempt_count"] += 1 data = self.read(key, format=format, hash_key=hash_key, **io_kwargs) else: data = None except Exception as e: print("Couldn't load pickle! {}".format(e)) data = None elif format in ["tab", "csv"]: if format == "tab": io_kwargs["delimiter"] = "\t" try: data = pd.read_csv(BytesIO(data), **io_kwargs) except: data = pd.read_csv(StringIO(data), **io_kwargs) elif format in ["xlsx", "xls"]: try: data = pd.read_excel(BytesIO(data), **io_kwargs) except: data = pd.read_excel(StringIO(data), **io_kwargs) elif format == "json": try: data = json.loads(data) except: pass elif format == "dta": try: data = pd.read_stata(BytesIO(data), **io_kwargs) except: data = pd.read_stata(StringIO(data), **io_kwargs) return data