Beispiel #1
0
def correlation_bias(frame1, frame2, verbose=False):
    """
    Given two DataFrames of word vectors that we don't want to associate with
    each other, find the strongest association for each item in `frame2`
    and compare it to the average.

    Returns a bias value (the average difference between the strongest
    association and the average association) and a confidence interval on that
    value.

    Set 'verbose=True' if you want to see the most biased associations and
    be either sad or confused.
    """
    bias_numbers = []

    centered1 = l2_normalize_rows(subtract_mean_vector(frame1))
    centered2 = l2_normalize_rows(subtract_mean_vector(frame2))
    grid = centered1.dot(centered2.T)

    for i in range(grid.shape[1]):
        col_bias = np.max(grid.iloc[:, i]) - np.mean(grid.iloc[:, i])
        if verbose:
            most_biased = np.argmax(grid.iloc[:, i])
            comparison = centered2.index[i]
            print("%4.4f %s => %s" % (col_bias, comparison, most_biased))
        bias_numbers.append(col_bias)

    mean = np.mean(bias_numbers)
    sem = scipy.stats.sem(bias_numbers)
    return pd.Series([mean, mean - sem * 2, mean + sem * 2],
                     index=['bias', 'low', 'high'])
Beispiel #2
0
def test_l2_normalize_rows():
    vectors = l2_normalize_rows(TEST_FRAME)

    lengths = np.sqrt(np.sum(np.power(vectors, 2), axis='columns'))
    for length in lengths:
        assert_almost_equal(length, 1.0, places=4)

    # Check if a data frame of all zeros will be normalized to zeros
    frame = pd.DataFrame(np.zeros(shape=(1, 10)))
    frame = l2_normalize_rows(frame)
    lengths = np.sqrt(np.sum(np.power(frame, 2), axis=1))
    ok_(all(length == 0 for length in lengths))
def test_l2_normalize_rows(simple_frame):
    vectors = l2_normalize_rows(simple_frame)

    lengths = np.sqrt(np.sum(np.power(vectors, 2), axis='columns'))
    for length in lengths:
        assert length == pytest.approx(1.0)

    # Check if a data frame of all zeros will be normalized to zeros
    frame = pd.DataFrame(np.zeros(shape=(1, 10)))
    frame = l2_normalize_rows(frame)
    lengths = np.sqrt(np.sum(np.power(frame, 2), axis=1))
    assert all(length == 0 for length in lengths)
def test_l2_normalize_rows():
    vectors = l2_normalize_rows(TEST_FRAME)

    lengths = np.sqrt(np.sum(np.power(vectors, 2), axis='columns'))
    for length in lengths:
        assert_almost_equal(length, 1.0, places=4)

    # Check if a data frame of all zeros will be normalized to zeros
    frame = pd.DataFrame(np.zeros(shape=(1, 10)))
    frame = l2_normalize_rows(frame)
    lengths = np.sqrt(np.sum(np.power(frame, 2), axis=1))
    ok_(all(length == 0 for length in lengths))
Beispiel #5
0
def test_l2_normalize_rows(frame=None):
    if not frame:
        frame = DATA + '/vectors/glove12-840B.h5'
    vectors = load_any_embeddings(frame)

    vectors = l2_normalize_rows(vectors)

    lengths = np.sqrt(np.sum(np.power(vectors, 2), axis='columns'))
    for length in lengths:
        assert_almost_equal(length, 1.0, places=4)

    # Check if a data frame of all zeroes will be normalized to NaN
    frame = pd.DataFrame(np.zeros(shape=(1, 10)))
    frame = l2_normalize_rows(frame)
    lengths = np.sqrt(np.sum(np.power(frame, 2), axis=1))
    ok_(all(np.isnan(length) for length in lengths))
Beispiel #6
0
def de_bias_binary(frame, pos_examples, neg_examples, left_examples,
                   right_examples):
    """
    De-bias a distinction that is presumed - for the purposes of de-biasing -
    to form two ends of a scale. The prototypical example is male vs. female,
    where words that are not inherently defined by gender end up being "more
    male" or "more female" due to stereotypes and biases in the data.

    The goal is not to remove the distinction from every word in the system's
    vocabulary, only those where making the distinction is inappropriate. A
    gender distinction between "she" and "he" is appropriate. A gender
    distinction between "doctor" and "nurse" is inappropriate.

    This function takes in four lists of vocabulary:

    - "Positive examples": examples of words that *should* be de-biased,
      such as "doctor" and "nurse" in the case of gender.

    - "Negative examples": examples of words that *should not* be de-biased,
      such as "she" and "he".

    - "Left examples": words that define one end of the distinction to be
      de-biased, such as "man".

    - "Right examples": words that define the other end of the distinction,
      such as "woman".

    The left and right examples are probably also good negative examples:
    they appropriately represent the distinction to be made, so they should
    not be de-biased.
    """
    # Make the SVM that distinguishes positive examples (words that should
    # be de-biased) from negative examples.
    category_predictor = two_class_svm(frame, pos_examples, neg_examples)

    # The SVM can predict the probability, for each vector in the frame, that
    # it's in each class. The positive class is column 1 of this prediction.
    # This gives us a vector of how much each word in the vocabulary should be
    # de-biased.
    applicability = category_predictor.predict_proba(frame)[:, 1]

    # The bias axis is the vector difference between the average right example
    # and the average left example.
    bias_axis = get_category_axis(frame, right_examples) - get_category_axis(
        frame, left_examples)

    # Make a modified version of the space that projects the bias axis to 0.
    # Then weight each row of that space by "applicability", the probability
    # that each row should be de-biased.
    modified_component = reject_subspace(frame, [bias_axis]).mul(applicability,
                                                                 axis=0)

    # Make another component representing the vectors that should not be
    # de-biased: the original space times (1 - applicability).
    original_component = frame.mul(1 - applicability, axis=0)

    # The sum of these two components is the de-biased space, where de-biasing
    # applies to each row proportional to its applicability.
    return l2_normalize_rows(original_component + modified_component,
                             offset=1e-9)
Beispiel #7
0
def build_ppmi(input_path, output_path):
    spmat, index = sparse_from_parallel_text(
        pathlib.Path(input_path), ['de', 'en', 'es', 'fa', 'fr', 'it', 'pt'])
    ppmi = counts_to_ppmi(spmat)
    u, s, vT = linalg.svds(ppmi, 300)
    v = vT.T
    values = (u + v) * (s**0.5)
    ppmi_frame = l2_normalize_rows(pd.DataFrame(values, index=index))
    save_hdf(ppmi_frame, output_path)
Beispiel #8
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.

        TODO: is this sometimes returning a DataFrame? Should it accept a
        Series as well as a DataFrame?
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[:self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count('/') >= 3
            # TODO: Is this duplicating something that field_match was supposed
            # to do?
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx:idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_key = filter
                # '0' is the character after '/', so end_key is the first possible
                # key that's not a descendant of the given filter key
                end_key = filter + '0'
                try:
                    start_idx = search_frame.index.get_loc(start_key,
                                                           method='bfill')
                except KeyError:
                    start_idx = len(search_frame.index)
                try:
                    end_idx = search_frame.index.get_loc(end_key,
                                                         method='bfill')
                except KeyError:
                    end_idx = len(search_frame.index)
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame,
                                        small_vec,
                                        limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f'))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Beispiel #9
0
def reject_subspace(frame, vecs):
    """
    Return a modification of the vector space `frame` where none of
    its rows have any correlation with any rows of `vecs`, by subtracting
    the outer product of `frame` with each normalized row of `vecs`.
    """
    current_array = frame.copy()
    for vec in vecs:
        vec = normalize_vec(vec)
        projection = current_array.dot(vec)
        current_array -= np.outer(projection, vec)

    return l2_normalize_rows(current_array, offset=1e-9)
Beispiel #10
0
def _load_vectors():
    frame = load_hdf(resource_filename('codenames', 'data/mini.h5'))
    selections = [
        label for label in frame.index
        if label.startswith('/c/en/') and '_' not in label and '#' not in label
        and wordfreq.zipf_frequency(label[6:], 'en') > 3.0
    ]
    # Make sure all the words in Codenames are represented
    wordlist = [
        standardized_uri('en', line.strip()) for line in open(
            resource_filename('codenames', 'data/codenames-words.txt'))
    ]
    additions = [word for word in wordlist if word not in selections]
    selections += additions
    frame = l2_normalize_rows(frame.loc[selections].astype('f'))
    return VectorSpaceWrapper(frame=frame)
Beispiel #11
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a DataFrame of terms ranked by their similarity to the query.
        The query can be:

        - A DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        if filter:
            exact_only = filter.count("/") >= 3
            if filter.endswith("/."):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_key = filter
                # '0' is the character after '/', so end_key is the first possible
                # key that's not a descendant of the given filter key
                end_key = filter + "0"
                try:
                    start_idx = search_frame.index.get_loc(start_key, method="bfill")
                except KeyError:
                    start_idx = len(search_frame.index)
                try:
                    end_idx = search_frame.index.get_loc(end_key, method="bfill")
                except KeyError:
                    end_idx = len(search_frame.index)
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(self.frame.loc[similar_sloppy.index].astype("f"))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Beispiel #12
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A pandas Series of weighted terms
        - A pandas DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded using the
        out-of-vocab strategy.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[:self.small_k]
        search_frame = self.small_frame
        # TODO: document filter
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx:idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = self._index_prefix_range(filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame,
                                        small_vec,
                                        limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f'))

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Beispiel #13
0
    def similar_terms(self, query, filter=None, limit=20):
        """
        Get a Series of terms ranked by their similarity to the query.
        The query can be:

        - A pandas Series of weighted terms
        - A pandas DataFrame of weighted terms
        - A dictionary from terms to weights
        - A list of (term, weight) tuples
        - A single term
        - An existing vector

        If the query contains 5 or fewer terms, it will be expanded to include
        neighboring terms in ConceptNet.
        """
        self.load()
        vec = self.get_vector(query)
        small_vec = vec[: self.small_k]
        search_frame = self.small_frame
        # TODO: document filter
        if filter:
            exact_only = filter.count('/') >= 3
            if filter.endswith('/.'):
                filter = filter[:-2]
                exact_only = True
            if exact_only:
                if filter in search_frame.index:
                    idx = search_frame.index.get_loc(filter)
                    search_frame = search_frame[idx : idx + 1]
                else:
                    search_frame = search_frame.iloc[0:0]
            else:
                start_idx, end_idx = self._index_prefix_range(filter + '/')
                search_frame = search_frame.iloc[start_idx:end_idx]
        similar_sloppy = similar_to_vec(search_frame, small_vec, limit=limit * 50)
        similar_choices = l2_normalize_rows(
            self.frame.loc[similar_sloppy.index].astype('f')
        )

        similar = similar_to_vec(similar_choices, vec, limit=limit)
        return similar
Beispiel #14
0
def de_bias_category(frame, category_examples, bias_examples):
    """
    Remove correlations between a class of words that should have biases
    removed (category_examples) and a set of words reflecting those biases
    (bias_examples). For example, the `category_examples` may be ethnicities,
    and `bias_examples` may be stereotypes about them.

    The check for whether a word should be de-biased works like
    `de_bias_binary`, where the category words are positive examples and the
    bias words are negative examples (because the words that define the bias
    presumably should not be de-biased).

    The words that should be de-biased will have their correlations with
    each of the bias words removed.
    """
    # Make an SVM that distinguishes words that are in the category to be
    # de-biased from words that are not.
    category_predictor = two_class_svm(frame, category_examples, bias_examples)

    # Predict the probability of each word in the vocabulary being in the
    # category.
    applicability = category_predictor.predict_proba(frame)[:, 1]

    # Make a matrix of vectors representing the correlations to remove.
    vocab = [standardized_uri('en', term) for term in bias_examples]
    components_to_reject = frame.loc[vocab].values

    # Make a modified version of the space that projects the bias vectors to 0.
    # Then weight each row of that space by "applicability", the probability
    # that each row should be de-biased.
    modified_component = reject_subspace(frame, components_to_reject).mul(
        applicability, axis=0)

    # Make another component representing the vectors that should not be
    # de-biased: the original space times (1 - applicability).
    original_component = frame.mul(1 - applicability, axis=0)

    # The sum of these two components is the de-biased space, where de-biasing
    # applies to each row proportional to its applicability.
    return l2_normalize_rows(original_component + modified_component,
                             offset=1e-9)