Example #1
0
def measure_bias(frame):
    """
    Return a DataFrame that measures biases in a semantic space, on four
    data sets:

    - Gender
    - Fine-grained ethnicity
    - Coarse-grained ethnicity
    - Religious beliefs
    """
    vsw = VectorSpaceWrapper(frame=frame)
    vsw.load()

    gender_binary_axis = normalize_vec(
        get_category_axis(frame, FEMALE_WORDS) -
        get_category_axis(frame, MALE_WORDS))
    gender_bias_numbers = []
    for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS:
        female_biased_uri = standardized_uri('en', female_biased_word)
        male_biased_uri = standardized_uri('en', male_biased_word)
        diff = normalize_vec(
            vsw.get_vector(female_biased_uri) -
            vsw.get_vector(male_biased_uri)).dot(gender_binary_axis)
        gender_bias_numbers.append(diff)

    mean = np.mean(gender_bias_numbers)
    sem = scipy.stats.sem(gender_bias_numbers)
    gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2],
                            index=['bias', 'low', 'high'])

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = pd.DataFrame(
        np.vstack(
            [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS]))
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS)
    belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    return pd.DataFrame({
        'gender': gender_bias,
        'ethnicity-fine': fine_ethnic_bias,
        'ethnicity-coarse': coarse_ethnic_bias,
        'ethnicity-names': name_ethnic_bias,
        'beliefs': belief_bias
    }).T
Example #2
0
    def get_vector(self, query, oov_vector=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `oov_vector=True`, this
        will allow expanded_vector to use an out-of-vocab strategy to find missing
        terms.
        """
        self.load()

        if isinstance(query, np.ndarray):
            return query
        elif isinstance(query, pd.Series) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, pd.DataFrame):
            terms = list(query.to_records())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))

        cache_key = tuple(terms + [oov_vector])
        if cache_key in self.cache:
            return self.cache[cache_key]

        oov_vector = oov_vector and (len(terms) <= 5)

        vec = self.expanded_vector(terms, oov_vector=oov_vector)
        self.cache[cache_key] = normalize_vec(vec)
        return self.cache[cache_key]
Example #3
0
    def get_vector(self, query, include_neighbors=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `include_neighbors=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()

        if isinstance(query, np.ndarray):
            return query
        elif isinstance(query, pd.Series) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, pd.DataFrame):
            terms = list(query.to_records())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))

        cache_key = tuple(terms + [include_neighbors])
        if cache_key in self.cache:
            return self.cache[cache_key]

        include_neighbors = include_neighbors and (len(terms) <= 5)

        vec = self.expanded_vector(terms, include_neighbors=include_neighbors)
        self.cache[cache_key] = normalize_vec(vec)
        return self.cache[cache_key]
Example #4
0
    def get_vector(self, query, oov_vector=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `oov_vector=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()

        if isinstance(query, np.ndarray):
            return query
        elif isinstance(query, pd.Series) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, pd.DataFrame):
            terms = list(query.to_records())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))

        cache_key = tuple(terms + [oov_vector])
        if cache_key in self.cache:
            return self.cache[cache_key]

        oov_vector = oov_vector and (len(terms) <= 5)

        vec = self.expanded_vector(terms, oov_vector=oov_vector)
        self.cache[cache_key] = normalize_vec(vec)
        return self.cache[cache_key]
Example #5
0
def get_weighted_vector(frame, weighted_terms):
    """
    Given a list of (term, weight) pairs, get a unit vector corresponding
    to the weighted average of those term vectors.

    A simplified version of VectorSpaceWrapper.get_vector().
    """
    total = frame.iloc[0] * 0.
    for term, weight in weighted_terms:
        if term in frame.index:
            vec = frame.loc[term]
            total += vec * weight
    return normalize_vec(total)
Example #6
0
def reject_subspace(frame, vecs):
    """
    Return a modification of the vector space `frame` where none of
    its rows have any correlation with any rows of `vecs`, by subtracting
    the outer product of `frame` with each normalized row of `vecs`.
    """
    current_array = frame.copy()
    for vec in vecs:
        vec = normalize_vec(vec)
        projection = current_array.dot(vec)
        current_array -= np.outer(projection, vec)

    return l2_normalize_rows(current_array, offset=1e-9)
Example #7
0
def get_weighted_vector(frame, weighted_terms):
    """
    Given a list of (term, weight) pairs, get a unit vector corresponding
    to the weighted average of those term vectors.

    A simplified version of VectorSpaceWrapper.get_vector().
    """
    total = frame.iloc[0] * 0.
    for term, weight in weighted_terms:
        if term in frame.index:
            vec = frame.loc[term]
            total += vec * weight
    return normalize_vec(total)
Example #8
0
def reject_subspace(frame, vecs):
    """
    Return a modification of the vector space `frame` where none of
    its rows have any correlation with any rows of `vecs`, by subtracting
    the outer product of `frame` with each normalized row of `vecs`.
    """
    current_array = frame.copy().values
    for vec in vecs:
        vec = normalize_vec(vec)
        projection = current_array.dot(vec)
        np.subtract(current_array, np.outer(projection, vec), out=current_array)

    normalize(current_array, norm='l2', copy=False)

    current_array = pd.DataFrame(current_array, index=frame.index)
    current_array.fillna(0, inplace=True)
    return current_array
Example #9
0
def reject_subspace(frame, vecs):
    """
    Return a modification of the vector space `frame` where none of
    its rows have any correlation with any rows of `vecs`, by subtracting
    the outer product of `frame` with each normalized row of `vecs`.
    """
    current_array = frame.copy().values
    for vec in vecs:
        vec = normalize_vec(vec)
        projection = current_array.dot(vec)
        np.subtract(current_array, np.outer(projection, vec), out=current_array)

    normalize(current_array, norm='l2', copy=False)

    current_array = pd.DataFrame(current_array, index=frame.index)
    current_array.fillna(0, inplace=True)
    return current_array
Example #10
0
    def get_vector(self, query, include_neighbors=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `include_neighbors=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()
        if isinstance(query, pd.DataFrame) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, str):
            terms = [(query, 1.0)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))
        include_neighbors = include_neighbors and (len(terms) <= 5)
        vec = self.expanded_vector(terms, include_neighbors=include_neighbors)
        return normalize_vec(vec)
Example #11
0
    def get_vector(self, query, include_neighbors=True):
        """
        Given one of the possible types of queries (see `similar_terms`), make
        a vector to look up from it.

        If there are 5 or fewer terms involved and `include_neighbors=True`, this
        will allow expanded_vector to look up neighboring terms in ConceptNet.
        """
        self.load()
        # FIXME: is pd.DataFrame supposed to be pd.Series here?
        if isinstance(query, pd.DataFrame) or isinstance(query, dict):
            terms = list(query.items())
        elif isinstance(query, str):
            terms = [(query, 1.)]
        elif isinstance(query, list):
            terms = query
        else:
            raise ValueError("Can't make a query out of type %s" % type(query))
        include_neighbors = include_neighbors and (len(terms) <= 5)
        vec = self.expanded_vector(terms, include_neighbors=include_neighbors)
        return normalize_vec(vec)