コード例 #1
0
def test_cache_with_oov():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    # check the vector of all zeros is returned if the term is not present
    ok_(not vectors.get_vector('/c/en/test', oov_vector=False).any())

    # If include_neighbors=True, the neighbor of 'test' in ConceptNet ('trial')
    #  will be used to approximate its vector
    ok_(vectors.get_vector('/c/en/test', oov_vector=True).any())
コード例 #2
0
def test_cache_with_oov():
    vectors = VectorSpaceWrapper(frame=TEST_FRAME)
    vectors.load()
    # check the vector of all zeros is returned if the term is not present
    ok_(not vectors.get_vector('/c/en/test', oov_vector=False).any())

    # If include_neighbors=True, the neighbor of 'test' in ConceptNet ('trial')
    #  will be used to approximate its vector
    ok_(vectors.get_vector('/c/en/test', oov_vector=True).any())
コード例 #3
0
ファイル: bias.py プロジェクト: akiratu/conceptnet5
def measure_bias(frame):
    """
    Return a DataFrame that measures biases in a semantic space, on four
    data sets:

    - Gender
    - Fine-grained ethnicity
    - Coarse-grained ethnicity
    - Religious beliefs
    """
    vsw = VectorSpaceWrapper(frame=frame)
    vsw.load()

    gender_binary_axis = normalize_vec(
        get_category_axis(frame, FEMALE_WORDS) -
        get_category_axis(frame, MALE_WORDS))
    gender_bias_numbers = []
    for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS:
        female_biased_uri = standardized_uri('en', female_biased_word)
        male_biased_uri = standardized_uri('en', male_biased_word)
        diff = normalize_vec(
            vsw.get_vector(female_biased_uri) -
            vsw.get_vector(male_biased_uri)).dot(gender_binary_axis)
        gender_bias_numbers.append(diff)

    mean = np.mean(gender_bias_numbers)
    sem = scipy.stats.sem(gender_bias_numbers)
    gender_bias = pd.Series([mean, mean - sem * 2, mean + sem * 2],
                            index=['bias', 'low', 'high'])

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_ETHNICITY)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    fine_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, COARSE_ETHNICITY_TERMS)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    coarse_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = pd.DataFrame(
        np.vstack(
            [get_category_axis(frame, names) for names in ETHNIC_NAME_SETS]))
    stereotype_vecs_2 = get_vocabulary_vectors(frame, ETHNIC_STEREOTYPE_TERMS)
    name_ethnic_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    stereotype_vecs_1 = get_vocabulary_vectors(frame, PEOPLE_BY_BELIEF)
    stereotype_vecs_2 = get_vocabulary_vectors(frame, BELIEF_STEREOTYPE_TERMS)
    belief_bias = correlation_bias(stereotype_vecs_1, stereotype_vecs_2)

    return pd.DataFrame({
        'gender': gender_bias,
        'ethnicity-fine': fine_ethnic_bias,
        'ethnicity-coarse': coarse_ethnic_bias,
        'ethnicity-names': name_ethnic_bias,
        'beliefs': belief_bias
    }).T
コード例 #4
0
class RelatednessClassifier(AttributeClassifier):
    def __init__(self):
        self.wrap = VectorSpaceWrapper(
            get_external_data_filename('numberbatch-20180108-biased.h5'),
            use_db=False)
        self.cache = {}

    def get_vector(self, uri):
        if uri in self.cache:
            return self.cache[uri]
        else:
            vec = normalize_vec(self.wrap.get_vector(uri))
            self.cache[uri] = vec
            return vec

    def get_similarity(self, uri1, uri2):
        return self.get_vector(uri1).dot(self.get_vector(uri2))

    def direct_relatedness(self, example):
        match1 = max(0, self.get_similarity(example.node1(),
                                            example.att_node()))**0.5
        match2 = max(0, self.get_similarity(example.node2(),
                                            example.att_node()))**0.5
        return match1 - match2

    def classify(self, examples, mode):
        return np.array(
            [self.direct_relatedness(example) > .1 for example in examples])
コード例 #5
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
コード例 #6
0
def eval_analogies(frame):
    filename = get_support_data_filename('google-analogies/questions-words.txt')
    quads = read_google_analogies(filename)
    vocab = [
        standardized_uri('en', word)
        for word in wordfreq.top_n_list('en', 200000)
    ]
    wrap = VectorSpaceWrapper(frame=frame)
    vecs = np.vstack([wrap.get_vector(word) for word in vocab])
    tframe = pd.DataFrame(vecs, index=vocab)
    total = 0
    correct = 0
    seen_mistakes = set()
    for quad in quads:
        prompt = quad[:3]
        answer = quad[3]
        vector = analogy_func(frame, *prompt)
        similar = similar_to_vec(tframe, vector)
        result = None
        for match in similar.index:
            if match not in prompt:
                result = match
                break
        if result == answer:
            correct += 1
        else:
            if result not in seen_mistakes:
                print(
                    "%s : %s :: %s : [%s] (should be %s)"
                    % (quad[0], quad[1], quad[2], result, answer)
                    )
                seen_mistakes.add(result)
        total += 1
    low, high = proportion_confint(correct, total)
    return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
コード例 #7
0
def test_cache_with_oov(multi_ling_frame):
    vectors = VectorSpaceWrapper(frame=multi_ling_frame)
    vectors.load()
    # check the vector of all zeros is returned if the term is not present
    assert not vectors.get_vector('/c/en/test', oov_vector=False).any()
コード例 #8
0
class MultipleFeaturesClassifier(AttributeClassifier):
    """
    Compute a number of numeric features from the examples, based on different
    data sources. Then use the concatenation of all these features as input to
    an SVM.

    The values of each feature are cached in the `discriminatt/results`
    directory. If you change the code of a feature, delete its corresponding cache
    files from that directory.
    """
    def __init__(self, ablate=()):
        self.wrap = VectorSpaceWrapper(
            get_external_data_filename('numberbatch-20180108-biased.h5'),
            use_db=False)
        self.cache = {}
        self.wp_db = None
        self.sme = None
        self.queries = None
        self.phrases = None
        self.svm = None
        self.ablate = ablate

        self.feature_methods = [
            self.direct_relatedness_features, self.sme_features,
            self.wikipedia_relatedness_features,
            self.wordnet_relatedness_features, self.phrase_hit_features
        ]

        self.feature_names = [
            'ConceptNet vector relatedness',
            'SME: RelatedTo',
            'SME: (x IsA a)',
            'SME: (x HasA a)',
            'SME: (x PartOf a)',
            'SME: (x CapableOf a)',
            'SME: (x UsedFor a)',
            'SME: (x HasContext a)',
            'SME: (x HasProperty a)',
            'SME: (x AtLocation a)',
            'SME: (a PartOf x)',
            'SME: (a AtLocation x)',
            'Wikipedia lead sections',
            'WordNet relatedness',
            'Google Ngrams',
        ]

    def get_vector(self, uri):
        if uri in self.cache:
            return self.cache[uri]
        else:
            vec = normalize_vec(self.wrap.get_vector(uri))
            self.cache[uri] = vec
            return vec

    def get_similarity(self, uri1, uri2):
        return self.get_vector(uri1).dot(self.get_vector(uri2))

    def direct_relatedness_features(self, example):
        match1 = max(self.get_similarity(example.node1(), example.att_node()),
                     0)**0.5
        match2 = max(self.get_similarity(example.node2(), example.att_node()),
                     0)**0.5
        return np.array([match1 - match2])

    def wikipedia_relatedness_features(self, example):
        if self.wp_db is None:
            self.wp_db = sqlite3.connect(
                get_external_data_filename('wikipedia-summary.db'))
        connected1 = [example.node1()] + wikipedia_connected_conceptnet_nodes(
            self.wp_db, example.word1)
        connected2 = [example.node2()] + wikipedia_connected_conceptnet_nodes(
            self.wp_db, example.word2)
        return self.max_relatedness_features(connected1, connected2,
                                             example.att_node())

    def wordnet_relatedness_features(self, example):
        connected1 = [example.node1()] + wordnet_connected_conceptnet_nodes(
            example.word1)
        connected2 = [example.node2()] + wordnet_connected_conceptnet_nodes(
            example.word2)
        return self.max_relatedness_features(connected1, connected2,
                                             example.att_node())

    def max_relatedness_features(self, conn1, conn2, att_node):
        match1 = max([self.get_similarity(c, att_node) for c in conn1])
        match2 = max([self.get_similarity(c, att_node) for c in conn2])
        return np.array([match1 - match2])

    def sme_features(self, example):
        if self.sme is None:
            self.sme = StandaloneSMEModel(
                get_external_data_filename('sme-20180129'))
        node1 = example.node1()
        node2 = example.node2()
        att = example.att_node()
        if node1 in self.sme and node2 in self.sme and att in self.sme:
            return self.sme.predict_discriminative_relations(
                node1, att) - self.sme.predict_discriminative_relations(
                    node2, att)
        else:
            return np.zeros(self.sme.num_rels())

    def phrase_hit_features(self, example):
        if self.phrases is None:
            self.phrases = sqlite3.connect(
                get_external_data_filename('phrases.db'))
        weight_pair1 = phrase_weight(self.phrases, example.lemma1(),
                                     example.lemma_att())
        weight_pair2 = phrase_weight(self.phrases, example.lemma2(),
                                     example.lemma_att())
        return weight_pair1 - weight_pair2

    def search_query_features(self, example):
        if self.queries is None:
            self.queries = read_search_queries()
        word1_queries = self.queries[example.word1]
        word2_queries = self.queries[example.word2]
        att_queries = self.queries[example.attribute]
        int1 = set(word1_queries).intersection(att_queries)
        int2 = set(word2_queries).intersection(att_queries)
        difference = len(int1) - len(int2)
        if difference > 0:
            return np.log(difference)
        else:
            return 0

    def extract_features(self, examples, mode='train'):
        subarrays = []
        for i, method in enumerate(self.feature_methods):
            name = method.__name__
            feature_filename = get_result_filename('{}.{}.npy'.format(
                name, mode))
            try:
                os.mkdir(os.path.dirname(feature_filename))
            except FileExistsError:
                pass
            if os.access(feature_filename, os.R_OK):
                features = np.load(feature_filename)
            else:
                feature_list = []
                for example in progress_bar(examples, desc=name):
                    feature_list.append(method(example))
                features = np.vstack(feature_list)
                np.save(feature_filename, features)

            # Set a selected feature source to all zeroes
            if i in self.ablate:
                features *= 0
            subarrays.append(features)
        return np.hstack(subarrays)

    def train(self, examples):
        self.svm = LinearSVC()
        inputs = normalize(self.extract_features(examples, mode='train'),
                           axis=0,
                           norm='l2')
        outputs = np.array([example.discriminative for example in examples])
        self.svm.fit(inputs, outputs)

        # Zero out features that get a negative weight -- these features were
        # intended to be positive, so one that comes out negative is probably
        # overfitting
        self.svm.coef_ = np.maximum(0, self.svm.coef_)
        coef_series = pd.Series(self.svm.coef_[0], index=self.feature_names)
        if self.ablate:
            used_feature_names = [
                self.feature_methods[a].__name__ for a in range(5)
                if a not in self.ablate
            ]
            print("Used [{}]".format(', '.join(used_feature_names)))
        if not self.ablate or self.ablate == (1, 2, 3, 4):
            print(coef_series)
            print("Intercept:", self.svm.intercept_)

    def classify(self, examples, mode):
        inputs = normalize(self.extract_features(examples, mode=mode),
                           axis=0,
                           norm='l2')
        predictions = self.svm.predict(inputs)
        return predictions