Esempio n. 1
0
    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if not self.frame.index[1].startswith('/c/'):
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.finder = None
                self.frame.index = ['/c/en/' + label for label in self.frame.index]

            if not self.frame.index.is_monotonic_increasing:
                self.frame = self.frame.sort_index()

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, : self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename
            )
        self._build_trie()
Esempio n. 2
0
    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            if not self.frame.index[1].startswith('/c/'):
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.frame.index = [
                    '/c/en/' + label for label in self.frame.index
                ]

            if not self.frame.index.is_monotonic_increasing:
                self.frame = self.frame.sort_index()

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, :self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename)
        self._build_trie()
Esempio n. 3
0
    def load(self):
        """
        Ensure that all the data is loaded.
        """
        if self.small_frame is not None:
            return
        try:
            if self.frame is None:
                self.frame = load_hdf(self.vector_filename)

            # FIXME: is self.standardized used for anything?
            if self.frame.index[0].startswith('/c/'):
                self.standardized = True
            else:
                # These terms weren't in ConceptNet standard form. Assume
                # they're in English, and stick the English language tag on
                # them without any further transformation, so we can be sure
                # we're evaluating the vectors as provided.
                self.standardized = False
                self.finder = None
                self.frame.index = [
                    '/c/en/' + label for label in self.frame.index
                ]

            self.k = self.frame.shape[1]
            self.small_k = 100
            self.small_frame = self.frame.iloc[:, :self.small_k].copy()
        except OSError:
            raise MissingVectorSpace(
                "Couldn't load the vector space %r. Do you need to build or "
                "download it?" % self.vector_filename)
Esempio n. 4
0
def graph_comparison(table_filename):
    import matplotlib.pyplot as plt
    result = load_hdf(table_filename)
    plt.style.use('bmh')
    plt.rcParams['xtick.labelsize'] = 'x-large'
    plt.rcParams['ytick.labelsize'] = 'x-large'

    width = 0.15
    evals = ['men3000', 'rw', 'mturk', 'ws353', 'story-cloze', 'sat-analogies']
    eval_labels = ['MEN-3000', 'Rare Words', 'MTurk-771', 'WS353', 'Story Cloze', 'SAT analogies']
    colors = [props['color'] for props in plt.rcParams['axes.prop_cycle']]

    systems = [
        ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'),
        ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'),
        ('LexVec', 'data/raw/vectors/lexvec.no-header.vectors.gz'),
        ('ConceptNet PPMI', 'data/vectors/conceptnet-55-ppmi.h5'),
        ('ConceptNet Numberbatch 16.09', 'data/vectors/numberbatch.h5')
    ]
    ind = np.arange(len(evals))

    fig, ax = plt.subplots(figsize=(12, 6))
    for i, (sysname, syspath) in enumerate(systems):
        eval_table = result.xs(syspath, level=0).loc[evals]
        errs = [eval_table['high'] - eval_table['acc'], eval_table['acc'] - eval_table['low']]
        ax.bar(ind + i * width, eval_table['acc'], width, color=colors[i], yerr=errs, ecolor='k')

    ax.set_ylim(0.0, 1.0)
    ax.legend([name for (name, path) in systems])
    ax.set_xticks(ind + width * len(systems) / 2)
    ax.set_xticklabels(eval_labels)
    ax.xaxis.grid(False)
    plt.ylabel('Evaluation score', fontsize='x-large')
    plt.savefig('data/stats/eval-graph.png', dpi=300)
Esempio n. 5
0
def graph_comparison(table_filename, out_filename):
    import matplotlib.pyplot as plt

    result = load_hdf(table_filename)
    # plt.style.use('bmh')
    plt.rcParams['xtick.labelsize'] = 'x-large'
    plt.rcParams['ytick.labelsize'] = 'x-large'

    evals = ['men3000', 'rw', 'mturk', 'ws353', 'semeval-2a-en']
    eval_labels = [
        'MEN-3000',
        'Rare Words',
        'MTurk-771',
        'WordSim-353',
        'SemEval 2017-2a',
    ]
    prop_cycle = list(plt.rcParams['axes.prop_cycle'])
    colors = [props['color'] for props in prop_cycle]

    systems = [
        (
            'word2vec Google News',
            'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz',
        ),
        ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'),
        ('GloVe renormalized', 'data/vectors/glove12-840B.h5'),
        ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'),
        # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'),
        ('ConceptNet Numberbatch', 'data/vectors/numberbatch.h5'),
    ]
    width = 0.84 / len(systems)
    ind = np.arange(len(evals))

    fig, ax = plt.subplots(figsize=(16, 8))
    for i, (sysname, syspath) in enumerate(systems):
        eval_table = result.xs(syspath, level=0).loc[evals]
        value = eval_table['acc']
        errs = [eval_table['high'] - value, value - eval_table['low']]
        ax.bar(
            ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k'
        )

    ax.set_ylim(0.0, 1.0)
    ax.set_yticks(np.arange(0.0, 1.1, 0.1))
    ax.legend(
        [name for (name, path) in systems],
        bbox_to_anchor=(1.02, 1),
        loc=2,
        borderaxespad=0.,
    )
    ax.set_xticks(ind + width * len(systems) / 2)
    ax.set_xticklabels(eval_labels)
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    plt.ylabel(
        'Evaluation score (Spearman \N{GREEK SMALL LETTER RHO})', fontsize='x-large'
    )
    plt.savefig(out_filename, bbox_inches="tight", dpi=300)
Esempio n. 6
0
def load_any_embeddings(filename):
    if filename.endswith('.bin.gz'):
        return load_word2vec_bin(filename, 1000000)
    elif filename.endswith('.gz'):
        return load_glove(filename, 1000000)
    elif filename.endswith('.h5'):
        return load_hdf(filename)
    else:
        raise ValueError("Can't recognize file extension of %r" % filename)
Esempio n. 7
0
def load_any_embeddings(filename):
    if filename.endswith('.bin.gz'):
        return load_word2vec_bin(filename, 1000000)
    elif filename.endswith('.gz'):
        return load_glove(filename, 1000000)
    elif filename.endswith('.h5'):
        return load_hdf(filename)
    else:
        raise ValueError("Can't recognize file extension of %r" % filename)
Esempio n. 8
0
def graph_bias_comparison(table_filename, out_filename):
    import matplotlib.pyplot as plt
    result = load_hdf(table_filename)
    # plt.style.use('bmh')
    plt.rcParams['xtick.labelsize'] = 'x-large'
    plt.rcParams['ytick.labelsize'] = 'x-large'

    evals = [
        'gender', 'beliefs', 'ethnicity-coarse', 'ethnicity-fine',
        'ethnicity-names'
    ]
    eval_labels = [
        'Gender bias', 'Religious bias', 'Ethnic bias (coarse)',
        'Ethnic bias (fine)', 'Bias from names'
    ]
    prop_cycle = list(plt.rcParams['axes.prop_cycle'])
    colors = [props['color'] for props in prop_cycle]

    systems = [
        ('word2vec Google News',
         'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'),
        ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'),
        ('GloVe renormalized', 'data/vectors/glove12-840B.h5'),
        ('fastText enWP (without OOV)',
         'data/raw/vectors/fasttext-wiki-en.vec.gz'),
        # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'),
        ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5')
    ]
    width = 0.84 / len(systems)
    ind = np.arange(len(evals))

    fig, ax = plt.subplots(figsize=(16, 8))
    for i, (sysname, syspath) in enumerate(systems):
        eval_table = result.xs(syspath, level=0).loc[evals]
        value = eval_table['bias']
        errs = [eval_table['high'] - value, value - eval_table['low']]
        ax.bar(ind + i * width,
               value,
               width * 0.9,
               color=colors[i],
               yerr=errs,
               ecolor='k')

    ax.set_ylim(0.0, 0.4)
    ax.set_yticks(np.arange(0.0, 0.5, 0.1))
    ax.legend([name for (name, path) in systems],
              bbox_to_anchor=(1.02, 1),
              loc=2,
              borderaxespad=0.)
    ax.set_xticks(ind + width * len(systems) / 2)
    ax.set_xticklabels(eval_labels)
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    plt.ylabel('Correlation with stereotypes', fontsize='x-large')
    plt.savefig(out_filename, bbox_inches="tight", dpi=300)
Esempio n. 9
0
    def __init__(self, dirname):
        """
        Load the files exported from a model trained with ConceptNet's
        implementation of Semantic Matching Energy.

        `terms-similar.h5` is the (|V| x 300) matrix of term embeddings. (It's
        named that because using the embeddings directly, instead of operating
        on them with a relation, is meant to represent the SimilarTo relation.)

        `relations.h5` is the (24 x 10) matrix of relation embeddings.

        `assoc.npy` is a 3-tensor with shape (10 x 300 x 300), which relates
        two term embeddings and a relation embedding. Multiplying two vectors
        by this tensor in the appropriate dimensions gives you a prediction for
        the third vector.
        """
        path = pathlib.Path(dirname)
        self.rel_embeddings = load_hdf(str(path / 'relations.h5'))
        self.term_embeddings = load_hdf(str(path / 'terms-similar.h5'))
        self.assoc_tensor = np.load(str(path / 'assoc.npy'))
Esempio n. 10
0
def graph_comparison(table_filename, out_filename):
    import matplotlib.pyplot as plt
    result = load_hdf(table_filename)
    # plt.style.use('bmh')
    plt.rcParams['xtick.labelsize'] = 'x-large'
    plt.rcParams['ytick.labelsize'] = 'x-large'

    evals = ['men3000', 'rw', 'mturk', 'ws353', 'semeval-2a-en']
    eval_labels = [
        'MEN-3000', 'Rare Words', 'MTurk-771', 'WordSim-353', 'SemEval 2017-2a'
    ]
    prop_cycle = list(plt.rcParams['axes.prop_cycle'])
    colors = [props['color'] for props in prop_cycle]

    systems = [
        ('word2vec Google News',
         'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'),
        ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'),
        ('GloVe renormalized', 'data/vectors/glove12-840B.h5'),
        ('fastText enWP (without OOV)',
         'data/raw/vectors/fasttext-wiki-en.vec.gz'),
        # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'),
        ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5')
    ]
    width = 0.84 / len(systems)
    ind = np.arange(len(evals))

    fig, ax = plt.subplots(figsize=(16, 8))
    for i, (sysname, syspath) in enumerate(systems):
        eval_table = result.xs(syspath, level=0).loc[evals]
        value = eval_table['acc']
        errs = [eval_table['high'] - value, value - eval_table['low']]
        ax.bar(ind + i * width,
               value,
               width * 0.9,
               color=colors[i],
               yerr=errs,
               ecolor='k')

    ax.set_ylim(0.0, 1.0)
    ax.set_yticks(np.arange(0.0, 1.1, 0.1))
    ax.legend([name for (name, path) in systems],
              bbox_to_anchor=(1.02, 1),
              loc=2,
              borderaxespad=0.)
    ax.set_xticks(ind + width * len(systems) / 2)
    ax.set_xticklabels(eval_labels)
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    plt.ylabel('Evaluation score (Spearman \N{GREEK SMALL LETTER RHO})',
               fontsize='x-large')
    plt.savefig(out_filename, bbox_inches="tight", dpi=300)
Esempio n. 11
0
def read_embedding_vocabularies(filenames):
    """
    Reads every vector embedding file in the given collection of
    filenames, and returns the union of their vocabularies.  (The
    files are assumed to be hdf5 files containing dataframes, and
    the vocabularies are their indices.
    """
    result = pd.Index([])
    for filename in filenames:
        vectors = load_hdf(filename)
        result = result.union(vectors.index)
    return result
Esempio n. 12
0
def read_embedding_vocabularies(filenames):
    """
    Reads every vector embedding file in the given collection of 
    filenames, and returns the union of their vocabularies.  (The 
    files are assumed to be hdf5 files containing dataframes, and 
    the vocabularies are their indices.
    """
    result = pd.Index([])
    for filename in filenames:
        vectors = load_hdf(filename)
        result = result.union(vectors.index)
    return result
Esempio n. 13
0
def _load_vectors():
    frame = load_hdf(resource_filename('codenames', 'data/mini.h5'))
    selections = [
        label for label in frame.index
        if label.startswith('/c/en/') and '_' not in label and '#' not in label
        and wordfreq.zipf_frequency(label[6:], 'en') > 3.0
    ]
    # Make sure all the words in Codenames are represented
    wordlist = [
        standardized_uri('en', line.strip()) for line in open(
            resource_filename('codenames', 'data/codenames-words.txt'))
    ]
    additions = [word for word in wordlist if word not in selections]
    selections += additions
    frame = l2_normalize_rows(frame.loc[selections].astype('f'))
    return VectorSpaceWrapper(frame=frame)
Esempio n. 14
0
def graph_comparison(table_filename, out_filename):
    import matplotlib.pyplot as plt
    result = load_hdf(table_filename)
    plt.style.use('bmh')
    plt.rcParams['xtick.labelsize'] = 'x-large'
    plt.rcParams['ytick.labelsize'] = 'x-large'

    patterns = [ "/", "\\" , "//" , "\\\\" , " " ]
    width = 0.15
    evals = ['men3000', 'rw', 'mturk', 'ws353', 'story-cloze', 'sat-analogies']
    eval_labels = ['MEN-3000', 'Rare Words', 'MTurk-771', 'WS353', 'Story Cloze', 'SAT analogies']
    colors = [props['color'] for props in plt.rcParams['axes.prop_cycle']]

    systems = [
        ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'),
        ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'),
        ('LexVec: enWP + NewsCrawl', 'data/raw/vectors/lexvec.no-header.vectors.gz'),
        ('ConceptNet-PPMI', 'data/precomputed/vectors/conceptnet-55-ppmi.h5'),
        ('ConceptNet Numberbatch', 'data/precomputed/vectors/numberbatch.h5')
    ]
    ind = np.arange(len(evals))

    fig, ax = plt.subplots(figsize=(16, 8))
    for i, (sysname, syspath) in enumerate(systems):
        eval_table = result.xs(syspath, level=0).loc[evals]
        errs = [eval_table['high'] - eval_table['acc'], eval_table['acc'] - eval_table['low']]
        ax.bar(ind + i * width, eval_table['acc'], width, hatch=patterns[i], color=colors[i], yerr=errs, ecolor='k')

    ax.set_ylim(0.0, 1.0)
    ax.set_yticks(np.arange(0.0, 1.1, 0.1))
    ax.legend([name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)
    ax.set_xticks(ind + width * len(systems) / 2)
    ax.set_xticklabels(eval_labels)
    ax.xaxis.grid(False)
    plt.ylabel('Evaluation score', fontsize='x-large')
    plt.savefig(out_filename, bbox_inches="tight", dpi=300)
Esempio n. 15
0
def read_embedding_vocabularies(filenames):
    result = pd.Index([])
    for filename in filenames:
        vectors = load_hdf(filename)
        result = result.union(vectors.index)
    return result
Esempio n. 16
0
def graph_bias_comparison(table_filename, out_filename):
    import matplotlib.pyplot as plt

    result = load_hdf(table_filename)
    # plt.style.use('bmh')
    plt.rcParams['xtick.labelsize'] = 'x-large'
    plt.rcParams['ytick.labelsize'] = 'x-large'

    evals = [
        'gender',
        'beliefs',
        'ethnicity-coarse',
        'ethnicity-fine',
        'ethnicity-names',
    ]
    eval_labels = [
        'Gender bias',
        'Religious bias',
        'Ethnic bias (coarse)',
        'Ethnic bias (fine)',
        'Bias from names',
    ]
    prop_cycle = list(plt.rcParams['axes.prop_cycle'])
    colors = [props['color'] for props in prop_cycle]

    systems = [
        (
            'word2vec Google News',
            'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz',
        ),
        ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'),
        ('GloVe renormalized', 'data/vectors/glove12-840B.h5'),
        ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'),
        # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'),
        ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5'),
    ]
    width = 0.84 / len(systems)
    ind = np.arange(len(evals))

    fig, ax = plt.subplots(figsize=(16, 8))
    for i, (sysname, syspath) in enumerate(systems):
        eval_table = result.xs(syspath, level=0).loc[evals]
        value = eval_table['bias']
        errs = [eval_table['high'] - value, value - eval_table['low']]
        ax.bar(
            ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k'
        )

    ax.set_ylim(0.0, 0.4)
    ax.set_yticks(np.arange(0.0, 0.5, 0.1))
    ax.legend(
        [name for (name, path) in systems],
        bbox_to_anchor=(1.02, 1),
        loc=2,
        borderaxespad=0.,
    )
    ax.set_xticks(ind + width * len(systems) / 2)
    ax.set_xticklabels(eval_labels)
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)
    ax.set_axisbelow(True)
    plt.ylabel('Correlation with stereotypes', fontsize='x-large')
    plt.savefig(out_filename, bbox_inches="tight", dpi=300)