Esempio n. 1
0
def truncate(
    space,
    output,
    size=('', 2000, 'New vector length.'),
    nvaa=('', False, 'Use only nouns, verbs, adjectives and adverbs as features.'),
    tagset=('', '', 'Tagset'),
):
    assert space.matrix.shape[1] >= size

    features = space.column_labels
    if nvaa:
        if tagset == 'bnc':
            features = features[features.index.get_level_values('tag').isin(['SUBST', 'VERB', 'ADJ', 'ADV'])]
        else:
            features = features[features.index.get_level_values('tag').isin(['N', 'V', 'J', 'R'])]

    # It's important to sort by id to make sure that the most frequent features are selected.
    features = features.sort('id').head(size)
    matrix = sparse.csc_matrix(space.matrix)[:, features['id']]

    assert len(features) == size

    # Reindex features
    features['id'] = list(range(size))

    new_space = Space(
        matrix,
        row_labels=space.row_labels,
        column_labels=features,
    )

    new_space.write(output)
Esempio n. 2
0
def truncate(
        space,
        output,
        size=('', 2000, 'New vector length.'),
        nvaa=('', False,
              'Use only nouns, verbs, adjectives and adverbs as features.'),
        tagset=('', '', 'Tagset'),
):
    assert space.matrix.shape[1] >= size

    features = space.column_labels
    if nvaa:
        if tagset == 'bnc':
            features = features[features.index.get_level_values('tag').isin(
                ['SUBST', 'VERB', 'ADJ', 'ADV'])]
        else:
            features = features[features.index.get_level_values('tag').isin(
                ['N', 'V', 'J', 'R'])]

    # It's important to sort by id to make sure that the most frequent features are selected.
    features = features.sort('id').head(size)
    matrix = sparse.csc_matrix(space.matrix)[:, features['id']]

    assert len(features) == size

    # Reindex features
    features['id'] = list(range(size))

    new_space = Space(
        matrix,
        row_labels=space.row_labels,
        column_labels=features,
    )

    new_space.write(output)
Esempio n. 3
0
def to_space(
    word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'),
    output=('o', 'space.h5', 'The output space file.'),
    word2vec_format=('', False, 'Word2vec_format.'),
    pos_separator=('', '', 'POS separator.'),
):
    """Read a word2vec file and save it as a space file."""
    from gensim.models import Word2Vec

    if word2vec_format:
        model = Word2Vec.load_word2vec_format(word2vec, binary=True)
    else:
        model = Word2Vec.load(word2vec)

    if not pos_separator:
        targets = pd.DataFrame(
            {
                'id': range(len(model.index2word)),
                'ngram': model.index2word,
                'tag': '_',
            },
        )
    else:
        tokens = [s.rsplit(pos_separator, maxsplit=1) for s in model.index2word]
        targets = pd.DataFrame(
            {
                'id': range(len(model.index2word)),
                'ngram': [n for n, _ in tokens],
                'tag': [t for _, t in tokens],
            },
        )

    targets.set_index(['ngram', 'tag'], inplace=True)

    context = pd.DataFrame(
        {
            'id': range(model.syn0.shape[1]),
            'ngram': range(model.syn0.shape[1]),
            'tag': '_'
        },

    )
    context.set_index(['ngram', 'tag'], inplace=True)

    space = Space(
        data_ij=model.syn0,
        row_labels=targets,
        column_labels=context,
    )

    space.write(output)
Esempio n. 4
0
def to_space(
        word2vec=('', 'GoogleNews-vectors-negative300.bin.gz',
                  'Path to word2vec vectors.'),
        output=('o', 'space.h5', 'The output space file.'),
):
    """Read a word2vec file and save it as a space file."""
    from gensim.models import Word2Vec

    model = Word2Vec.load_word2vec_format(word2vec, binary=True)

    targets = pd.DataFrame({'id': range(len(model.index2word))},
                           index=model.index2word)
    targets.index.name = 'ngram'

    context = pd.DataFrame({'id': range(model.syn0.shape[1])})
    context.index.name = 'ngram'

    space = Space(
        data_ij=model.syn0,
        row_labels=targets,
        column_labels=context,
    )

    space.write(output)
Esempio n. 5
0
def transitive_verb_space(
    space_file,
    transitive_verb_arguments,
    execnet_hub,
    output=('o', 'space.h5', 'Output verb vector space.'),
    chunk_size=('', 100, 'The length of a chunk.'),
):

    data_to_send = (
        'data',
        pickle.dumps(
            {
                'space_file': space_file,
            },
        )
    )

    def init(channel):
        channel.send(data_to_send)

    groups = transitive_verb_arguments.groupby(
        # ['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag'],
        ['verb_stem', 'verb_tag']
    )

    groups = Bar(
        'Subject object Kronecker products',
        max=len(groups),
        suffix='%(index)d/%(max)d',
    ).iter(
        pickle.dumps(g) for g in groups
    )

    results = execnet_hub.run(
        remote_func=verb_space_builder,
        iterable=groups,
        init_func=init,
        verbose=False,
    )

    result = next(results)

    for r in results:
        for k, v in r.items():
            if k in result:
                result[k] += v
            else:
                result[k] = v

    result = list(result.items())

    verb_labels = [l for l, _ in result]
    verb_vectors = [v for _, v in result]

    del result

    matrix = sparse.vstack(verb_vectors)
    del verb_vectors

    row_labels = pd.DataFrame(
        {
            'ngram': [l[0] for l in verb_labels],
            'tag': [l[1] for l in verb_labels],
            'id': [i for i, _ in enumerate(verb_labels)],
        }
    ).set_index(['ngram', 'tag'])

    column_labels = pd.DataFrame(
        {
            'ngram': list(range(matrix.shape[1])),
            'tag': list(range(matrix.shape[1])),
            'id': list(range(matrix.shape[1])),
        }
    ).set_index(['ngram', 'tag'])

    space = Space(
        matrix,
        row_labels=row_labels,
        column_labels=column_labels,
    )

    space.write(output)