def truncate( space, output, size=('', 2000, 'New vector length.'), nvaa=('', False, 'Use only nouns, verbs, adjectives and adverbs as features.'), tagset=('', '', 'Tagset'), ): assert space.matrix.shape[1] >= size features = space.column_labels if nvaa: if tagset == 'bnc': features = features[features.index.get_level_values('tag').isin(['SUBST', 'VERB', 'ADJ', 'ADV'])] else: features = features[features.index.get_level_values('tag').isin(['N', 'V', 'J', 'R'])] # It's important to sort by id to make sure that the most frequent features are selected. features = features.sort('id').head(size) matrix = sparse.csc_matrix(space.matrix)[:, features['id']] assert len(features) == size # Reindex features features['id'] = list(range(size)) new_space = Space( matrix, row_labels=space.row_labels, column_labels=features, ) new_space.write(output)
def truncate( space, output, size=('', 2000, 'New vector length.'), nvaa=('', False, 'Use only nouns, verbs, adjectives and adverbs as features.'), tagset=('', '', 'Tagset'), ): assert space.matrix.shape[1] >= size features = space.column_labels if nvaa: if tagset == 'bnc': features = features[features.index.get_level_values('tag').isin( ['SUBST', 'VERB', 'ADJ', 'ADV'])] else: features = features[features.index.get_level_values('tag').isin( ['N', 'V', 'J', 'R'])] # It's important to sort by id to make sure that the most frequent features are selected. features = features.sort('id').head(size) matrix = sparse.csc_matrix(space.matrix)[:, features['id']] assert len(features) == size # Reindex features features['id'] = list(range(size)) new_space = Space( matrix, row_labels=space.row_labels, column_labels=features, ) new_space.write(output)
def to_space( word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'), output=('o', 'space.h5', 'The output space file.'), word2vec_format=('', False, 'Word2vec_format.'), pos_separator=('', '', 'POS separator.'), ): """Read a word2vec file and save it as a space file.""" from gensim.models import Word2Vec if word2vec_format: model = Word2Vec.load_word2vec_format(word2vec, binary=True) else: model = Word2Vec.load(word2vec) if not pos_separator: targets = pd.DataFrame( { 'id': range(len(model.index2word)), 'ngram': model.index2word, 'tag': '_', }, ) else: tokens = [s.rsplit(pos_separator, maxsplit=1) for s in model.index2word] targets = pd.DataFrame( { 'id': range(len(model.index2word)), 'ngram': [n for n, _ in tokens], 'tag': [t for _, t in tokens], }, ) targets.set_index(['ngram', 'tag'], inplace=True) context = pd.DataFrame( { 'id': range(model.syn0.shape[1]), 'ngram': range(model.syn0.shape[1]), 'tag': '_' }, ) context.set_index(['ngram', 'tag'], inplace=True) space = Space( data_ij=model.syn0, row_labels=targets, column_labels=context, ) space.write(output)
def to_space( word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'), output=('o', 'space.h5', 'The output space file.'), ): """Read a word2vec file and save it as a space file.""" from gensim.models import Word2Vec model = Word2Vec.load_word2vec_format(word2vec, binary=True) targets = pd.DataFrame({'id': range(len(model.index2word))}, index=model.index2word) targets.index.name = 'ngram' context = pd.DataFrame({'id': range(model.syn0.shape[1])}) context.index.name = 'ngram' space = Space( data_ij=model.syn0, row_labels=targets, column_labels=context, ) space.write(output)
def transitive_verb_space( space_file, transitive_verb_arguments, execnet_hub, output=('o', 'space.h5', 'Output verb vector space.'), chunk_size=('', 100, 'The length of a chunk.'), ): data_to_send = ( 'data', pickle.dumps( { 'space_file': space_file, }, ) ) def init(channel): channel.send(data_to_send) groups = transitive_verb_arguments.groupby( # ['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag'], ['verb_stem', 'verb_tag'] ) groups = Bar( 'Subject object Kronecker products', max=len(groups), suffix='%(index)d/%(max)d', ).iter( pickle.dumps(g) for g in groups ) results = execnet_hub.run( remote_func=verb_space_builder, iterable=groups, init_func=init, verbose=False, ) result = next(results) for r in results: for k, v in r.items(): if k in result: result[k] += v else: result[k] = v result = list(result.items()) verb_labels = [l for l, _ in result] verb_vectors = [v for _, v in result] del result matrix = sparse.vstack(verb_vectors) del verb_vectors row_labels = pd.DataFrame( { 'ngram': [l[0] for l in verb_labels], 'tag': [l[1] for l in verb_labels], 'id': [i for i, _ in enumerate(verb_labels)], } ).set_index(['ngram', 'tag']) column_labels = pd.DataFrame( { 'ngram': list(range(matrix.shape[1])), 'tag': list(range(matrix.shape[1])), 'id': list(range(matrix.shape[1])), } ).set_index(['ngram', 'tag']) space = Space( matrix, row_labels=row_labels, column_labels=column_labels, ) space.write(output)