help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print 'Pre-processing corpus' if args.wiki: print 'Using wikipedia corpus' get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model') print 'Dict size: %s' % len(corpus_model.dictionary) print 'Collocations: %s' % corpus_model.matrix.nnz if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print 'Reading corpus statistics' corpus_model = Corpus.load('corpus.model') print 'Dict size: %s' % len(corpus_model.dictionary) print 'Collocations: %s' % corpus_model.matrix.nnz
default='', help='Get closes words to this word.') args = parser.parse_args() if args.create: # Build the corpus dictionary and the cooccurrence matrix. print 'Pre-processing corpus' if args.wiki: print 'Using wikipedia corpus' get_data = read_wikipedia_corpus else: get_data = read_corpus corpus_model = Corpus() corpus_model.fit(get_data(args.create), window=10) corpus_model.save('corpus.model') print 'Dict size: %s' % len(corpus_model.dictionary) print 'Collocations: %s' % corpus_model.matrix.nnz if args.train: # Train the GloVe model and save it to disk. if not args.create: # Try to load a corpus from disk. print 'Reading corpus statistics' corpus_model = Corpus.load('corpus.model') print 'Dict size: %s' % len(corpus_model.dictionary) print 'Collocations: %s' % corpus_model.matrix.nnz
with open('../../output/vocabs_100.txt', 'r') as vbf: for line in vbf.readlines(): vocab.append(line.strip()) # 建立词典,统计共现矩阵 dictionary = {} for i, word in enumerate(vocab): dictionary[word] = i corpus = [] with open('../../input/wiki.500.txt', 'r') as cf: for line in cf.readlines(): corpus.append([]) for word in line.split(): corpus[-1].append(word) corpus_obj = Corpus(dictionary=dictionary) corpus_obj.fit(corpus, window=10, ignore_missing=True) # 得到稀疏的上三角矩阵 corpus_obj.save('../../output/corpus_obj') # corpus_obj = Corpus.load('../../output/corpus_obj') # self.dictionary, self. matrix corpus_obj.matrix = corpus_obj.matrix.toarray() for i in range(corpus_obj.matrix.shape[0]): for j in range(i + 1, corpus_obj.matrix.shape[0]): if (corpus_obj.matrix[i][j] > 0.): corpus_obj.matrix[j][i] = corpus_obj.matrix[i][j] glove = GloVe(n=100, xmax=100, alpha=0.75, max_iter=20000, learning_rate=0.05, tol=1e-4, display_progress=100,