Ejemplo n.º 1
0
                        help='Get closes words to this word.')
    args = parser.parse_args()


    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print 'Pre-processing corpus'

        if args.wiki:
            print 'Using wikipedia corpus'
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')
        
        print 'Dict size: %s' % len(corpus_model.dictionary)
        print 'Collocations: %s' % corpus_model.matrix.nnz

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print 'Reading corpus statistics'
            corpus_model = Corpus.load('corpus.model')

            print 'Dict size: %s' % len(corpus_model.dictionary)
            print 'Collocations: %s' % corpus_model.matrix.nnz
Ejemplo n.º 2
0
                        default='',
                        help='Get closes words to this word.')
    args = parser.parse_args()

    if args.create:
        # Build the corpus dictionary and the cooccurrence matrix.
        print 'Pre-processing corpus'

        if args.wiki:
            print 'Using wikipedia corpus'
            get_data = read_wikipedia_corpus
        else:
            get_data = read_corpus

        corpus_model = Corpus()
        corpus_model.fit(get_data(args.create), window=10)
        corpus_model.save('corpus.model')

        print 'Dict size: %s' % len(corpus_model.dictionary)
        print 'Collocations: %s' % corpus_model.matrix.nnz

    if args.train:
        # Train the GloVe model and save it to disk.

        if not args.create:
            # Try to load a corpus from disk.
            print 'Reading corpus statistics'
            corpus_model = Corpus.load('corpus.model')

            print 'Dict size: %s' % len(corpus_model.dictionary)
            print 'Collocations: %s' % corpus_model.matrix.nnz
Ejemplo n.º 3
0
    with open('../../output/vocabs_100.txt', 'r') as vbf:
        for line in vbf.readlines():
            vocab.append(line.strip())

    # 建立词典,统计共现矩阵
    dictionary = {}
    for i, word in enumerate(vocab):
        dictionary[word] = i
    corpus = []
    with open('../../input/wiki.500.txt', 'r') as cf:
        for line in cf.readlines():
            corpus.append([])
            for word in line.split():
                corpus[-1].append(word)
    corpus_obj = Corpus(dictionary=dictionary)
    corpus_obj.fit(corpus, window=10, ignore_missing=True)  # 得到稀疏的上三角矩阵
    corpus_obj.save('../../output/corpus_obj')
    # corpus_obj = Corpus.load('../../output/corpus_obj') # self.dictionary, self. matrix
    corpus_obj.matrix = corpus_obj.matrix.toarray()
    for i in range(corpus_obj.matrix.shape[0]):
        for j in range(i + 1, corpus_obj.matrix.shape[0]):
            if (corpus_obj.matrix[i][j] > 0.):
                corpus_obj.matrix[j][i] = corpus_obj.matrix[i][j]

    glove = GloVe(n=100,
                  xmax=100,
                  alpha=0.75,
                  max_iter=20000,
                  learning_rate=0.05,
                  tol=1e-4,
                  display_progress=100,