import gensim
from word2veckeras.doc2veckeras import SentenceClassifier,Doc2VecClassifier
from word2veckeras.treebank import TreeBank

treebank=TreeBank('./trees')
Xtest,Ytest=treebank.sents_labels('test',only_root=False,pos_neg_label=False)

X,Y        =treebank.sents_labels('train',only_root=False,pos_neg_label=False)

# X,Y        =treebank.sents_labels('dev',only_root=False,pos_neg_label=False)
# n_sample=300
# X=X[:n_sample]
# Y=Y[:n_sample]


clf1=SentenceClassifier( doc2vec=gensim.models.doc2vec.Doc2Vec() )
clf1.fit(X,Y)
print clf1.score(Xtest,Ytest)


from sklearn.grid_search import GridSearchCV,ParameterSampler, ParameterGrid

clf2=Doc2VecClassifier()
tuned_parameters = [{'dm':[1],'dm_concat':[0,1],'size': [200,300,400], 'window':[4,8],'min_count':[0,9],'sample':[0,1e-5],'iter':[1]}]
#tuned_parameters = [{'dm':[0,1],'size': [100,200]}]
clf2 = GridSearchCV(clf2, tuned_parameters,cv=3,n_jobs=4,verbose=1)
clf2.fit(X,Y)
print clf2.best_estimator_
print clf2.best_params_
print clf2.best_score_
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Licensed under the GNU Affero General Public License, version 3 - http://www.gnu.org/licenses/agpl-3.0.html

import nltk
import gensim
from word2veckeras.doc2veckeras import SentenceClassifier,Doc2VecClassifier

genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
n_sample=40
sents_labels=  sum([list(zip(nltk.corpus.brown.sents(categories=[genres[i_g]])[:n_sample],[i_g]*n_sample)) for i_g in range(len(genres))],[])
X=[sl[0] for sl in sents_labels]
Y=[sl[1] for sl in sents_labels]

clf1=SentenceClassifier( doc2vec=gensim.models.doc2vec.Doc2Vec() )
clf1.fit(X,Y)
print clf1.score(X,Y)

from sklearn.grid_search import GridSearchCV,ParameterSampler, ParameterGrid

clf2=Doc2VecClassifier()
tuned_parameters = [{'dm':[1],'dm_concat':[0,1],'size': [200,300,400], 'window':[4,8],'min_count':[0,9],'sample':[0,1e-5],'iter':[1]}]
#tuned_parameters = [{'dm':[1],'size': [200,300,400]}]
clf2 = GridSearchCV(clf2, tuned_parameters,cv=3,n_jobs=-1,verbose=1)
clf2.fit(X,Y)
print clf2.best_estimator_
print clf2.best_params_
print clf2.best_score_
print clf2.score(X,Y)
Esempio n. 3
0
from word2veckeras.doc2veckeras import SentenceClassifier, Doc2VecClassifier
from word2veckeras.treebank import TreeBank

treebank = TreeBank('./trees')
Xtest, Ytest = treebank.sents_labels('test',
                                     only_root=False,
                                     pos_neg_label=False)

X, Y = treebank.sents_labels('train', only_root=False, pos_neg_label=False)

# X,Y        =treebank.sents_labels('dev',only_root=False,pos_neg_label=False)
# n_sample=300
# X=X[:n_sample]
# Y=Y[:n_sample]

clf1 = SentenceClassifier(doc2vec=gensim.models.doc2vec.Doc2Vec())
clf1.fit(X, Y)
print clf1.score(Xtest, Ytest)

from sklearn.grid_search import GridSearchCV, ParameterSampler, ParameterGrid

clf2 = Doc2VecClassifier()
tuned_parameters = [{
    'dm': [1],
    'dm_concat': [0, 1],
    'size': [200, 300, 400],
    'window': [4, 8],
    'min_count': [0, 9],
    'sample': [0, 1e-5],
    'iter': [1]
}]