def __init__(self, corrections=None, spoken=None, target=None): self.train_path, self.dev_path, self.test_path = filepaths() self.trainset, self.devset, self.testset = returnDatasets() self.languages_known = Counter() self.languages_learning = Counter() self.languages = languages() self.ch = CorrectionHelper(corrections, spoken, target)
def languageChoices(self): return utils.languages()
test_x, test_y = sp.returnEntriesWithSpoken(test) all_train_x = train_x.append([i for i in dev_x]) all_train_y = train_y.append([j for j in dev_y]) np.random.seed(s['seed']) random.seed(s['seed']) ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size ''' rnn = RNN(nh=s['nhidden'], nc=len(languages()), ne=s['vocab_size'], de=s['emb_dimension'], cs=s['win']) best_f1 = -numpy.inf s['clr'] = s['lr'] for e in range(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs']))
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.metrics import metrics from sklearn.grid_search import GridSearchCV import numpy as np import pickle import itertools import random sp = SetProcessing() '''TODO''' n_samples = 2000 n_features = 500 n_languages = len(languages()) def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))]) vect = CountVectorizer(ngram_range=(1,1), max_features=n_features) X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.fit_transform(X_train_counts) clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42) clf.fit(X_train_tfidf, langs)
all_train_x = train_x.append([i for i in dev_x]) all_train_y = train_y.append([j for j in dev_y]) np.random.seed(s['seed']) random.seed(s['seed']) ''' nh :: dimension of the hidden layer nc :: number of classes ne :: number of word embeddings in the vocabulary de :: dimension of the word embeddings cs :: word window context size ''' rnn = RNN( nh = s['nhidden'], nc = len(languages()), ne = s['vocab_size'], de = s['emb_dimension'], cs = s['win']) best_f1 = -numpy.inf s['clr'] = s['lr'] for e in range(s['nepochs']): # shuffle shuffle([train_lex, train_ne, train_y], s['seed']) s['ce'] = e tic = time.time() for i in xrange(nsentences): cwords = contextwin(train_lex[i], s['win']) words = map(lambda x: numpy.asarray(x).astype('int32'),\ minibatch(cwords, s['bs']))
def homepage(request): return { 'isanon': 1 if authenticated_userid(request) else 0, 'languages': utils.languages(), }
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.metrics import metrics from sklearn.grid_search import GridSearchCV import numpy as np import pickle import itertools import random sp = SetProcessing() '''TODO''' n_samples = 2000 n_features = 500 n_languages = len(languages()) def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42))])