Esempio n. 1
0
 def __init__(self, corrections=None, spoken=None, target=None):
     self.train_path, self.dev_path, self.test_path = filepaths()
     self.trainset, self.devset, self.testset = returnDatasets()
     self.languages_known = Counter()
     self.languages_learning = Counter()
     self.languages = languages()
     self.ch = CorrectionHelper(corrections, spoken, target)
Esempio n. 2
0
	def __init__(self, corrections=None, spoken=None, target=None):
		self.train_path, self.dev_path, self.test_path = filepaths()
		self.trainset, self.devset, self.testset = returnDatasets()
		self.languages_known = Counter()
		self.languages_learning = Counter()
		self.languages = languages()
		self.ch = CorrectionHelper(corrections, spoken, target)
Esempio n. 3
0
 def languageChoices(self):
     return utils.languages()
Esempio n. 4
0
    test_x, test_y = sp.returnEntriesWithSpoken(test)

    all_train_x = train_x.append([i for i in dev_x])
    all_train_y = train_y.append([j for j in dev_y])

    np.random.seed(s['seed'])
    random.seed(s['seed'])
    '''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size 
        '''
    rnn = RNN(nh=s['nhidden'],
              nc=len(languages()),
              ne=s['vocab_size'],
              de=s['emb_dimension'],
              cs=s['win'])

    best_f1 = -numpy.inf
    s['clr'] = s['lr']
    for e in range(s['nepochs']):
        # shuffle
        shuffle([train_lex, train_ne, train_y], s['seed'])
        s['ce'] = e
        tic = time.time()
        for i in xrange(nsentences):
            cwords = contextwin(train_lex[i], s['win'])
            words  = map(lambda x: numpy.asarray(x).astype('int32'),\
                minibatch(cwords, s['bs']))
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import metrics
from sklearn.grid_search import GridSearchCV

import numpy as np
import pickle
import itertools
import random

sp = SetProcessing()

'''TODO'''
n_samples = 2000
n_features = 500
n_languages = len(languages())

def runSGDPipeline(entries, langs):
	t0 = time()
	sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,1), max_features=n_features)),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',
                                            alpha=0.001, n_iter=5, random_state=42))])

	vect = CountVectorizer(ngram_range=(1,1), max_features=n_features)
	X_train_counts = vect.fit_transform(entries)
	tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
	X_train_tfidf = tfidf.fit_transform(X_train_counts)

	clf = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.001, n_iter=5, random_state=42)
	clf.fit(X_train_tfidf, langs)
Esempio n. 6
0
	all_train_x = train_x.append([i for i in dev_x])
	all_train_y = train_y.append([j for j in dev_y])

	np.random.seed(s['seed'])
	random.seed(s['seed'])

	'''
        nh :: dimension of the hidden layer
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        cs :: word window context size 
        '''
	rnn = RNN(	nh = s['nhidden'],
				nc = len(languages()),
				ne = s['vocab_size'],
				de = s['emb_dimension'],
				cs = s['win'])

	best_f1 = -numpy.inf
	s['clr'] = s['lr']
	for e in range(s['nepochs']):
		# shuffle
		shuffle([train_lex, train_ne, train_y], s['seed'])
		s['ce'] = e
		tic = time.time()
		for i in xrange(nsentences):
			cwords = contextwin(train_lex[i], s['win'])
			words  = map(lambda x: numpy.asarray(x).astype('int32'),\
							minibatch(cwords, s['bs']))
Esempio n. 7
0
def homepage(request):
    return {
        'isanon': 1 if authenticated_userid(request) else 0,
        'languages': utils.languages(),
    }
Esempio n. 8
0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import metrics
from sklearn.grid_search import GridSearchCV

import numpy as np
import pickle
import itertools
import random

sp = SetProcessing()
'''TODO'''
n_samples = 2000
n_features = 500
n_languages = len(languages())


def runSGDPipeline(entries, langs):
    t0 = time()
    sgd_pipeline = Pipeline([('vect',
                              CountVectorizer(ngram_range=(1, 1),
                                              max_features=n_features)),
                             ('tfidf', TfidfTransformer(use_idf=True)),
                             ('clf',
                              SGDClassifier(loss='squared_hinge',
                                            penalty='l2',
                                            alpha=0.001,
                                            n_iter=5,
                                            random_state=42))])