def bagOfWords(dataset): sp = SetProcessing() datalist = sp.convertDataToList(dataset) japanese, korean, mandarin = sp.organizeEasternLanguages(datalist) datalist = datalist[870:970] pairs = sp.buildSpeakingLearningPairs(datalist) print(pairs) entries = [] langs = [] korean = korean[:10] japanese = japanese[:10] for s in korean: datalist.append(s) for fr in japanese: datalist.append(fr) for data in datalist: entries.append(data[sp.ENTRY]) langs.append(data[sp.SPEAKING]) print(langs) vect = CountVectorizer() X_train_counts = vect.fit_transform(entries) tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts) X_train_tfidf = tfidf.transform(X_train_counts) X_train_tfidf = X_train_tfidf.toarray() tree = SGDClassifier() tree.fit(X_train_tfidf, langs) result = tree.predict(X_train_tfidf) print(np.mean(result == langs)) print(metrics.classification_report(langs, result, target_names=langs))
def w2vRetrieve(datalist, language_tag=None): if language_tag is None: modelpath = MODEL_PATH else: modelpath = 'wordvecs/dim%s_window%s_%s' % (VEC_DIM, CONTEXT, language_tag) sp = SetProcessing() t0 = time() sentences = [] counter = 0 for value in datalist: if counter % 100 is 0: print("currently looked at %s sets" % counter) if isWestern(value[sp.STUDYING]): sentences.append(w2vWestern(value[sp.ENTRY])) counter += 1 else: sentences.append(w2vEastern(value[sp.ENTRY])) counter += 1 wordvec = Word2Vec(sentences, size=VEC_DIM, window=CONTEXT, alpha=ALPHA, workers=WORKERS, min_count=FREQ_SKIP) wordvec.init_sims(replace=True) wordvec.save(modelpath) print("Saved word2vec model in %s seconds!" % (time()-t0)) return wordvec
if __name__ == '__main__': s = { 'fold': 3, # 5 folds 0,1,2,3,4 'lr': 0.0627142536696559, 'verbose': 1, 'decay': False, # decay on the learning rate if improvement stops 'win': 7, # number of words in the context window 'bs': 9, # number of backprop through time steps 'nhidden': 100, # number of hidden units 'seed': 345, 'emb_dimension': 100, # dimension of word embedding 'nepochs': 50, 'vocab_size': 1000 } sp = SetProcessing() train = sp.convertDataToList(sp.train) dev = sp.convertDataToList(sp.dev) test = sp.convertDataToList(sp.test) train_x, train_y = sp.returnEntriesWithSpoken(train) dev_x, dev_y = sp.returnEntriesWithSpoken(dev) test_x, test_y = sp.returnEntriesWithSpoken(test) all_train_x = train_x.append([i for i in dev_x]) all_train_y = train_y.append([j for j in dev_y]) np.random.seed(s['seed']) random.seed(s['seed']) ''' nh :: dimension of the hidden layer
scores = cross_validation.cross_val_score(clf, train_x, train_y, scoring='f1_weighted') if __name__ == '__main__': s = {'fold':3, # 5 folds 0,1,2,3,4 'lr':0.0627142536696559, 'verbose':1, 'decay':False, # decay on the learning rate if improvement stops 'win':7, # number of words in the context window 'bs':9, # number of backprop through time steps 'nhidden':100, # number of hidden units 'seed':345, 'emb_dimension':100, # dimension of word embedding 'nepochs':50, 'vocab_size':1000} sp = SetProcessing() train = sp.convertDataToList(sp.train) dev = sp.convertDataToList(sp.dev) test = sp.convertDataToList(sp.test) train_x, train_y = sp.returnEntriesWithSpoken(train) dev_x, dev_y = sp.returnEntriesWithSpoken(dev) test_x, test_y = sp.returnEntriesWithSpoken(test) all_train_x = train_x.append([i for i in dev_x]) all_train_y = train_y.append([j for j in dev_y]) np.random.seed(s['seed']) random.seed(s['seed']) '''
('edit_dist', object)]) for i, pair in enumerate(pairs): spoken, count = pair features['spoken'][i] = spoken features['edit_dist'][i] = count return features if __name__ == '__main__': train, dev, test = returnDatasets() ''' Set up the classes. ''' #gfe = GiveawayFeatureExtraction() #sfe = SocialFeatureExtraction() sse = SyntacticStructExtraction() ce = CorrectionExtraction() sp = SetProcessing() datalist = sp.convertDataToList(train) #dev = sp.convertDataToList(dev) #test = sp.convertDataToList(test) #merged = sp.mergeLists(train, dev, test) #english, french, spanish, japanese, korean, mandarin = sp.returnSplitDatasets(train, 5, False) '''Return the individual sets by native language.''' '''Takes approx. 1 second.''' print("Collecting test sets...") western_native, eastern_native = sp.organizeDataByRegion(train) english_native, french_native, spanish_native = sp.organizeWesternLanguages( western_native) japanese_native, korean_native, mandarin_native = sp.organizeEasternLanguages( eastern_native) '''Return the individual sets by language being studied.'''
from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import SGDClassifier, SGDRegressor from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.metrics import metrics from sklearn.grid_search import GridSearchCV import numpy as np import pickle import itertools import random sp = SetProcessing() '''TODO''' n_samples = 2000 n_features = 500 n_languages = len(languages()) def runSGDPipeline(entries, langs): t0 = time() sgd_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1), max_features=n_features)), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', SGDClassifier(loss='squared_hinge', penalty='l2',