Ejemplo n.º 1
0
def bagOfWords(dataset):
    sp = SetProcessing()
    datalist = sp.convertDataToList(dataset)
    japanese, korean, mandarin = sp.organizeEasternLanguages(datalist)
    datalist = datalist[870:970]
    pairs = sp.buildSpeakingLearningPairs(datalist)
    print(pairs)
    entries = []
    langs = []
    korean = korean[:10]
    japanese = japanese[:10]

    for s in korean:
        datalist.append(s)
    for fr in japanese:
        datalist.append(fr)

    for data in datalist:
        entries.append(data[sp.ENTRY])
        langs.append(data[sp.SPEAKING])

    print(langs)

    vect = CountVectorizer()
    X_train_counts = vect.fit_transform(entries)
    tfidf = TfidfTransformer(use_idf=True).fit(X_train_counts)
    X_train_tfidf = tfidf.transform(X_train_counts)
    X_train_tfidf = X_train_tfidf.toarray()

    tree = SGDClassifier()
    tree.fit(X_train_tfidf, langs)
    result = tree.predict(X_train_tfidf)
    print(np.mean(result == langs))
    print(metrics.classification_report(langs, result, target_names=langs))
Ejemplo n.º 2
0
def w2vRetrieve(datalist, language_tag=None):
    if language_tag is None: 
        modelpath = MODEL_PATH
    else:
        modelpath = 'wordvecs/dim%s_window%s_%s' % (VEC_DIM, CONTEXT, language_tag)
    sp = SetProcessing()
    t0 = time()
    sentences = []
    counter = 0
    for value in datalist:
        if counter % 100 is 0: 
            print("currently looked at %s sets" % counter)
        if isWestern(value[sp.STUDYING]):
            sentences.append(w2vWestern(value[sp.ENTRY]))
            counter += 1
        else:
            sentences.append(w2vEastern(value[sp.ENTRY]))
            counter += 1
    wordvec = Word2Vec(sentences, size=VEC_DIM, window=CONTEXT, alpha=ALPHA, workers=WORKERS, min_count=FREQ_SKIP)
    wordvec.init_sims(replace=True)
    wordvec.save(modelpath)
    print("Saved word2vec model in %s seconds!" % (time()-t0))
    return wordvec
Ejemplo n.º 3
0
if __name__ == '__main__':
    s = {
        'fold': 3,  # 5 folds 0,1,2,3,4
        'lr': 0.0627142536696559,
        'verbose': 1,
        'decay': False,  # decay on the learning rate if improvement stops
        'win': 7,  # number of words in the context window
        'bs': 9,  # number of backprop through time steps
        'nhidden': 100,  # number of hidden units
        'seed': 345,
        'emb_dimension': 100,  # dimension of word embedding
        'nepochs': 50,
        'vocab_size': 1000
    }

    sp = SetProcessing()
    train = sp.convertDataToList(sp.train)
    dev = sp.convertDataToList(sp.dev)
    test = sp.convertDataToList(sp.test)

    train_x, train_y = sp.returnEntriesWithSpoken(train)
    dev_x, dev_y = sp.returnEntriesWithSpoken(dev)
    test_x, test_y = sp.returnEntriesWithSpoken(test)

    all_train_x = train_x.append([i for i in dev_x])
    all_train_y = train_y.append([j for j in dev_y])

    np.random.seed(s['seed'])
    random.seed(s['seed'])
    '''
        nh :: dimension of the hidden layer
Ejemplo n.º 4
0
	scores = cross_validation.cross_val_score(clf, train_x, train_y, scoring='f1_weighted')

if __name__ == '__main__':
	s = {'fold':3, # 5 folds 0,1,2,3,4
         'lr':0.0627142536696559,
         'verbose':1,
         'decay':False, # decay on the learning rate if improvement stops
         'win':7, # number of words in the context window
         'bs':9, # number of backprop through time steps
         'nhidden':100, # number of hidden units
         'seed':345,
         'emb_dimension':100, # dimension of word embedding
         'nepochs':50,
         'vocab_size':1000}

	sp = SetProcessing()
	train = sp.convertDataToList(sp.train)
	dev = sp.convertDataToList(sp.dev)
	test = sp.convertDataToList(sp.test)

	train_x, train_y = sp.returnEntriesWithSpoken(train)
	dev_x, dev_y = sp.returnEntriesWithSpoken(dev)
	test_x, test_y = sp.returnEntriesWithSpoken(test)

	all_train_x = train_x.append([i for i in dev_x])
	all_train_y = train_y.append([j for j in dev_y])

	np.random.seed(s['seed'])
	random.seed(s['seed'])

	'''
Ejemplo n.º 5
0
                                      ('edit_dist', object)])
        for i, pair in enumerate(pairs):
            spoken, count = pair
            features['spoken'][i] = spoken
            features['edit_dist'][i] = count
        return features


if __name__ == '__main__':
    train, dev, test = returnDatasets()
    ''' Set up the classes. '''
    #gfe = GiveawayFeatureExtraction()
    #sfe = SocialFeatureExtraction()
    sse = SyntacticStructExtraction()
    ce = CorrectionExtraction()
    sp = SetProcessing()

    datalist = sp.convertDataToList(train)
    #dev = sp.convertDataToList(dev)
    #test = sp.convertDataToList(test)
    #merged = sp.mergeLists(train, dev, test)
    #english, french, spanish, japanese, korean, mandarin = sp.returnSplitDatasets(train, 5, False)
    '''Return the individual sets by native language.'''
    '''Takes approx. 1 second.'''
    print("Collecting test sets...")
    western_native, eastern_native = sp.organizeDataByRegion(train)
    english_native, french_native, spanish_native = sp.organizeWesternLanguages(
        western_native)
    japanese_native, korean_native, mandarin_native = sp.organizeEasternLanguages(
        eastern_native)
    '''Return the individual sets by language being studied.'''
Ejemplo n.º 6
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, SGDRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import metrics
from sklearn.grid_search import GridSearchCV

import numpy as np
import pickle
import itertools
import random

sp = SetProcessing()
'''TODO'''
n_samples = 2000
n_features = 500
n_languages = len(languages())


def runSGDPipeline(entries, langs):
    t0 = time()
    sgd_pipeline = Pipeline([('vect',
                              CountVectorizer(ngram_range=(1, 1),
                                              max_features=n_features)),
                             ('tfidf', TfidfTransformer(use_idf=True)),
                             ('clf',
                              SGDClassifier(loss='squared_hinge',
                                            penalty='l2',