Esempio n. 1
0
def get_best_model_params(model, data, labels, model_type='b'):
  x_train, x_test, y_train, y_test = get_data_sets(data, labels)
  train = dv().fit_transform(x_train)
  test = np.array(y_train)
  hyper_parameters = get_hyper_parameters(model_type)
  clf = gs(model, hyper_parameters)
  if model_type == 'gmm':
    clf = model
    clf.fit(train.todense())
  else:  
    clf.fit(train.todense(), test)
  
  print "Best parameters set found on development set:"
  print
  if not model_type == 'gmm': print clf.best_estimator_
  
  train = dv().fit_transform(x_test)
  test = np.array(y_test)
  #print clf.score(train.todense(), test)
  return clf
Esempio n. 2
0
def train():
    #for each sentence in the dependency parses, construct the tree:
    features=[]
    answers = []
    data = []
    for sentence in data:
        #format sentence to pass into tree maker
        #iterate through sentence until tree is formed - one each iteration, will consider multiple pairs of words
        #each time we consider two words, get features, save as row in array with dictvectorize
        '''features, answers = make_tree('',sentence,True)
        features.append(featues)
        answers.append(answers)'''
        pass
        
    
    #example so that can do svm
    #order of features  -2,-1,0-,0+,1,2: pos,lex,ch-L-pos,ch-L-lex,ch-R-pos,ch-R-lex
    example_features1 = {'pos-2': ':', 'pos-1': 'NNS', 'pos-0': 'IN', 'pos0': 'NN', 'pos1': 'WP', 'pos1': ':'}
    example_features1.update( {'lex-2': '-', 'lex-1': 'sellers','lex-0': 'of','lex0': 'resort','lex1': 'who','lex2': '-'})
    example_features1.update({'chrrpos-1':'DT','chrrpos0':'JJ','chrlex-1':'the','chrlex0':'last'})
    example_features1.update({'chlpos1': 'VBD', 'chllex1':'were'})
    example_features2 = {'pos-2': 'NNS', 'pos-1': 'IN', 'pos-0': 'NN', 'pos0': 'WP', 'pos1': ':'}
    example_features2.update( {'lex-2': 'sellers', 'lex-1': 'of','lex-0': 'resort','lex0': 'who','lex1': '-'})
    example_features2.update({'chrrpos-2':'DT','chrrpos-0':'JJ','chrlex-2':'the','chrlex-0':'last'})
    example_features2.update({'chlpos0': 'VBD', 'chllex0':'were'})
    #this feature matrix has two entries
    example_features = [example_features1,example_features2]
    #each time we consider two words, get correct action (shift, right, left), put in the answer vector
    example_answers = np.array(['right','left'])

    #dictvectorize to turn strings into numerical values
    vec = dv()
    example_array = vec.fit_transform(example_features).toarray()
    '''array = vec.fit_transform(features).toarray()'''

    #TODO: a later goal: once we have the matrix, sort and split label (left, right, split) data 
    #so that we can run three different models

    #use sklearn svm to come up with a model
    #persist the model in a pickle
    clf = svm.SVC()
    clf.fit(example_array,example_answers)
    '''clf.fit(array,answers)'''
    pkl = open('svm.pkl','wb')
    pickle.dump(clf,pkl)
    pkl.close()

    return clf
Esempio n. 3
0
def perform(target_label):
    data_pred = pd.read_csv('data_set/content_preds.csv', dtype=object).drop(target_label, 1)
    user_label = pd.read_csv('data_set/userinfo.csv', dtype=object)[['profile_id', target_label]]
    data_merged = pd.merge(data_pred, user_label, on='profile_id', how='inner')
    user_info = pd.read_csv('data_set/userinfo.csv', dtype=object, usecols=list(data_merged.columns))

    kf = StratifiedKFold(data_merged[target_label], n_folds=10, shuffle=True)
    list_of_each_fold_score = list()

    for tr_index, te_index in kf:
        tr_criterion = [x in set(data_merged['profile_id'][tr_index]) for x in user_info['profile_id']]
        data_tr = user_info[tr_criterion].copy()
        data_te = data_merged.loc[te_index].copy()
        data_concatenated = data_tr.append(data_te, ignore_index=True)
        data_concatenated[['profile_id']] = data_concatenated[['profile_id']].astype(float)
        # print(data_concatenated)

        vectorizer = dv(sparse=False)
        data_x_dict = data_concatenated.drop([target_label], 1).T.to_dict().values()
        data_x_vec = vectorizer.fit_transform(data_x_dict)
        data_x_frame = pd.DataFrame(data_x_vec)

        for i in list(data_x_frame.columns):
            if len(data_x_frame[i].unique()) > 2:
                profile_id_index = i

        data_tr_x = data_x_frame.set_index(profile_id_index).loc[
            map(int, list(set(data_merged['profile_id'][tr_index])))].sort_index().copy()
        data_tr_y = data_tr[target_label].copy()
        data_te_x = data_x_frame.set_index(profile_id_index).loc[
            map(int, list(set(data_merged['profile_id'][te_index])))].sort_index().copy()
        data_te_y = data_te[target_label].copy()

        clf = MultinomialNB()
        clf.fit(data_tr_x, data_tr_y)
        y_pred = clf.predict(data_te_x)

        score = metrics.accuracy_score(data_te_y, y_pred)
        list_of_each_fold_score.append(score)

    return np.asarray(list_of_each_fold_score).mean()
    # print type(x_num_scaled_matrix), type(cat_data)
    # print x_num_scaled_matrix
    # print cat_data

    return x_num_scaled_matrix, cat_data

num_train_matrix, cat_train_matrix = encoding(train, numeric_cols, train_drop_cols)

x_cat_train_data = cat_train_matrix.T.to_dict().values()

# num_matrix1 = data.drop(category_cols, axis=1)
# x_num_data = num_matrix1.T.to_dict().values()
# print x_cat_data
# print num_matrix

vectorized = dv(sparse = False)
##NOTE: directly call transform function on training data will cause error since features are not loaded yet
# we should call fit_transform on training data and then transform on test data
# to make sure the test data's features coincide to training data's
vec_x_cat_train = vectorized.fit_transform(x_cat_train_data)
# print vec_x_cat_train, vec_x_cat_train.shape
x_train = np.hstack((num_train_matrix, vec_x_cat_train))


# print x_train, x_train.shape
# print x_test, x_test.shape
# print vectorized

sgd = SGDRegressor()
sgd.fit(x_train, y)
# print sgd.coef_
Esempio n. 5
0
# In[ ]:

# In[ ]:

dd = {'ele1': 2, 'ele2': 3}
dd.keys()

# In[ ]:

# In[ ]:

# preprocessing - sklearn.feature_extraction.DictVectorizer, transform feature-value mappings to
# vectors.
iris = datasets.load_iris()
y = iris.target
iris_dv = dv(sparse=False)
my_dict = [{'species': iris.target_names[i]} for i in y]
my_dict_trans = iris_dv.fit_transform(my_dict)

# In[ ]:

# Linear regression
boston = datasets.load_boston()
boston_X = boston.data
boston_y = boston.target

lr = LinearRegression()
lr.fit(boston_X, boston_y)
predictions = lr.predict(boston_X)

# In[ ]:
Esempio n. 6
0
def get_data(data, labels):
  x_train, x_test, y_train, y_test = get_data_sets(data, labels)
  train = dv().fit_transform(x_test)
  test = np.array(y_test)
  
  return train.todense(), test
#for hackerrank comment line 12 , 13 and 14 and replace 'y.pop(0)' with 'input()'
y = []
with open('quoraAnswerClassifier.txt') as f:
    y = f.readlines()
n, m = [int(i) for i in y.pop(0).strip().split()]
train_label = []
_train = []

for i in range(n):
    a = y.pop(0).strip().split()
    a.pop(0)
    train_label.append(a.pop(0))
    b = [x.split(':') for x in a]
    _train.append({int(e[0]): float(e[1]) for e in b})

train = dv().fit_transform(_train).toarray()

model = rf()
model.fit(train, train_label)

test_name = []
_test = []

for i in range(int(y.pop(0).strip())):
    a = y.pop(0).strip().split()
    test_name.append(a.pop(0))
    b = [x.split(':') for x in a]
    _test.append({int(e[0]): float(e[1]) for e in b})

test = dv().fit_transform(_test).toarray()
test_lable = model.predict(test)
Esempio n. 8
0
def train():
    #for each sentence in the dependency parses, construct the tree:
    features = []
    answers = []
    data = []
    for sentence in data:
        #format sentence to pass into tree maker
        #iterate through sentence until tree is formed - one each iteration, will consider multiple pairs of words
        #each time we consider two words, get features, save as row in array with dictvectorize
        '''features, answers = make_tree('',sentence,True)
        features.append(featues)
        answers.append(answers)'''
        pass

    #example so that can do svm
    #order of features  -2,-1,0-,0+,1,2: pos,lex,ch-L-pos,ch-L-lex,ch-R-pos,ch-R-lex
    example_features1 = {
        'pos-2': ':',
        'pos-1': 'NNS',
        'pos-0': 'IN',
        'pos0': 'NN',
        'pos1': 'WP',
        'pos1': ':'
    }
    example_features1.update({
        'lex-2': '-',
        'lex-1': 'sellers',
        'lex-0': 'of',
        'lex0': 'resort',
        'lex1': 'who',
        'lex2': '-'
    })
    example_features1.update({
        'chrrpos-1': 'DT',
        'chrrpos0': 'JJ',
        'chrlex-1': 'the',
        'chrlex0': 'last'
    })
    example_features1.update({'chlpos1': 'VBD', 'chllex1': 'were'})
    example_features2 = {
        'pos-2': 'NNS',
        'pos-1': 'IN',
        'pos-0': 'NN',
        'pos0': 'WP',
        'pos1': ':'
    }
    example_features2.update({
        'lex-2': 'sellers',
        'lex-1': 'of',
        'lex-0': 'resort',
        'lex0': 'who',
        'lex1': '-'
    })
    example_features2.update({
        'chrrpos-2': 'DT',
        'chrrpos-0': 'JJ',
        'chrlex-2': 'the',
        'chrlex-0': 'last'
    })
    example_features2.update({'chlpos0': 'VBD', 'chllex0': 'were'})
    #this feature matrix has two entries
    example_features = [example_features1, example_features2]
    #each time we consider two words, get correct action (shift, right, left), put in the answer vector
    example_answers = np.array(['right', 'left'])

    #dictvectorize to turn strings into numerical values
    vec = dv()
    example_array = vec.fit_transform(example_features).toarray()
    '''array = vec.fit_transform(features).toarray()'''

    #TODO: a later goal: once we have the matrix, sort and split label (left, right, split) data
    #so that we can run three different models

    #use sklearn svm to come up with a model
    #persist the model in a pickle
    clf = svm.SVC()
    clf.fit(example_array, example_answers)
    '''clf.fit(array,answers)'''
    pkl = open('svm.pkl', 'wb')
    pickle.dump(clf, pkl)
    pkl.close()

    return clf