Ejemplo n.º 1
0
def classify( model , train_data , test_data , model_name ) :
	print "Testing with %s" % model_name
	X = [ row[ :-1 ] for row in train_data ]
	y = [ row[ -1 ] for row in train_data ]
	model.fit( X , y )

	X_test = [ row[ :-1 ] for row in test_data ]
	y_real = [ row[ -1 ] for row in test_data ]
	y_pred = model.predict( X_test )
	print report( y_real , y_pred )
	tp = lambda x : 1 if x == 'spam' else 0
	real = [ tp( v ) for v in y_real ]
	pred = [ tp( v ) for v in y_pred ]
	print mean_absolute_error( real , pred )
	print mean_squared_error( real , pred )
Ejemplo n.º 2
0
def test(config, model, datapath, usegpu):
    getdata = feature.transformer(config, datapath)

    trueTag = []
    predTag = []
    while True:
        data = getdata.get_data()
        if data is None:
            break

        data0, label0, seqlen = data
        if usegpu:
            testdata = Variable(torch.Tensor(data0).cuda())
            # label = Variable(torch.LongTensor(label0).cuda())
        else:
            testdata = Variable(torch.Tensor(data0))
            # label = Variable(torch.LongTensor(label0))

        model.init_hidden(usegpu)
        output = model.forward(testdata)
        _, pred = torch.max(output, 2)

        for i, l in enumerate(label0):
            trueTag += l
            predTag += pred[i].data.tolist()

    print(report(trueTag, predTag))
Ejemplo n.º 3
0
 def _fit(self, seed, test_size, X, y):
     X_train, X_test, y_train, y_test = train_test_split(
         X, y,
         test_size=test_size,
         random_state=seed
     )
     model_instance = clone(self.model)
     model_instance.fit(X_train, y_train)
     y_pred = model_instance.predict(X_test)
     self.y = y
     return [
         model_instance,
         report(
             y_pred, y_test, output_dict=True
         )
     ]
Ejemplo n.º 4
0
    def test_more(self, X_test, y_test, objs):
        n = int(
            input(
                '-- how many parts do you want to split test data into?: - '))
        r = input('-- ratios of them (eg: 1:3:6 if n=3): - ').split(':')
        r = [int(x) for x in r]
        sum_r = 0

        for i in range(n):
            size = round(r[i] / (10 - sum_r) * len(X_test))
            X_test_small, y_test_small, = X_test[:size], y_test[:size]
            X_test, y_test = X_test[size:], y_test[size:]
            sum_r += r[i]

            for obj in objs:
                y_pred = obj[0].predict(X_test_small)
                print('--', obj[1])
                print(report(y_test_small, y_pred))
        """ // if you just want to test with an arbitary amount of test data:
Ejemplo n.º 5
0
def evaluate(model, features, labels, mask):
    """Gives accuracy."""
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask].cpu().numpy()

        # Statistics
        _, indices = torch.max(logits, dim=1)
        prediction = indices.long().cpu().numpy()
        accuracy = (prediction == labels).sum() / len(prediction)
        precision, recall, fscore, _ = score(
            labels, prediction, average="macro"
        )

        class_based_report = report(labels, prediction)

        return accuracy, precision, recall, fscore, class_based_report
Ejemplo n.º 6
0
def evaluate(model, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(features)
        logits = logits[mask]
        labels = labels[mask]

        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)

        # Statistics
        precision, recall, fscore, support = score(labels, indices)

        # Accuracy
        acc = correct.item() * 1.0 / len(labels)

        class_based_report = report(labels, indices)

        return acc, precision, recall, fscore, support, class_based_report
Ejemplo n.º 7
0
 def _fit(self, results, seed, X_train, X_test, y_train, y_test):
     self.model.fit(X_train, y_train)
     y_pred = self.model.predict(X_test)
     if self.model_type == "classification":
         report_dict = report(y_test, y_pred, output_dict=True)
     elif self.model_type == "regression":
         report_dict = self.report(y_test, y_pred)
     else:
         raise Exception("model_type must be regression or classification")
     report_dict["mask"] = self._get_mask(y_train, self.data.shape[0])
     report_dict["seed"] = seed
     report_dict["hyperparameters"] = self.hyperparameters
     if self.is_pipeline():
         if 'coef_' in dir(self.model.named_steps['model']):
             report_dict['coef'] = self.model.named_steps['model'].coef_
     else:
         if "coef_" in dir(self.model):
             report_dict['coef'] = self.model.coef_
     results.append(report_dict)
     return results
Ejemplo n.º 8
0
def evaluate(file1, file2):
    num = 0
    wrong = 0
    real =[]
    predicted = []
    tags = set()
    with codecs.open(file1,"r", encoding="iso8859-15") as f1, open(file2,"r",encoding="iso8859-15") as f2:
        
        for line1,line2 in zip(f1,f2):
            if len(line1)> 1 and len(line2) > 1:

                num += 1

                try:
                    word_1,tag_1 = line1.split()
                    word_2,tag_2 = line2.split()
                    real.append(tag_1)
                    tags.add(tag_1)
                    predicted.append(tag_2)
                    if tag_1 != tag_2:
                        wrong += 1
                except:
                    pass


                
                   # print(tag_1,tag_2)
    try:
        from sklearn.metrics import classification_report as report
        print("REPORT" ,report(real,predicted,labels=list(tags)))

    except ImportError:
        print("Sk-leanr module not found. skipping classification report")

    print( "accuracy",(num-wrong)/float(num), "%")



#evaluate("data/tiger_test.txt","results.txt")
Ejemplo n.º 9
0
def f1_scores(results, truth):
	print(report(truth['class'].tolist(), results['class'].tolist()))
Ejemplo n.º 10
0
def run(x, y):
    print(accuracy(x, y))
    print(report(x, y))
    print(confusion_matrix(x, y))
    return 0
Ejemplo n.º 11
0
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('dense', DenseTransformer()),
    ('clf', GaussianNB())
])

bernoulli = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', BernoulliNB())
])

kfold = KFold(n_splits=10)

print("\n\n############ MULTINOMIAL STARTED ########################")
multinomial_report = [report(classified_target[test], multinomial.fit(classified_data[train], classified_target[train])
                             .predict(classified_data[test])) for train, test in kfold.split(classified_data)]
for rep in multinomial_report:
    print(rep)


print("\n\n############ GAUSSIAN STARTED ########################")
gaussian_report = [report(classified_target[test], gaussian.fit(classified_data[train], classified_target[train])
                          .predict(classified_data[test])) for train, test in kfold.split(classified_data)]
for rep in gaussian_report:
    print(rep)


print("\n\n############ BERNOULLI STARTED ########################")
bernoulli_report = [report(classified_target[test], bernoulli.fit(classified_data[train], classified_target[train])
                           .predict(classified_data[test])) for train, test in kfold.split(classified_data)]
for rep in bernoulli_report:
Ejemplo n.º 12
0


# feature selection
#df1=SelectPercentile(chi2,percentile=99).fit_transform(df1,label_train)
#df2=SelectPercentile(chi2,percentile=99).fit_transform(df2,label_eval)

# leveraging of SVM
model_svm=svm.SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=1, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model_svm.fit(df1,label_train)
result_svm=model_svm.predict(df2)

report1 = report(label_eval,result_svm,digits=5)
print(report1)


model_svm=svm.SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=2, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
model_svm.fit(df1,label_train)
result_svm=model_svm.predict(df2)

report11 = report(label_eval,result_svm,digits=5)
print(report11)


model_svm=svm.SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
Ejemplo n.º 13
0
Archivo: lstm.py Proyecto: Spepsi/LSTM
    for i in range(n_updates):
        model.train(gradient_dataset,whole_train_dataset,i)

    ### EVALUTATION OF NETWORK ###
    pred_list =[]
    labels = []
    target_names = ['Anger','Boredom','Disgust', 'Fear', 'Happiness', 'Sadness', 'Neutral']
    classes = range(7)
    for idx,i in enumerate(trX):
    # Do the prediction for each frame
        prediction = list(model.predict(i))
        # Calculate predominant class
        pred_list.append(argmax_index([prediction.count(j) for j in range(n_classes)]))
        labels.append(trY[idx][0])
    # Classification report :
    print report(labels,pred_list,target_names=target_names)
    #compute accuracy :
    print "Accuracy : "+str(np.mean(np.asarray(labels) == np.asarray(pred_list)))
    pred_list = []
    labels = []
    for idx,i in enumerate(teX):
        # Do the prediction for each frame
        prediction = list(model.predict(i))
        # Calculate predominant class
        pred_list.append(argmax_index([prediction.count(j) for j in range(n_classes)]))
        labels.append(teY[idx][0])

    # Classification report :
    print report(labels,pred_list, target_names=target_names)
    #compute accuracy :
    print "Accuracy : "+str(np.mean(np.asarray(labels) == np.asarray(pred_list)))
# In[2]:

###Initial Baseline Models

x_train, y_train = load_data("../data/Simulated_Data_Train.csv")
x_val, y_val = load_data("../data/Simulated_Data_Validation.csv")
x_test, y_test = load_data("../data/Simulated_Data_Test.csv")

# In[4]:

nn = feed_forward(x_train, y_train, width=32)
nn.train(20)

print("****** Initial Feed Forward Network *********")
print(report(y_test, nn.predict(x_test)))

# In[36]:


def tune_model_width(build_fn, x_train, y_train, x_val, y_val, max_width=50):
    """
    Takes a 3-Layer nueral network and expands width to see if there 
    are tangible benefits to increasing the width of the hidden layer 
    in the model. 
    
    Parameters: 
    build_fn - function that returns a keras nn model with the specified parameters 
    x_train - the data matrix 
    y_train - the response function
    x_val - validation data
Ejemplo n.º 15
0
def evaluateSpacy(conll_test, max_sent=None, print_dicts=False):

    nlp = spacy.load('en_core_web_sm')

    test = loadConll(conll_test)

    if max_sent is not None and isinstance(max_sent, int):
        test_doc = list(nlp.pipe(test['text'][:max_sent]))
    else:
        test_doc = list(nlp.pipe(test['text']))
    # print('Elements in doc format: {}'.format(len(test_doc)))

    # Retokenization to merge '-' elements (ex: dates, obj-obj)
    for doc in test_doc:
        with doc.retokenize() as retokenizer:
            index = 0
            startMerging = -1
            for token in doc:
                if token.whitespace_ == '' and startMerging == -1:
                    startMerging = index
                if (token.whitespace_ == ' ' or index == len(doc)-1) \
                   and startMerging != -1:
                    retokenizer.merge(doc[startMerging:index + 1])
                    startMerging = -1
                index += 1

    doc_spacy_test_list = []
    for doc in test_doc:
        for token in doc:
            if token.ent_type_ == '':
                key = token.ent_iob_
            else:
                key = token.ent_iob_ + '-' + token.ent_type_
            doc_spacy_test_list.append(converter(key))

    doc_conll_test_list = []
    for tag_list in test['NE_tag']:
        for tag in tag_list.split():
            doc_conll_test_list.append(tag)

    scores = report(doc_conll_test_list,
                    doc_spacy_test_list,
                    output_dict=True,
                    zero_division=0)
    print('Accuracy on spacy prediction: {:0.4f}\n'.format(scores['accuracy']))

    # Chunk accuracy (i.e entity accuracy)
    sent_idx = 0
    ref_list = []
    hyp_list = []
    for sent in test['text'][:max_sent]:
        token_idx = 0
        ref_token_list = []
        hyp_token_list = []
        for token in sent.split():
            ref_token_list.append(
                [token, test['NE_tag'][sent_idx].split()[token_idx]])

            if test_doc[sent_idx][token_idx].ent_type_ == '':
                hyp_token_list.append([
                    test_doc[sent_idx][token_idx].text,
                    test_doc[sent_idx][token_idx].ent_iob_
                ])
            else:
                hyp_token_list.append([
                    test_doc[sent_idx][token_idx].text,
                    test_doc[sent_idx][token_idx].ent_iob_ + '-' +
                    converter(test_doc[sent_idx][token_idx].ent_type_)
                ])

            token_idx += 1
        ref_list.append(ref_token_list)
        hyp_list.append(hyp_token_list)
        sent_idx += 1

    measures = conll.evaluate(ref_list, hyp_list)
    # Make fancy table:
    measureShow = pd.DataFrame().from_dict(measures, orient='index')
    print(measureShow.round(decimals=3))
Ejemplo n.º 16
0
directory = "F:\\To_server\\model\\test_results\\"
#filename = "tagging.test.hyp.txt"
#filename = "test.txt"
filename = sys.argv[1]

with open(directory + filename) as f:
	prediction = [];
	label = [];
	unique_label = {}
	label_names = []
	for line in f:
		contentlist = line.split()
		#if (len(contentlist) != 0 and len(contentlist) !=3):
			#print(line)
			#print(len(contentlist))
		
		if (len(contentlist) == 0 or contentlist[0] == "BOS" or contentlist[0] == "EOS"):
			continue
		else:
			label.append(contentlist[1])
			prediction.append(contentlist[2])
			if (contentlist[1] in unique_label):
				unique_label[contentlist[1]] += 1
			else:
				label_names.append(contentlist[1])
				unique_label[contentlist[1]] = 1

print(report(label, prediction, label_names, target_names=label_names))


"""

import matplotlib.pyplot as plt
from sklearn import datasets, svm, metrics
from sklearn.metrics import classification_report as report
#
format1 = "Classification report for classifier %s:\n%s\n"
format2 = "Confusion matrix:\n%s"
digits = datasets.load_digits()
imageLabels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(imageLabels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)
n = len(digits.images)
data2 = digits.images.reshape((n, -1))
classifier = svm.SVC(gamma=0.001)
classifier.fit(data2[:n // 2], digits.target[:n // 2])
expected = digits.target[n // 2:]
predicted = classifier.predict(data[n // 2:])
print(format1 % (classifier, report(expected, predicted)))
print(format2 % metrics.confusion_matrix(expected, predicted))
imageAndPredictions = list(zip(digits.images[n // 2:], predicted))
for index, (image, prediction) in enumerate(imageAndPredictions[:4]):
    plt.subplot(2, 4, index + 5)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)
plt.show()
Ejemplo n.º 18
0
import numpy as np
from sklearn.metrics import classification_report as report

a = np.array([1, 0, 0, 2, 2, 1, 0, 0, 0, 0])
b = np.array([1, 1, 2, 2, 0, 0, 1, 1, 0, 0])
res = np.array([100, 100, -50, -50, 20, -5, 100, 100, -30, -50])
positive = (a == 1) & (b == 1) | (a == 1) & (b == 0)
negative = (a == 2) & (b == 2) | (a == 2) & (b == 0)
print(report(a, b))

print(res[positive].sum() - res[negative].sum())
Ejemplo n.º 19
0
    def training(self):
        data_train = pd.read_csv(
            "D:/Python_Project/Keywords_extraction/train_balance.csv")
        data_test = pd.read_csv(
            "D:/Python_Project/Keywords_extraction/test_balance.csv")

        acc = 0
        # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']]
        # cols = [col for col in data_train.columns if col  in ['头词频','词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词平均','词偏度','词峰度','词差方差','最大词差','最小词差','最小句中位置','首次句位置','最后句位置','出现在第一句','出现在最后一句','句子出现频率','句平均','句偏度','包含英文','度中心性','接近中心性','s','f','v','d','k','x','i','l','un','包含数字']]
        '''
        cols=['词频','词长','IDF','出现在标题','首次出现词位置','最后出现词位置','词方差','词偏度','最大句中位置','最小句中位置',
              '平均句中位置','平均句长','首次句位置','出现在最后一句','句子出现频率','句方差',
              '句平均','句差方差','最大句差','包含英文','接近中心性','n', 't', 'v', 'z', 'q', 'd', 'k', 'x', 'y', '包含数字']

         ['词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差', '最大句中位置', '平均句中位置', 
         '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差', '句差方差', '最大句差', '度中心性',
          'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x', 'g', 'j', 'y', 'un', '包含数字']

         '''
        cols = [
            '词频', '词长', 'IDF', '出现在标题', '首次出现词位置', '词方差', '词平均', '最大词差',
            '最大句中位置', '平均句中位置', '首次句位置', '出现在第一句', '出现在最后一句', '句子出现频率', '句方差',
            '句差方差', '最大句差', '度中心性', 'n', 'v', 'a', 'z', 'd', 'h', 'k', 'x',
            'g', 'j', 'y', 'un', '包含数字'
        ]
        # cols = [col for col in data_train.columns if col not in ['id', '关键词', '标签']]
        x_train = data_train.loc[:, cols]
        y_train = data_train.loc[:, '标签']
        x_train = x_train.reset_index(drop=True)
        y_train = y_train.reset_index(drop=True)
        x_val = data_test.loc[:, cols]
        y_val = data_test.loc[:, '标签']
        x_val = x_val.reset_index(drop=True)
        y_val = y_val.reset_index(drop=True)

        # 测试集为30%,训练集为70%
        # x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=0)

        lgb_train = lgb.Dataset(x_train, y_train)

        lgb_eval = lgb.Dataset(x_val, y_val, reference=lgb_train)
        #     print('开始训练......')

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': {'auc'},
            'learning_rate': 0.025,
            'num_leaves': 100,
            'min_data_in_leaf': 70,
            'bagging_fraction': 0.85,
            'is_unbalance': 'true',
            'seed': 42
        }

        gbm = lgb.train(
            params,
            lgb_train,
            num_boost_round=5000,
            valid_sets=lgb_eval,
            early_stopping_rounds=30,
            verbose_eval=False,
        )

        y_pred = gbm.predict(x_val)
        y_pred = list(y_pred)
        Y_val = list(y_val)
        pos = 0
        pos_acc = 0
        pos_pre = 0
        for i, j in zip(Y_val, y_pred):
            if (i >= 0.5):
                pos += 1

            if (i >= 0.5 and j >= 0.5):
                pos_acc += 1
            if (j >= 0.5):
                pos_pre += 1

        pos_r = pos_acc / pos
        pos_a = pos_acc / pos_pre
        print((pos_a * pos_r) / (pos_a + pos_r) * 2)
        i = 0
        count = 0

        for item in y_pred:
            if item > 0.5:
                y_pred[i] = 1
            else:
                y_pred[i] = 0

            i = i + 1
        # print(report(Y_val, y_pred,digits=4))

        y_pred = gbm.predict(x_train)
        y_pred = list(y_pred)
        Y_train = list(y_train)

        i = 0
        count = 0

        for item in y_pred:
            if item > 0.5:
                y_pred[i] = 1
            else:
                y_pred[i] = 0

            i = i + 1
        print(report(Y_train, y_pred, digits=4))
        plt.rc('font', family='SimSun', size=13)
        # gbm.save_model('lgbmodel_allfeature.model')
        explainer = shap.TreeExplainer(gbm)
        shap_values = explainer.shap_values(x_train)
        # 基线值y_base就是训练集的目标变量的拟合值的均值。
        y_base = explainer.expected_value
        shap.initjs()
        # shap.summary_plot(shap_values[0], x_train, sort=True, color_bar_label=("FEATURE_VALUE0"))#1
        shap.summary_plot(shap_values[1],
                          x_train,
                          sort=True,
                          color_bar_label=("FEATURE_VALUE1"))  # 2
Ejemplo n.º 20
0
def train(configpath, datapath, classes, usegpu):
    config = configparser.ConfigParser()
    config.read(configpath)
    getdata = feature.transformer(config, datapath)

    print('building net...')
    net = model.LSTM(config, classes)
    if usegpu:
        net = net.cuda()
    optimer = optim.Adam(net.parameters(),
                         lr=config.getfloat('train', 'learning_rate'))
    criterion = nn.CrossEntropyLoss()

    print('begin training ...')
    for epoch in range(config.getint('train', 'epoch')):
        print('epoch:', epoch)
        trueTag = []
        predTag = []
        while True:
            data = getdata.get_data()
            if data is None:
                break

            traindata, label0, seqlen = data

            if usegpu:
                traindata = Variable(torch.Tensor(traindata).cuda())
                label = Variable(torch.LongTensor(label0).cuda())
            else:
                traindata = Variable(torch.Tensor(traindata))
                label = Variable(torch.LongTensor(label0))

            net.init_hidden(usegpu)
            output = net.forward(traindata)
            # print(output.size())

            loss = 0

            _, pred = torch.max(output, 2)

            for i, l in enumerate(label0):

                trueTag += l
                predTag += pred[i].data.tolist()

            # print(output.size())
            for i, seq in enumerate(output):
                # print(seq.size())
                # print(label[i])
                # print(seq.size())
                for j, l in enumerate(label[i]):
                    # print(seq[j])
                    if l.data.tolist()[0] == 0:
                        loss += 0.05 * criterion(seq[j].view(1, -1), l)
                    else:
                        loss += criterion(seq[j].view(1, -1), l)
            # print(loss)
            optimer.zero_grad()
            loss.backward()
            optimer.step()
        print('train result')
        # print(trueTag)
        # print(predTag)
        print(report(trueTag, predTag))
        print('test result')
        test(config, net, '/Users/Smart/Desktop/code/Challenge_Cup/test1.json',
             usegpu)
Ejemplo n.º 21
0
def main(trainpaths, testpath):
    #Load training data in
    train_dict = collect_train_data(trainpaths)
    print('Training data loaded successfully')

    #Load test data in
    test_data = collect_test_data(testpath)
    print('Test data loaded successfully')

    #Collect labels
    labels = collect_labels(train_dict)
    label_dict = create_label_dict(train_dict)
    inv_label_dict = {value: key for key, value in label_dict.items()}

    #Train Vectorizer
    v = DictVectorizer(sparse=False)
    train_features_list = []
    for chord_prog in train_dict.keys():
        for note_list in train_dict[chord_prog]:
            n = 4
            D = {
                'max_avg_ngrams': avg_max_pitch(note_list, n),
                'min_avg_ngrams': avg_min_pitch(note_list, n),
                'num_notes': len(note_list),
                'num_max_pitch': num_max_pitch(note_list, n),
                'num_min_pitch': num_min_pitch(note_list, n),
                'avg_pitch': avg_pitch(note_list),
                'max_pitch_diff': max_pitch_diff(note_list),
                'max_diff_avg_ngrams': max_pitch_diff_avg(note_list, n),
                'most_freq_pitch': most_common_pitch(note_list),
                'freq_pitch_diff_avg': normalized_pitch_diff_avg(note_list)
            }
            train_features_list.append(D)
    x_train = v.fit_transform(train_features_list)
    print('Train Features Step Complete')

    #Vectorize Test data for later use
    test_features_list = []
    for note_list in test_data:
        n = 4
        D = {
            'max_avg_ngrams': avg_max_pitch(note_list, n),
            'min_avg_ngrams': avg_min_pitch(note_list, n),
            'num_notes': len(note_list),
            'num_max_pitch': num_max_pitch(note_list, n),
            'num_min_pitch': num_min_pitch(note_list, n),
            'avg_pitch': avg_pitch(note_list),
            'max_pitch_diff': max_pitch_diff(note_list),
            'max_diff_avg_ngrams': max_pitch_diff_avg(note_list, n),
            'most_freq_pitch': most_common_pitch(note_list),
            'freq_pitch_diff_avg': normalized_pitch_diff_avg(note_list)
        }
        test_features_list.append(D)
    x_test = v.transform(test_features_list)
    print('Test Features Step Complete')

    #Train Classifer
    K = knn(n_neighbors=5)
    y_train = [label_dict[label] for label in labels]
    K = K.fit(x_train, y_train)
    print('KNN Classifier Training Step Complete')

    #For report later on
    y_pred = []

    #Predict chord progression using KNN
    for x in x_test:
        x_predict = []
        x_predict.append(x)
        predict = K.predict(x_predict)
        print(inv_label_dict[predict[0]])
        y_pred.append(predict[0])
    #Find Precision, Recall, F-1 scores for test data over chord progressions
    target_names = [prog for prog in label_dict.keys()]

    #Hardcoded for test data
    y_true = [0, 0, 1, 2, 3, 4]

    #Print Report
    print(report(y_true, y_pred, target_names=target_names))
Ejemplo n.º 22
0
],
                      axis=1)

# Build model
rf_cv = RandomForestClassifier(n_estimators=300, max_depth=90, n_jobs=-1)
rf_model_cv = rf_cv.fit(X_cv_train, y_train)
y_prediction_cv = rf_model_cv.predict(X_cv_test)

precision, recall, fscore, train_support = f_score(y_test,
                                                   y_prediction_cv,
                                                   pos_label='spam',
                                                   average='micro')
print('Precision: {} --- Recall: {} --- F1-Score: {} --- Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round(fscore, 3),
    round(accuracy(y_test, y_prediction_cv), 3)))
print(report(y_test, y_prediction_cv))
# ------------------------------------------------------------------------------------
# Making the Confusion Matrix: CountVectorizer
matrixcv = confusion_matrix(y_test, y_prediction_cv)
class_label = ['0', '1', '2', '3', '4']
matrixcv_df = pd.DataFrame(matrixcv, index=class_label, columns=class_label)
sns.heatmap(matrixcv_df, annot=True, fmt='d')
plt.title("Confusion Matrix of best CountVectorizer model")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
# ------------------------------------------------------------------------------------
# Evaluation of model: on tfidfVectorizer
# num_estimator = 300, max_depth = None
# Variables define
tfidf = tfidfV(ngram_range=(2, 2), analyzer=cleandata)  # defined before
Ejemplo n.º 23
0

label_train = getLabel([], 'train-labels.txt')
label_eval = getLabel([], 'eval-labels.txt')

x_vectorizer = HashingVectorizer()

corpus1 = []
corpus2 = []
with open('train-tweets.txt', encoding='utf-8') as train:
    for line in train:
        line = line.replace("\n", "").split("\t")
        corpus1.append(line[1])
with open('eval-tweets.txt', encoding='utf-8') as train:
    for line in train:
        line = line.replace("\n", "").split("\t")
        corpus2.append(line[1])
# print(corpus)

X = x_vectorizer.fit_transform(corpus1)
#X.toarray()
y_vectorizer = HashingVectorizer()
Y = y_vectorizer.fit_transform(corpus2)
#print(x_vectorizer.get_feature_names())

model_NB = MultinomialNB()
model_NB.fit(X, label_train)
result_NB = model_NB.predict(Y)

report3 = report(label_eval, result_NB, digits=5)
print('\n', report3)
Ejemplo n.º 24
0
# In[2]:


x_train, y_train = load_data("../data/Simulated_Data_Train.csv")
x_val, y_val = load_data("../data/Simulated_Data_Validation.csv")
x_test, y_test = load_data("../data/Simulated_Data_Test.csv")


# In[6]:


lr = log_reg(x_train, y_train)
yprob = lr.predict(x_test)
yhat = decide(yprob, 0.5)
print(report(y_test, lr.model.predict(x_test)))


# In[19]:


credit_data = load_data("../data/Simulated_Data_Test.csv", as_df = True)

coef_dict = {}
for var, coef in zip(credit_data.columns, lr.model.coef_[0]): 
    coef_dict[var] = coef
    
coef_frame = pd.DataFrame.from_dict(coef_dict, 'index', columns = ["coefficient"])
coef_frame.to_latex("../report/coef.tex")