def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} test".format(language, len(data.trainset),
                                             len(data.testset)))

    baseline = Baseline(language)

    word_frequence = baseline.word_frequences(data.trainset)

    char_frequence = baseline.char_frequence(data.trainset)

    lengh_trainset = baseline.lengh_trainset(data.trainset)

    bigram_counts_word = baseline.bigram_counts_word(data.trainset)

    pos_dictionary = baseline.pos_dictionary(data.trainset)

    lengh_char = baseline.lengh_char(data.trainset)

    bigram_counts_char = baseline.bigram_counts_char(data.trainset)

    baseline.train(data.trainset, word_frequence, pos_dictionary,
                   bigram_counts_word, lengh_trainset, char_frequence,
                   lengh_char, bigram_counts_char)

    predictions = baseline.test(data.testset, word_frequence, pos_dictionary,
                                bigram_counts_word, lengh_trainset,
                                char_frequence, lengh_char, bigram_counts_char)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    report_score(gold_labels, predictions)
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} Test\n".format(language.upper(),
                                               len(data.trainset),
                                               len(data.devset)))

    #for sent in data.trainset:
    #    print(sent['sentence'], sent['target_word'], sent['gold_label'])

    baseline = Baseline(language, type='classify')

    baseline.train(data.trainset)

    predictions = baseline.test(data.devset)

    gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions, detailed=True)

    ########################### Regression ###################33
    baseline2 = Baseline(language, type='regression')

    baseline2.train(data.trainset)

    predictions = baseline2.test(data.devset)

    gold_labels2 = [float(sent['gold_prob']) for sent in data.devset]

    print("Probabilistic classification task:\nMSE:",
          mean_squared_error(gold_labels2, predictions), "\n\n")
Esempio n. 3
0
def execute_demo(language):
    if language == 'english':
        word_emb = load_word_embeddings('english')
    elif language == 'spanish':
        word_emb = load_word_embeddings('spanish')

    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset),
                                            len(data.devset)))

    #for sent in data.trainset:
    # Gold label -> 0 if the word is not complex, 1 if the word is complex.
    #print(sent['sentence'], sent['target_word'], sent['gold_label'])

    baseline = Baseline(language)

    model = Model(language)

    model.train(data.trainset, word_emb)

    predictions = model.test(data.devset, word_emb)

    gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions)
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset),
                                            len(data.devset)))

    # for sent in data.trainset:
    #    print(sent['sentence'], sent['target_word'], sent['gold_label'])

    baseline = Baseline(language)

    baseline.train(data.trainset, data.bigram_dic)

    predictions = baseline.test(data.devset, data.bigram_dic)

    gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions)

    print("{} test".format(language))

    predictions = baseline.test(data.testset, data.bigram_dic)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    report_score(gold_labels, predictions)
def execute_demo(language):
    data = Dataset(language)

    if test == True:
        print("{}: {} training - {} dev".format(language, len(data.trainset),
                                                len(data.testset)))
    else:
        print("{}: {} training - {} dev".format(language, len(data.trainset),
                                                len(data.devset)))

    if Base == True:
        baseline = Baseline(language)
    else:
        baseline = MyLine(language)

    baseline.train(data.trainset)

    if test == True:
        predictions = baseline.test(data.testset)
        gold_labels = [sent['gold_label'] for sent in data.testset]

    else:
        predictions = baseline.test(data.devset)
        gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions)
Esempio n. 6
0
def execute_demo(language, flag):
    data = Dataset(language)

    if flag == 0:
        print("{}: {} training - {} dev".format(language, len(data.trainset),
                                                len(data.devset))
              )  #data.trainset 是dataset函数内返回的dataset的形式  data.devset用来测试用的
    if flag == 1:
        print("{}: {} training - {} test".format(language, len(data.trainset),
                                                 len(data.testset)))
    # for sent in data.trainset:
    #    # print(sent['sentence'], sent['target_word'], sent['gold_label'])
    #    print(sent)

    baseline = Baseline(language)

    baseline.train(data.trainset)

    predictions_devset = baseline.test(data.devset)
    predictions_testset = baseline.test(data.testset)

    gold_labels_devset = [sent['gold_label']
                          for sent in data.devset]  ##输出的是二元值  0 1 0 1形式的
    gold_labels_testset = [sent['gold_label'] for sent in data.testset]

    if flag == 0:
        print("Test by using dev set:")
        report_score(gold_labels_devset, predictions_devset)
    if flag == 1:
        print("Test by using test set:")
        report_score(gold_labels_testset, predictions_testset)
Esempio n. 7
0
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} test".format(language, len(data.trainset),
                                             len(data.testset)))

    # for sent in data.trainset:
    #    print(sent['target_word'])#sent['sentence'], sent['target_word'], sent['gold_label'])

    baseline = Baseline(language)

    baseline.train(data.trainset)

    predictions_dev = baseline.test(data.devset)

    predictions_test = baseline.test(data.testset)

    gold_labels_dev = [sent['gold_label'] for sent in data.devset]
    gold_labels_test = [sent['gold_label'] for sent in data.testset]

    print("DEV result:")
    report_score(gold_labels_dev, predictions_dev, detailed=True)

    print("TEST result:")
    report_score(gold_labels_test, predictions_test, detailed=True)
def word_identifier(language):
    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset),
                                            len(data.testset)))

    # for sent in data.trainset:
    #    print(sent['sentence'], sent['target_word'], sent['gold_label'])

    #Define gold labels
    dev_gold_labels = [sent['gold_label'] for sent in data.devset]
    test_gold_labels = [sent['gold_label'] for sent in data.testset]
    train_gold_labels = [sent['gold_label'] for sent in data.trainset]
    train_data_size = len(data.trainset)

    #define using of  features for improved systems
    features = [
        'chars_len', 'tokens_len', 'vowels_len', 'first_upper',
        'word_frequency'
    ]

    LR_classifier = LR(language, features)
    LR_classifier.train(data.trainset)
    LR_predictions = LR_classifier.test(data.testset)
    report_score(test_gold_labels, LR_predictions, True)

    SVM_classifier = SVM(language, features)
    SVM_classifier.train(data.trainset)
    SVM_predictions = SVM_classifier.test(data.testset)
    report_score(test_gold_labels, SVM_predictions, True)
    ''' 
    scores = []
    data_scale = 0
    while True:
        data_scale += 1000
        if train_data_size <data_scale:
            data_scale = train_data_size
            data_set = data.trainset
        else:
            data_set = data.trainset[0:data_scale]
        
          
        TB_baseline = WordLength(language)
        if language == 'english':
            length = 8
        elif language == 'spanish':
            length =10
        TB_predictions = TB_baseline.test(data.devset,length)
        fscore = report_score(dev_gold_labels, TB_predictions)
        scores.append((data_scale,fscore))
        
        if data_scale == train_data_size:
            break
        
    TB_baseline_results[language] = np.asarray(scores)   
    '''
    '''
def execute_demo(language):
    data = Dataset(language)

    baseline = Baseline(language)

    baseline.train(data.trainset, data.unigram, data.suffix, data.char_trigram, data.pos, data.dep, data.shape, data.frequency)

    predictions = baseline.test(data.testset)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    report_score(gold_labels,predictions)
def execute_demo(language, size=0):

    data = Dataset(language)
    if size:
        data.trainset = data.trainset[0:size]

    print("{}: {} training - {} dev - {} test".format(language,
                                                      len(data.trainset),
                                                      len(data.devset),
                                                      len(data.testset)))

    improved = Improved(language)

    improved.train(data.trainset)

    predictions_dev = improved.test(data.devset)
    predictions_test = improved.test(data.testset)

    gold_labels_dev = [sent['gold_label'] for sent in data.devset]
    gold_labels_test = [sent['gold_label'] for sent in data.testset]

    if size:
        print("dev score size = " + str(size))
        report_score(gold_labels_dev, predictions_dev)
        print("test score size = " + str(size))
        report_score(gold_labels_test, predictions_test)
        print('-' * 50)
    else:
        print("dev score")
        report_score(gold_labels_dev, predictions_dev)
        print("test score")
        report_score(gold_labels_test, predictions_test)
        print('-' * 50)
Esempio n. 11
0
def execute_demo(language):
    data = Dataset(language)

    trainset_small = []
    for i in range(int(5*len(data.trainset)/5)):
        trainset_small.append(data.trainset[i])
        
#    trainset_small = []
#    for i in range(1000):
#        trainset_small.append(data.trainset[i])
        
# ***** Improved system ***** #
    system = ImprovedSys(language)
    
    if language == 'english':
        
        BoW, lexicon = system.create_engBoWLexicon_wiki()
#        BoW = system.create_BoW_data(data.trainset, data.devset, data.testset)
#        BoW = system.create_engBoW_brown()
#        lexicon = system.create_engLexicon_original(data.tnewsset, data.twikinewsset, data.twikiset)
#        pos_weight = system.create_posWeight(data.trainset)
#        aoa_dict, mean_rate = system.create_aoaDict()
        
        system.train_eng(trainset_small, BoW, lexicon)
        dev_predictions = system.test_eng(data.devset, BoW, lexicon)
        test_predictions = system.test_eng(data.testset, BoW, lexicon)

    if language == 'spanish':
        
        BoW, lexicon = system.create_espBoWLexicon()
#        BoW = system.create_BoW_data(data.trainset, data.devset, data.testset)
#        BoW = system.create_espBoW_wiki()
#        pos_weight = system.create_posWeight(data.trainset)
        
        system.train_esp(trainset_small, BoW, lexicon)
        dev_predictions = system.test_esp(data.devset, BoW, lexicon)
        test_predictions = system.test_esp(data.testset, BoW, lexicon)

    dev_gold_labels = [sent['gold_label'] for sent in data.devset]
    test_gold_labels = [sent['gold_label'] for sent in data.testset]
    
    print("{}: {} training - {} dev".format(language, len(trainset_small), 
          len(data.devset)))
    report_score(dev_gold_labels, dev_predictions)
    print("{}: {} training - {} test".format(language, len(trainset_small), 
          len(data.testset)))
    report_score(test_gold_labels, test_predictions)
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset)))

    # for sent in data.trainset:
    #    print(sent['sentence'], sent['target_word'], sent['gold_label'])

#for baselineLSTM
    # baselineLSTM = BaselineLSTM(language)

    # baselineLSTM.train(data.trainset)

    # predictions = baselineLSTM.test(data.devset)
#for baselineNew
    # baselineNew = BaselineNew(language)

    # baselineNew.train(data.trainset)

    # predictions = baselineNew.test(data.devset)

#for baseline
    # baseline = Baseline(language)

    # baseline.train(data.trainset)

    # predictions = baseline.test(data.devset)

#for baselineTF
    # baselineTF = BaselineTf(language)

    # baselineTF.train(data.trainset)

    # predictions = baselineTF.test(data.devset)

#for baselineRNN
    baselineRNN = BaselineRNN(language)

    baselineRNN.train(data.trainset)

    predictions = baselineRNN.test(data.devset)

    #gold_label is the binary result 
    gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions)
Esempio n. 13
0
def execute(language):
    data = Dataset(language)
    instance = CWI(language)
    # baseline = Baseline(language)
    print("{}: {} training - {} dev - {} test".format(language,
                                                      len(data.trainset),
                                                      len(data.devset),
                                                      len(data.testset)))

    instance.train(data.trainset)
    predictions = instance.test(data.testset)
    # baseline.train(data.trainset)
    # predBaseline = baseline.test(data.testset)
    gold_labels = [sent['gold_label'] for sent in data.testset]

    accuracy = numpy.cumsum([
        prediction == sent
        for sent, prediction in zip(gold_labels, predictions)
    ]) / range(1,
               len(data.testset) + 1)
    # accuracy = numpy.cumsum([ prediction == sent for sent, prediction in zip(gold_labels, predBaseline) ]) / range(1,len(data.testset)+1)

    print("For", language, "language:")
    report_score(gold_labels, predictions)
    # report_score(gold_labels, predBaseline)

    plt.figure("Learning graphs"
               )  #Creates the plot and set the title to Learning Graphs
    graph = plt.subplot2grid((1, 1), (0, 0))  #Create subplot in 0 coordinate
    title = "Learning Rate for Complex Words Identification of " + language  #Define title
    graph.set_title(title)  #Set the title of plot
    graph.plot(100. * accuracy[10:], 'g-',
               label="Accuracy")  #Plot line for accuracy
    graph.set_yscale('linear')  #set y scale linear
    graph.set_ylabel('Accuracy')  #Set y label Accuracy
    graph.set_xscale('linear')  #Set x scale linear
    graph.set_xlabel('Iterations')  #Set x label Iterations
    legend = plt.legend(loc='upper right')  #Declare the position of legends
    for label in legend.get_texts():
        label.set_fontsize('small')  #setting font size of label to small
    for label in legend.get_lines():
        label.set_linewidth(1)  #Setting line width of legend to 1
    plt.grid(True)
    plt.show()
Esempio n. 14
0
def execute_demo(language, algor):
    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset),
                                            len(data.testset)))

    baseline = Baseline(language, algor)

    freqdict1 = baseline.freqdict(data.trainset + data.testset)

    posindex1 = baseline.posdict(data.trainset + data.testset)

    baseline.train(data.trainset, freqdict1, posindex1)

    predictions = baseline.test(data.testset, freqdict1, posindex1)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    report_score(gold_labels, predictions)
Esempio n. 15
0
def execute_demo(language, is_baseline = True, use_test = False):
    
    data = Dataset(language)
    model = Model(language, is_baseline)
    model.train(data.trainset)
    
    if is_baseline:
        mod = "baseline"
    else:
        mod = "final"
    if use_test:
        print("Evaluating {} model on {} using Test set".format(mod, language))
        predictions = model.test(data.testset)
        gold_labels = [sent['gold_label'] for sent in data.testset]
        print("{} instances of training data, {} instances of evaluation data".format(len(data.trainset), len(data.testset)))
    else:
        print("Evaluating {} model on {} using Development set".format(mod, language))
        predictions = model.test(data.devset)
        gold_labels = [sent['gold_label'] for sent in data.devset]
        print("{} instances of training data, {} instances of evaluation data".format(len(data.trainset), len(data.devset)))
    
    report_score(gold_labels, predictions, detailed = False)
Esempio n. 16
0
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} test".format(language, len(data.trainset),
                                             len(data.testset)))

    # for sent in data.trainset:
    #    print(sent['sentence'], sent['target_word'], sent['gold_label'])

    baseline = Baseline(language)

    baseline.train(data.trainset)

    predictions = baseline.test(data.testset)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    report_score(gold_labels, predictions, True)

    svm = SVM(language)
    svm.train(data.trainset)
    predictions2 = svm.test(data.testset)
    report_score(gold_labels, predictions2, True)
Esempio n. 17
0
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} dev - {} test".format(language,
                                                      len(data.trainset),
                                                      len(data.devset),
                                                      len(data.testset)))

    baseline = Baseline(language)

    baseline.train(data.trainset)

    dev = baseline.test(data.devset)
    devLabels = [sent['gold_label'] for sent in data.devset]

    print("Fine-tuned Score - Dev Set")
    report_score(devLabels, dev, detailed=True)

    predictions = baseline.test(data.testset)
    gold_labels = [sent['gold_label'] for sent in data.testset]

    print("Final Score - Test Set")
    report_score(gold_labels, predictions, detailed=True)
Esempio n. 18
0
def execute_improve(language):

    data = Dataset(language)

    print("{}: {} training - {} dev - {} test".format(language,
                                                      len(data.trainset),
                                                      len(data.devset),
                                                      len(data.testset)))

    improved = Improved(language)

    improved.train(data.trainset)

    dev = improved.test(data.devset)
    devLabels = [sent['gold_label'] for sent in data.devset]

    print("Fine-tuned Score")
    report_score(devLabels, dev, detailed=True)

    prediction = improved.test(data.testset)
    gold_label = [sent['gold_label'] for sent in data.testset]

    print("Final Score")
    report_score(gold_label, prediction, detailed=True)
Esempio n. 19
0
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} test".format(language, len(data.trainset),
                                             len(data.devset)))

    #baseline = Baseline(language)#, data.trainset)

    #baseline.train(data.trainset)#,data.devset)

    baseline = Improved_system(language, data.trainset)

    baseline.train(data.trainset, data.devset)
    predictions = baseline.test(data.devset)

    gold_labels = [sent['gold_label'] for sent in data.devset]

    report_score(gold_labels, predictions)

    predictions = baseline.test(data.testset)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    report_score(gold_labels, predictions)
Esempio n. 20
0
def execute_demo(language, amountdata=100):
    data = Dataset(language, amountdata)

    print("{}: {} training - {} dev".format(language, len(data.trainset),
                                            len(data.devset)))

    print('\nInitialising')
    baseline = Baseline(language)
    improved = Improved(language)

    print('Training')
    baseline.train(data.trainset)
    improved.train(data.trainset)

    print('Predicting')
    predictions = baseline.test(data.devset)
    predictionImp = improved.test(data.devset)
    gold_labels = [sent['gold_label'] for sent in data.devset]
    target = [sent['target_word'] for sent in data.devset]

    print("\nScore for baseline:")
    report_score(gold_labels, predictions)
    print("Score for improved model:")
    report_score(gold_labels, predictionImp)

    print('Predicting on testset')
    predictions2 = baseline.test(data.testset)
    predictionImp2 = improved.test(data.testset)
    gold_labels2 = [sent['gold_label'] for sent in data.testset]
    target2 = [sent['target_word'] for sent in data.testset]

    print("\nScore for baseline:")
    report_score(gold_labels2, predictions2)
    print("Score for improved model:")
    report_score(gold_labels2, predictionImp2)

    results = [(predictions[i], predictionImp[i], gold_labels[i], target[i])
               for i in range(len(target))]
    ####to show wrong predictions
    results = [tup for tup in results if tup[0] != tup[2] and tup[1] != tup[2]]

    results2 = [(predictions2[i], predictionImp2[i], gold_labels2[i],
                 target2[i]) for i in range(len(target2))]
    return results, results2
Esempio n. 21
0
def execute_system(language, modelName, featureSet):
    data = Dataset(language)

    print("{}: {} training - {} test".format(language, len(data.trainset),
                                             len(data.testset)))
    print("Features: {}".format(featureSet))
    print("Model: {}".format(modelName))

    system = System(language, modelName, featureSet)

    print("Training...")
    system.train(data.trainset)

    print("Testing...")
    predictions = system.test(data.testset)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    score = report_score(gold_labels, predictions, detailed=True)
Esempio n. 22
0
def execute_demo(language):
    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset), len(data.devset)))

#    trainset = data.trainset[:int(len(data.trainset)*1/100)]
    print('Feature based models')
    baseline = Baseline(language)
    
    print('Training models')
    baseline.train(data.trainset)
#    baseline.train(trainset)

    print('Predicting labels')
    predictions = baseline.test(data.devset)

    predictions_int =[]
    for pred in predictions:
        pred_int = []
        for val in pred[1]:
            pred_int.append(int(val))
        predictions_int.append(pred_int)

    gold_labels = [sent['gold_label'] for sent in data.devset]
#    target_words = [sent['target_word'] for sent in data.devset]
        
    print('Calculating scores')
    for pred in predictions:
        print('Scores for' ,pred[0])
        report_score(gold_labels, pred[1])
    
    print('Scores for hard voting with all models')
    avg_pred_int = np.mean(np.array(predictions_int), axis = 0).tolist()
    avg_pred = [str(round(val)) for val in avg_pred_int]
    report_score(gold_labels, avg_pred)
    
#   Woed2vec based models
    
    print('Word2vec based models')
    print('Loading w2v')
    w2v = Word2vec(language)
    
    print('Training models')
    w2v.train(data.trainset)
#    w2v.train(trainset)
    
    print('Predicting labels')    
    predictions_w2v = w2v.test(data.devset)
    
    predictions_w2v_int =[]
    for pred in predictions_w2v:
        pred_int = []
        for val in pred[1]:
            pred_int.append(int(val))
        predictions_w2v_int.append(pred_int)
    
    print('Calculating scores')
    for pred in predictions_w2v:
        print('Scores for' ,pred[0])
        report_score(gold_labels, pred[1])
    
    print('Scores for hard voting with all models')
    avg_pred_w2v_int = np.mean(np.array(predictions_w2v_int), axis = 0).tolist()
    avg_pred_w2v = [str(round(val)) for val in avg_pred_w2v_int]
    report_score(gold_labels, avg_pred_w2v)
    
    for pred in predictions:
        pred_int = []
        for val in pred[1]:
            pred_int.append(int(val))
        predictions_w2v_int.append(pred_int)
    
    print('Scores for hard voting with both types of models')
    avg_pred_all_int = np.mean(np.array(predictions_w2v_int), axis = 0).tolist()
    avg_pred_all = [str(round(val)) for val in avg_pred_all_int]
    report_score(gold_labels, avg_pred_all)
Esempio n. 23
0
File: main.py Progetto: hesah89/NLP
            elif model == 0:  # to skip the language
                continue
            #######################################################################
            scheme.train(data.trainset)
            predictions = scheme.test(dataSet)
            gold_labels = [sent['gold_label'] for sent in dataSet]

            # collect data for the training rate graph
            gmodel = language + str(model) + dataSetName
            graphData[gmodel] = cumsum([ pred == sent['gold_label']
                    for sent, pred in zip(dataSet, predictions) ]) / range(1,len(dataSet)+1)

            # collect and print results
            print("Using model", model)
            macroF1, accuracy = report_score(gold_labels, predictions)
            results.append([model, macroF1, 100.*accuracy, language, dataSetName])

            # log failures
            model = language + str(model) + dataSetName
            failures[model] = [
                    (sent['gold_label'], sent['target_word'])
                    for pred, sent in zip(predictions, dataSet)
                    if pred != sent['gold_label'] ]
            
            failures[model] = sorted(failures[model], key=lambda x: x[0])  # sort alphabetically

# print training rate graphs
from pylab import *
for language in languages:
    if   language == 'english':
Esempio n. 24
0
test_data_set = DataSet(path="data",
                        bodies="train_bodies.csv",
                        stances="test_stances.csv")
test_segments = segmentize_dataset(test_data_set)
entries = zip_segments(test_segments)
test_classifications = []
stance_features = []
predictions = []
for entry in tqdm(entries[:TESTING_SIZE]):
    headline, body, classification = entry
    prediction = classifier.predict(headline, body)
    predictions.append(prediction)
    if prediction != classification:
        logging.debug("Headline: {0}\n".format(headline))
        logging.debug("Body: {0}\n".format(body))
        logging.debug("correct: {0}, predicted: {1}\n\n\n".format(
            classification, prediction))

    test_classifications.append(classification)

hits = 0
results = zip(predictions, test_classifications)
for result in results:
    p, tc = result
    if p == tc:
        hits += 1

print("Percentage correct: {0}%".format(float(hits) / float(len(results))))
score = Scorer.report_score(test_classifications, predictions)
print(score)
def execute_sys(language):
    data = Dataset(language)

    print("{}: {} training - {} dev".format(language, len(data.trainset),
                                            len(data.testset)))

    ### feature selection
    training = []
    suffix = {}
    vowels_combo = {}
    pos_tags = {}

    chars = {}
    bigrams = {}
    trigrams = {}

    vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú']
    for sent in data.trainset:
        training.append((sent['target_word'], sent['gold_label']))
        tokenised = sent['target_word'].split(' ')

        for wd in tokenised:

            ### vowels features
            target = wd.lower()
            temp_combo = ''
            for char in target:
                if char in vowels:
                    temp_combo += char
                elif len(temp_combo) > 0:
                    vowels_combo[temp_combo] = 0
                    temp_combo = ''

            ### suffix features
            suffix[target[-3:]] = 0

            try:
                tag = nltk.pos_tag(nltk.word_tokenize(wd))[0][1]
                pos_tags[tag] = 0
            except IndexError:
                pass

            for i in range(len(target)):
                chars[target[i]] = 0
            ### char bigram
            for i in range(len(target) - 1):
                bigrams[target[i] + target[i + 1]] = 0
            ### char trigram
            for i in range(len(target) - 2):
                trigrams[target[i] + target[i + 1] + target[i + 2]] = 0

    vowels_combo_list = list(vowels_combo.keys())
    suffix_to_list = list(suffix.keys())
    suffix_list_len3 = [s for s in suffix_to_list if len(s) == 3]

    sys_run = System(language,
                     Baseline_run=False,
                     vowels=False,
                     v_list=vowels_combo_list,
                     syllables=False,
                     upper=True,
                     suffix=False,
                     s_list=suffix_list_len3,
                     vc_ratio=False,
                     pos=False,
                     pos_dict=pos_tags,
                     all_chars=True,
                     all_chars_dict=chars,
                     bigrams=True,
                     bigrams_dict=bigrams,
                     trigrams=False,
                     trigrams_dict=trigrams)

    gold_labels = [sent['gold_label'] for sent in data.testset]

    sys_run.train(data.trainset)

    predictions = sys_run.test(data.testset)

    score = report_score(gold_labels, predictions, detailed=False)

    print(score)

    # this output the (target word, gold label) and the predicted label
    words = []
    for sent in data.testset:
        words.append((sent['target_word'], sent['gold_label']))
    predict = []
    for x in np.nditer(predictions):
        predict.append(np.asscalar(x))
    word_pred = []
    for i in range(len(predict)):
        word_pred.append((words[i], predict[i]))

    return word_pred
Esempio n. 26
0
    sess.run(tf.global_variables_initializer())
    total_loss = 0
    # start training
    for i in range(30000):
        # get batch to learn easily
        batch_x, batch_y = train.next_batch(batch_size_train)
        feed_dict = {
            x: batch_x,
            onehot_labels: batch_y,
            keep_prob: train_keep_prob
        }
        _, current_loss = sess.run([opt_op, loss], feed_dict=feed_dict)
        total_loss += current_loss
        if i % 50 == 0:
            print(
                str(i) + " : " +
                str(compute_accuracy(validation.input, validation.labels)))

    #sess = tf.Session()
    print("Test accuracy : " + str(compute_accuracy(test.input, test.labels)))
    # input v_x to nn and get the result with y_pre
    y_pre = sess.run(prediction, feed_dict={x: test.input})
    # find how many right
    with tf.Session():
        predicted = tf.argmax(y_pre,
                              1).eval()  # transoform from tensor to np array
        actual = tf.argmax(test.labels, 1).eval()
        LABELS = ['agree', 'disagree', 'discuss']
        report_score([LABELS[e] for e in actual],
                     [LABELS[e] for e in predicted])
Esempio n. 27
0
import numpy
from pylab import *

for lang in ["english", "spanish"]:
    data = Dataset(lang)
    model = Features(lang)
    # baseline = Baseline(lang)

    # baseline.train(data.trainset)
    # paseline = baseline.test(data.testset)
    
    
    print("{}: {} training - {} dev - {} test".format(lang, len(data.trainset), len(data.devset), len(data.testset)))

    model.train(data.trainset)
    predictions = model.test(data.testset)
    gold_labels = [sent['gold_label'] for sent in data.testset]

    pl = numpy.cumsum([predic == sent['gold_label'] for sent, predic in zip(data.testset, predictions) ]) / range(1,len(data.testset)+1)
    # pl = numpy.cumsum([predic == sent['gold_label'] for sent, predic in zip(data.testset, paseline) ]) / range(1,len(data.testset)+1)

    report_score(gold_labels, predictions)
    # report_score(gold_labels, paseline)
    
    
    plt.title('graph for learning rate')
    plt.plot(100*pl[20:])
    plt.ylabel('accuracy score')
    plt.xlabel('iteration')
    plt.show()
Esempio n. 28
0
def execute_demo(language):
    data = Dataset(language)
    
#    test_data = data.testset
    test_data = data.devset

    print("{}: {} training - {} test".format(language, len(data.trainset), len(test_data)))


    baseline = Baseline(language)
    advanced = Advanced(language)
    
    models_to_run = [baseline, advanced]
    model_mistakes = {}
    
    gold_labels = [sent['gold_label'] for sent in test_data]
    
    # Error analysis:
    sentences = [sent['sentence'] for sent in test_data]
    targets = [sent['target_word'] for sent in test_data]

    model_predictions = {}
    
    debug = False
    
    for model in models_to_run:

        model.train(data.trainset)
        trained = model.train(data.trainset)

        # Since only English uses RFC
        importances = False
        if importances == True:
            if language == 'english' and model == advanced:
                importances = trained.feature_importances_
                ordered_feature_list = model.ordered_feature_list
                indices = np.argsort(importances)[::-1]
                for f in range(20):
                    print("{}. & {} & ({:0.3}) \\\\ \hline".format(f+1, ordered_feature_list[indices[f]], importances[indices[f]]))


        predictions = model.test(test_data)
        model_predictions[model.name] = predictions
        
        print(model.name)
        report_score(gold_labels, predictions)
          
        if debug == True:
            look_at = 500
            for sent_i in range(look_at):
                if predictions[sent_i] != gold_labels[sent_i]:
                    if sent_i in model_mistakes:
                        model_mistakes[sent_i].append(model.name)
                    else:
                        model_mistakes[sent_i] = [model.name]
                else:
                    if sent_i not in model_mistakes:
                        model_mistakes[sent_i] = []
                        
                    
    if debug == True:                    

        both_right = []
        advanced_right = []
        baseline_right = []
        both_wrong = []
        
        for key, value in model_mistakes.items():
            if len(value) == 2:
                both_wrong.append(key)
            elif len(value) == 0:
                both_right.append(key)
            elif value[0] == 'Baseline':
                advanced_right.append(key)
            else:
                baseline_right.append(key)
         
            # Finds an example of an incorrect word.
        max_wrong = 10
        for perm in [both_right, both_wrong, advanced_right, baseline_right]:
            curr_wrong = 0
            for item in perm:
                if curr_wrong == max_wrong:
                    break
                curr_wrong += 1
                
                sent = sentences[item]
                target = targets[item]
                gold = gold_labels[item]
                if perm == advanced_right:
                    predict = model_predictions['Advanced'][item]
                else:
                    predict = model_predictions['Baseline'][item]
                
                if perm == advanced_right:
                    perm_name = 'Advanced Correct, Baseline Incorrect'
                elif perm == baseline_right:
                    perm_name = 'Advanced Incorrect, Baseline Correct'
                elif perm == both_right:
                    perm_name = 'Both Correct'
                else:
                    perm_name = 'Both Incorrect'
                    
                print("{}:\n Sent: {}\n Target: {}\n Predicted: {}\n Gold: {}\n".format(perm_name, sent, target, predict, gold))
Esempio n. 29
0
data = read_data('./datasets/english/English_Train.tsv')
all = build_dataset(data)

progress('training perceptron')

per = Perceptron(eta=0.01,
                 epochs=opts.epochs,
                 avg=opts.avg,
                 shuffle=opts.shuffle)
per.train(all)

progress('perceptron trained in', per.convergance_epochs(), '/', opts.epochs,
         '(max) epochs')

if opts.graph:
    per.plot_training_error()

progress('reading test data')

data = read_data('./datasets/english/News_Test.tsv')
all = build_dataset(data)

progress('testing')

gold = [x[2]['gsbin'] for x in all]
pred = [max(0, per.predict(x[0])) for x in all]

progress('evaluating')

report_score(gold, pred, detailed=True)
Esempio n. 30
0
            predictions.append(str(category_index))
        '''
        for i in range(n_predictions):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])
        '''


gold_label_dev = []
for sent in eng_dataset.devset:
    predict(sent['target_word'])
    gold_label_dev.append(sent['gold_label'])
print('The result for development dataset')
report_score(gold_label_dev, predictions)
predictions = []

gold_label_test = []
for sent in eng_dataset.testset:
    predict(sent['target_word'])
    gold_label_test.append(sent['gold_label'])
print('The result for test dataset')
report_score(gold_label_test, predictions)

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)
plt.show()