Esempio n. 1
0
def crossValidation(numIter, testProp):
	"""
	Input: The number of iterations to run and the proportion of 
	Output: The average proportion of correctly parsed sentences
	"""
	numTest = int(testProp * 1000)
	numList = list(range(1000))
	counter = [0]*numIter

	# repeat numIter times
	for i in range(numIter):
		print("Starting iteration: ", i)

		# Shuffle list of indices of the dependency treebank sentences and set the first numTests
		# as the test sentences.
		random.shuffle(numList)
		tests = numList[:numTest]
		grams = numList[numTest:]
		grammar = dependencyGraphsToGrammar([dependency_treebank.parsed_sents()[sent] for sent in grams])
		parser = malt_parser.maltparser(grammar)

		# try every test sentence. See how the parser does
		for test in tests:
			testSent = parser.parse(dependency_treebank.sents()[test])
			refSent = dependency_treebank.parsed_sents()[test].tree()
			if compareDependencyTrees(testSent,refSent):
				counter[i] += 1

	# return the average score of all of the iterations.
	return sum(counter)/(numTest*numIter)
Esempio n. 2
0
def main():    
    '''
    Read and print sentence
    '''
    parsed = dp.parsed_sents(FILE)[0]
    tree = parsed.tree()
    print(tree.pprint())
Esempio n. 3
0
 def __init__(self, split_ratios=(0.9, 0.1, 0.0)):
     """
     Initialize the train, validation and test set
     :param split_ratios: (train, test, validation)
     """
     self.split_ratios = split_ratios
     self.sentences = dependency_treebank.parsed_sents()
Esempio n. 4
0
def test_model(folders, vectorizer, model):
    """
    Takes a list of folders from which to draw data files to test the model.
    Parses sentences by iteratively looking at target nodes in the remaining 
    subtrees of the sentence. For each pair, the algorithm derives a list of 
    features, and uses the model to predict a construction action. The function
    then analyzes the performance of the model and prints out results.
    ========== INCOMPLETE ==========
    """
    for filepath in data_file_paths_for_folders(folders):
        for sentence in dt.parsed_sents(filepath):
            T = flattened_node_list(sentence)
            i = 0
            no_construction = True
            while len(T) >= 1:
                if i == len(T) - 1:
                    if no_construction:
                        break
                    no_construction = True
                    i = 0
                else:
                    target_features = get_contextual_features(T, i)
                    target_classification = estimate_classification(target_features, vectorizer, model)
                    construction(T, i, target_classification)
                    if target_classification != SHIFT:
                        no_construction = False
                i += 1
Esempio n. 5
0
 def fromRaw():
     for sent in dependency_treebank.parsed_sents():
         
         g = prune(graph(sent))
         
         if g.number_of_nodes()-1 <= 10:
             yield (g,sent)
Esempio n. 6
0
 def __init__(self, num_iterations):
     self.num_iterations = num_iterations
     corpus = dependency_treebank.parsed_sents()
     sep = int(len(corpus) * (9 / 10))
     self.train_set, self.test_set = corpus[:sep], corpus[sep:]
     self.word_vec, self.tag_vec = self.word_tag_vectors_init()
     self.num_words, self.num_tags = len(self.word_vec), len(self.tag_vec)
     self.cur_weights = Counter()
     self.aggregate_weights = Counter()  # sum of weights
Esempio n. 7
0
def sample_dep_parse():
    from nltk.corpus import dependency_treebank
    p = dependency_treebank.parsed_sents()[0]
    
    ''' with 'rel' from Stanford dependencies:
    [{'address': 0, 'deps': [8], 'rel': 'TOP',           'tag': 'TOP', 'word': None},
     {'address': 1, 'deps': [],  'rel': 'nn', 'head': 2, 'tag': 'NNP', 'word': Pierre},
      ...]
    '''

    '''
Esempio n. 8
0
def test():
    '''
    test model
    '''
    # load trained model and features
    with open('train.p', 'r') as file:
        trained_model = pickle.load(file)
    with open('features.p', 'r') as file:
        features = pickle.load(file)
    # load test data
    truelist = dp.parsed_sents(testfile)
    predictedlist = []
    # main loop
    print "Running main loop..."
    k = 1
    for true in truelist:
        k += 1
        print str(k) + " of " + str(len(truelist)) + " sentences"
        # erase tree structure
        Z = nodes(true)
        T = erase_structure(true)
        # initialize
        i = 1
        X = []
        j = 1
        Y = ""
        no_construction = False
        j = 1
        # build dependency structure
        print
        while len(
                Z
        ) > 2 and j < 200:  # the 200 is used to prevent infinite looping for now
            j += 1
            if i == 1:
                if no_construction == True:
                    break
                no_construction = True
            x = get_contextual_features(T, Z, i, 2, 2)
            y = estimate_action(trained_model, features, x)
            Y += y
            (T, Z, i) = construction(T, Z, i, y)
            if y != 'S':
                no_construction = False
        predictedlist.append(T)
    # measure performance
    print "Calculating performance..."
    (dep_acc, root_acc, comp_acc) = accuracy(truelist, predictedlist)
    # print output
    print "Dependency accuracy: " + str(dep_acc)
    print "Root accuracy: " + str(root_acc)
    print "Complete accuracy: " + str(comp_acc)
Esempio n. 9
0
def import_data():

    #iterate through directory to get all the data
    '''for subdir, dirs, files in os.walk('./dep_treebank'):
        for row in files:
            for f in row:
                print f'''

    #example of what to do for each file
    path = "../../../../../Users/lurke/Documents/Harvard/Senior/CS187/final/dep_treebank"
    f = path + "/00/wsj_0001.mrg"
    t = dt.parsed_sents(f)[0]
    print t
Esempio n. 10
0
def import_data():

    #iterate through directory to get all the data
    '''for subdir, dirs, files in os.walk('./dep_treebank'):
        for row in files:
            for f in row:
                print f'''

    #example of what to do for each file
    path = "../../../../../Users/lurke/Documents/Harvard/Senior/CS187/final/dep_treebank"
    f = path + "/00/wsj_0001.mrg"
    t = dt.parsed_sents(f)[0]
    print t
Esempio n. 11
0
def test():
    '''
    test model
    '''
    # load trained model and features
    with open('train.p','r') as file:
        trained_model=pickle.load(file)
    with open('features.p','r') as file:
        features=pickle.load(file)
    # load test data
    truelist = dp.parsed_sents(testfile) 
    predictedlist = []
    # main loop
    print "Running main loop..."
    k = 1
    for true in truelist:
        k += 1
        print str(k) + " of " + str(len(truelist)) + " sentences"
        # erase tree structure
        Z = nodes(true)
        T = erase_structure(true)
        # initialize
        i = 1
        X = []
        j = 1
        Y = ""
        no_construction = False
        j = 1
        # build dependency structure
        print 
        while len(Z) > 2 and j < 200: # the 200 is used to prevent infinite looping for now
            j += 1
            if i == 1:
                if no_construction == True:
                    break
                no_construction = True
            x = get_contextual_features(T, Z, i, 2, 2)
            y = estimate_action(trained_model, features, x)
            Y += y
            (T, Z, i) = construction(T, Z, i, y)
            if y != 'S':
                no_construction = False
        predictedlist.append(T)
    # measure performance
    print "Calculating performance..."
    (dep_acc, root_acc, comp_acc) = accuracy(truelist, predictedlist)
    # print output
    print "Dependency accuracy: " + str(dep_acc)
    print "Root accuracy: " + str(root_acc)
    print "Complete accuracy: " + str(comp_acc)
Esempio n. 12
0
def main():
    '''main function. either trains the model or tests it on a dataset'''
    if currently_training:
        sents = sum([dp.parsed_sents(testfile) for testfile in testfiles], [])
        train = Train()
        p = Parser(train, lcontext, rcontext)
        trees = do_parse(p, sents)

        models = gen_svc(train)
        pkl = open('models.pkl','wb')
        pickle.dump(models, pkl)
        pkl.close()

    else:
        models = pickle.load(open('models.pkl','rb'))
        predict = Predict(models)
        p = Parser(predict, lcontext, rcontext)
        testfiles2_dir = INPATH + '23/'
        testfiles2 = [testfiles2_dir + file for file in os.listdir(testfiles2_dir)]
        sents = sum([dp.parsed_sents(testfile) for testfile in testfiles2], [])
        trees_predict = do_parse(p, sents)

        print 'ACCURACYS:'
        print accuracy(sents, trees_predict)
Esempio n. 13
0
def main():
    trees = dependency_treebank.parsed_sents()
    sents = list(dependency_treebank.sents())

    # for sent in sents_train:
    #     print(sent)
    #
    # TESTER LINES
    sents = sents
    trees = trees
    #############
    split_index = int(len(sents) * 0.9)
    trees_train = trees[:split_index]
    trees_test = trees[split_index:]
    sents_train = sents[:split_index]
    sents_test = sents[split_index:]

    corpus, POS_tags = get_corpus_and_tags_from_trees(trees)
    word_matrix_size, word_map = create_matrix(corpus)
    tag_matrix_size, tag_map = create_matrix(POS_tags)
    f = 0

    t_list_feat_dict = create_list_of_feature_dic(
        sents_test,
        word_matrix_size,
        word_map,
        tag_matrix_size,
        tag_map,
        pickles=True,
        pkl_path="list_feat_dict_test.pkl")
    # Q3
    w = perceptron(trees_train, sents_train, word_matrix_size, word_map,
                   tag_matrix_size, tag_map)
    print(np.sum(w))
    # Q4
    # w4 = perceptron(trees_train, sents_train, word_matrix_size, word_map, tag_matrix_size, tag_map, augmented=True)

    # Evaluate:
    score = evaluate(theta=w,
                     tree_test=trees_test,
                     sent_test=sents_test,
                     t_list_feat_dict=t_list_feat_dict)
    print(score)
Esempio n. 14
0
def main():
    sents = sum([dp.parsed_sents(testfile) for testfile in testfiles], [])
    train = Train()
    p = Parser(train)
    trees = do_parse(p, sents)

    vec, svc = gen_svc(train)
    predict = Predict(vec, svc)
    p = Parser(predict)
    trees_predict = do_parse(p, sents)

    for train,predict,actual in zip(trees, trees_predict, sents):
        if train != predict:
            print train
            print predict

        if train != actual.tree():
#            print train
#            print actual.tree()
            pass
Esempio n. 15
0
def main():
    sents = sum([dp.parsed_sents(testfile) for testfile in testfiles], [])
    train = Train()
    p = Parser(train)
    trees = do_parse(p, sents)

    vec, svc = gen_svc(train)
    predict = Predict(vec, svc)
    p = Parser(predict)
    trees_predict = do_parse(p, sents)

    for train, predict, actual in zip(trees, trees_predict, sents):
        if train != predict:
            print train
            print predict

        if train != actual.tree():
            #            print train
            #            print actual.tree()
            pass
Esempio n. 16
0
def train():
    '''
    train model
    '''
    Y = []
    X = []
    print "Main loop"
    for section in SECTIONS:
        print "section " + str(section)
        inpath = INPATH + "/" + section + "/"
        for file in os.listdir(inpath):
            print "file " + str(file)
            file = inpath + file
            parsed_file = dp.parsed_sents(file)
            nsents = 1
            for sentence in parsed_file:
                nsents += 1
                Z = nodes(sentence)
                T = erase_structure(sentence)
                i = 1
                j = 1
                while len(Z) > 2 and j < 200:
                    j += 1
                    y = get_action(sentence, T, Z, i)
                    x = get_contextual_features(T, Z, i, 2, 2)
                    X.append(x)
                    Y.append(y)
                    (T, Z, i) = construction(T, Z, i, y)
    vec = DictVectorizer()
    print "Converting features to sparse matrix..."
    xx = vec.fit_transform(X)
    features = vec.get_feature_names()
    trained_model = LinearSVC()
    print "SVM learning..."
    trained_model.fit(xx, Y)
    print "Saving..."
    with open('train.p', 'w') as f:
        pickle.dump(trained_model, f)
    with open('features.p', 'w') as f:
        pickle.dump(features, f)
    print "Training completed"
Esempio n. 17
0
def train():
    '''
    train model
    '''
    Y = []
    X = []
    print "Main loop"
    for section in SECTIONS:
        print "section " + str(section)
        inpath = INPATH + "/" + section + "/"
        for file in os.listdir(inpath):
            print "file " + str(file)
            file = inpath + file
            parsed_file = dp.parsed_sents(file)
            nsents = 1
            for sentence in parsed_file:
                nsents += 1    
                Z = nodes(sentence)
                T = erase_structure(sentence)
                i = 1
                j = 1
                while len(Z) > 2 and j < 200:
                    j += 1
                    y = get_action(sentence, T, Z, i)
                    x = get_contextual_features(T, Z, i, 2, 2)
                    X.append(x)
                    Y.append(y)
                    (T, Z, i) = construction(T, Z, i, y)
    vec = DictVectorizer()
    print "Converting features to sparse matrix..."
    xx = vec.fit_transform(X)
    features = vec.get_feature_names()
    trained_model = LinearSVC()
    print "SVM learning..."
    trained_model.fit(xx, Y)
    print "Saving..."
    with open('train.p','w') as f:
        pickle.dump(trained_model,f)
    with open('features.p','w') as f:
        pickle.dump(features,f)
    print "Training completed"
Esempio n. 18
0
def read_dependency_treebank():
    """Read the dependency treebank included in NLTK, and convert it to a
    simplified format.
    
    A sentence x is represented as a list of word/part-of-speech pairs, the
    first of which is a special dummy token '<TOP>' which will be the root of
    every parse tree. A parse tree y as a list of integers corresponding to the
    positions of the heads of the respective tokens; the first integer in this
    list is always -1, meaning that the dummy token has no head.

    For instance, if we have the sentence "John sleeps.", it will be represented
    as the list 

        [('<TOP>', '<TOP>'), ('John', 'NNP'), ('sleeps', 'VBZ'), ('.', '.')]

    and its parse tree will be

        [-1, 2, 0, 2]

    """
    XY = (convert_dependency_tree(t) for t in dependency_treebank.parsed_sents())
    X, Y = (list(t) for t in zip(*XY))
    return X, Y
Esempio n. 19
0
def train_model(folders):
    """
    Takes a list of folders from which to draw data files to train the model.
    Parses sentences in a similar way to when testing, by iteratively looking at
    target nodes in the remaining subtrees of the sentence. For each pair, the
    algorithm derives a list of features and a correct construction action. Once
    there are all found it uses them to generate a model, which is returned.
    ========== INCOMPLETE ==========
    """
    raw_features = []
    classifications = []
    for filepath in data_file_paths_for_folders(folders):
        for sentence in dt.parsed_sents(filepath):
            T = flattened_node_list(sentence)
            i = 0
            no_construction = True
            while len(T) >= 1:
                if i == len(T) - 1:
                    if no_construction:
                        break
                    no_construction = True
                    i = 0
                else:
                    target_features = get_contextual_features(T, i)
                    target_classification = get_classification(T, i, sentence)
                    raw_features.append(target_features)
                    classifications.append(target_classification)
                    construction(T, i, target_classification)
                    if target_classification != SHIFT:
                        no_construction = False
                i += 1
    vectorizer = DictVectorizer()
    feature_matrix = vectorizer.fit_transform(raw_features)
    feature_names = vectorizer.get_feature_names()
    model = multiclass.OneVsOneClassifier(svm.LinearSVC())
    model.fit(feature_matrix, classifications)
    return vectorizer, model
Esempio n. 20
0
def train_model(folders):
    """
    Takes a list of folders from which to draw data files to train the model.
    Parses sentences in a similar way to when testing, by iteratively looking at
    target nodes in the remaining subtrees of the sentence. For each pair, the
    algorithm derives a list of features and a correct construction action. Once
    there are all found it uses them to generate a model, which is returned.
    ========== INCOMPLETE ==========
    """
    raw_features = []
    classifications = []
    for filepath in data_file_paths_for_folders(folders):
        for sentence in dt.parsed_sents(filepath):
            T = flattened_node_list(sentence)
            i = 0
            no_construction = True
            while len(T) >= 1:
                if i == len(T) - 1:
                    if no_construction:
                        break
                    no_construction = True
                    i = 0
                else:
                    target_features = get_contextual_features(T, i)
                    target_classification = get_classification(T, i, sentence)
                    raw_features.append(target_features)
                    classifications.append(target_classification)
                    construction(T, i, target_classification)
                    if target_classification != SHIFT:
                        no_construction = False
                i += 1
    vectorizer = DictVectorizer()
    feature_matrix = vectorizer.fit_transform(raw_features)
    feature_names = vectorizer.get_feature_names()
    model = multiclass.OneVsOneClassifier(svm.LinearSVC())
    model.fit(feature_matrix, classifications)
    return vectorizer, model
Esempio n. 21
0
import ex3
from nltk.corpus import dependency_treebank
from scipy.sparse import dok_matrix, csr_matrix
import numpy as np
import Node, Edge

corpus_sentences = dependency_treebank.parsed_sents()

training_size = round(len(corpus_sentences) * 0.9)
training_set = corpus_sentences[:training_size]
test_set = corpus_sentences[training_size:]


def test_b():
    tree = training_set[0]
    sentence = ex3.create_sentence(tree)
    node1 = tree.nodes[1]
    node2 = tree.nodes[2]
    f = ex3.feature_function(node1, node2, sentence)
    print(f)


def test_b_and_e():
    edges_set = ex3.calc_right_tree(training_set[0])
    sentence = ex3.create_sentence(training_set[0])
    print(sentence)
    # for edge in edges_set:
    #     print(edge.out_node.word, edge.in_node.word)
    #     print(ex3.feature_function(edge.out_node, edge.in_node, sentence))

    sentence = []
Esempio n. 22
0
 def __init__(self, feature_function):
     self.sents = dependency_treebank.parsed_sents()
     threshold = int(len(self.sents) * 0.9)
     self.train = self.sents[:threshold]
     self.test = self.sents[threshold:]
     self.feature = feature_function
Esempio n. 23
0
                                             arc_score_func)
        learnt_arcs_dict = min_spanning_arborescence_nx(arc_lst, 0)
        set_of_arcs = set([(arc[0], arc[2])
                           for arc in learnt_arcs_dict.values()])
        for tail in graph.nodes.values():
            for head_idx in tail['deps']['']:
                if (tail['address'], head_idx) in set_of_arcs:
                    sum_per_sentence += 1
        sum += sum_per_sentence / (len(graph.nodes) - 1)
    return sum / len(test_set)


if __name__ == '__main__':
    nltk.download('dependency_treebank')
    train_examples, test_examples = \
        train_test_split(dependency_treebank.parsed_sents(), train_size=0.9)

    vocabulary, tags = get_vocabulary_and_tags(train_examples)
    word_to_idx = {word: i for i, word in enumerate(vocabulary)}
    tag_to_idx = {tag: i for i, tag in enumerate(tags)}

    w1 = perceptron_train(2, 1, train_examples,
                          len(vocabulary)**2 + len(tags)**2, word_to_idx,
                          tag_to_idx, get_graph_feature_vector, arc_score)

    # with open('w1.pickle', 'wb') as f:
    #     pickle.dump(w1, f)
    # pickle_in = open("w1.pickle", "rb")
    # w1 = pickle.load(pickle_in)

    res = evaluate(w1, test_examples, word_to_idx, tag_to_idx, arc_score)
            )

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, "wb"))"""
        finally:
            remove(input_file.name)


# Evaluate the features comparing performance to the original
print()
print()
print("Optional and advanced part")
print()
transition_parser = TransitionParser("arc-standard")
transition_parser.train(dependency_treebank.parsed_sents()[:100],
                        "transition_parser.model")
parses = transition_parser.parse(dependency_treebank.parsed_sents()[-10:],
                                 "transition_parser.model")
print(len(parses))
dependency_evaluator = DependencyEvaluator(
    parses,
    dependency_treebank.parsed_sents()[-10:])
standard_parser_evaluation = dependency_evaluator.eval()
print(
    f"The scores of the standard TransitionParser are: {standard_parser_evaluation}"
)
print()

my_transition_parser = ModifyiedTransitionParser("arc-standard")
my_transition_parser.train(dependency_treebank.parsed_sents()[:100],
Esempio n. 25
0
    """
    Return a custom score function that obeys the BigramScoreFunction interface.
    It can use information stored in the local file sf.dat
    """
    # Complete this if you want extra credit
    return None

if __name__ == "__main__":

    # Load the score function
    sf = BigramInterpScoreFunction("tb_counts.words", "tb_counts.tag")

    total_right = 0
    total_edges = 0

    for ss in dt.parsed_sents():
        words = list(dependency_element(ss, 'word'))
        tags = list(dependency_element(ss, 'tag'))

        chart = EisnerParser(words, tags, sf)

        chart.initialize_chart()
        chart.fill_chart()

#        for ii, jj in correct_positions(ss):
        for ii, jj in chart.reconstruct():
            # Subtract 1 to account for the head we added
            print(ii, jj, words[ii - 1], words[jj - 1],
                  tags[ii - 1], tags[jj - 1],
                  sf(words[ii - 1], words[jj - 1],
                     tags[ii - 1], tags[jj - 1], ii, jj))
Esempio n. 26
0
    Return a custom score function that obeys the BigramScoreFunction interface.
    It can use information stored in the local file sf.dat
    """
    # Complete this if you want extra credit
    return None


if __name__ == "__main__":

    # Load the score function
    sf = BigramInterpScoreFunction("tb_counts.words", "tb_counts.tag")

    total_right = 0
    total_edges = 0

    for ss in dt.parsed_sents():
        words = list(dependency_element(ss, 'word'))
        tags = list(dependency_element(ss, 'tag'))

        chart = EisnerParser(words, tags, sf)

        chart.initialize_chart()
        chart.fill_chart()

        #        for ii, jj in correct_positions(ss):
        for ii, jj in chart.reconstruct():
            # Subtract 1 to account for the head we added
            print(
                ii, jj, words[ii - 1], words[jj - 1], tags[ii - 1],
                tags[jj - 1],
                sf(words[ii - 1], words[jj - 1], tags[ii - 1], tags[jj - 1],
Esempio n. 27
0
"""
Step 1: Gather Dependencies from Penn Treebank Sample and append them to a list of Dependency Structures
"""
from nltk.corpus import dependency_treebank
from nltk.parse import DependencyGraph, ProbabilisticProjectiveDependencyParser

t = dependency_treebank.parsed_sents()
dependency_structures = []
for i in range(0, len(t)):
    dependency_structures.append(
        dependency_treebank.parsed_sents()[i].to_conll(3))
"""
Step 2: Convert to Dependency Graphs and Append them to a list of Dependency Graphs
"""
dependency_trees = []
for i in range(0, len(dependencies_structures)):
    dependency_trees.append(DependencyGraph(dependency_structures[i]))
"""
Step 3: Train Probabilistic Dependency Parsing
"""
pbDp = ProbabilisticProjectiveDependencyParser()
ProbabilisticProjectiveDependencyParser.train(pbDp, dgs)
"""
Step 4: Return Dependency Graphs
"""

output_graphs = ProbabilisticProjectiveDependencyParser.parse(
    pbDp, [
        'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join',
        'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29',
        '.'
Esempio n. 28
0
@author: dimitri
"""
import nltk

nltk.download('dependency_treebank')
from nltk.corpus import dependency_treebank
from nltk.parse import DependencyEvaluator

#TESTING OF EXERCISE 2 CODE

#COMPARISON PARSER WITH NEW FEATURE WITH STANDARD ONE
from exercise2_parser_1 import *

tp = TransitionParser('arc-standard')
tp.train(dependency_treebank.parsed_sents()[:200], 'tp.model')
parses = tp.parse(dependency_treebank.parsed_sents()[-50:], 'tp.model')
de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-50:])
las, uas = de.eval()
# print las e uas
print("result of the parser with new feature \n")
print(las)
print(uas)
from nltk.parse.transitionparser import TransitionParser

tp = TransitionParser('arc-standard')
tp.train(dependency_treebank.parsed_sents()[:200], 'tp.model')
parses = tp.parse(dependency_treebank.parsed_sents()[-50:], 'tp.model')
de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-50:])
las, uas = de.eval()
# print las e uas
Esempio n. 29
0
from nltk.corpus import dependency_treebank
import random
import mst
from sparse_vector import sparse_vector

# read parsed data ('gold' parsed sentences) and add 'ROOT' node with 'ROOT' tag
parsed_sents = dependency_treebank.parsed_sents()
for sent in parsed_sents:
    sent.nodes[0].update({'word': 'ROOT', 'tag': 'ROOT', 'ctag': 'ROOT'})
# read taged data and add the word 'ROOT'
tagged_sents_orig = dependency_treebank.tagged_sents()
tagged_sents = []
for sent in tagged_sents_orig:
    tagged_sents.append([('ROOT', 'ROOT')] + sent)

# split train and test, from the parsed and frome the tagged-only sentences
train_tagged = tagged_sents[:int(len(parsed_sents) * 0.9)]
train_parsed = parsed_sents[:int(len(parsed_sents) * 0.9)]
test_parsed = parsed_sents[int(len(parsed_sents) * 0.9):]
test_tagged = tagged_sents[int(len(tagged_sents) * 0.9):]


# create set of all possible tags and words.
def get_all_possible_tags_and_words(data_set):
    all_words = set()
    all_tags = set()
    for sen in data_set:
        for tagged_word in sen:
            all_words.add(tagged_word[0])
            all_tags.add(tagged_word[1])
    all_tags.add('ROOT')
Esempio n. 30
0
def main():

    ####################################################################################################################
    # Get the data
    ####################################################################################################################
    print("Getting data")
    train_set = []
    test_set = []
    try:
        # nltk.download()
        parsed_sents = dependency_treebank.parsed_sents()  # Download all the data
        train_set = parsed_sents[:(int(len(parsed_sents) * 0.3))]
        test_set = parsed_sents[(int(len(parsed_sents) * 0.95)):]
    except:
        print("couldn't get the data")
        exit(1)

    print("Finished Getting data")


    ####################################################################################################################
    #  Calculate the error rate using the feature vector
    ####################################################################################################################
    print("Starting perceptron algorithm ")
    weight_vector_w = perceptron_algorithm(train_set, with_distance=False)
    print("finish to calculate the weight vector")
    error_count = []
    counter = 0
    test_set_size = str(len(test_set))
    for sentence in test_set:
        print("sen num "+str(counter) + " from : " + test_set_size)

        counter += 1
        arcs_vector = get_arcs_vector(sentence, weight_vector_w, with_distance=False)
        mst = Chu_Liu_Edmonds_algorithm.min_spanning_arborescence(arcs_vector, SINK)
        arcs_dict = get_arcs_from_sentence(sentence)
        current_error_rate = get_error_gold_vs_result(arcs_dict, mst) / len(sentence.nodes)
        error_count.append(current_error_rate)
    avg = np.average(error_count)
    print("the avg without distance is: " + str(avg), file=open("output.txt", "a"))

    f = open('result.txt', 'w+')
    f.write(json.dumps(weight_vector_w))
    weight_vector_w = None
    ####################################################################################################################
    #  Calculate the error rate using the feature and distance vectors
    ####################################################################################################################
    print("Starting perceptron algorithm ")
    weight_vector_w = perceptron_algorithm(train_set, with_distance=True)
    print("finish to calculate the weight vector")
    error_count = []
    counter = 0
    for sentence in test_set:
        print("sen num "+str(counter) + " from : " + test_set_size)
        counter += 1
        arcs_vector = get_arcs_vector(sentence, weight_vector_w, with_distance=True)
        mst = Chu_Liu_Edmonds_algorithm.min_spanning_arborescence(arcs_vector, SINK)
        arcs_dict = get_arcs_from_sentence(sentence)
        current_error_rate = get_error_gold_vs_result(arcs_dict, mst) / len(sentence.nodes)
        error_count.append(current_error_rate)
    avg = np.average(error_count)
    print("the avg with distance is: " + str(avg), file=open("output.txt", "a"))
    try:
        f = open('result2.txt', 'w+')
        f.write(json.dumps(weight_vector_w))
    # the part in the try wasnt check good
    except:
        print("coudlnt save the vector")