Python parsed_sents Exemples, nltk.corpus.dependency_treebank.parsed_sents Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : test_malt.py Projet : hmc-cs159-spring2016/banana

def crossValidation(numIter, testProp):
	"""
	Input: The number of iterations to run and the proportion of 
	Output: The average proportion of correctly parsed sentences
	"""
	numTest = int(testProp * 1000)
	numList = list(range(1000))
	counter = [0]*numIter

	# repeat numIter times
	for i in range(numIter):
		print("Starting iteration: ", i)

		# Shuffle list of indices of the dependency treebank sentences and set the first numTests
		# as the test sentences.
		random.shuffle(numList)
		tests = numList[:numTest]
		grams = numList[numTest:]
		grammar = dependencyGraphsToGrammar([dependency_treebank.parsed_sents()[sent] for sent in grams])
		parser = malt_parser.maltparser(grammar)

		# try every test sentence. See how the parser does
		for test in tests:
			testSent = parser.parse(dependency_treebank.sents()[test])
			refSent = dependency_treebank.parsed_sents()[test].tree()
			if compareDependencyTrees(testSent,refSent):
				counter[i] += 1

	# return the average score of all of the iterations.
	return sum(counter)/(numTest*numIter)

Exemple #2

0

Afficher le fichier

def main():    
    '''
    Read and print sentence
    '''
    parsed = dp.parsed_sents(FILE)[0]
    tree = parsed.tree()
    print(tree.pprint())

Exemple #3

0

Afficher le fichier

Fichier : mst_parser.py Projet : RazK/huji-nlp

 def __init__(self, split_ratios=(0.9, 0.1, 0.0)):
     """
     Initialize the train, validation and test set
     :param split_ratios: (train, test, validation)
     """
     self.split_ratios = split_ratios
     self.sentences = dependency_treebank.parsed_sents()

Exemple #4

0

Afficher le fichier

Fichier : ariel.py Projet : lurke/DependencyParsing

def test_model(folders, vectorizer, model):
    """
    Takes a list of folders from which to draw data files to test the model.
    Parses sentences by iteratively looking at target nodes in the remaining 
    subtrees of the sentence. For each pair, the algorithm derives a list of 
    features, and uses the model to predict a construction action. The function
    then analyzes the performance of the model and prints out results.
    ========== INCOMPLETE ==========
    """
    for filepath in data_file_paths_for_folders(folders):
        for sentence in dt.parsed_sents(filepath):
            T = flattened_node_list(sentence)
            i = 0
            no_construction = True
            while len(T) >= 1:
                if i == len(T) - 1:
                    if no_construction:
                        break
                    no_construction = True
                    i = 0
                else:
                    target_features = get_contextual_features(T, i)
                    target_classification = estimate_classification(target_features, vectorizer, model)
                    construction(T, i, target_classification)
                    if target_classification != SHIFT:
                        no_construction = False
                i += 1

Exemple #5

0

Afficher le fichier

Fichier : dependency.py Projet : ai-ku/UnsupervisedParsing

 def fromRaw():
     for sent in dependency_treebank.parsed_sents():
         
         g = prune(graph(sent))
         
         if g.number_of_nodes()-1 <= 10:
             yield (g,sent)

Exemple #6

0

Afficher le fichier

Fichier : ex5.py Projet : eliyacobov1/MST-Parser

 def __init__(self, num_iterations):
     self.num_iterations = num_iterations
     corpus = dependency_treebank.parsed_sents()
     sep = int(len(corpus) * (9 / 10))
     self.train_set, self.test_set = corpus[:sep], corpus[sep:]
     self.word_vec, self.tag_vec = self.word_tag_vectors_init()
     self.num_words, self.num_tags = len(self.word_vec), len(self.tag_vec)
     self.cur_weights = Counter()
     self.aggregate_weights = Counter()  # sum of weights

Exemple #7

0

Afficher le fichier

Fichier : adjsAndAdverbs.py Projet : christianbuck/nlu

def sample_dep_parse():
    from nltk.corpus import dependency_treebank
    p = dependency_treebank.parsed_sents()[0]
    
    ''' with 'rel' from Stanford dependencies:
    [{'address': 0, 'deps': [8], 'rel': 'TOP',           'tag': 'TOP', 'word': None},
     {'address': 1, 'deps': [],  'rel': 'nn', 'head': 2, 'tag': 'NNP', 'word': Pierre},
      ...]
    '''

    '''

Exemple #8

0

Afficher le fichier

def test():
    '''
    test model
    '''
    # load trained model and features
    with open('train.p', 'r') as file:
        trained_model = pickle.load(file)
    with open('features.p', 'r') as file:
        features = pickle.load(file)
    # load test data
    truelist = dp.parsed_sents(testfile)
    predictedlist = []
    # main loop
    print "Running main loop..."
    k = 1
    for true in truelist:
        k += 1
        print str(k) + " of " + str(len(truelist)) + " sentences"
        # erase tree structure
        Z = nodes(true)
        T = erase_structure(true)
        # initialize
        i = 1
        X = []
        j = 1
        Y = ""
        no_construction = False
        j = 1
        # build dependency structure
        print
        while len(
                Z
        ) > 2 and j < 200:  # the 200 is used to prevent infinite looping for now
            j += 1
            if i == 1:
                if no_construction == True:
                    break
                no_construction = True
            x = get_contextual_features(T, Z, i, 2, 2)
            y = estimate_action(trained_model, features, x)
            Y += y
            (T, Z, i) = construction(T, Z, i, y)
            if y != 'S':
                no_construction = False
        predictedlist.append(T)
    # measure performance
    print "Calculating performance..."
    (dep_acc, root_acc, comp_acc) = accuracy(truelist, predictedlist)
    # print output
    print "Dependency accuracy: " + str(dep_acc)
    print "Root accuracy: " + str(root_acc)
    print "Complete accuracy: " + str(comp_acc)

Exemple #9

0

Afficher le fichier

Fichier : lauren.py Projet : lurke/DependencyParsing

def import_data():

    #iterate through directory to get all the data
    '''for subdir, dirs, files in os.walk('./dep_treebank'):
        for row in files:
            for f in row:
                print f'''

    #example of what to do for each file
    path = "../../../../../Users/lurke/Documents/Harvard/Senior/CS187/final/dep_treebank"
    f = path + "/00/wsj_0001.mrg"
    t = dt.parsed_sents(f)[0]
    print t

Exemple #10

0

Afficher le fichier

def import_data():

    #iterate through directory to get all the data
    '''for subdir, dirs, files in os.walk('./dep_treebank'):
        for row in files:
            for f in row:
                print f'''

    #example of what to do for each file
    path = "../../../../../Users/lurke/Documents/Harvard/Senior/CS187/final/dep_treebank"
    f = path + "/00/wsj_0001.mrg"
    t = dt.parsed_sents(f)[0]
    print t

Exemple #11

0

Afficher le fichier

Fichier : henrik.py Projet : lurke/DependencyParsing

def test():
    '''
    test model
    '''
    # load trained model and features
    with open('train.p','r') as file:
        trained_model=pickle.load(file)
    with open('features.p','r') as file:
        features=pickle.load(file)
    # load test data
    truelist = dp.parsed_sents(testfile) 
    predictedlist = []
    # main loop
    print "Running main loop..."
    k = 1
    for true in truelist:
        k += 1
        print str(k) + " of " + str(len(truelist)) + " sentences"
        # erase tree structure
        Z = nodes(true)
        T = erase_structure(true)
        # initialize
        i = 1
        X = []
        j = 1
        Y = ""
        no_construction = False
        j = 1
        # build dependency structure
        print 
        while len(Z) > 2 and j < 200: # the 200 is used to prevent infinite looping for now
            j += 1
            if i == 1:
                if no_construction == True:
                    break
                no_construction = True
            x = get_contextual_features(T, Z, i, 2, 2)
            y = estimate_action(trained_model, features, x)
            Y += y
            (T, Z, i) = construction(T, Z, i, y)
            if y != 'S':
                no_construction = False
        predictedlist.append(T)
    # measure performance
    print "Calculating performance..."
    (dep_acc, root_acc, comp_acc) = accuracy(truelist, predictedlist)
    # print output
    print "Dependency accuracy: " + str(dep_acc)
    print "Root accuracy: " + str(root_acc)
    print "Complete accuracy: " + str(comp_acc)

Exemple #12

0

Afficher le fichier

Fichier : master.py Projet : lurke/DependencyParsing

def main():
    '''main function. either trains the model or tests it on a dataset'''
    if currently_training:
        sents = sum([dp.parsed_sents(testfile) for testfile in testfiles], [])
        train = Train()
        p = Parser(train, lcontext, rcontext)
        trees = do_parse(p, sents)

        models = gen_svc(train)
        pkl = open('models.pkl','wb')
        pickle.dump(models, pkl)
        pkl.close()

    else:
        models = pickle.load(open('models.pkl','rb'))
        predict = Predict(models)
        p = Parser(predict, lcontext, rcontext)
        testfiles2_dir = INPATH + '23/'
        testfiles2 = [testfiles2_dir + file for file in os.listdir(testfiles2_dir)]
        sents = sum([dp.parsed_sents(testfile) for testfile in testfiles2], [])
        trees_predict = do_parse(p, sents)

        print 'ACCURACYS:'
        print accuracy(sents, trees_predict)

Exemple #13

0

Afficher le fichier

def main():
    trees = dependency_treebank.parsed_sents()
    sents = list(dependency_treebank.sents())

    # for sent in sents_train:
    #     print(sent)
    #
    # TESTER LINES
    sents = sents
    trees = trees
    #############
    split_index = int(len(sents) * 0.9)
    trees_train = trees[:split_index]
    trees_test = trees[split_index:]
    sents_train = sents[:split_index]
    sents_test = sents[split_index:]

    corpus, POS_tags = get_corpus_and_tags_from_trees(trees)
    word_matrix_size, word_map = create_matrix(corpus)
    tag_matrix_size, tag_map = create_matrix(POS_tags)
    f = 0

    t_list_feat_dict = create_list_of_feature_dic(
        sents_test,
        word_matrix_size,
        word_map,
        tag_matrix_size,
        tag_map,
        pickles=True,
        pkl_path="list_feat_dict_test.pkl")
    # Q3
    w = perceptron(trees_train, sents_train, word_matrix_size, word_map,
                   tag_matrix_size, tag_map)
    print(np.sum(w))
    # Q4
    # w4 = perceptron(trees_train, sents_train, word_matrix_size, word_map, tag_matrix_size, tag_map, augmented=True)

    # Evaluate:
    score = evaluate(theta=w,
                     tree_test=trees_test,
                     sent_test=sents_test,
                     t_list_feat_dict=t_list_feat_dict)
    print(score)

Exemple #14

0

Afficher le fichier

Fichier : nate.py Projet : lurke/DependencyParsing

def main():
    sents = sum([dp.parsed_sents(testfile) for testfile in testfiles], [])
    train = Train()
    p = Parser(train)
    trees = do_parse(p, sents)

    vec, svc = gen_svc(train)
    predict = Predict(vec, svc)
    p = Parser(predict)
    trees_predict = do_parse(p, sents)

    for train,predict,actual in zip(trees, trees_predict, sents):
        if train != predict:
            print train
            print predict

        if train != actual.tree():
#            print train
#            print actual.tree()
            pass

Exemple #15

0

Afficher le fichier

Fichier : nate.py Projet : pombredanne/DependencyParsing-1

def main():
    sents = sum([dp.parsed_sents(testfile) for testfile in testfiles], [])
    train = Train()
    p = Parser(train)
    trees = do_parse(p, sents)

    vec, svc = gen_svc(train)
    predict = Predict(vec, svc)
    p = Parser(predict)
    trees_predict = do_parse(p, sents)

    for train, predict, actual in zip(trees, trees_predict, sents):
        if train != predict:
            print train
            print predict

        if train != actual.tree():
            #            print train
            #            print actual.tree()
            pass

Exemple #16

0

Afficher le fichier

def train():
    '''
    train model
    '''
    Y = []
    X = []
    print "Main loop"
    for section in SECTIONS:
        print "section " + str(section)
        inpath = INPATH + "/" + section + "/"
        for file in os.listdir(inpath):
            print "file " + str(file)
            file = inpath + file
            parsed_file = dp.parsed_sents(file)
            nsents = 1
            for sentence in parsed_file:
                nsents += 1
                Z = nodes(sentence)
                T = erase_structure(sentence)
                i = 1
                j = 1
                while len(Z) > 2 and j < 200:
                    j += 1
                    y = get_action(sentence, T, Z, i)
                    x = get_contextual_features(T, Z, i, 2, 2)
                    X.append(x)
                    Y.append(y)
                    (T, Z, i) = construction(T, Z, i, y)
    vec = DictVectorizer()
    print "Converting features to sparse matrix..."
    xx = vec.fit_transform(X)
    features = vec.get_feature_names()
    trained_model = LinearSVC()
    print "SVM learning..."
    trained_model.fit(xx, Y)
    print "Saving..."
    with open('train.p', 'w') as f:
        pickle.dump(trained_model, f)
    with open('features.p', 'w') as f:
        pickle.dump(features, f)
    print "Training completed"

Exemple #17

0

Afficher le fichier

Fichier : henrik.py Projet : lurke/DependencyParsing

def train():
    '''
    train model
    '''
    Y = []
    X = []
    print "Main loop"
    for section in SECTIONS:
        print "section " + str(section)
        inpath = INPATH + "/" + section + "/"
        for file in os.listdir(inpath):
            print "file " + str(file)
            file = inpath + file
            parsed_file = dp.parsed_sents(file)
            nsents = 1
            for sentence in parsed_file:
                nsents += 1    
                Z = nodes(sentence)
                T = erase_structure(sentence)
                i = 1
                j = 1
                while len(Z) > 2 and j < 200:
                    j += 1
                    y = get_action(sentence, T, Z, i)
                    x = get_contextual_features(T, Z, i, 2, 2)
                    X.append(x)
                    Y.append(y)
                    (T, Z, i) = construction(T, Z, i, y)
    vec = DictVectorizer()
    print "Converting features to sparse matrix..."
    xx = vec.fit_transform(X)
    features = vec.get_feature_names()
    trained_model = LinearSVC()
    print "SVM learning..."
    trained_model.fit(xx, Y)
    print "Saving..."
    with open('train.p','w') as f:
        pickle.dump(trained_model,f)
    with open('features.p','w') as f:
        pickle.dump(features,f)
    print "Training completed"

Exemple #18

0

Afficher le fichier

Fichier : mstparser.py Projet : EddieNejadi/Machine_Learning

def read_dependency_treebank():
    """Read the dependency treebank included in NLTK, and convert it to a
    simplified format.
    
    A sentence x is represented as a list of word/part-of-speech pairs, the
    first of which is a special dummy token '<TOP>' which will be the root of
    every parse tree. A parse tree y as a list of integers corresponding to the
    positions of the heads of the respective tokens; the first integer in this
    list is always -1, meaning that the dummy token has no head.

    For instance, if we have the sentence "John sleeps.", it will be represented
    as the list 

        [('<TOP>', '<TOP>'), ('John', 'NNP'), ('sleeps', 'VBZ'), ('.', '.')]

    and its parse tree will be

        [-1, 2, 0, 2]

    """
    XY = (convert_dependency_tree(t) for t in dependency_treebank.parsed_sents())
    X, Y = (list(t) for t in zip(*XY))
    return X, Y

Exemple #19

0

Afficher le fichier

Fichier : ariel.py Projet : lurke/DependencyParsing

def train_model(folders):
    """
    Takes a list of folders from which to draw data files to train the model.
    Parses sentences in a similar way to when testing, by iteratively looking at
    target nodes in the remaining subtrees of the sentence. For each pair, the
    algorithm derives a list of features and a correct construction action. Once
    there are all found it uses them to generate a model, which is returned.
    ========== INCOMPLETE ==========
    """
    raw_features = []
    classifications = []
    for filepath in data_file_paths_for_folders(folders):
        for sentence in dt.parsed_sents(filepath):
            T = flattened_node_list(sentence)
            i = 0
            no_construction = True
            while len(T) >= 1:
                if i == len(T) - 1:
                    if no_construction:
                        break
                    no_construction = True
                    i = 0
                else:
                    target_features = get_contextual_features(T, i)
                    target_classification = get_classification(T, i, sentence)
                    raw_features.append(target_features)
                    classifications.append(target_classification)
                    construction(T, i, target_classification)
                    if target_classification != SHIFT:
                        no_construction = False
                i += 1
    vectorizer = DictVectorizer()
    feature_matrix = vectorizer.fit_transform(raw_features)
    feature_names = vectorizer.get_feature_names()
    model = multiclass.OneVsOneClassifier(svm.LinearSVC())
    model.fit(feature_matrix, classifications)
    return vectorizer, model

Exemple #20

0

Afficher le fichier

def train_model(folders):
    """
    Takes a list of folders from which to draw data files to train the model.
    Parses sentences in a similar way to when testing, by iteratively looking at
    target nodes in the remaining subtrees of the sentence. For each pair, the
    algorithm derives a list of features and a correct construction action. Once
    there are all found it uses them to generate a model, which is returned.
    ========== INCOMPLETE ==========
    """
    raw_features = []
    classifications = []
    for filepath in data_file_paths_for_folders(folders):
        for sentence in dt.parsed_sents(filepath):
            T = flattened_node_list(sentence)
            i = 0
            no_construction = True
            while len(T) >= 1:
                if i == len(T) - 1:
                    if no_construction:
                        break
                    no_construction = True
                    i = 0
                else:
                    target_features = get_contextual_features(T, i)
                    target_classification = get_classification(T, i, sentence)
                    raw_features.append(target_features)
                    classifications.append(target_classification)
                    construction(T, i, target_classification)
                    if target_classification != SHIFT:
                        no_construction = False
                i += 1
    vectorizer = DictVectorizer()
    feature_matrix = vectorizer.fit_transform(raw_features)
    feature_names = vectorizer.get_feature_names()
    model = multiclass.OneVsOneClassifier(svm.LinearSVC())
    model.fit(feature_matrix, classifications)
    return vectorizer, model

Exemple #21

0

Afficher le fichier

Fichier : MyTest.py Projet : varditA/Syntax_Dependency_Parser

import ex3
from nltk.corpus import dependency_treebank
from scipy.sparse import dok_matrix, csr_matrix
import numpy as np
import Node, Edge

corpus_sentences = dependency_treebank.parsed_sents()

training_size = round(len(corpus_sentences) * 0.9)
training_set = corpus_sentences[:training_size]
test_set = corpus_sentences[training_size:]


def test_b():
    tree = training_set[0]
    sentence = ex3.create_sentence(tree)
    node1 = tree.nodes[1]
    node2 = tree.nodes[2]
    f = ex3.feature_function(node1, node2, sentence)
    print(f)


def test_b_and_e():
    edges_set = ex3.calc_right_tree(training_set[0])
    sentence = ex3.create_sentence(training_set[0])
    print(sentence)
    # for edge in edges_set:
    #     print(edge.out_node.word, edge.in_node.word)
    #     print(ex3.feature_function(edge.out_node, edge.in_node, sentence))

    sentence = []

Exemple #22

0

Afficher le fichier

Fichier : ex4.py Projet : bensapirstein/dependency_parsing

 def __init__(self, feature_function):
     self.sents = dependency_treebank.parsed_sents()
     threshold = int(len(self.sents) * 0.9)
     self.train = self.sents[:threshold]
     self.test = self.sents[threshold:]
     self.feature = feature_function

Exemple #23

0

Afficher le fichier

                                             arc_score_func)
        learnt_arcs_dict = min_spanning_arborescence_nx(arc_lst, 0)
        set_of_arcs = set([(arc[0], arc[2])
                           for arc in learnt_arcs_dict.values()])
        for tail in graph.nodes.values():
            for head_idx in tail['deps']['']:
                if (tail['address'], head_idx) in set_of_arcs:
                    sum_per_sentence += 1
        sum += sum_per_sentence / (len(graph.nodes) - 1)
    return sum / len(test_set)


if __name__ == '__main__':
    nltk.download('dependency_treebank')
    train_examples, test_examples = \
        train_test_split(dependency_treebank.parsed_sents(), train_size=0.9)

    vocabulary, tags = get_vocabulary_and_tags(train_examples)
    word_to_idx = {word: i for i, word in enumerate(vocabulary)}
    tag_to_idx = {tag: i for i, tag in enumerate(tags)}

    w1 = perceptron_train(2, 1, train_examples,
                          len(vocabulary)**2 + len(tags)**2, word_to_idx,
                          tag_to_idx, get_graph_feature_vector, arc_score)

    # with open('w1.pickle', 'wb') as f:
    #     pickle.dump(w1, f)
    # pickle_in = open("w1.pickle", "rb")
    # w1 = pickle.load(pickle_in)

    res = evaluate(w1, test_examples, word_to_idx, tag_to_idx, arc_score)

Exemple #24

0

Afficher le fichier

Fichier : main.py Projet : AlessandroGrassi1998/NLU_assignment_1

            )

            model.fit(x_train, y_train)
            # Save the model to file name (as pickle)
            pickle.dump(model, open(modelfile, "wb"))"""
        finally:
            remove(input_file.name)


# Evaluate the features comparing performance to the original
print()
print()
print("Optional and advanced part")
print()
transition_parser = TransitionParser("arc-standard")
transition_parser.train(dependency_treebank.parsed_sents()[:100],
                        "transition_parser.model")
parses = transition_parser.parse(dependency_treebank.parsed_sents()[-10:],
                                 "transition_parser.model")
print(len(parses))
dependency_evaluator = DependencyEvaluator(
    parses,
    dependency_treebank.parsed_sents()[-10:])
standard_parser_evaluation = dependency_evaluator.eval()
print(
    f"The scores of the standard TransitionParser are: {standard_parser_evaluation}"
)
print()

my_transition_parser = ModifyiedTransitionParser("arc-standard")
my_transition_parser.train(dependency_treebank.parsed_sents()[:100],

Exemple #25

0

Afficher le fichier

Fichier : dependency.py Projet : azzamsu/cl1-hw

    """
    Return a custom score function that obeys the BigramScoreFunction interface.
    It can use information stored in the local file sf.dat
    """
    # Complete this if you want extra credit
    return None

if __name__ == "__main__":

    # Load the score function
    sf = BigramInterpScoreFunction("tb_counts.words", "tb_counts.tag")

    total_right = 0
    total_edges = 0

    for ss in dt.parsed_sents():
        words = list(dependency_element(ss, 'word'))
        tags = list(dependency_element(ss, 'tag'))

        chart = EisnerParser(words, tags, sf)

        chart.initialize_chart()
        chart.fill_chart()

#        for ii, jj in correct_positions(ss):
        for ii, jj in chart.reconstruct():
            # Subtract 1 to account for the head we added
            print(ii, jj, words[ii - 1], words[jj - 1],
                  tags[ii - 1], tags[jj - 1],
                  sf(words[ii - 1], words[jj - 1],
                     tags[ii - 1], tags[jj - 1], ii, jj))

Exemple #26

0

Afficher le fichier

Fichier : dependency.py Projet : anassalamah/cl1-hw

    Return a custom score function that obeys the BigramScoreFunction interface.
    It can use information stored in the local file sf.dat
    """
    # Complete this if you want extra credit
    return None


if __name__ == "__main__":

    # Load the score function
    sf = BigramInterpScoreFunction("tb_counts.words", "tb_counts.tag")

    total_right = 0
    total_edges = 0

    for ss in dt.parsed_sents():
        words = list(dependency_element(ss, 'word'))
        tags = list(dependency_element(ss, 'tag'))

        chart = EisnerParser(words, tags, sf)

        chart.initialize_chart()
        chart.fill_chart()

        #        for ii, jj in correct_positions(ss):
        for ii, jj in chart.reconstruct():
            # Subtract 1 to account for the head we added
            print(
                ii, jj, words[ii - 1], words[jj - 1], tags[ii - 1],
                tags[jj - 1],
                sf(words[ii - 1], words[jj - 1], tags[ii - 1], tags[jj - 1],

Exemple #27

0

Afficher le fichier

"""
Step 1: Gather Dependencies from Penn Treebank Sample and append them to a list of Dependency Structures
"""
from nltk.corpus import dependency_treebank
from nltk.parse import DependencyGraph, ProbabilisticProjectiveDependencyParser

t = dependency_treebank.parsed_sents()
dependency_structures = []
for i in range(0, len(t)):
    dependency_structures.append(
        dependency_treebank.parsed_sents()[i].to_conll(3))
"""
Step 2: Convert to Dependency Graphs and Append them to a list of Dependency Graphs
"""
dependency_trees = []
for i in range(0, len(dependencies_structures)):
    dependency_trees.append(DependencyGraph(dependency_structures[i]))
"""
Step 3: Train Probabilistic Dependency Parsing
"""
pbDp = ProbabilisticProjectiveDependencyParser()
ProbabilisticProjectiveDependencyParser.train(pbDp, dgs)
"""
Step 4: Return Dependency Graphs
"""

output_graphs = ProbabilisticProjectiveDependencyParser.parse(
    pbDp, [
        'Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join',
        'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29',
        '.'

Exemple #28

0

Afficher le fichier

@author: dimitri
"""
import nltk

nltk.download('dependency_treebank')
from nltk.corpus import dependency_treebank
from nltk.parse import DependencyEvaluator

#TESTING OF EXERCISE 2 CODE

#COMPARISON PARSER WITH NEW FEATURE WITH STANDARD ONE
from exercise2_parser_1 import *

tp = TransitionParser('arc-standard')
tp.train(dependency_treebank.parsed_sents()[:200], 'tp.model')
parses = tp.parse(dependency_treebank.parsed_sents()[-50:], 'tp.model')
de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-50:])
las, uas = de.eval()
# print las e uas
print("result of the parser with new feature \n")
print(las)
print(uas)
from nltk.parse.transitionparser import TransitionParser

tp = TransitionParser('arc-standard')
tp.train(dependency_treebank.parsed_sents()[:200], 'tp.model')
parses = tp.parse(dependency_treebank.parsed_sents()[-50:], 'tp.model')
de = DependencyEvaluator(parses, dependency_treebank.parsed_sents()[-50:])
las, uas = de.eval()
# print las e uas

Exemple #29

0

Afficher le fichier

Fichier : ex3.py Projet : bareluz93/ex3_NLP

from nltk.corpus import dependency_treebank
import random
import mst
from sparse_vector import sparse_vector

# read parsed data ('gold' parsed sentences) and add 'ROOT' node with 'ROOT' tag
parsed_sents = dependency_treebank.parsed_sents()
for sent in parsed_sents:
    sent.nodes[0].update({'word': 'ROOT', 'tag': 'ROOT', 'ctag': 'ROOT'})
# read taged data and add the word 'ROOT'
tagged_sents_orig = dependency_treebank.tagged_sents()
tagged_sents = []
for sent in tagged_sents_orig:
    tagged_sents.append([('ROOT', 'ROOT')] + sent)

# split train and test, from the parsed and frome the tagged-only sentences
train_tagged = tagged_sents[:int(len(parsed_sents) * 0.9)]
train_parsed = parsed_sents[:int(len(parsed_sents) * 0.9)]
test_parsed = parsed_sents[int(len(parsed_sents) * 0.9):]
test_tagged = tagged_sents[int(len(tagged_sents) * 0.9):]


# create set of all possible tags and words.
def get_all_possible_tags_and_words(data_set):
    all_words = set()
    all_tags = set()
    for sen in data_set:
        for tagged_word in sen:
            all_words.add(tagged_word[0])
            all_tags.add(tagged_word[1])
    all_tags.add('ROOT')

Exemple #30

0

Afficher le fichier

Fichier : sol3.py Projet : shaked571/NLP_ex3

def main():

    ####################################################################################################################
    # Get the data
    ####################################################################################################################
    print("Getting data")
    train_set = []
    test_set = []
    try:
        # nltk.download()
        parsed_sents = dependency_treebank.parsed_sents()  # Download all the data
        train_set = parsed_sents[:(int(len(parsed_sents) * 0.3))]
        test_set = parsed_sents[(int(len(parsed_sents) * 0.95)):]
    except:
        print("couldn't get the data")
        exit(1)

    print("Finished Getting data")


    ####################################################################################################################
    #  Calculate the error rate using the feature vector
    ####################################################################################################################
    print("Starting perceptron algorithm ")
    weight_vector_w = perceptron_algorithm(train_set, with_distance=False)
    print("finish to calculate the weight vector")
    error_count = []
    counter = 0
    test_set_size = str(len(test_set))
    for sentence in test_set:
        print("sen num "+str(counter) + " from : " + test_set_size)

        counter += 1
        arcs_vector = get_arcs_vector(sentence, weight_vector_w, with_distance=False)
        mst = Chu_Liu_Edmonds_algorithm.min_spanning_arborescence(arcs_vector, SINK)
        arcs_dict = get_arcs_from_sentence(sentence)
        current_error_rate = get_error_gold_vs_result(arcs_dict, mst) / len(sentence.nodes)
        error_count.append(current_error_rate)
    avg = np.average(error_count)
    print("the avg without distance is: " + str(avg), file=open("output.txt", "a"))

    f = open('result.txt', 'w+')
    f.write(json.dumps(weight_vector_w))
    weight_vector_w = None
    ####################################################################################################################
    #  Calculate the error rate using the feature and distance vectors
    ####################################################################################################################
    print("Starting perceptron algorithm ")
    weight_vector_w = perceptron_algorithm(train_set, with_distance=True)
    print("finish to calculate the weight vector")
    error_count = []
    counter = 0
    for sentence in test_set:
        print("sen num "+str(counter) + " from : " + test_set_size)
        counter += 1
        arcs_vector = get_arcs_vector(sentence, weight_vector_w, with_distance=True)
        mst = Chu_Liu_Edmonds_algorithm.min_spanning_arborescence(arcs_vector, SINK)
        arcs_dict = get_arcs_from_sentence(sentence)
        current_error_rate = get_error_gold_vs_result(arcs_dict, mst) / len(sentence.nodes)
        error_count.append(current_error_rate)
    avg = np.average(error_count)
    print("the avg with distance is: " + str(avg), file=open("output.txt", "a"))
    try:
        f = open('result2.txt', 'w+')
        f.write(json.dumps(weight_vector_w))
    # the part in the try wasnt check good
    except:
        print("coudlnt save the vector")