Beispiel #1
0
def extract_features(formatted_corpus, feature_names, training=True, model=None):
    non_proj = []

    X_1 = []
    y_1 = []

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        feats = []
        while queue:
            feats.append(features.extract(stack, queue, graph, feature_names, sentence))
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        X_1.extend(feats)
        y_1.extend(transitions)
        #print('Equal graphs:', transition.equal_graphs(sentence, graph))
        if not transition.equal_graphs(sentence, graph):
            non_proj.append(sentence)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        #print(transitions)
        #print(graph)

    #print(len(non_proj))
    #s = sorted(non_proj, key=lambda x: len(x))

    #print([x['form'] for x in s[0]])

    #for x in non_proj:
    #    print(len(x))
    #    print(x)

    return (X_1, y_1)
Beispiel #2
0
def extract_features(formatted_corpus, feature_names):
    X, Y = [], []
    for sentence in formatted_corpus:
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        temp_X = []
        temp_Y = []
        while queue:
            # x is one row of X
            x = extract(stack, queue, graph, feature_names, sentence)
            stack, queue, graph, trans = reference(stack, queue, graph)
            temp_X.append(x)
            temp_Y.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        if transition.equal_graphs(sentence, graph):
            X += temp_X
            Y += temp_Y 
        
    return X, Y
    column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
    column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        print(transitions)
        print(graph)
Beispiel #4
0
        graph['deprels'] = {
        }  # Define another dictionary with dependency relations stored inside, again inside graph
        graph['deprels'][
            '0'] = 'ROOT'  # Make the first element in the dependency relations dictionary the 'ROOT' keyword
        transitions = [
        ]  # List of the transitions that each sentence will have to create its dependency tree
        while queue:  # While you still have things in your input token queue
            stack, queue, graph, trans = reference(stack, queue, graph)
            # Stack, queue, and graph may have been modified
            # trans holds a string for the kind of transition that should be made (shift, reduce, left-arc, right-arc)
            transitions.append(
                trans
            )  # Append the transition string to the list of transitions for this sentence
        stack, graph = transition.empty_stack(stack, graph)
        # print('Equal graphs:', transition.equal_graphs(sentence, graph))
        if not transition.equal_graphs(sentence, graph):
            # print('NOT EQUAL GRAPHS')
            nonprojective_sentences.append(sentence)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            # "Replace" the head value on the word to the head value that we calculated with the dependency graph
            word['head'] = graph['heads'][word['id']]

        # print(transitions)  # We print out the transitions that we make
        # print(graph)  # And we print out the graph of arc heads and the dependency relation between them
    # End of the for sentence in formatted_corpus loop
    print('end?')
    shortest_nonprojective_sentence = min(nonprojective_sentences, key=len)
    for word in shortest_nonprojective_sentence:
        print(word)
Beispiel #5
0
    column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
    column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        print(transitions)
        print(graph)