def extract_features_sent(sentence, feature_names, classifier, dict_classes,
                          vec):

    stack = []
    graph = {}
    queue = list(sentence)
    graph['heads'] = {}
    graph['heads']['0'] = '0'
    graph['deprels'] = {}
    graph['deprels']['0'] = 'ROOT'
    transitions = []

    x = list()
    X = list()
    y = list()
    while queue:
        if (len(stack) > 0):
            x.append(stack[0]['cpostag'])
            x.append(stack[0]['form'])
        else:
            x.append('nil')
            x.append('nil')

        if (queue):
            x.append(queue[0]['cpostag'])
            x.append(queue[0]['form'])
        else:
            x.append('nil')
            x.append('nil')

        x.append(transition.can_reduce(stack, graph))
        x.append(transition.can_leftarc(stack, graph))
        X = (dict(zip(feature_names, x)))
        #remove reference, predict what action should be done(equiv to trans)
        #print('Stack is ', len(stack))
        #print('Queue is ', queue)
        trans_nr = classifier.predict(vec.transform(X))
        print(trans_nr[0])
        trans = dict_classes[trans_nr[0]]
        stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)
        x = list()
    #stack, graph = transition.empty_stack(stack, graph)

    transition.empty_stack(stack, graph)
    for word in sentence:
        word['head'] = graph['heads'][word['id']]
        word['deprel'] = graph['deprels'][word['id']]
    return graph
Beispiel #2
0
def extract_all_features(formatted_corpus):
    sent_cnt = 0

    y_symbols = []  # Our array of transistions
    X_dict = list()  # Our matrix

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            a = 1
            # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'

        while queue:
            x = features.extract(stack, queue, graph, FEATURE_NAMES, sentence)
            X_dict.append(x)

            stack, queue, graph, trans = reference(stack, queue, graph)

            y_symbols.append(trans)
        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        # print(y_symbols)
        # print(graph)

    return X_dict, y_symbols
Beispiel #3
0
def predict_sentence():
    train_file = 'swedish_talbanken05_train.conll'
    test_file = 'swedish_talbanken05_test_blind.conll'
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]

    sentences = conll.read_sentences(test_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006_test)
    features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra']
    features2 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'can_re', 'can_ra'
    ]
    features3 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1',
        'can_re', 'can_ra'
    ]

    sent_cnt = 0

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:

            feat = features.extract_3(stack, queue, graph, features3, sentence)
            # print(feat)
            feat = vec.transform(feat)
            trans_nr = model.predict(feat)
            # print(trans_nr)
            trans = label.inverse_transform(trans_nr)
            print(trans)
            # fel Graph
            stack, queue, graph, trans = parse_ml(stack, queue, graph,
                                                  trans[0])

        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
            word['deprel'] = graph['deprels'][word['id']]

    conll.save("test", formatted_corpus, column_names_2006)
Beispiel #4
0
def train_model():
    train_file = 'swedish_talbanken05_train.conll'
    test_file = 'swedish_talbanken05_test_blind.conll'
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)
    features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra']
    features2 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'can_re', 'can_ra'
    ]
    features3 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1',
        'can_re', 'can_ra'
    ]

    sent_cnt = 0
    x_vect = []
    y_vect = []

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            feat = features.extract_3(stack, queue, graph, features3, sentence)
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
            x_vect.append(feat)
            y_vect.append(trans)
            # print(feat, " = ", trans)
        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        # print(transitions)
        # print(graph)
    return x_vect, y_vect
Beispiel #5
0
def extract_features(formatted_corpus, feature_names, training=True, model=None):
    non_proj = []

    X_1 = []
    y_1 = []

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        feats = []
        while queue:
            feats.append(features.extract(stack, queue, graph, feature_names, sentence))
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        X_1.extend(feats)
        y_1.extend(transitions)
        #print('Equal graphs:', transition.equal_graphs(sentence, graph))
        if not transition.equal_graphs(sentence, graph):
            non_proj.append(sentence)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        #print(transitions)
        #print(graph)

    #print(len(non_proj))
    #s = sorted(non_proj, key=lambda x: len(x))

    #print([x['form'] for x in s[0]])

    #for x in non_proj:
    #    print(len(x))
    #    print(x)

    return (X_1, y_1)
Beispiel #6
0
def calculateSomething(filen, model=None, dict_vect=None, label_enc = None):
        column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
        column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']
        sentences = conll.read_sentences(filen)
        formatted_corpus = conll.split_rows(sentences, column_names_2006)
        sent_cnt = 0
        X_unEncoded = []
        y_unEncoded = []
        for sentence in formatted_corpus:
            sent_cnt += 1
            #if sent_cnt % 1000 == 0:
            #    print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
            stack = []
            queue = list(sentence)
            state = {}
            state['heads'] = {}
            state['heads']['0'] = '0'
            state['deprels'] = {}
            state['deprels']['0'] = 'ROOT'
            transitions = []
            while queue:

                featureRow = extract(stack, queue, state, [], sentence)
                if model is None or dict_vect is None or label_enc is None:
                    stack, queue, state, trans = reference(stack, queue, state)
                    transitions.append(trans)
                else:
                    featureRow_encoded = dict_vect.transform(featureRow)
                    trans_nr = model.predict(featureRow_encoded)
                    trans = le.inverse_transform(trans_nr)
                    print(trans[0])

                    stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)

                X_unEncoded.append(featureRow)
                y_unEncoded.append(trans)

            stack, state = transition.empty_stack(stack, state)

            #print('Equal graphs:', transition.equal_graphs(sentence, state))

            # Poorman's projectivization to have well-formed graphs.
            for word in sentence:
                word['head'] = state['heads'][word['id']]
        return X_unEncoded, y_unEncoded
Beispiel #7
0
def extract_features(formatted_corpus, feature_names):
    X, Y = [], []
    for sentence in formatted_corpus:
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        temp_X = []
        temp_Y = []
        while queue:
            # x is one row of X
            x = extract(stack, queue, graph, feature_names, sentence)
            stack, queue, graph, trans = reference(stack, queue, graph)
            temp_X.append(x)
            temp_Y.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        if transition.equal_graphs(sentence, graph):
            X += temp_X
            Y += temp_Y 
        
    return X, Y
Beispiel #8
0
def extract_features_sent(sentence, feature_names, classifier, dict_classes,
                          vec):

    stack = []
    graph = {}
    queue = list(sentence)
    graph['heads'] = {}
    graph['heads']['0'] = '0'
    graph['deprels'] = {}
    graph['deprels']['0'] = 'ROOT'

    x = list()
    X = list()
    d = len(sentence)
    while queue:
        if (len(stack) > 0):
            x.append(stack[0]['cpostag'])
        else:
            x.append('nil')
        if (len(stack) > 1):
            x.append(stack[1]['cpostag'])
        else:
            x.append('nil')
        if (len(stack) > 0):
            x.append(stack[0]['form'])
        else:
            x.append('nil')
        if (len(stack) > 1):
            x.append(stack[1]['form'])
        else:
            x.append('nil')
        if (queue):
            x.append(queue[0]['cpostag'])
        else:
            x.append('nil')
        if (len(queue) > 1):
            x.append(queue[1]['cpostag'])
        else:
            x.append('nil')
        if (queue):
            x.append(queue[0]['form'])
        else:
            x.append('nil')
        if (len(queue) > 1):
            x.append(queue[1]['form'])
        else:
            x.append('nil')

        x.append(transition.can_reduce(stack, graph))
        x.append(transition.can_leftarc(stack, graph))
        X = (dict(zip(feature_names, x)))
        trans_nr = classifier.predict(vec.transform(X))[0]
        trans = dict_classes[trans_nr]
        stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)
        x = list()

    transition.empty_stack(stack, graph)
    for word in sentence:
        word['head'] = graph['heads'][word['id']]
        word['deprel'] = graph['deprels'][word['id']]
    return X
    column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
    column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        print(transitions)
        print(graph)
def extract_features(formatted_corpus,
                     mode,
                     test_mode=False,
                     vec=None,
                     classifier=None):
    # EXTRACT FEATURES
    feature_names_1 = [
        'stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', 'can-re',
        'can-la'
    ]
    feature_names_2 = [
        'stack1_POS', 'stack1_word', 'queue1_POS', 'queue1_word'
    ]
    feature_names_3 = ['left_POS', 'left_word', 'right_POS', 'right_word']

    feature_names = {
        'mode1': feature_names_1,
        'mode2': feature_names_2,
        'mode3': feature_names_3
    }
    X = list()
    transitions = list()
    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        # if sent_cnt % 1000 == 0:
        #    print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'

        while queue:
            if mode == 3:
                X_row = extract_mode_3(stack, queue, graph, feature_names,
                                       sentence)
            elif mode == 2:
                X_row = extract_mode_2(stack, queue, graph, feature_names,
                                       sentence)
            elif mode == 1:
                X_row = extract_mode_1(stack, queue, graph, feature_names,
                                       sentence)
            if not test_mode:
                stack, queue, graph, trans = reference(stack, queue, graph)
            elif test_mode:
                X_row_vec = vec.transform(X_row)
                trans_nr = classifier.predict(X_row_vec)
                stack, queue, graph, trans = parse_ml(stack, queue, graph,
                                                      trans_nr)
            X.append(X_row)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        # print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        if test_mode:
            for word in sentence:
                word['head'] = graph['heads'][word['id']]
                word['deprel'] = graph['deprels'][word['id']]
        # print(graph)
    for pos, e in enumerate(X[:6]):
        print("x = {}, y= {}".format(e, transitions[pos]))
    # print(X)
    # print(transitions)
    if test_mode:
        conll.save('out_{}_mode_{}.conll'.format("test", mode),
                   formatted_corpus, column_names_2006)
    return X, transitions
Beispiel #11
0
    X = list()
    Y = list()
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        state = {}
        state['heads'] = {}
        state['heads']['0'] = '0'
        state['deprels'] = {}
        state['deprels']['0'] = 'ROOT'
        transitions = []
        #features.extract(stack,queue,state,feature_names,sentence)
        #if sent_cnt<2:
        while queue:
                X.append(features.extract2(stack,queue,state,feature_names2,sentence))
                stack, queue, state, trans = reference(stack, queue, state)
                transitions.append(trans)
                Y.append(trans)
        stack, state = transition.empty_stack(stack, state)
        #print('Equal graphs:', transition.equal_graphs(sentence, state))

        # Poorman's projectivization to have well-formed graphs.
        #for word in sentence:
        #    word['head'] = state['heads'][word['id']]
        #print(transitions)
        #print(state)
    print(X)
    #print(Y)
Beispiel #12
0
    column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
    column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)

    sent_cnt = 0
    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
        stack, graph = transition.empty_stack(stack, graph)
        print('Equal graphs:', transition.equal_graphs(sentence, graph))

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        print(transitions)
        print(graph)