Ejemplo n.º 1
0
def main():
    test_corpus = '../Corpus/swedish_talbanken05_test_blind-conllx.txt'

    training_column_names = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
    test_column_names = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

    test_sentences = conll.read_sentences(test_corpus)

    # Take the sentences and break them up into a useful format
    # Each corpus is a list of lists of dictionaries
    # Corpus = dictionary, each sentence is a list, each word is a dictionary
    split_test_sentences = conll.split_rows(test_sentences, training_column_names)
    feature_names_1 = ['stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word',
                       'can_ra', 'can_la']
    feature_names_2 = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word',
                       'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word',
                       'can_ra', 'can_la']
    feature_names_3 = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word',
                       'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word',
                       'stack0_next_word_POS', 'stack0_next_word_word', 'stack0_prev_word_POS', 'stack0_prev_word_word',
                       'can_ra', 'can_la']
    feature_sets = [feature_names_1, feature_names_2, feature_names_3]

    i = 1
    for feature_set in feature_sets:
        classifier_filename = 'ml_classifier' + str(i) + '.pickle'
        model_filename = 'ml_model' + str(i) + '.pickle'
        classifier_file = open(classifier_filename, 'rb')
        model_file = open(model_filename, 'rb')
        classifier = pickle.load(classifier_file)
        model = pickle.load(model_file)
        perform_prediction(split_test_sentences, feature_set, classifier, model)
        i += 1
Ejemplo n.º 2
0
def predict_sentence():
    train_file = 'swedish_talbanken05_train.conll'
    test_file = 'swedish_talbanken05_test_blind.conll'
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]

    sentences = conll.read_sentences(test_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006_test)
    features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra']
    features2 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'can_re', 'can_ra'
    ]
    features3 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1',
        'can_re', 'can_ra'
    ]

    sent_cnt = 0

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:

            feat = features.extract_3(stack, queue, graph, features3, sentence)
            # print(feat)
            feat = vec.transform(feat)
            trans_nr = model.predict(feat)
            # print(trans_nr)
            trans = label.inverse_transform(trans_nr)
            print(trans)
            # fel Graph
            stack, queue, graph, trans = parse_ml(stack, queue, graph,
                                                  trans[0])

        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
            word['deprel'] = graph['deprels'][word['id']]

    conll.save("test", formatted_corpus, column_names_2006)
Ejemplo n.º 3
0
def train_model():
    train_file = 'swedish_talbanken05_train.conll'
    test_file = 'swedish_talbanken05_test_blind.conll'
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)
    features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra']
    features2 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'can_re', 'can_ra'
    ]
    features3 = [
        'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0',
        'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1',
        'can_re', 'can_ra'
    ]

    sent_cnt = 0
    x_vect = []
    y_vect = []

    for sentence in formatted_corpus:
        sent_cnt += 1
        if sent_cnt % 1000 == 0:
            print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
        stack = []
        queue = list(sentence)
        graph = {}
        graph['heads'] = {}
        graph['heads']['0'] = '0'
        graph['deprels'] = {}
        graph['deprels']['0'] = 'ROOT'
        transitions = []
        while queue:
            feat = features.extract_3(stack, queue, graph, features3, sentence)
            stack, queue, graph, trans = reference(stack, queue, graph)
            transitions.append(trans)
            x_vect.append(feat)
            y_vect.append(trans)
            # print(feat, " = ", trans)
        stack, graph = transition.empty_stack(stack, graph)

        # Poorman's projectivization to have well-formed graphs.
        for word in sentence:
            word['head'] = graph['heads'][word['id']]
        # print(transitions)
        # print(graph)
    return x_vect, y_vect
Ejemplo n.º 4
0
def calculateSomething(filen, model=None, dict_vect=None, label_enc = None):
        column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']
        column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']
        sentences = conll.read_sentences(filen)
        formatted_corpus = conll.split_rows(sentences, column_names_2006)
        sent_cnt = 0
        X_unEncoded = []
        y_unEncoded = []
        for sentence in formatted_corpus:
            sent_cnt += 1
            #if sent_cnt % 1000 == 0:
            #    print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True)
            stack = []
            queue = list(sentence)
            state = {}
            state['heads'] = {}
            state['heads']['0'] = '0'
            state['deprels'] = {}
            state['deprels']['0'] = 'ROOT'
            transitions = []
            while queue:

                featureRow = extract(stack, queue, state, [], sentence)
                if model is None or dict_vect is None or label_enc is None:
                    stack, queue, state, trans = reference(stack, queue, state)
                    transitions.append(trans)
                else:
                    featureRow_encoded = dict_vect.transform(featureRow)
                    trans_nr = model.predict(featureRow_encoded)
                    trans = le.inverse_transform(trans_nr)
                    print(trans[0])

                    stack, queue, graph, trans = parse_ml(stack, queue, graph, trans)

                X_unEncoded.append(featureRow)
                y_unEncoded.append(trans)

            stack, state = transition.empty_stack(stack, state)

            #print('Equal graphs:', transition.equal_graphs(sentence, state))

            # Poorman's projectivization to have well-formed graphs.
            for word in sentence:
                word['head'] = state['heads'][word['id']]
        return X_unEncoded, y_unEncoded
Ejemplo n.º 5
0
def computeVectors(file):
        column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel']

        sentences = conll.read_sentences(file)
        formatted_corpus = conll.split_rows(sentences, column_names_2006)

        X = []
        y = []

        feature_names = [
            'stack0_POS',
            'stack0_word',
            'queue0_POS',
            'queue0_word',
            'can-re',
            'can-la'
        ]
        #feature_names = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS',
        #                 'queue0_word', 'queue1_word','can-re', 'can-la']

        #feature_names=['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS',
        #               'queue0_word', 'queue1_word','next_word_POS','next_word', 'prev_word_POS', 'prev_word',
        #               'can-re', 'can-la']
        for sentence in formatted_corpus:
Ejemplo n.º 6
0
        dev_file = base_path + 'CoNLL2009-ST-English-development-pos.txt'
        test_file = base_path + 'CoNLL2009-ST-test-words-pos.txt'
    elif corpus == 'CoNLL-U':
        column_names = [
            'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL',
            'DEPS', 'MISC'
        ]
        POS_key = 'UPOS'
        base_path = '/Users/pierre/Documents/Cours/EDAN20/corpus/ud-treebanks-v2.6/'
        train_file = base_path + 'UD_English-EWT/en_ewt-ud-train.conllu'
        dev_file = base_path + 'UD_English-EWT/en_ewt-ud-dev.conllu'
        test_file = base_path + 'UD_English-EWT/en_ewt-ud-test.conllu'

    train_sentences = conll.read_sentences(train_file)
    formatted_corpus = [
        conll.split_rows(sentence, column_names)
        for sentence in train_sentences
    ]
    # print(formatted_corpus[0])

    counts = Counts(formatted_corpus, column_names, POS_key)
    counts.count_all()
    # counts.print_stats()
    counts.print_debug()
    # print(counts.sentences[0])
    # exit()
    if corpus == 'CoNLL2009':
        cm = ConfusionMatrix(formatted_corpus, POS_key)
        cm.compute_matrix()
        cm.print()
        print("Accuracy: ", cm.compute_accuracy())
Ejemplo n.º 7
0
        for pair, head in collector_pair.items():
            if (head_reference == head):
                triples.append((pair[0], pair[1], _object))
                break
    return triples


if __name__ == "__main__":
    column_names_u = [
        'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel',
        'deps', 'misc'
    ]
    fname = sys.argv[1]
    print(fname)
    sentences = conll.read_sentences(fname)
    formatted_corpus = conll.split_rows(sentences, column_names_u)
    frequencies = {}
    for sentence in formatted_corpus:
        for token in sentence:
            if (token['deprel'] == 'nsubj'):
                subject = token['form']
                head = token['head']
                verb = sentence[int(head)]['form']
                pair = (subject.lower(), verb.lower())
                if pair in frequencies:
                    frequencies[pair] += 1
                else:
                    frequencies[pair] = 1

    cnt = Counter(frequencies)
    print(cnt.most_common(5))
Ejemplo n.º 8
0
    print("# of pairs:", counter)
    print("Morst frequent triples:")
    for i, triple in enumerate(triples_sorted):
        if i >= 5:
            break
        print(triple[1], triple[0])


if __name__ == '__main__':
    column_names = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    train_file = "./corpus/swedish_talbanken05_train.conll"
    train_corpus = conll.read_sentences(train_file)
    train_corpus = conll.split_rows(train_corpus, column_names)

    print(train_file, len(train_corpus))
    get_pairs(train_corpus)
    get_triples(train_corpus)

    column_names = [
        'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel',
        'deps', 'misc'
    ]
    SUB = 'nsubj'
    OBJ = 'obj'

    files = conll.get_files("/usr/local/cs/EDAN20/ud-treebanks-v2.4/",
                            "train.conllu")
    for file in files:
Ejemplo n.º 9
0
            if not connected(sentence[i], sentence[id_head], sentence):
                np_links.append(word)
                break
    return np_links


if __name__ == '__main__':
    train_file = '../../../corpus/conllx/sv/swedish_talbanken05_train.conll'
    # train_file = 'test_x'
    # test_file = '../../../corpus/conllx/sv/swedish_talbanken05_test.conll'
    test_file = '../../../corpus/conllx/sv/swedish_talbanken05_test_blind.conll'

    column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats',
                         'head', 'deprel', 'phead', 'pdeprel']
    column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']

    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)
    # print(formatted_corpus[0])

    for sentence in formatted_corpus:
        if sentence[1]['form'] == 'Vad' and sentence[2]['form'] == 'beror':
            print_sentence(sentence)
            print(sentence)
            np_links = nonprojective_links(sentence)
            print("nonprojective Links", np_links)

            projective_order = []
            inorder(sentence[0], sentence, projective_order)
            print(projective_order)
    ]
    column_names_2006_test = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats'
    ]

    #for i in range(1, 4):
    vec = DictVectorizer(sparse=True)
    classifier = linear_model.LogisticRegression(penalty='l2',
                                                 dual=True,
                                                 solver='liblinear',
                                                 verbose=1)
    # classifier = linear_model.Perceptron(penalty='l2')
    # classifier = tree.DecisionTreeClassifier()
    # classifier = linear_model.SGDClassifier(penalty='l2')
    sentences = conll.read_sentences(train_file)
    sentences_test = conll.read_sentences(test_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)
    formatted_corpus_test = conll.split_rows(sentences_test,
                                             column_names_2006_test)

    X, y = extract_features(formatted_corpus, mode=3)

    train_model(X, y, classifier, vec)

    X_test, y_test = extract_features(formatted_corpus_test,
                                      mode=3,
                                      test_mode=True,
                                      vec=vec,
                                      classifier=classifier)
    test_model(X_test, y_test, classifier, vec)
Ejemplo n.º 11
0
            for i in range(2):
                feature_names.append('stack_front' + str(i * 2 - 1) + '_' +
                                     element)
    feature_names.extend(['can-re', 'can-la'])
    return feature_names


if __name__ == '__main__':
    train_file = 'swedish_talbanken05_train.conll'
    test_file = 'swedish_talbanken05_test_blind.conll'
    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    sentences = conll.read_sentences(train_file)
    formatted_corpus = conll.split_rows(sentences, column_names_2006)

    settings = [(1, False), (2, False), (2, True)]
    feature_names = []
    file_names = []
    for i in range(3):
        file_names.append('linear_model_' + str(i + 1))
        feature_names.append(create_feature_names(settings[i]))
    """
    TRAINING THE MODELS
    """
    print("Extracting the features...")
    feature_lists_X = []
    y = []
    y_completed = False
    for i in range(len(settings)):
Ejemplo n.º 12
0
    test_column_names = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats']
    # feature_names = ['stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word',
    #                  'can_ra', 'can_la']
    feature_names = [
        'stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS',
        'queue1_POS', 'queue0_word', 'queue1_word', 'can_ra', 'can_la'
    ]
    # feature_names = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word',
    #                  'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word',
    #                  'stack0_next_word_POS', 'stack0_next_word_word', 'stack0_prev_word_POS', 'stack0_prev_word_word',
    #                  'can_ra', 'can_la']

    # Read the corpus in and break into sentences
    training_sentences = conll.read_sentences(training_corpus)
    test_sentences = conll.read_sentences(test_corpus)

    # Take the sentences and break them up into a useful format
    # Each corpus is a list of lists of dictionaries
    # Corpus = dictionary, each sentence is a list, each word is a dictionary
    split_training_sentences = conll.split_rows(training_sentences,
                                                training_column_names)
    split_test_sentences = conll.split_rows(test_sentences,
                                            training_column_names)

    vec = DictVectorizer(sparse=True)

    classifier, model = train_the_model(split_training_sentences,
                                        feature_names)

    perform_prediction(split_test_sentences, feature_names, classifier, model)
Ejemplo n.º 13
0
from sklearn import metrics
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
if __name__ == '__main__':

    column_names_2006 = [
        'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel',
        'phead', 'pdeprel'
    ]
    train_file = '../../corpus/conllx/sv/swedish_talbanken05_train.conll'
    test_file = '../../corpus/conllx/sv/swedish_talbanken05_test.conll'

    train_sentences = conll.read_sentences(train_file)
    test_sentences = conll.read_sentences(test_file)
    formatted_train_corpus = conll.split_rows(train_sentences,
                                              column_names_2006)
    formatted_test_corpus = conll.split_rows(test_sentences, column_names_2006)
    for mode in [1, 3]:
        print("Extracting the features...")
        X_dict, y = features.extract_features(formatted_train_corpus, mode)
        print("Encoding the features...")
        # Vectorize the feature matrix and carry out a one-hot encoding
        vec = DictVectorizer(sparse=True)
        X = vec.fit_transform(X_dict)

        print("Training the model...")
        classifier = linear_model.LogisticRegression(penalty='l2',
                                                     dual=True,
                                                     solver='liblinear',
                                                     multi_class='ovr')
        model = classifier.fit(X, y)