def main(): how_many_to_print = 5 # How many pairs/triples to print at the end # Corpora/Information for ConLL-X column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] training_file_2006 = '../Corpus/swedish_talbanken05_train-conllx.txt' test_file_2006 = '../Corpus/swedish_talbanken05_test-conllx.txt' column_names_conllu = [ 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc' ] # In the deps field, there may be a '0:root' string. # This signifies that that word is the root of the dependency tree files = conll.get_files('../Corpus/Universal_Dependencies_Corpora_Only', 'train.conllu') training_sentences_conllx = conll.read_sentences(training_file_2006) split_training_sentences_conllx = conll.split_rows_conllx( training_sentences_conllx, column_names_2006) # Perform our counting on ConLL-X Corpora subject_verb_pairs_conllx_count = count_subject_verb_pairs_conllx( split_training_sentences_conllx) subject_verb_object_triples_conllx_count = count_subject_verb_object_triples_conllx( split_training_sentences_conllx) # Perform the printing that is required. print_n_most_frequent(subject_verb_pairs_conllx_count, subject_verb_object_triples_conllx_count, how_many_to_print, training_file_2006) # Perform our counting on CoLL-U Corpora for file in files: training_sentences_conllu = conll.read_sentences(file) split_training_sentences_conllu = conll.split_rows_conllu( training_sentences_conllu, column_names_conllu) subject_verb_pairs_conllu_count = count_subject_verb_pairs_conllu( split_training_sentences_conllu) subject_verb_object_triples_conllu_count = count_subject_verb_object_triples_conllu( split_training_sentences_conllu) # Perform the printing that is required. # NOTE: es_gsd-ud-train.conllu identifies %'s as an nsubj. I have no idea why, but it does. # NOTE: So, when %'s are printed out, it's technically correct. It's easy to remove though. # NOTE: Just add another check when you perform your count, and exclude %'s. print_n_most_frequent(subject_verb_pairs_conllu_count, subject_verb_object_triples_conllu_count, how_many_to_print, file) exit(0) # Exit code of 0 is good. return
def main(): test_corpus = '../Corpus/swedish_talbanken05_test_blind-conllx.txt' training_column_names = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] test_column_names = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] test_sentences = conll.read_sentences(test_corpus) # Take the sentences and break them up into a useful format # Each corpus is a list of lists of dictionaries # Corpus = dictionary, each sentence is a list, each word is a dictionary split_test_sentences = conll.split_rows(test_sentences, training_column_names) feature_names_1 = ['stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', 'can_ra', 'can_la'] feature_names_2 = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', 'can_ra', 'can_la'] feature_names_3 = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', 'stack0_next_word_POS', 'stack0_next_word_word', 'stack0_prev_word_POS', 'stack0_prev_word_word', 'can_ra', 'can_la'] feature_sets = [feature_names_1, feature_names_2, feature_names_3] i = 1 for feature_set in feature_sets: classifier_filename = 'ml_classifier' + str(i) + '.pickle' model_filename = 'ml_model' + str(i) + '.pickle' classifier_file = open(classifier_filename, 'rb') model_file = open(model_filename, 'rb') classifier = pickle.load(classifier_file) model = pickle.load(model_file) perform_prediction(split_test_sentences, feature_set, classifier, model) i += 1
def predict_sentence(): train_file = 'swedish_talbanken05_train.conll' test_file = 'swedish_talbanken05_test_blind.conll' column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] sentences = conll.read_sentences(test_file) formatted_corpus = conll.split_rows(sentences, column_names_2006_test) features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra'] features2 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'can_re', 'can_ra' ] features3 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1', 'can_re', 'can_ra' ] sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: feat = features.extract_3(stack, queue, graph, features3, sentence) # print(feat) feat = vec.transform(feat) trans_nr = model.predict(feat) # print(trans_nr) trans = label.inverse_transform(trans_nr) print(trans) # fel Graph stack, queue, graph, trans = parse_ml(stack, queue, graph, trans[0]) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] conll.save("test", formatted_corpus, column_names_2006)
def train_model(): train_file = 'swedish_talbanken05_train.conll' test_file = 'swedish_talbanken05_test_blind.conll' column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) features1 = ['word_s0', 'pos_s0', 'word_q0', 'pos_q0', 'can_re', 'can_ra'] features2 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'can_re', 'can_ra' ] features3 = [ 'word_s0', 'pos_s0', 'word_s1', 'pos_s1', 'word_q0', 'pos_q0', 'word_q1', 'pos_q1', 'word_n0', 'pos_n0', 'word_n1', 'pos_n1', 'can_re', 'can_ra' ] sent_cnt = 0 x_vect = [] y_vect = [] for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: feat = features.extract_3(stack, queue, graph, features3, sentence) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) x_vect.append(feat) y_vect.append(trans) # print(feat, " = ", trans) stack, graph = transition.empty_stack(stack, graph) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] # print(transitions) # print(graph) return x_vect, y_vect
def calculateSomething(filen, model=None, dict_vect=None, label_enc = None): column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(filen) formatted_corpus = conll.split_rows(sentences, column_names_2006) sent_cnt = 0 X_unEncoded = [] y_unEncoded = [] for sentence in formatted_corpus: sent_cnt += 1 #if sent_cnt % 1000 == 0: # print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) state = {} state['heads'] = {} state['heads']['0'] = '0' state['deprels'] = {} state['deprels']['0'] = 'ROOT' transitions = [] while queue: featureRow = extract(stack, queue, state, [], sentence) if model is None or dict_vect is None or label_enc is None: stack, queue, state, trans = reference(stack, queue, state) transitions.append(trans) else: featureRow_encoded = dict_vect.transform(featureRow) trans_nr = model.predict(featureRow_encoded) trans = le.inverse_transform(trans_nr) print(trans[0]) stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) X_unEncoded.append(featureRow) y_unEncoded.append(trans) stack, state = transition.empty_stack(stack, state) #print('Equal graphs:', transition.equal_graphs(sentence, state)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = state['heads'][word['id']] return X_unEncoded, y_unEncoded
def computeVectors(file): column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] sentences = conll.read_sentences(file) formatted_corpus = conll.split_rows(sentences, column_names_2006) X = [] y = [] feature_names = [ 'stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', 'can-re', 'can-la' ] #feature_names = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', # 'queue0_word', 'queue1_word','can-re', 'can-la'] #feature_names=['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', # 'queue0_word', 'queue1_word','next_word_POS','next_word', 'prev_word_POS', 'prev_word', # 'can-re', 'can-la'] for sentence in formatted_corpus:
print("# of pairs:", counter) print("Morst frequent triples:") for i, triple in enumerate(triples_sorted): if i >= 5: break print(triple[1], triple[0]) if __name__ == '__main__': column_names = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] train_file = "./corpus/swedish_talbanken05_train.conll" train_corpus = conll.read_sentences(train_file) train_corpus = conll.split_rows(train_corpus, column_names) print(train_file, len(train_corpus)) get_pairs(train_corpus) get_triples(train_corpus) column_names = [ 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc' ] SUB = 'nsubj' OBJ = 'obj' files = conll.get_files("/usr/local/cs/EDAN20/ud-treebanks-v2.4/", "train.conllu")
X = (dict(zip(feature_names, x))) trans_nr = classifier.predict(vec.transform(X))[0] trans = dict_classes[trans_nr] stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) x = list() transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] return X if __name__ == '__main__': test_file = 'swedish_talbanken05_test_blind.conll' sentences = conll.read_sentences(test_file) column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] formatted_corpus = conll.split_rows(sentences, column_names_2006) feature_names = [ 'stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', 'can-re', 'can-la', 'before_word', 'before_POS', 'after_word', 'after_POS' ] classifier = pickle.load(open('model3.pkl', 'rb')) #print(classifier)
if not connected(sentence[i], sentence[id_head], sentence): np_links.append(word) break return np_links if __name__ == '__main__': train_file = '../../../corpus/conllx/sv/swedish_talbanken05_train.conll' # train_file = 'test_x' # test_file = '../../../corpus/conllx/sv/swedish_talbanken05_test.conll' test_file = '../../../corpus/conllx/sv/swedish_talbanken05_test_blind.conll' column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) # print(formatted_corpus[0]) for sentence in formatted_corpus: if sentence[1]['form'] == 'Vad' and sentence[2]['form'] == 'beror': print_sentence(sentence) print(sentence) np_links = nonprojective_links(sentence) print("nonprojective Links", np_links) projective_order = [] inorder(sentence[0], sentence, projective_order) print(projective_order)
# print('re', stack[0]['cpostag'], queue[0]['cpostag']) stack, queue, graph = transition.reduce(stack, queue, graph) return stack, queue, graph, 're' # Shift # print('sh', [], queue[0]['cpostag']) stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, 'sh' if __name__ == '__main__': train_file = 'swedish_talbanken05_train.conll' test_file = 'swedish_talbanken05_test_blind.conll' column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = []
'phead', 'pdeprel' ] column_names_2006_test = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats' ] #for i in range(1, 4): vec = DictVectorizer(sparse=True) classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear', verbose=1) # classifier = linear_model.Perceptron(penalty='l2') # classifier = tree.DecisionTreeClassifier() # classifier = linear_model.SGDClassifier(penalty='l2') sentences = conll.read_sentences(train_file) sentences_test = conll.read_sentences(test_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) formatted_corpus_test = conll.split_rows(sentences_test, column_names_2006_test) X, y = extract_features(formatted_corpus, mode=3) train_model(X, y, classifier, vec) X_test, y_test = extract_features(formatted_corpus_test, mode=3, test_mode=True, vec=vec, classifier=classifier) test_model(X_test, y_test, classifier, vec)
# X, y = extract_features(formatted_corpus, feature_names) # # for i in range(9): # print(str(X[i]) + " " + str(y[i])) start_time = time.clock() train_corpus = 'swedish_talbanken05_train.conll' test_corpus = 'swedish_talbanken05_test_blind.conll' column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] feature_names = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', 'can-re', 'can-la', 'before_word', 'before_POS', 'after_word', 'after_POS'] train_sentences = conll.read_sentences(train_corpus) formatted_corpus = conll.split_rows(train_sentences, column_names_2006) print("Extracting the features...") X_dict, y_symbols = extract_features(formatted_corpus, feature_names) print("Encoding the features and classes...") # Vectorize the feature matrix and carry out a one-hot encoding vec = DictVectorizer(sparse=True) X = vec.fit_transform(X_dict) # The statement below will swallow a considerable memory # X = vec.fit_transform(X_dict).toarray() # print(vec.get_feature_names()) y, dict_classes, inv_dict_classes = encode_classes(y_symbols)
'phead', 'pdeprel' ] test_column_names = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] # feature_names = ['stack0_POS', 'stack0_word', 'queue0_POS', 'queue0_word', # 'can_ra', 'can_la'] feature_names = [ 'stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', 'can_ra', 'can_la' ] # feature_names = ['stack0_POS', 'stack1_POS', 'stack0_word', 'stack1_word', # 'queue0_POS', 'queue1_POS', 'queue0_word', 'queue1_word', # 'stack0_next_word_POS', 'stack0_next_word_word', 'stack0_prev_word_POS', 'stack0_prev_word_word', # 'can_ra', 'can_la'] # Read the corpus in and break into sentences training_sentences = conll.read_sentences(training_corpus) test_sentences = conll.read_sentences(test_corpus) # Take the sentences and break them up into a useful format # Each corpus is a list of lists of dictionaries # Corpus = dictionary, each sentence is a list, each word is a dictionary split_training_sentences = conll.split_rows(training_sentences, training_column_names) split_test_sentences = conll.split_rows(test_sentences, training_column_names) vec = DictVectorizer(sparse=True) classifier, model = train_the_model(split_training_sentences, feature_names)
triples = [] for tup in collector_objects: _object = tup[0] head_reference = tup[1] for pair, head in collector_pair.items(): if (head_reference == head): triples.append((pair[0], pair[1], _object)) break return triples column_names_2006 = [ 'id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel' ] sentences = conll.read_sentences('./swedish_talbanken05_train.conll.txt') formatted_corpus = conll.split_rows(sentences, column_names_2006) frequencies = {} for sentence in formatted_corpus: for token in sentence: if (token['deprel'] == 'SS'): subject = token['form'] head = token['head'] verb = sentence[int(head)]['form'] pair = (subject.lower(), verb.lower()) if pair in frequencies: frequencies[pair] += 1 else: frequencies[pair] = 1
head_reference = tup[1] for pair, head in collector_pair.items(): if (head_reference == head): triples.append((pair[0], pair[1], _object)) break return triples if __name__ == "__main__": column_names_u = [ 'id', 'form', 'lemma', 'upostag', 'xpostag', 'feats', 'head', 'deprel', 'deps', 'misc' ] fname = sys.argv[1] print(fname) sentences = conll.read_sentences(fname) formatted_corpus = conll.split_rows(sentences, column_names_u) frequencies = {} for sentence in formatted_corpus: for token in sentence: if (token['deprel'] == 'nsubj'): subject = token['form'] head = token['head'] verb = sentence[int(head)]['form'] pair = (subject.lower(), verb.lower()) if pair in frequencies: frequencies[pair] += 1 else: frequencies[pair] = 1 cnt = Counter(frequencies)
base_path = '/Users/pierre/Documents/Cours/EDA171/programs/pos_tagging/corpus/en/' train_file = base_path + 'CoNLL2009-ST-English-train-pos.txt' dev_file = base_path + 'CoNLL2009-ST-English-development-pos.txt' test_file = base_path + 'CoNLL2009-ST-test-words-pos.txt' elif corpus == 'CoNLL-U': column_names = [ 'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC' ] POS_key = 'UPOS' base_path = '/Users/pierre/Documents/Cours/EDAN20/corpus/ud-treebanks-v2.6/' train_file = base_path + 'UD_English-EWT/en_ewt-ud-train.conllu' dev_file = base_path + 'UD_English-EWT/en_ewt-ud-dev.conllu' test_file = base_path + 'UD_English-EWT/en_ewt-ud-test.conllu' train_sentences = conll.read_sentences(train_file) formatted_corpus = [ conll.split_rows(sentence, column_names) for sentence in train_sentences ] # print(formatted_corpus[0]) counts = Counts(formatted_corpus, column_names, POS_key) counts.count_all() # counts.print_stats() counts.print_debug() # print(counts.sentences[0]) # exit() if corpus == 'CoNLL2009': cm = ConfusionMatrix(formatted_corpus, POS_key) cm.compute_matrix()
print(X_dict) print(len(X_dict)) print(len(sent_tok_seq)) if __name__ == '__main__': column_names = [ 'ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC' ] base_path = '/Users/pierre/Documents/Cours/EDAN20/corpus/ud-treebanks-v2.6/' train_file = base_path + 'UD_English-EWT/en_ewt-ud-train.conllu' dev_file = base_path + 'UD_English-EWT/en_ewt-ud-dev.conllu' test_file = base_path + 'UD_English-EWT/en_ewt-ud-test.conllu' train_sentences = conll.read_sentences(train_file) formatted_train_corpus = [ conll.split_rows(sentence, column_names) for sentence in train_sentences ] train_sent_texts = [ conll.get_text(sentence) for sentence in train_sentences ] test_sentences = conll.read_sentences(test_file) formatted_test_corpus = [ conll.split_rows(sentence, column_names) for sentence in test_sentences ] test_sent_texts = [conll.get_text(sentence) for sentence in test_sentences] # Training a model X_symb_seq = []