def extract_features(formatted_corpus, feature_names, training=True, model=None): non_proj = [] X_1 = [] y_1 = [] sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] feats = [] while queue: feats.append(features.extract(stack, queue, graph, feature_names, sentence)) stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) X_1.extend(feats) y_1.extend(transitions) #print('Equal graphs:', transition.equal_graphs(sentence, graph)) if not transition.equal_graphs(sentence, graph): non_proj.append(sentence) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] #print(transitions) #print(graph) #print(len(non_proj)) #s = sorted(non_proj, key=lambda x: len(x)) #print([x['form'] for x in s[0]]) #for x in non_proj: # print(len(x)) # print(x) return (X_1, y_1)
def extract_features(formatted_corpus, feature_names): X, Y = [], [] for sentence in formatted_corpus: stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' temp_X = [] temp_Y = [] while queue: # x is one row of X x = extract(stack, queue, graph, feature_names, sentence) stack, queue, graph, trans = reference(stack, queue, graph) temp_X.append(x) temp_Y.append(trans) stack, graph = transition.empty_stack(stack, graph) if transition.equal_graphs(sentence, graph): X += temp_X Y += temp_Y return X, Y
column_names_2006 = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats', 'head', 'deprel', 'phead', 'pdeprel'] column_names_2006_test = ['id', 'form', 'lemma', 'cpostag', 'postag', 'feats'] sentences = conll.read_sentences(train_file) formatted_corpus = conll.split_rows(sentences, column_names_2006) sent_cnt = 0 for sentence in formatted_corpus: sent_cnt += 1 if sent_cnt % 1000 == 0: print(sent_cnt, 'sentences on', len(formatted_corpus), flush=True) stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] while queue: stack, queue, graph, trans = reference(stack, queue, graph) transitions.append(trans) stack, graph = transition.empty_stack(stack, graph) print('Equal graphs:', transition.equal_graphs(sentence, graph)) # Poorman's projectivization to have well-formed graphs. for word in sentence: word['head'] = graph['heads'][word['id']] print(transitions) print(graph)
graph['deprels'] = { } # Define another dictionary with dependency relations stored inside, again inside graph graph['deprels'][ '0'] = 'ROOT' # Make the first element in the dependency relations dictionary the 'ROOT' keyword transitions = [ ] # List of the transitions that each sentence will have to create its dependency tree while queue: # While you still have things in your input token queue stack, queue, graph, trans = reference(stack, queue, graph) # Stack, queue, and graph may have been modified # trans holds a string for the kind of transition that should be made (shift, reduce, left-arc, right-arc) transitions.append( trans ) # Append the transition string to the list of transitions for this sentence stack, graph = transition.empty_stack(stack, graph) # print('Equal graphs:', transition.equal_graphs(sentence, graph)) if not transition.equal_graphs(sentence, graph): # print('NOT EQUAL GRAPHS') nonprojective_sentences.append(sentence) # Poorman's projectivization to have well-formed graphs. for word in sentence: # "Replace" the head value on the word to the head value that we calculated with the dependency graph word['head'] = graph['heads'][word['id']] # print(transitions) # We print out the transitions that we make # print(graph) # And we print out the graph of arc heads and the dependency relation between them # End of the for sentence in formatted_corpus loop print('end?') shortest_nonprojective_sentence = min(nonprojective_sentences, key=len) for word in shortest_nonprojective_sentence: print(word)