def extract(stack, queue, state, feature_names, sentence): """ Extract the features from one sentence returns X and y, where X is a list of dictionaries and y is a list of symbols :param sentence: :param w_size: :return: """ x = [] tmpListStack = ['nil', 'nil', 'nil', 'nil'] tmpListQueue = ['nil', 'nil', 'nil', 'nil'] try: tmpListStack[0] = stack[0]['form'] tmpListStack[1] = stack[0]['postag'] tmpListStack[2] = stack[1]['form'] tmpListStack[3] = stack[1]['postag'] except: pass try: tmpListQueue[0] = queue[0]['form'] tmpListQueue[1] = queue[0]['postag'] tmpListQueue[2] = queue[1]['form'] tmpListQueue[3] = queue[1]['postag'] except: pass x.extend(tmpListStack) x.extend(tmpListQueue) x.append(transition.can_rightarc(stack)) x.append(transition.can_reduce(stack, state)) # We represent the feature vector as a dictionary # The classes are stored in a list #y.append(padded_sentence[i + w_size][2]) return dict(zip(feature_names, x))
def reference(stack, queue, graph): """ Gold standard parsing Produces a sequence of transitions from a manually-annotated corpus: sh, re, ra.deprel, la.deprel :param stack: The stack :param queue: The input list :param graph: The set of relations already parsed :return: the transition and the grammatical function (deprel) in the form of transition.deprel """ # Right arc if stack and stack[0]['id'] == queue[0]['head']: # print('ra', queue[0]['deprel'], stack[0]['cpostag'], queue[0]['cpostag']) deprel = '.' + queue[0]['deprel'] stack, queue, graph = transition.right_arc(stack, queue, graph) return stack, queue, graph, 'ra' + deprel # Left arc if stack and queue[0]['id'] == stack[0]['head']: # print('la', stack[0]['deprel'], stack[0]['cpostag'], queue[0]['cpostag']) deprel = '.' + stack[0]['deprel'] stack, queue, graph = transition.left_arc(stack, queue, graph) return stack, queue, graph, 'la' + deprel # Reduce if stack and transition.can_reduce(stack, graph): for word in stack: if (word['id'] == queue[0]['head'] or word['head'] == queue[0]['id']): # print('re', stack[0]['cpostag'], queue[0]['cpostag']) stack, queue, graph = transition.reduce(stack, queue, graph) return stack, queue, graph, 're' # Shift # print('sh', [], queue[0]['cpostag']) stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, 'sh'
def line_extract(stack, queue, graph, feature_names, sentence, samples, special): x = [] structures = [stack, queue] elements = ['postag', 'form'] for structure in structures: for element in elements: for i in range(samples): if len(structure) > i: x.append(structure[i][element]) else: x.append('nil') if special: # word before and after top of stack for element in elements: for i in [-1, 1]: if len(stack) > 0: index = int(stack[0]['id']) + i if 0 <= index < len(sentence): x.append(sentence[index][element]) else: x.append('nil') else: x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) return dict(zip(feature_names, x))
def extract2(stack, queue, graph, feature_names, sentence): features = {} feat_vec = ['', '', '', '', '', '', '', '', '', ''] if not stack: feat_vec[0] = feat_vec[1] = feat_vec[2] = feat_vec[3] = 'nil' elif len(stack) < 2: feat_vec[1] = feat_vec[3] = 'nil' feat_vec[0] = stack[0]['postag'] feat_vec[2] = stack[0]['form'] else: feat_vec[0] = stack[0]['postag'] feat_vec[1] = stack[1]['postag'] feat_vec[2] = stack[0]['form'] feat_vec[3] = stack[1]['form'] if not queue: feat_vec[5] = feat_vec[6] = feat_vec[7] = feat_vec[8] = 'nil' if len(queue) < 2: feat_vec[4] = queue[0]['postag'] feat_vec[6] = queue[0]['form'] feat_vec[5] = feat_vec[7] = 'nil' else: feat_vec[4] = queue[0]['postag'] feat_vec[5] = queue[1]['postag'] feat_vec[6] = queue[0]['form'] feat_vec[7] = queue[1]['form'] feat_vec[8] = transition.can_reduce(stack, graph) feat_vec[9] = transition.can_leftarc(stack, graph) return dict(zip(feature_names, feat_vec))
def extract_2(stack, queue, graph, feature_names, sentence): stack_list = ["nil", "nil", "nil", "nil"] if stack: stack_list[0] = stack[0]['form'] stack_list[1] = stack[0]['postag'] if len(stack) > 1: stack_list[2] = stack[1]['form'] stack_list[3] = stack[1]['postag'] queue_list = ["nil", "nil", "nil", "nil"] if queue: queue_list[0] = queue[0]['form'] queue_list[1] = queue[0]['postag'] if len(queue) > 1: queue_list[2] = queue[1]['form'] queue_list[3] = queue[1]['postag'] features = stack_list + queue_list can_re = transition.can_reduce(stack, graph) can_left_arc = transition.can_leftarc(stack, graph) features.append(can_re) features.append(can_left_arc) features = zip(feature_names, features) features = dict(features) # print(features) return features
def extract(stack, queue, graph, feature_names, sentence): # X contains one dict for each word, with each feature as a key in the dict # x is a row in X x = list() p_stack = stack + [{'id': '-1', 'form': 'nil', 'postag': 'nil'}]*2 p_queue = queue + [{'form': 'nil', 'postag': 'nil'}]*2 p_sentence = [{'form': 'nil', 'postag': 'nil'}, {'form': 'BOS', 'postag': 'BOS'}] + sentence[1:] + [{'form': 'EOS', 'postag': 'EOS'}] # 1st feature set x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) x.append(p_stack[0]['postag']) x.append(p_stack[0]['form']) x.append(p_queue[0]['postag']) x.append(p_queue[0]['form']) # 2nd feature set x.append(p_stack[1]['postag']) x.append(p_stack[1]['form']) x.append(p_queue[1]['postag']) x.append(p_queue[1]['form']) # 3rd feature set i = int(p_stack[0]['id']) if i == -1: x += ['nil']*4 else: w_pre, w_next = p_sentence[i], p_sentence[i + 2] x.append(w_pre['postag']) x.append(w_pre['form']) x.append(w_next['postag']) x.append(w_next['form']) return dict(zip(feature_names, x))
def extract(stack, queue, graph, feature_names, sentence): full_sentence = sentence features = [] try: features.append(stack[0]['postag']) features.append(stack[0]['form']) except: features.append("nil") features.append("nil") features.append(queue[0]['postag']) features.append(queue[0]['form']) if transition.can_leftarc(stack, graph): can_la = True else: can_la = False if transition.can_reduce(stack, graph): can_re = True else: can_re = False features.append(can_re) features.append(can_la) return features
def generate_feature_vector2(stack,queue,graph): feature_names = ['stack0_POS','stack1_POS','stack0_word','stack1_word', 'queue0_POS','queue1_POS','queue0_word','queue1_word','can-re','can-la'] try: stack0_POS = stack[0]['postag'] except IndexError: stack0_POS = 'nil' try: stack1_POS = stack[1]['postag'] except IndexError: stack1_POS = 'nil' try: stack0_word = stack[0]['form'] except IndexError: stack0_word = 'nil' try: stack1_word = stack[1]['form'] except IndexError: stack1_word = 'nil' #Guaranteed to exist because of while loop queue0_POS = queue[0]['postag'] queue0_word= queue[0]['form'] try: queue1_POS = queue[1]['postag'] except IndexError: queue1_POS = 'nil' try: queue1_word= queue[1]['form'] except IndexError: queue1_word = 'nil' can_left_arc = transition.can_leftarc(stack,graph) can_reduce = transition.can_reduce(stack,graph) return dict(zip(feature_names, [stack0_POS,stack1_POS,stack0_word,stack1_word,queue0_POS,queue1_POS,queue0_word,queue1_word,can_reduce,can_left_arc]))
def extract_features_sent(sentence, feature_names): """ Extract the features from one sentence returns X and y, where X is a list of dictionaries and y is a list of symbols :param sentence: :param w_size: :return: """ #sentence = sentence.splitlines() stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x = list() X = list() y = list() while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) x.append(stack[0]['form']) else: x.append('nil') x.append('nil') if (queue): x.append(queue[0]['cpostag']) x.append(queue[0]['form']) else: x.append('nil') x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X.append(dict(zip(feature_names, x))) #remove reference, predict what action should be done(equiv to trans) stack, queue, graph, trans = dparser.reference(stack, queue, graph) y.append(trans) x = list() #stack, graph = transition.empty_stack(stack, graph) #for word in queue: #print(word['form']) #stack, queue, graph, trans = reference(stack, queue, graph) #transitions.append(trans) # stack, graph = transition.empty_stack(stack, graph) return X, y
def parse_ml(stack, queue, graph, trans): if stack and trans[:2] == 'ra' and transition.can_rightarc(stack): stack, queue, graph = transition.right_arc(stack, queue, graph, trans[3:]) return stack, queue, graph, 'ra' if stack and trans[:2] == 'la' and transition.can_leftarc(stack, graph): stack, queue, graph = transition.left_arc(stack, queue, graph, trans[3:]) return stack, queue, graph, 'la' if stack and trans[:2] == 're' and transition.can_reduce(stack, graph): stack, queue, graph = transition.reduce(stack, queue, graph) return stack, queue, graph, 're' stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, 'sh'
def reference(stack, queue, graph): """ Gold standard parsing Produces a sequence of transitions from a manually-annotated corpus: sh, re, ra.deprel, la.deprel :param stack: The stack :param queue: The input list :param graph: The set of relations already parsed :return: the transition and the grammatical function (deprel) in the form of transition.deprel """ # This is a continuous list of if statements, but each has a return statement inside of it, so it behaves similar to # a set of elif statements. However, there is a priority level to these operations, # 1) Right-arc, 2) Left-arc, 3) Reduce, 4) Shift # Right arc if stack and stack[0]['id'] == queue[0]['head']: # If stack is non-null and # the top of the stack's id key is the same as the front of the queue's head key are the same # Then we do a right-arc # print('ra', queue[0]['deprel'], stack[0]['cpostag'], queue[0]['cpostag']) deprel = '.' + queue[0]['deprel'] stack, queue, graph = transition.right_arc(stack, queue, graph) return stack, queue, graph, 'ra' + deprel # Left arc if stack and queue[0]['id'] == stack[0]['head']: # If the stack is non-null AND # the front of the queue's ID is the same as the top of the stack's head value # Then we do a left-arc # print('la', stack[0]['deprel'], stack[0]['cpostag'], queue[0]['cpostag']) deprel = '.' + stack[0]['deprel'] stack, queue, graph = transition.left_arc(stack, queue, graph) return stack, queue, graph, 'la' + deprel # Reduce if stack and transition.can_reduce(stack, graph): # If the stack is non-null and the stack can be reduced, then for word in stack: # For each word present in the stack if (word['id'] == queue[0]['head'] or # If the word's ID is the same as the front of the queue's HEAD value word['head'] == queue[0]['id']): # OR the word's HEAD value matches the front of the queue's ID value # print('re', stack[0]['cpostag'], queue[0]['cpostag']) # Then we can reduce the value that is on the stack stack, queue, graph = transition.reduce(stack, queue, graph) return stack, queue, graph, 're' # Shift # We will only get here if none of the other operations did NOT occur # This is because each of the if statements for the operations has a return statement # Stack was pushed to, queue had front removed, and graph is unmodified by shift actions stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, 'sh'
def extract3(stack, queue, graph, feature_names, sentence): features = {} feat_vec = ['', '', '', '', '', '', '', '', '', '', '', '', '', ''] if not stack: feat_vec[0] = feat_vec[1] = feat_vec[2] = feat_vec[3] = 'nil' elif len(stack) < 2: feat_vec[1] = feat_vec[3] = 'nil' feat_vec[0] = stack[0]['postag'] feat_vec[2] = stack[0]['form'] else: feat_vec[0] = stack[0]['postag'] feat_vec[1] = stack[1]['postag'] feat_vec[2] = stack[0]['form'] feat_vec[3] = stack[1]['form'] if not queue: feat_vec[5] = feat_vec[6] = feat_vec[7] = feat_vec[8] = 'nil' if len(queue) < 2: feat_vec[4] = queue[0]['postag'] feat_vec[6] = queue[0]['form'] feat_vec[5] = feat_vec[7] = 'nil' else: feat_vec[4] = queue[0]['postag'] feat_vec[5] = queue[1]['postag'] feat_vec[6] = queue[0]['form'] feat_vec[7] = queue[1]['form'] feat_vec[8] = transition.can_reduce(stack, graph) feat_vec[9] = transition.can_leftarc(stack, graph) #before 10, 11 #after 12,13 if not stack: feat_vec[10] = feat_vec[11] = feat_vec[12] = feat_vec[13] = 'nil' else: st_id = stack[0]['id'] if int(st_id) == 0: feat_vec[10] = feat_vec[11] = 'nil' else: feat_vec[10] = sentence[int(st_id) - 1]['postag'] feat_vec[11] = sentence[int(st_id) - 1]['form'] if int(st_id) > len(sentence): feat_vec[12] = feat_vec[13] = 'nil' else: feat_vec[12] = sentence[int(st_id) + 1]['postag'] feat_vec[13] = sentence[int(st_id) + 1]['form'] return dict(zip(feature_names, feat_vec))
def extract(stack, queue, graph, feature_names, sentence): X = [] X.append(transition.can_leftarc(stack, graph)) X.append(transition.can_reduce(stack, graph)) try: X.append(stack[0]['postag']) X.append(stack[0]['form']) except: X.append("nil") X.append("nil") try: X.append(stack[1]['postag']) X.append(stack[1]['form']) except: X.append("nil") X.append("nil") X1 = X try: X.append(queue[0]['postag']) X.append(queue[0]['form']) except: X.append("nil") X.append("nil") try: X.append(queue[1]['postag']) X.append(queue[1]['form']) except: X.append("nil") X.append("nil") X2 = X try: for i in range(len(sentence)): if sentence[i]['form'] == stack[0]['form']: X.append(sentence[i + 1]['postag']) X.append(sentence[i + 1]['form']) except: X.append("nil") X.append("nil") try: X.append(sentence[int(stack[1]['head'])]['postag']) X.append(sentence[int(stack[1]['head'])]['form']) except: X.append("nil") X.append("nil") X3 = X X1 = dict(zip(feature_names[:6], X1)) X2 = dict(zip(feature_names[:10], X2)) X3 = dict(zip(feature_names, X3)) return X1, X2, X3
def extract_features_sent(sentence, feature_names, classifier, dict_classes, vec): stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x = list() X = list() y = list() while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) x.append(stack[0]['form']) else: x.append('nil') x.append('nil') if (queue): x.append(queue[0]['cpostag']) x.append(queue[0]['form']) else: x.append('nil') x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X = (dict(zip(feature_names, x))) #remove reference, predict what action should be done(equiv to trans) #print('Stack is ', len(stack)) #print('Queue is ', queue) trans_nr = classifier.predict(vec.transform(X)) print(trans_nr[0]) trans = dict_classes[trans_nr[0]] stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) x = list() #stack, graph = transition.empty_stack(stack, graph) transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] return graph
def extract_mode_1(stack, queue, graph, feature_names, sentence=None): features = list() if stack: stack0 = stack[0] features.extend([stack0.get('postag'), stack0.get('form')]) else: features.extend(['nil', 'nil']) if queue: queue0 = queue[0] features.extend([queue0.get('postag'), queue0.get('form')]) else: features.extend(['nil', 'nil']) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) return dict(zip(feature_names.get('mode1'), features))
def extract(stack, queue, graph, feature_names, sentence): features = [] features.append(stack[0]['postag'] if len(stack) > 0 else 'nil') features.append(stack[1]['postag'] if len(stack) > 1 else 'nil') features.append(stack[0]['form'] if len(stack) > 0 else 'nil') features.append(stack[1]['form'] if len(stack) > 1 else 'nil') features.append(queue[0]['postag'] if len(queue) > 0 else 'nil') features.append(queue[1]['postag'] if len(queue) > 1 else 'nil') features.append(queue[0]['form'] if len(queue) > 0 else 'nil') features.append(queue[1]['form'] if len(queue) > 1 else 'nil') features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) return dict(zip(feature_names, features))
def extract(stack, queue, state, feature_names, sentence): features = {} features["can_reduce"] = str(transition.can_reduce(stack, state)) features["can_leftarc"] = str(transition.can_leftarc(stack, state)) if stack: features["stack0_postag"] = stack[0]["postag"] features["stack0_form"] = stack[0]["form"] else: features["stack0_postag"] = "nil" features["stack0_form"] = "nil" if len(stack) > 1: features["stack1_postag"] = stack[1]["postag"] features["stack1_form"] = stack[1]["form"] else: features["stack1_postag"] = "nil" features["stack1_form"] = "nil" if queue: features["queue0_postag"] = queue[0]["postag"] features["queue0_form"] = queue[0]["form"] else: features["queue0_postag"] = "nil" features["queue0_form"] = "nil" if len(queue) > 1: features["queue1_postag"] = queue[1]["postag"] features["queue1_form"] = queue[1]["form"] else: features["queue1_postag"] = "nil" features["queue1_form"] = "nil" features["nextWord_form"] = "nil" features["nextWord_postag"] = "nil" if int(queue[0]["id"]) < len(sentence) - 1: w = sentence[int(queue[0]["id"]) + 1] features["nextWord_form"] = w['form'] features["nextWord_postag"] = w['postag'] features["prevWord_form"] = "nil" features["prevWord_postag"] = "nil" if int(queue[0]["id"]) > 0: w = sentence[int(queue[0]["id"]) - 1] features["prevWord_form"] = w['form'] features["prevWord_postag"] = w['postag'] return features
def parse_ml(stack, queue, graph, trans): #print(trans) if stack and transition.can_rightarc(stack) and trans[:2] == 'ra': stack, queue, graph = transition.right_arc(stack, queue, graph, trans[3:]) return stack, queue, graph, 'ra' elif stack and transition.can_leftarc( stack, graph) and trans[:2] == 'la': #VARFÖR :2 stack, queue, graph = transition.left_arc(stack, queue, graph, trans[3:]) return stack, queue, graph, 'la' elif stack and transition.can_reduce(stack, graph) and trans[:2] == 're': stack, queue, graph = transition.reduce(stack, queue, graph) return stack, queue, graph, 're' else: stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, "sh"
def extract(stack, queue, graph, feature_names, sentence): features = [] # Feature set 1 features.extend(['nil', 'nil', 'nil', 'nil', 'nil', 'nil']) if len(stack) >= 1: features[0] = stack[0]['form'] features[1] = stack[0]['postag'] if len(queue) >= 1: features[2] = queue[0]['form'] features[3] = queue[0]['postag'] features[4] = transition.can_leftarc(stack, graph) features[5] = transition.can_reduce(stack, graph) # Feature set 2 if len(feature_names) == 10 or len(feature_names) == 14: features.extend(['nil', 'nil', 'nil', 'nil']) if len(stack) >= 2: features[6] = stack[1]['form'] features[7] = stack[1]['postag'] if len(queue) >= 2: features[8] = queue[1]['form'] features[9] = queue[1]['postag'] # Feature set 3 if len(feature_names) == 14: features.extend(['nil', 'nil', 'nil', 'nil']) if len(stack) >= 1 and len(sentence) > int(stack[0]['id']) + 1: word = sentence[int(stack[0]['id']) + 1] features[10] = word['form'] features[11] = word['postag'] if len(queue) >= 1 and len(sentence) > int(queue[0]['id']) + 1: word = sentence[int(queue[0]['id']) + 1] features[12] = word['form'] features[13] = word['postag'] features = dict(zip(feature_names, features)) return features
def parse_ml(stack, queue, graph, trans): #right arc if stack and trans[:2] == 'ra' and transition.can_rightarc(stack): stack, queue, graph = transition.right_arc(stack, queue, graph, trans[3:]) return stack, queue, graph, 'ra' #left arc if stack and trans[:2] == 'la' and transition.can_leftarc(stack, graph): stack, queue, graph = transition.left_arc(stack, queue, graph, trans[3:]) return stack, queue, graph, 'la' #reduce if stack and trans[:2] == 're' and transition.can_reduce(stack, graph): stack, queue, graph = transition.reduce(stack, queue, graph) return stack, queue, graph, 're' #shift if stack and trans[:2] == 'sh': stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, 'sh' #action not possible -> shift else: stack, queue, graph = transition.shift(stack, queue, graph) return stack, queue, graph, 'sh'
def generate_feature_vector3(stack,queue,graph,sentence): feature_names = ['stack0_POS','stack1_POS','stack0_word','stack1_word', 'queue0_POS','queue1_POS','queue0_word','queue1_word','can-re','can-la','following_word','following_word_POS','queue3_POS','stack0_previous_word_POS'] try: stack0_POS = stack[0]['postag'] except IndexError: stack0_POS = 'nil' try: stack1_POS = stack[1]['postag'] except IndexError: stack1_POS = 'nil' try: stack0_word = stack[0]['form'] except IndexError: stack0_word = 'nil' try: stack1_word = stack[1]['form'] except IndexError: stack1_word = 'nil' #Guaranteed to exist because of while loop queue0_POS = queue[0]['postag'] queue0_word= queue[0]['form'] try: queue1_POS = queue[1]['postag'] except IndexError: queue1_POS = 'nil' try: queue1_word= queue[1]['form'] except IndexError: queue1_word = 'nil' #POS QUEUE 3 try: queue3_POS = queue[3]['postag'] except IndexError: queue3_POS = 'nil' #POS STACK 0 pw try: idx = stack[0]['id'] stack0_previous_word_POS= sentence[int(idx)-1]['postag'] except IndexError: stack0_previous_word_POS= 'nil' #POS STACK 0 fw try: idx = stack[0]['id'] following_word_POS = sentence[int(idx)+1]['postag'] except IndexError: following_word_POS = 'nil' #LEX STACK 0 fw try: idx = stack[0]['id'] following_word = sentence[int(idx)+1]['form'] except IndexError: following_word = 'nil' can_left_arc = transition.can_leftarc(stack,graph) can_reduce = transition.can_reduce(stack,graph) return dict(zip(feature_names, [stack0_POS,stack1_POS,stack0_word,stack1_word,queue0_POS,queue1_POS, queue0_word,queue1_word,can_reduce,can_left_arc,following_word,following_word_POS,queue3_POS,stack0_previous_word_POS]))
def extract(stack, queue, graph, feature_names, sentence): features = {} for fn in feature_names: if fn == 'stack0_POS': if stack: features["stack0_POS"] = stack[0]["postag"] else: features["stack0_POS"] = "nil" if fn == 'stack1_POS': if len(stack) > 1: features["stack1_POS"] = stack[1]["postag"] else: features["stack1_POS"] = "nil" if fn == 'stack0_word': if stack: features["stack0_word"] = stack[0]["form"] else: features["stack0_word"] = "nil" if fn == 'stack1_word': if stack and len(stack) > 1: features["stack1_word"] = stack[1]["form"] else: features["stack1_word"] = "nil" if fn == 'queue0_POS': if queue: features["queue0_POS"] = queue[0]["postag"] else: features["queue0_POS"] = "nil" if fn == 'queue1_POS': if queue and len(queue) > 1: features["queue1_POS"] = queue[1]["postag"] else: features["queue1_POS"] = "nil" if fn == 'queue0_word': if queue: features["queue0_word"] = queue[0]["form"] else: features["queue0_word"] = "nil" if fn == 'queue1_word': if queue and len(queue) > 1: features["queue1_word"] = queue[1]["form"] else: features["queue1_word"] = "nil" if fn == 'can-re': features["can-re"] = str(transition.can_reduce(stack, graph)) if fn == 'can-la': features["can-la"] = str(transition.can_leftarc(stack, graph)) if fn == 'next_word_POS': features["next_word_POS"] = "nil" if int(queue[0]["id"]) < len(sentence) - 1: #Next sentece +1 w = sentence[int(queue[0]["id"]) + 1] features["next_word_POS"] = w['postag'] if fn == 'next_word': features["next_word"] = "nil" if int(queue[0]["id"]) < len(sentence) - 1: #Next sentece +1 w = sentence[int(queue[0]["id"]) + 1] features["next_word"] = w['form'] if fn == 'prev_word_POS': features["prev_word_POS"] = "nil" if int(queue[0]["id"]) < len(sentence) - 1: # prev sentece -1 w = sentence[int(queue[0]["id"]) - 1] features["prev_word_POS"] = w['postag'] if fn == 'prev_word': features["prev_word"] = "nil" if int(queue[0]["id"]) < len(sentence) - 1: # prev sentece -1 w = sentence[int(queue[0]["id"]) - 1] features["prev_word"] = w['form'] return features
def extract(stack, queue, graph, feature_names, sentence): """ Returns a row """ features = list() # TODO: Should we use postag os cpostag? POS_TAG = 'postag' WORD_TAG = 'form' ID_TAG = 'id' DEPREL_TAG = 'deprel' NULL_VALUE = 'nil' HEAD_TAG = 'head' # stack_0 if stack: stack_0_POS = stack[0][POS_TAG] stack_0_word = stack[0][WORD_TAG] else: stack_0_POS = NULL_VALUE stack_0_word = NULL_VALUE # stack_1 if len(stack) > 1: stack_1_POS = stack[1][POS_TAG] stack_1_word = stack[1][WORD_TAG] else: stack_1_POS = NULL_VALUE stack_1_word = NULL_VALUE # queue_0 if queue: queue_0_POS = queue[0][POS_TAG] queue_0_word = queue[0][WORD_TAG] else: queue_0_POS = NULL_VALUE queue_0_word = NULL_VALUE # queue_1 if len(queue) > 1: queue_1_POS = queue[1][POS_TAG] queue_1_word = queue[1][WORD_TAG] else: queue_1_POS = NULL_VALUE queue_1_word = NULL_VALUE if len(feature_names) == 6: features.append(stack_0_word) features.append(stack_0_POS) features.append(queue_0_word) features.append(queue_0_POS) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) elif len(feature_names) == 10: features.append(stack_0_word) features.append(stack_0_POS) features.append(stack_1_word) features.append(stack_1_POS) features.append(queue_0_word) features.append(queue_0_POS) features.append(queue_1_word) features.append(queue_1_POS) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) elif len(feature_names) == 13: # word after top of stack in sentence if stack_0_word == NULL_VALUE: after_stack_0_word = NULL_VALUE after_stack_0_POS = NULL_VALUE else: id_stack_0 = int(stack[0]['id']) if len(sentence) - 1 == id_stack_0: #stack 0 is the last word after_stack_0_word = NULL_VALUE after_stack_0_POS = NULL_VALUE else: next_word = sentence[id_stack_0 + 1] after_stack_0_word = next_word[WORD_TAG] after_stack_0_POS = next_word[POS_TAG] # # Head of stack 0 POS # if stack: # head_index_of_stack_0 = stack[0][HEAD_TAG] # head_of_stack_0 = sentence[int(head_index_of_stack_0)] # head_of_stack_0_POS = head_of_stack_0[POS_TAG] # else: # head_of_stack_0_POS = NULL_VALUE features.append(stack_0_word) features.append(stack_0_POS) features.append(stack_1_word) features.append(stack_1_POS) features.append(queue_0_word) features.append(queue_0_POS) features.append(queue_1_word) features.append(queue_1_POS) features.append(after_stack_0_word) features.append(after_stack_0_POS) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) # Our own features features.append(transition.can_rightarc(stack)) # features.append(head_of_stack_0_POS) # Convert features object features = dict(zip(feature_names, features)) return features
def extract_features_sent(sentence, feature_names): """ Extract the features from one sentence returns X and y, where X is a list of dictionaries and y is a list of symbols :param sentence: :param feature_names :return: """ #sentence = sentence.splitlines() stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x = list() X = list() y = list() while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['cpostag']) else: x.append('nil') if (len(stack) > 0): x.append(stack[0]['form']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['form']) else: x.append('nil') if (queue): x.append(queue[0]['cpostag']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['cpostag']) else: x.append('nil') if (queue): x.append(queue[0]['form']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['form']) else: x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X.append(dict(zip(feature_names, x))) stack, queue, graph, trans = dparser.reference(stack, queue, graph) y.append(trans) x = list() # x.append(stack[0]['cpostag']) return X, y
def extract_3(stack, queue, graph, feature_names, sentence): stack_list = ["nil", "nil", "nil", "nil"] if stack: stack_list[0] = stack[0]['form'] stack_list[1] = stack[0]['postag'] if len(stack) > 1: stack_list[2] = stack[1]['form'] stack_list[3] = stack[1]['postag'] queue_list = ["nil", "nil", "nil", "nil"] if queue: queue_list[0] = queue[0]['form'] queue_list[1] = queue[0]['postag'] if len(queue) > 1: queue_list[2] = queue[1]['form'] queue_list[3] = queue[1]['postag'] features = stack_list + queue_list previous_word = ["nil", "nil"] if stack: if int(stack[0]["id"]) > 0: word = sentence[int(stack[0]["id"]) - 1] previous_word[0] = word['form'] previous_word[1] = word['postag'] features = features + previous_word next_word = ["nil", "nil"] if stack: if int(stack[0]["id"]) < len(sentence) - 1: word = sentence[int(stack[0]["id"]) + 1] next_word[0] = word['form'] next_word[1] = word['postag'] features = features + next_word can_re = transition.can_reduce(stack, graph) can_left_arc = transition.can_leftarc(stack, graph) features.append(can_re) features.append(can_left_arc) features = zip(feature_names, features) features = dict(features) # print(features) # print(features) return features # extract_features(sentences, w_size, feature_names): # """ # Builds X matrix and y vector # X is a list of dictionaries and y is a list # :param sentences: # :param w_size: # :return: # """ # X_l = [] # y_l = [] # for sentence in sentences: # X, y = extract_features_sent(sentence, w_size, feature_names, False) # X_l.extend(X) # y_l.extend(y) # return X_l, y_l # # # def extract_features_sent(sentence, w_size, feature_names, useChunk): # """ # Extract the features from one sentence # returns X and y, where X is a list of dictionaries and # y is a list of symbols # :param sentence: string containing the CoNLL structure of a sentence # :param w_size: # :return: # """ # # # We pad the sentence to extract the context window more easily # start = "BOS BOS BOS\n" # end = "\nEOS EOS EOS" # start *= w_size # end *= w_size # sentence = start + sentence # sentence += end # # # Each sentence is a list of rows # sentence = sentence.splitlines() # padded_sentence = list() # for line in sentence: # line = line.split() # padded_sentence.append(line) # # print(padded_sentence) # # # We extract the features and the classes # # X contains is a list of features, where each feature vector is a dictionary # # y is the list of classes # X = list() # y = list() # for i in range(len(padded_sentence) - 2 * w_size): # # x is a row of X # x = list() # # The words in lower case # for j in range(2 * w_size + 1): # x.append(padded_sentence[i + j][0].lower()) # # The POS # for j in range(2 * w_size + 1): # x.append(padded_sentence[i + j][1]) # for j in range(w_size): # x.append(padded_sentence[i + j][2]) # # for j in range(2): # # x.append(padded_sentence[i - j + 1][2]) # # The chunks (Up to the word) # """ # for j in range(w_size): # feature_line.append(padded_sentence[i + j][2]) # """ # # We represent the feature vector as a dictionary # X.append(dict(zip(feature_names, x))) # # The classes are stored in a list # y.append(padded_sentence[i + w_size][2]) # return X, y
def extract_features_sent(sentence, feature_names, classifier, dict_classes, vec): stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' x = list() X = list() d = len(sentence) while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['cpostag']) else: x.append('nil') if (len(stack) > 0): x.append(stack[0]['form']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['form']) else: x.append('nil') if (queue): x.append(queue[0]['cpostag']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['cpostag']) else: x.append('nil') if (queue): x.append(queue[0]['form']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['form']) else: x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X = (dict(zip(feature_names, x))) trans_nr = classifier.predict(vec.transform(X))[0] trans = dict_classes[trans_nr] stack, queue, graph, trans = parse_ml(stack, queue, graph, trans) x = list() transition.empty_stack(stack, graph) for word in sentence: word['head'] = graph['heads'][word['id']] word['deprel'] = graph['deprels'][word['id']] return X
def extract(stack, queue, graph, feature_names, sentence): features = list() POS_TAG = "postag" WORD_TAG = "form" ID_TAG = "id" DEPREL_TAG = "deprel" NULL_VALUE = "nil" HEAD_TAG = "head" if stack: stack_0_pos = stack[0][POS_TAG] stack_0_word = stack[0][WORD_TAG] else: stack_0_pos = NULL_VALUE stack_0_word = NULL_VALUE if len(stack) > 1: stack_1_pos = stack[1][POS_TAG] stack_1_word = stack[1][WORD_TAG] else: stack_1_pos = NULL_VALUE stack_1_word = NULL_VALUE if queue: queue_0_pos = queue[0][POS_TAG] queue_0_word = queue[0][WORD_TAG] else: queue_0_pos = NULL_VALUE queue_0_word = NULL_VALUE if len(queue) > 1: queue_1_pos = queue[1][POS_TAG] queue_1_word = queue[1][WORD_TAG] else: queue_1_pos = NULL_VALUE queue_1_word = NULL_VALUE if len(feature_names) == 6: features.append(stack_0_word) features.append(stack_0_pos) features.append(queue_0_word) features.append(queue_0_pos) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) elif len(feature_names) == 10: features.append(stack_0_word) features.append(stack_0_pos) features.append(stack_1_word) features.append(stack_1_pos) features.append(queue_0_word) features.append(queue_0_pos) features.append(queue_1_word) features.append(queue_1_pos) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) features = dict(zip(feature_names, features)) return features
def extract(stack, queue, graph, feature_names, sentence): full_sentence = sentence features = [] # print(sentence) try: #feature_names = [stack_0_postag, stack_0_form,stack_1_postag,stack_1_form,queue_0_postag,queue_0_form_queue_1_postag,queue_1_form_forward_word_postag,forward_word_form_backward_word_postag_backward_word_form] features.append(stack[0]['postag']) features.append(stack[0]['form']) try: features.append(stack[1]['postag']) features.append(stack[1]['form']) except: features.append("nil") features.append("nil") except: features.append("nil") features.append("nil") features.append("nil") features.append("nil") try: features.append(queue[0]['postag']) features.append(queue[0]['form']) features.append(queue[1]['postag']) features.append(queue[1]['form']) except: features.append(queue[0]['postag']) features.append(queue[0]['form']) features.append("nil") features.append("nil") #print(sentence['id']) try: id = stack[0]['id'] for ids in range(len(sentence)): if sentence[ids]['id'] == id: features.append(sentence[ids + 1]['postag']) features.append(sentence[ids + 1]['form']) except: features.append('nil') features.append('nil') try: id = stack[0]['id'] for ids in range(len(sentence)): if sentence[ids]['id'] == id: features.append(sentence[ids - 1]['postag']) features.append(sentence[ids - 1]['form']) except: features.append('nil') features.append('nil') if transition.can_leftarc(stack, graph): can_la = True else: can_la = False if transition.can_reduce(stack, graph): can_re = True else: can_re = False features.append(can_re) features.append(can_la) return features
def extract(stack, queue, graph, feature_names, sentence): features = list() POS_TAG = "postag" WORD_TAG = "form" ID_TAG = "id" DEPREL_TAG = "deprel" NULL_VALUE = "nil" HEAD_TAG = "head" PHEAD_TAG = "phead" PDEPREL_TAG = "pdeprel" if stack: stack_0_pos = stack[0][POS_TAG] stack_0_word = stack[0][WORD_TAG] else: stack_0_pos = NULL_VALUE stack_0_word = NULL_VALUE if len(stack) > 1: stack_1_pos = stack[1][POS_TAG] stack_1_word = stack[1][WORD_TAG] else: stack_1_pos = NULL_VALUE stack_1_word = NULL_VALUE if queue: queue_0_pos = queue[0][POS_TAG] queue_0_word = queue[0][WORD_TAG] else: queue_0_pos = NULL_VALUE queue_0_word = NULL_VALUE if len(queue) > 1: queue_1_pos = queue[1][POS_TAG] queue_1_word = queue[1][WORD_TAG] else: queue_1_pos = NULL_VALUE queue_1_word = NULL_VALUE if len(feature_names) == 6: features.append(stack_0_word) features.append(stack_0_pos) features.append(queue_0_word) features.append(queue_0_pos) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) elif len(feature_names) == 10: features.append(stack_0_word) features.append(stack_0_pos) features.append(stack_1_word) features.append(stack_1_pos) features.append(queue_0_word) features.append(queue_0_pos) features.append(queue_1_word) features.append(queue_1_pos) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) elif len(feature_names) == 14: # No word if stack_0_word == NULL_VALUE: after_stack_0_word = NULL_VALUE after_stack_0_pos = NULL_VALUE before_stack_0_word = NULL_VALUE before_stack_0_pos = NULL_VALUE else: id_stack_0 = int(stack[0]['id']) # Last word if id_stack_0 == len(sentence) - 1: after_stack_0_word = NULL_VALUE after_stack_0_pos = NULL_VALUE else: next_word = sentence[id_stack_0 + 1] after_stack_0_word = next_word[WORD_TAG] after_stack_0_pos = next_word[POS_TAG] # First word if id_stack_0 == 0: before_stack_0_word = NULL_VALUE before_stack_0_pos = NULL_VALUE else: previous_word = sentence[id_stack_0] before_stack_0_word = previous_word[WORD_TAG] before_stack_0_pos = previous_word[POS_TAG] features.append(stack_0_word) features.append(stack_0_pos) features.append(stack_1_word) features.append(stack_1_pos) features.append(queue_0_word) features.append(queue_0_pos) features.append(queue_1_word) features.append(queue_1_pos) features.append(after_stack_0_word) features.append(after_stack_0_pos) features.append(before_stack_0_word) features.append(before_stack_0_pos) features.append(transition.can_reduce(stack, graph)) features.append(transition.can_leftarc(stack, graph)) features = dict(zip(feature_names, features)) return features
def extract2(stack, queue, state, feature_names, sentence): """ Extract the features from one sentence returns X and y, where X is a list of dictionaries and y is a list of symbols :param sentence: :param w_size: :return: """ tmpsiblings = ['nil', 'nil'] left = 0 right = 1000 if (len(stack) > 0 and len(state) > 0): #print(state['heads']) for key, value in (state['heads'].items()): if (int(key) < int(stack[0]['id']) and int(value) == int(stack[0]['head'])): #print(str(key)+' ' + str(value)) #print(stack[0]['id']) if left < int(key): left = int(key) elif int(key) > int(stack[0]['id']) and int(value) == int( stack[0]['head']): if right > int(key): right = int(key) if right < 1000: #print('right') #print(stack[0]['form']) #print(sentence[right]['form']) tmpsiblings[1] = sentence[right]['form'] pass if (left > 0): #print('left') #print(stack[0]['form']) #print(sentence[left]['form']) tmpsiblings[0] = sentence[left]['form'] pass #if(indx < stack[0]['id'] and head): #print(sentence[stack[0]['id']+1]) #while(head > 0): # print(stack[0]['head']) x = [] tmpListStack = ['nil', 'nil', 'nil', 'nil'] tmpListQueue = ['nil', 'nil', 'nil', 'nil'] tmpNextWord = ['nil', 'nil'] try: tmpNextWord[1] = sentence[int(stack[0]['id']) + 1]['postag'] tmpNextWord[1] = sentence[int(stack[0]['id']) + 1]['form'] except: pass try: tmpListStack[0] = stack[0]['form'] tmpListStack[1] = stack[0]['postag'] tmpListStack[2] = stack[1]['form'] tmpListStack[3] = stack[1]['postag'] except: pass try: tmpListQueue[0] = queue[0]['form'] tmpListQueue[1] = queue[0]['postag'] tmpListQueue[2] = queue[1]['form'] tmpListQueue[3] = queue[1]['postag'] except: pass x.extend(tmpListStack) x.extend(tmpListQueue) x.append(transition.can_rightarc(stack)) x.append(transition.can_reduce(stack, state)) x.extend(tmpNextWord) x.extend(tmpsiblings) # We represent the feature vector as a dictionary # The classes are stored in a list #y.append(padded_sentence[i + w_size][2]) return dict(zip(feature_names, x))