def train_the_model(split_training_sentences, feature_names): print("Extracting the features...") X_matrix = [] # The matrix of feature vectors Y_vec = [] # The vector of desired/known-good outputs # We iterate over all sentences that are in the corpus for sentence in split_training_sentences: stack = [ ] # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc queue = list( sentence) # Queue of input tokens that will be cycles through graph = { } # The dependency graph that we will go store our dependency arcs in graph['heads'] = { } # Create a dictionary inside graph for heads of each sentence graph['heads'][ '0'] = '0' # The first element in the heads dictionary inside the graph dictionary is '0' graph['deprels'] = { } # Define another dictionary with dependency relations stored inside, again inside graph graph['deprels'][ '0'] = 'ROOT' # Make the first element in the dependency relations dictionary the 'ROOT' keyword while queue: # While you still have things in your input token queue x_elem = extract(stack, queue, graph, feature_names, sentence) stack, queue, graph, y_elem = dparser.reference( stack, queue, graph) X_matrix.extend(x_elem) Y_vec.append(y_elem) # for i in range(0, len(Y_vec)): # print('x =', X_matrix[i], '\ty =', Y_vec[i]) if os.path.isfile('./ml_classifier.pickle') and os.path.isfile( './ml_model.pickle'): classifier_file = open('ml_classifier.pickle', 'rb') model_file = open('ml_model.pickle', 'rb') classifier = pickle.load(classifier_file) model = pickle.local(model_file) else: print("Encoding the features...") # Vectorize the feature matrix and carry out a one-hot encoding X = vec.fit_transform(X_matrix) print("Training the model...") # TODO: Need to save these classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear') model = classifier.fit(X, Y_vec) classifier_file = open('ml_classifier.pickle', 'wb') model_file = open('ml_model.pickle', 'wb') pickle.dump(classifier, classifier_file) pickle.dump(model, model_file) Y_classifier_predicted = classifier.predict(X) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(Y_vec, Y_classifier_predicted))) # classifier = 'temp classifier' # model = 'temp model' return classifier, model
def extract_features_sent(sentence, feature_names): """ Extract the features from one sentence returns X and y, where X is a list of dictionaries and y is a list of symbols :param sentence: :param w_size: :return: """ #sentence = sentence.splitlines() stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x = list() X = list() y = list() while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) x.append(stack[0]['form']) else: x.append('nil') x.append('nil') if (queue): x.append(queue[0]['cpostag']) x.append(queue[0]['form']) else: x.append('nil') x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X.append(dict(zip(feature_names, x))) #remove reference, predict what action should be done(equiv to trans) stack, queue, graph, trans = dparser.reference(stack, queue, graph) y.append(trans) x = list() #stack, graph = transition.empty_stack(stack, graph) #for word in queue: #print(word['form']) #stack, queue, graph, trans = reference(stack, queue, graph) #transitions.append(trans) # stack, graph = transition.empty_stack(stack, graph) return X, y
def train_the_model(split_training_sentences, feature_names, vec, i): print("Extracting the features...") X_matrix = [] # The matrix of feature vectors Y_vec = [] # The vector of desired/known-good outputs # We iterate over all sentences that are in the corpus for sentence in split_training_sentences: stack = [ ] # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc queue = list( sentence) # Queue of input tokens that will be cycles through graph = { } # The dependency graph that we will go store our dependency arcs in graph['heads'] = { } # Create a dictionary inside graph for heads of each sentence graph['heads'][ '0'] = '0' # The first element in the heads dictionary inside the graph dictionary is '0' graph['deprels'] = { } # Define another dictionary with dependency relations stored inside, again inside graph graph['deprels'][ '0'] = 'ROOT' # Make the first element in the dependency relations dictionary the 'ROOT' keyword while queue: # While you still have things in your input token queue x_elem = extract(stack, queue, graph, feature_names, sentence, i) stack, queue, graph, y_elem = dparser.reference( stack, queue, graph) X_matrix.extend(x_elem) Y_vec.append(y_elem) # for i in range(0, len(Y_vec)): # print('x =', X_matrix[i], '\ty =', Y_vec[i]) print("Encoding the features...") # Vectorize the feature matrix and carry out a one-hot encoding X = vec.fit_transform(X_matrix) # print("Training the model...") # this_classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear') # this_model = this_classifier.fit(X, Y_vec) # TODO: Uncomment lines above, comment lines below classifier_filename = 'ml_classifier' + str(i) + '.pickle' model_filename = 'ml_model' + str(i) + '.pickle' a_classifier = open(classifier_filename, 'rb') a_model = open(model_filename, 'rb') this_classifier = pickle.load(a_classifier) this_model = pickle.load(a_model) # Y_classifier_predicted = this_classifier.predict(X) # print("Classification report for classifier %s:\n%s\n" # % (this_classifier, metrics.classification_report(Y_vec, Y_classifier_predicted))) return this_classifier, this_model
def extract(stack, queue, graph, feature_names, sentence, samples, special=False): X = [] y = [] while queue: x = [] structures = [stack, queue] elements = ['postag', 'form'] for structure in structures: for element in elements: for i in range(samples): if len(structure) > i: x.append(structure[i][element]) else: x.append('nil') if special: # word before and after top of stack for element in elements: for i in [-1, 1]: if len(stack) > 0: index = int(stack[0]['id']) + i if 0 <= index < len(sentence): x.append(sentence[index][element]) else: x.append('nil') else: x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X.append(dict(zip(feature_names, x))) stack, queue, graph, trans = dparser.reference(stack, queue, graph) y.append(trans) return X, y
def perform_prediction(split_test_sentences, feature_names, classifier, model): print("Extracting features from test Corpus") X_matrix_test = [] # The matrix of feature vectors Y_vec_test = [] # The vector of predicted outputs # We iterate over all sentences that are in the corpus for sentence in split_test_sentences: stack = [ ] # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc queue = list( sentence) # Queue of input tokens that will be cycles through graph = { } # The dependency graph that we will go store our dependency arcs in graph['heads'] = { } # Create a dictionary inside graph for heads of each sentence graph['heads'][ '0'] = '0' # The first element in the heads dictionary inside the graph dictionary is '0' graph['deprels'] = { } # Define another dictionary with dependency relations stored inside, again inside graph graph['deprels'][ '0'] = 'ROOT' # Make the first element in the deprel dictionary the 'ROOT' keyword while queue: x_elem = extract(stack, queue, graph, feature_names, sentence) X_matrix_test.extend(x_elem) stack, queue, graph, y_elem = dparser.reference( stack, queue, graph) # WE DO NOT USE Y_VEC_TEST Y_vec_test.append(y_elem) # Vectorize the feature matrix and carry out a one-hot encoding print("Encoding the test features") X_test = vec.transform(X_matrix_test) print("Predicting the arc actions to take") y_test_predicted = classifier.predict(X_test) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(Y_vec_test, y_test_predicted))) print(y_test_predicted)
def extract_features(formatted_corpus,mode): X = [] Y = [] for sentence in formatted_corpus: stack = [] queue = list(sentence) graph = {} graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' while queue: if(mode == 3): features = generate_feature_vector3(stack,queue,graph,sentence) elif(mode == 2): features = generate_feature_vector2(stack,queue,graph) else: #mode==1 features = generate_feature_vector1(stack,queue,graph) stack, queue, graph, trans = dparser.reference(stack, queue, graph) X.append(features) Y.append(trans) return X,Y
def extract_features_sent(sentence, feature_names): """ Extract the features from one sentence returns X and y, where X is a list of dictionaries and y is a list of symbols :param sentence: :param feature_names :return: """ #sentence = sentence.splitlines() stack = [] graph = {} queue = list(sentence) graph['heads'] = {} graph['heads']['0'] = '0' graph['deprels'] = {} graph['deprels']['0'] = 'ROOT' transitions = [] x = list() X = list() y = list() while queue: if (len(stack) > 0): x.append(stack[0]['cpostag']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['cpostag']) else: x.append('nil') if (len(stack) > 0): x.append(stack[0]['form']) else: x.append('nil') if (len(stack) > 1): x.append(stack[1]['form']) else: x.append('nil') if (queue): x.append(queue[0]['cpostag']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['cpostag']) else: x.append('nil') if (queue): x.append(queue[0]['form']) else: x.append('nil') if (len(queue) > 1): x.append(queue[1]['form']) else: x.append('nil') x.append(transition.can_reduce(stack, graph)) x.append(transition.can_leftarc(stack, graph)) X.append(dict(zip(feature_names, x))) stack, queue, graph, trans = dparser.reference(stack, queue, graph) y.append(trans) x = list() # x.append(stack[0]['cpostag']) return X, y
def extract_features_sent(stack, queue, graph, feature_names, sentence, train): # We pad the sentence to extract the context window more easily X = list() y = list() while queue: prev_stack = stack prev_queue = queue stack, queue, graph, trans = dparser.reference(stack, queue, graph) x = list() for i in feature_names: if (i == 'stack0_POS'): if len(prev_stack) >= 1: x.append(prev_stack[0]['postag']) else: x.append('nill') elif (i == 'stack1_POS'): if len(prev_stack) >= 2: x.append(prev_stack[1]['postag']) else: x.append('nill') elif (i == 'stack2_POS'): if len(prev_stack) >= 3: x.append(prev_stack[2]['postag']) else: x.append('nill') elif (i == 'stack0_word'): if len(prev_stack) >= 1: x.append(prev_stack[0]['form']) else: x.append('nill') elif (i == 'stack1_word'): if len(prev_stack) >= 2: x.append(prev_stack[1]['form']) else: x.append('nill') elif (i == 'stack2_word'): if len(prev_stack) >= 3: x.append(prev_stack[2]['form']) else: x.append('nill') elif (i == 'queue0_POS'): if len(prev_queue) >= 1: x.append(prev_queue[0]['postag']) else: x.append('nill') elif (i == 'queue1_POS'): if len(prev_queue) >= 2: x.append(prev_queue[1]['postag']) else: x.append('nill') elif (i == 'queue2_POS'): if len(prev_queue) >= 3: x.append(prev_queue[2]['postag']) else: x.append('nill') elif (i == 'queue0_word'): if len(prev_queue) >= 1: x.append(prev_queue[0]['form']) else: x.append('nill') elif (i == 'queue1_word'): if len(prev_queue) >= 2: x.append(prev_queue[1]['form']) else: x.append('nill') elif (i == 'queue2_word'): if len(prev_queue) >= 3: x.append(prev_queue[2]['form']) else: x.append('nill') elif (i == 'can-re'): if len(prev_stack) >= 1: x.append(True) else: x.append(False) elif (i == 'can-la'): if len(prev_stack) >= 1 and len(prev_queue) >= 1: x.append(True) else: x.append(False) X.append(dict(zip(feature_names, x))) y.append(trans) if (not train): break return X, y