Esempio n. 1
0
def train_the_model(split_training_sentences, feature_names):
    print("Extracting the features...")
    X_matrix = []  # The matrix of feature vectors
    Y_vec = []  # The vector of desired/known-good outputs
    # We iterate over all sentences that are in the corpus
    for sentence in split_training_sentences:
        stack = [
        ]  # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc
        queue = list(
            sentence)  # Queue of input tokens that will be cycles through
        graph = {
        }  # The dependency graph that we will go store our dependency arcs in
        graph['heads'] = {
        }  # Create a dictionary inside graph for heads of each sentence
        graph['heads'][
            '0'] = '0'  # The first element in the heads dictionary inside the graph dictionary is '0'
        graph['deprels'] = {
        }  # Define another dictionary with dependency relations stored inside, again inside graph
        graph['deprels'][
            '0'] = 'ROOT'  # Make the first element in the dependency relations dictionary the 'ROOT' keyword
        while queue:  # While you still have things in your input token queue
            x_elem = extract(stack, queue, graph, feature_names, sentence)
            stack, queue, graph, y_elem = dparser.reference(
                stack, queue, graph)
            X_matrix.extend(x_elem)
            Y_vec.append(y_elem)

    # for i in range(0, len(Y_vec)):
    #     print('x =', X_matrix[i], '\ty =', Y_vec[i])

    if os.path.isfile('./ml_classifier.pickle') and os.path.isfile(
            './ml_model.pickle'):
        classifier_file = open('ml_classifier.pickle', 'rb')
        model_file = open('ml_model.pickle', 'rb')
        classifier = pickle.load(classifier_file)
        model = pickle.local(model_file)
    else:
        print("Encoding the features...")
        # Vectorize the feature matrix and carry out a one-hot encoding
        X = vec.fit_transform(X_matrix)
        print("Training the model...")
        # TODO: Need to save these
        classifier = linear_model.LogisticRegression(penalty='l2',
                                                     dual=True,
                                                     solver='liblinear')
        model = classifier.fit(X, Y_vec)
        classifier_file = open('ml_classifier.pickle', 'wb')
        model_file = open('ml_model.pickle', 'wb')
        pickle.dump(classifier, classifier_file)
        pickle.dump(model, model_file)

    Y_classifier_predicted = classifier.predict(X)
    print("Classification report for classifier %s:\n%s\n" %
          (classifier,
           metrics.classification_report(Y_vec, Y_classifier_predicted)))
    # classifier = 'temp classifier'
    # model = 'temp model'
    return classifier, model
def extract_features_sent(sentence, feature_names):
    """
    Extract the features from one sentence
    returns X and y, where X is a list of dictionaries and
    y is a list of symbols
    :param sentence:
    :param w_size:
    :return:
    """
    #sentence  = sentence.splitlines()

    stack = []
    graph = {}
    queue = list(sentence)
    graph['heads'] = {}
    graph['heads']['0'] = '0'
    graph['deprels'] = {}
    graph['deprels']['0'] = 'ROOT'

    transitions = []

    x = list()
    X = list()
    y = list()
    while queue:
        if (len(stack) > 0):
            x.append(stack[0]['cpostag'])
            x.append(stack[0]['form'])
        else:
            x.append('nil')
            x.append('nil')

        if (queue):
            x.append(queue[0]['cpostag'])
            x.append(queue[0]['form'])
        else:
            x.append('nil')
            x.append('nil')

        x.append(transition.can_reduce(stack, graph))
        x.append(transition.can_leftarc(stack, graph))

        X.append(dict(zip(feature_names, x)))
        #remove reference, predict what action should be done(equiv to trans)
        stack, queue, graph, trans = dparser.reference(stack, queue, graph)
        y.append(trans)
        x = list()
    #stack, graph = transition.empty_stack(stack, graph)

    #for word in queue:
    #print(word['form'])
    #stack, queue, graph, trans = reference(stack, queue, graph)
    #transitions.append(trans)
    # stack, graph = transition.empty_stack(stack, graph)
    return X, y
Esempio n. 3
0
def train_the_model(split_training_sentences, feature_names, vec, i):
    print("Extracting the features...")
    X_matrix = []  # The matrix of feature vectors
    Y_vec = []  # The vector of desired/known-good outputs
    # We iterate over all sentences that are in the corpus
    for sentence in split_training_sentences:
        stack = [
        ]  # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc
        queue = list(
            sentence)  # Queue of input tokens that will be cycles through
        graph = {
        }  # The dependency graph that we will go store our dependency arcs in
        graph['heads'] = {
        }  # Create a dictionary inside graph for heads of each sentence
        graph['heads'][
            '0'] = '0'  # The first element in the heads dictionary inside the graph dictionary is '0'
        graph['deprels'] = {
        }  # Define another dictionary with dependency relations stored inside, again inside graph
        graph['deprels'][
            '0'] = 'ROOT'  # Make the first element in the dependency relations dictionary the 'ROOT' keyword
        while queue:  # While you still have things in your input token queue
            x_elem = extract(stack, queue, graph, feature_names, sentence, i)
            stack, queue, graph, y_elem = dparser.reference(
                stack, queue, graph)
            X_matrix.extend(x_elem)
            Y_vec.append(y_elem)

    # for i in range(0, len(Y_vec)):
    #     print('x =', X_matrix[i], '\ty =', Y_vec[i])

    print("Encoding the features...")
    # Vectorize the feature matrix and carry out a one-hot encoding
    X = vec.fit_transform(X_matrix)
    # print("Training the model...")
    # this_classifier = linear_model.LogisticRegression(penalty='l2', dual=True, solver='liblinear')
    # this_model = this_classifier.fit(X, Y_vec)
    # TODO: Uncomment lines above, comment lines below
    classifier_filename = 'ml_classifier' + str(i) + '.pickle'
    model_filename = 'ml_model' + str(i) + '.pickle'
    a_classifier = open(classifier_filename, 'rb')
    a_model = open(model_filename, 'rb')
    this_classifier = pickle.load(a_classifier)
    this_model = pickle.load(a_model)

    # Y_classifier_predicted = this_classifier.predict(X)
    # print("Classification report for classifier %s:\n%s\n"
    #       % (this_classifier, metrics.classification_report(Y_vec, Y_classifier_predicted)))

    return this_classifier, this_model
Esempio n. 4
0
def extract(stack,
            queue,
            graph,
            feature_names,
            sentence,
            samples,
            special=False):
    X = []
    y = []

    while queue:
        x = []
        structures = [stack, queue]
        elements = ['postag', 'form']
        for structure in structures:
            for element in elements:
                for i in range(samples):
                    if len(structure) > i:
                        x.append(structure[i][element])
                    else:
                        x.append('nil')

        if special:
            # word before and after top of stack
            for element in elements:
                for i in [-1, 1]:
                    if len(stack) > 0:
                        index = int(stack[0]['id']) + i
                        if 0 <= index < len(sentence):
                            x.append(sentence[index][element])
                        else:
                            x.append('nil')
                    else:
                        x.append('nil')

        x.append(transition.can_reduce(stack, graph))
        x.append(transition.can_leftarc(stack, graph))
        X.append(dict(zip(feature_names, x)))
        stack, queue, graph, trans = dparser.reference(stack, queue, graph)
        y.append(trans)
    return X, y
Esempio n. 5
0
def perform_prediction(split_test_sentences, feature_names, classifier, model):
    print("Extracting features from test Corpus")
    X_matrix_test = []  # The matrix of feature vectors
    Y_vec_test = []  # The vector of predicted outputs
    # We iterate over all sentences that are in the corpus
    for sentence in split_test_sentences:
        stack = [
        ]  # Stack of tokens that will be manipulated with shift, reduce, left-arc, and right-arc
        queue = list(
            sentence)  # Queue of input tokens that will be cycles through
        graph = {
        }  # The dependency graph that we will go store our dependency arcs in
        graph['heads'] = {
        }  # Create a dictionary inside graph for heads of each sentence
        graph['heads'][
            '0'] = '0'  # The first element in the heads dictionary inside the graph dictionary is '0'
        graph['deprels'] = {
        }  # Define another dictionary with dependency relations stored inside, again inside graph
        graph['deprels'][
            '0'] = 'ROOT'  # Make the first element in the deprel dictionary the 'ROOT' keyword
        while queue:
            x_elem = extract(stack, queue, graph, feature_names, sentence)
            X_matrix_test.extend(x_elem)
            stack, queue, graph, y_elem = dparser.reference(
                stack, queue, graph)
            # WE DO NOT USE Y_VEC_TEST
            Y_vec_test.append(y_elem)

    # Vectorize the feature matrix and carry out a one-hot encoding
    print("Encoding the test features")
    X_test = vec.transform(X_matrix_test)
    print("Predicting the arc actions to take")
    y_test_predicted = classifier.predict(X_test)
    print("Classification report for classifier %s:\n%s\n" %
          (classifier,
           metrics.classification_report(Y_vec_test, y_test_predicted)))
    print(y_test_predicted)
Esempio n. 6
0
def extract_features(formatted_corpus,mode):
    X = []
    Y = []
    for sentence in formatted_corpus:
      stack = []
      queue = list(sentence)
      graph = {}
      graph['heads'] = {}
      graph['heads']['0'] = '0'
      graph['deprels'] = {}
      graph['deprels']['0'] = 'ROOT'
      while queue:
        if(mode == 3):
          features = generate_feature_vector3(stack,queue,graph,sentence)
        elif(mode == 2):
          features = generate_feature_vector2(stack,queue,graph)
        else:
          #mode==1
          features = generate_feature_vector1(stack,queue,graph)

        stack, queue, graph, trans = dparser.reference(stack, queue, graph)
        X.append(features)
        Y.append(trans)
    return X,Y
Esempio n. 7
0
def extract_features_sent(sentence, feature_names):
    """
    Extract the features from one sentence
    returns X and y, where X is a list of dictionaries and
    y is a list of symbols
    :param sentence:
    :param feature_names
    :return:
    """
    #sentence  = sentence.splitlines()

    stack = []
    graph = {}
    queue = list(sentence)

    graph['heads'] = {}
    graph['heads']['0'] = '0'
    graph['deprels'] = {}
    graph['deprels']['0'] = 'ROOT'

    transitions = []

    x = list()
    X = list()
    y = list()

    while queue:

        if (len(stack) > 0):
            x.append(stack[0]['cpostag'])
        else:
            x.append('nil')
        if (len(stack) > 1):
            x.append(stack[1]['cpostag'])
        else:
            x.append('nil')
        if (len(stack) > 0):
            x.append(stack[0]['form'])
        else:
            x.append('nil')
        if (len(stack) > 1):
            x.append(stack[1]['form'])
        else:
            x.append('nil')
        if (queue):
            x.append(queue[0]['cpostag'])
        else:
            x.append('nil')
        if (len(queue) > 1):
            x.append(queue[1]['cpostag'])
        else:
            x.append('nil')
        if (queue):
            x.append(queue[0]['form'])
        else:
            x.append('nil')
        if (len(queue) > 1):
            x.append(queue[1]['form'])
        else:
            x.append('nil')

        x.append(transition.can_reduce(stack, graph))
        x.append(transition.can_leftarc(stack, graph))
        X.append(dict(zip(feature_names, x)))
        stack, queue, graph, trans = dparser.reference(stack, queue, graph)
        y.append(trans)
        x = list()
    # x.append(stack[0]['cpostag'])

    return X, y
Esempio n. 8
0
def extract_features_sent(stack, queue, graph, feature_names, sentence, train):
    # We pad the sentence to extract the context window more easily

    X = list()
    y = list()

    while queue:
        prev_stack = stack
        prev_queue = queue
        stack, queue, graph, trans = dparser.reference(stack, queue, graph)

        x = list()
        for i in feature_names:
            if (i == 'stack0_POS'):
                if len(prev_stack) >= 1:
                    x.append(prev_stack[0]['postag'])
                else:
                    x.append('nill')
            elif (i == 'stack1_POS'):
                if len(prev_stack) >= 2:
                    x.append(prev_stack[1]['postag'])
                else:
                    x.append('nill')
            elif (i == 'stack2_POS'):
                if len(prev_stack) >= 3:
                    x.append(prev_stack[2]['postag'])
                else:
                    x.append('nill')
            elif (i == 'stack0_word'):
                if len(prev_stack) >= 1:
                    x.append(prev_stack[0]['form'])
                else:
                    x.append('nill')
            elif (i == 'stack1_word'):
                if len(prev_stack) >= 2:
                    x.append(prev_stack[1]['form'])
                else:
                    x.append('nill')
            elif (i == 'stack2_word'):
                if len(prev_stack) >= 3:
                    x.append(prev_stack[2]['form'])
                else:
                    x.append('nill')
            elif (i == 'queue0_POS'):
                if len(prev_queue) >= 1:
                    x.append(prev_queue[0]['postag'])
                else:
                    x.append('nill')
            elif (i == 'queue1_POS'):
                if len(prev_queue) >= 2:
                    x.append(prev_queue[1]['postag'])
                else:
                    x.append('nill')
            elif (i == 'queue2_POS'):
                if len(prev_queue) >= 3:
                    x.append(prev_queue[2]['postag'])
                else:
                    x.append('nill')
            elif (i == 'queue0_word'):
                if len(prev_queue) >= 1:
                    x.append(prev_queue[0]['form'])
                else:
                    x.append('nill')
            elif (i == 'queue1_word'):
                if len(prev_queue) >= 2:
                    x.append(prev_queue[1]['form'])
                else:
                    x.append('nill')
            elif (i == 'queue2_word'):
                if len(prev_queue) >= 3:
                    x.append(prev_queue[2]['form'])
                else:
                    x.append('nill')
            elif (i == 'can-re'):
                if len(prev_stack) >= 1:
                    x.append(True)
                else:
                    x.append(False)
            elif (i == 'can-la'):
                if len(prev_stack) >= 1 and len(prev_queue) >= 1:
                    x.append(True)
                else:
                    x.append(False)

        X.append(dict(zip(feature_names, x)))
        y.append(trans)
        if (not train):
            break

    return X, y