Beispiel #1
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained keras model.
    @param use_lstm.        Bool indicating whether clf is a CRF or LSTM.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    if use_lstm:
        # vectorize tokenized sentences
        X = []
        for sent in tokenized_sents:
            id_seq = []
            for w in sent:
                if w in vocab:
                    id_seq.append(vocab[w])
                else:
                    id_seq.append(vocab['oov'])
            X.append(id_seq)
    else:
        # vectorize validation X
        text_features = extract_features(tokenized_sents)
        flat_X_feats = vocab.transform(flatten(text_features))
        X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    if use_lstm:
        predictions = keras_ml.predict(clf, X)
    else:
        predictions = crf.predict(clf, X)

    # Format labels from output
    return predictions
Beispiel #2
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm,
                    hyperparams):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained keras model.
    @param use_lstm.        Bool indicating whether clf is a CRF or LSTM.
    '''
    # use_lstm=self._use_lstm
    if use_lstm:

        #parameters=hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        #model_folder="./models/NN_models"
        predictions = []
        sys.stdout.write('\n use_lstm \n')
        dataset = Exp.Dataset()

        fictional_labels = copy.deepcopy(tokenized_sents)
        for idx, x in enumerate(fictional_labels):
            for val_id, value in enumerate(x):
                fictional_labels[idx][val_id] = 'O'

        Datasets_tokens = {}
        Datasets_labels = {}

        Datasets_tokens['deploy'] = tokenized_sents
        Datasets_labels['deploy'] = fictional_labels

        token_to_vector = dataset.load_dataset(
            Datasets_tokens,
            Datasets_labels,
            "",
            parameters,
            token_to_vector=tokens_to_vec,
            pretrained_dataset=pretrained_dataset)

        print(dataset.token_indices.keys())

        parameters['Feature_vector_length'] = dataset.feature_vector_size
        parameters['use_features_before_final_lstm'] = False

        dataset.update_dataset("", ['deploy'], Datasets_tokens,
                               Datasets_labels)

        del Datasets_tokens
        del Datasets_labels

        #model=current_model
        model = entity_model.EntityLSTM(dataset, parameters)

        os.mkdir(parameters['conll_like_result_folder'])

        test_temp = os.path.join(parameters['conll_like_result_folder'],
                                 'test/')
        train_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'train/')
        valid_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'valid/')

        os.mkdir(test_temp)
        os.mkdir(train_temp)
        os.mkdir(valid_temp)

        sess = tf.Session()
        with sess.as_default():

            #model=entity_model.EntityLSTM(dataset,parameters)
            transition_params_trained = model.restore_from_pretrained_model(
                parameters,
                dataset,
                sess,
                token_to_vector=token_to_vector,
                pretrained_dataset=pretrained_dataset)
            del token_to_vector
            predictions = training_predict_LSTM.prediction_step(
                sess, dataset, "deploy", model, 0,
                parameters['conll_like_result_folder'],
                transition_params_trained)
            sess.close()

        tf.reset_default_graph()

        shutil.rmtree(parameters['conll_like_result_folder'])
        return predictions, model

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        sys.stdout.write('\tnothing to predict %s\n' % p_or_n)
        return []

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        print('todo: incorporate lstm')
        # vectorize tokenized sentences
        #X = []
        #for sent in tokenized_sents:
        #   id_seq = []
        #   for w in sent:
        #      if w in vocab:
        #           id_seq.append(vocab[w])
        #       else:
        #        id_seq.append(vocab['oov'])
        #  X.append(id_seq)
    else:
        from cliner.feature_extraction.features import extract_features

        # vectorize validation X
        text_features = extract_features(tokenized_sents)
        flat_X_feats = vocab.transform(flatten(text_features))
        X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    sys.stdout.write('\tpredicting  labels %s\n' % p_or_n)

    # Predict labels
    if use_lstm:
        print("TEST_PREDICT")
        exit()

    else:
        from cliner.machine_learning import crf
        predictions = crf.predict(clf, X)

    # Format labels from output
    return predictions