Ejemplo n.º 1
0
    def __generic_first_predict(self,
                                p_or_n,
                                text_features,
                                dvect,
                                clf,
                                do_grid=False):
        '''
        Model::__generic_first_predict()

        Purpose: Train that works for both prose and nonprose

        @param p_or_n.        <string> either "prose" or "nonprose"
        @param text_features. <list-of-lists> of feature dictionaries
        @param dvect.         <DictVectorizer>
        @param clf.           scikit-learn classifier
        @param do_grid.       <boolean> indicating whether to perform grid search
        '''

        # If nothing to predict, skip actual prediction
        if len(text_features) == 0:
            print '\tnothing to predict (pass one) ' + p_or_n
            return []

        # Save list structure to reconstruct after vectorization
        offsets = save_list_structure(text_features)

        if globals_cliner.verbosity > 0:
            print '\tvectorizing features (pass one) ' + p_or_n

        # Vectorize features
        X_feats = dvect.transform(flatten(text_features))

        if globals_cliner.verbosity > 0:
            print '\tpredicting    labels (pass one) ' + p_or_n

        # CRF requires reconstruct lists
        if self._crf_enabled:
            X_feats = reconstruct_list(list(X_feats), offsets)
            lib = crf
        else:
            lib = sci

        #for X in X_feats:
        #    for x in X:
        #        print x
        #    print
        #print '\n'

        # Predict IOB labels
        out = lib.predict(clf, X_feats)

        # Format labels from output
        predictions = reconstruct_list(out, offsets)
        return predictions
Ejemplo n.º 2
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained model.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    # vectorize validation X
    text_features = extract_features(tokenized_sents)
    flat_X_feats = vocab.transform(flatten(text_features))
    X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    predictions = crf_ml.predict(clf, X)

    # Format labels from output
    return predictions
Ejemplo n.º 3
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained model.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    # vectorize validation X
    text_features = extract_features(tokenized_sents)
    flat_X_feats = vocab.transform( flatten(text_features) )
    X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    predictions =   crf_ml.predict(clf, X)

    # Format labels from output
    return predictions
Ejemplo n.º 4
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained keras model.
    @param use_lstm.        Bool indicating whether clf is a CRF or LSTM.
    '''

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        print '\tnothing to predict ' + p_or_n
        return []

    print '\tvectorizing words ' + p_or_n

    if use_lstm:
        # vectorize tokenized sentences
        X = []
        for sent in tokenized_sents:
            id_seq = []
            for w in sent:
                if w in vocab:
                    id_seq.append(vocab[w])
                else:
                    id_seq.append(vocab['oov'])
            X.append(id_seq)
    else:
        # vectorize validation X
        text_features = extract_features(tokenized_sents)
        flat_X_feats = vocab.transform(flatten(text_features))
        X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    print '\tpredicting  labels ' + p_or_n

    # Predict labels
    if use_lstm:
        predictions = keras_ml.predict(clf, X)
    else:
        predictions = crf.predict(clf, X)

    # Format labels from output
    return predictions
Ejemplo n.º 5
0
    def __generic_first_train(self,
                              p_or_n,
                              text_features,
                              iob_labels,
                              do_grid=False):
        '''
        Model::__generic_first_train()

        Purpose: Train that works for both prose and nonprose

        @param p_or_n.        <string> either "prose" or "nonprose"
        @param text_features. <list-of-lists> of feature dictionaries
        @param iob_labels.    <list> of "I", "O", and "B" labels
        @param do_grid.       <boolean> indicating whether to perform grid search
        '''

        # Must have data to train on
        if len(text_features) == 0:
            raise Exception('Training must have %s training examples' % p_or_n)

        # Vectorize IOB labels
        Y_labels = [IOB_labels[y] for y in iob_labels]

        # Save list structure to reconstruct after vectorization
        offsets = save_list_structure(text_features)

        if globals_cliner.verbosity > 0:
            print '\tvectorizing features (pass one) ' + p_or_n

        #X = reconstruct_list(flatten(text_features), offsets)
        #Y = reconstruct_list(        Y_labels      , offsets)
        #for a,b in zip(X,Y):
        #    for x,y in zip(a,b):
        #        print y
        #        #print filter(lambda t:t[0]=='word', x.keys())
        #        print x.keys()
        #        print
        #    print '\n\n\n'

        # Vectorize features
        dvect = DictVectorizer()
        X_feats = dvect.fit_transform(flatten(text_features))

        # CRF needs reconstructed lists
        if self._crf_enabled:
            X_feats = reconstruct_list(list(X_feats), offsets)
            Y_labels = reconstruct_list(Y_labels, offsets)
            lib = crf
        else:
            lib = sci

        if globals_cliner.verbosity > 0:
            print '\ttraining classifiers (pass one) ' + p_or_n

        #for i,X in enumerate(X_feats):
        #    for j,x in enumerate(X):
        #        print x, '\t', Y_labels[i][j]
        #    print
        #exit()

        # Train classifier
        clf = lib.train(X_feats, Y_labels, do_grid)

        return dvect, clf
Ejemplo n.º 6
0
def generic_train(p_or_n, tokenized_sents, iob_nested_labels,
                  val_sents=None, val_labels=None, dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.             A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents.    A list of sentences, where each sentence is tokenized
                                 into words
    @param iob_nested_labels.  Parallel to `tokenized_sents`, 7-way labels for 
                                 concept spans
    @param val_sents.          Validation data. Same format as tokenized_sents
    @param val_labels.         Validation data. Same format as iob_nested_labels
    @param dev_split.          A real number from 0 to 1
    '''
    # Must have data to train on
    if len(tokenized_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000):
    if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>10):

        p = int(dev_split*100)
        print '\tCreating %d/%d train/dev split' % (100-p,p)

        perm = range(len(tokenized_sents))
        random.shuffle(perm)

        tokenized_sents   = [   tokenized_sents[i] for i in perm ]
        iob_nested_labels = [ iob_nested_labels[i] for i in perm ]

        ind = int(dev_split*len(tokenized_sents))

        val_sents   = tokenized_sents[:ind ]
        train_sents = tokenized_sents[ ind:]

        val_labels   = iob_nested_labels[:ind ]
        train_labels = iob_nested_labels[ ind:]

        tokenized_sents   = train_sents
        iob_nested_labels = train_labels


    print '\tvectorizing words', p_or_n

    #tokenized_sents   = train_sents[ :2]
    #iob_nested_labels = train_labels[:2]

    # count word frequencies to determine OOV
    freq = defaultdict(int)
    for sent in tokenized_sents:
        for w in sent:
            freq[w] += 1

    # determine OOV based on % of vocab or minimum word freq threshold
    oov = set()
    '''
    if len(freq) < 100:
        lo = len(freq)/20
        oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo])
    else:
        #lo = 2
        #oov = set([ w for w,f in freq.items() if (f <= lo) ])
        oov = set()
    '''

    '''
    val = None
    for w,f in sorted(freq.items(), key=lambda t:t[1]):
        if val != f:
            val = f
            print
        print '%8d  %s' % (f,w)
    exit()
    '''

    ########
    # CRF 
    ########

    # vectorize tokenized sentences
    '''
    def make_feature(ind):
        return {(ind,i):1 for i in range(10)}
    text_features = []
    for sent in tokenized_sents:
        fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent]
        text_features.append(fseq)
    '''
    text_features = extract_features(tokenized_sents)

    # Collect list of feature types
    enabled_features = set()
    for sf in text_features:
        for wf in sf:
            for (feature_type,instance),value in wf.items():
                if feature_type.startswith('prev'):
                    feature_type = 'PREV*'
                if feature_type.startswith('next'):
                    feature_type = 'NEXT*'
                enabled_features.add(feature_type)
    enabled_features = sorted(enabled_features)

    # Vectorize features
    vocab = DictVectorizer()
    flat_X_feats = vocab.fit_transform( flatten(text_features) )
    X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    # vectorize IOB labels
    Y_labels = [ [tag2id[y] for y in y_seq] for y_seq in iob_nested_labels ]

    assert len(X_feats) == len(Y_labels)
    for i in range(len(X_feats)):
        assert X_feats[i].shape[0] == len(Y_labels[i])


    # if there is specified validation data, then vectorize it
    if val_sents:
        # vectorize validation X
        val_text_features = extract_features(val_sents)
        flat_val_X_feats = vocab.transform( flatten(val_text_features) )
        val_X = reconstruct_list(flat_val_X_feats, 
                                 save_list_structure(val_text_features))
        # vectorize validation Y
        val_Y = [ [tag2id[y] for y in y_seq] for y_seq in val_labels ]


    print '\ttraining classifiers', p_or_n

    #val_sents  = val_sents[ :5]
    #val_labels = val_labels[:5]

    # train using crf
    clf, dev_score  = crf_ml.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y)

    return vocab, clf, dev_score, enabled_features
Ejemplo n.º 7
0
def generic_train(p_or_n,
                  tokenized_sents,
                  iob_nested_labels,
                  use_lstm,
                  val_sents=None,
                  val_labels=None,
                  dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.             A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents.    A list of sentences, where each sentence is tokenized
                                 into words
    @param iob_nested_labels.  Parallel to `tokenized_sents`, 7-way labels for 
                                 concept spans
    @param use_lstm            Bool indicating whether to train CRF or LSTM.
    @param val_sents.          Validation data. Same format as tokenized_sents
    @param val_labels.         Validation data. Same format as iob_nested_labels
    @param dev_split.          A real number from 0 to 1
    '''

    # Must have data to train on:
    if len(tokenized_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000):
    if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents) > 10):

        p = int(dev_split * 100)
        print '\tCreating %d/%d train/dev split' % (100 - p, p)

        perm = range(len(tokenized_sents))
        random.shuffle(perm)

        tokenized_sents = [tokenized_sents[i] for i in perm]
        iob_nested_labels = [iob_nested_labels[i] for i in perm]

        ind = int(dev_split * len(tokenized_sents))

        val_sents = tokenized_sents[:ind]
        train_sents = tokenized_sents[ind:]

        val_labels = iob_nested_labels[:ind]
        train_labels = iob_nested_labels[ind:]

        tokenized_sents = train_sents
        iob_nested_labels = train_labels

    print '\tvectorizing words', p_or_n

    #tokenized_sents   = train_sents[ :2]
    #iob_nested_labels = train_labels[:2]

    # count word frequencies to determine OOV
    freq = defaultdict(int)
    for sent in tokenized_sents:
        for w in sent:
            freq[w] += 1

    # determine OOV based on % of vocab or minimum word freq threshold
    oov = set()
    '''
    if len(freq) < 100:
        lo = len(freq)/20
        oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo])
    else:
        #lo = 2
        #oov = set([ w for w,f in freq.items() if (f <= lo) ])
        oov = set()
    '''
    '''
    val = None
    for w,f in sorted(freq.items(), key=lambda t:t[1]):
        if val != f:
            val = f
            print
        print '%8d  %s' % (f,w)
    exit()
    '''

    if use_lstm:
        ########
        # LSTM
        ########

        # build vocabulary of words
        vocab = {}
        for sent in tokenized_sents:
            for w in sent:
                if (w not in vocab) and (w not in oov):
                    vocab[w] = len(vocab) + 1
        vocab['oov'] = len(vocab) + 1

        # vectorize tokenized sentences
        X_seq_ids = []
        for sent in tokenized_sents:
            id_seq = [(vocab[w] if w in vocab else vocab['oov']) for w in sent]
            X_seq_ids.append(id_seq)

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels]

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_X = []
            for sent in val_sents:
                id_seq = [(vocab[w] if w in vocab else vocab['oov'])
                          for w in sent]
                val_X.append(id_seq)
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

    else:
        ########
        # CRF
        ########

        # vectorize tokenized sentences
        '''
        def make_feature(ind):
            return {(ind,i):1 for i in range(10)}
        text_features = []
            fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent]
            text_features.append(fseq)
        '''
        text_features = extract_features(tokenized_sents)
        # type(text_features): <type 'list'>

        # Collect list of feature types
        enabled_features = set()
        for sf in text_features:
            for wf in sf:
                for (feature_type, instance), value in wf.items():
                    if feature_type.startswith('prev'):
                        feature_type = 'PREV*'
                    if feature_type.startswith('next'):
                        feature_type = 'NEXT*'
                    enabled_features.add(feature_type)
        enabled_features = sorted(enabled_features)

        # Vectorize features
        vocab = DictVectorizer()
        flat_X_feats = vocab.fit_transform(flatten(text_features))
        X_feats = reconstruct_list(flat_X_feats,
                                   save_list_structure(text_features))

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels]

        assert len(X_feats) == len(Y_labels)
        for i in range(len(X_feats)):
            assert X_feats[i].shape[0] == len(Y_labels[i])

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_text_features = extract_features(val_sents)
            flat_val_X_feats = vocab.transform(flatten(val_text_features))
            val_X = reconstruct_list(flat_val_X_feats,
                                     save_list_structure(val_text_features))
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

    print '\ttraining classifiers', p_or_n

    #val_sents  = val_sents[ :5]
    #val_labels = val_labels[:5]

    if use_lstm:
        # train using lstm
        clf, dev_score = keras_ml.train(X_seq_ids,
                                        Y_labels,
                                        tag2id,
                                        len(vocab),
                                        val_X_ids=val_X,
                                        val_Y_ids=val_Y)
    else:
        # train using crf
        clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y)

    return vocab, clf, dev_score, enabled_features
Ejemplo n.º 8
0
def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm,
                    hyperparams):
    '''
    generic_predict()

    Train a model that works for both prose and nonprose

    @param p_or_n.          A string that indicates "prose", "nonprose", or "all"
    @param tokenized_sents. A list of sentences, where each sentence is tokenized
                              into words
    @param vocab.           A dictionary mapping word tokens to numeric indices.
    @param clf.             An encoding of the trained keras model.
    @param use_lstm.        Bool indicating whether clf is a CRF or LSTM.
    '''
    # use_lstm=self._use_lstm
    if use_lstm:

        #parameters=hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        #model_folder="./models/NN_models"
        predictions = []
        sys.stdout.write('\n use_lstm \n')
        dataset = Exp.Dataset()

        fictional_labels = copy.deepcopy(tokenized_sents)
        for idx, x in enumerate(fictional_labels):
            for val_id, value in enumerate(x):
                fictional_labels[idx][val_id] = 'O'

        Datasets_tokens = {}
        Datasets_labels = {}

        Datasets_tokens['deploy'] = tokenized_sents
        Datasets_labels['deploy'] = fictional_labels

        token_to_vector = dataset.load_dataset(
            Datasets_tokens,
            Datasets_labels,
            "",
            parameters,
            token_to_vector=tokens_to_vec,
            pretrained_dataset=pretrained_dataset)

        print(dataset.token_indices.keys())

        parameters['Feature_vector_length'] = dataset.feature_vector_size
        parameters['use_features_before_final_lstm'] = False

        dataset.update_dataset("", ['deploy'], Datasets_tokens,
                               Datasets_labels)

        del Datasets_tokens
        del Datasets_labels

        #model=current_model
        model = entity_model.EntityLSTM(dataset, parameters)

        os.mkdir(parameters['conll_like_result_folder'])

        test_temp = os.path.join(parameters['conll_like_result_folder'],
                                 'test/')
        train_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'train/')
        valid_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'valid/')

        os.mkdir(test_temp)
        os.mkdir(train_temp)
        os.mkdir(valid_temp)

        sess = tf.Session()
        with sess.as_default():

            #model=entity_model.EntityLSTM(dataset,parameters)
            transition_params_trained = model.restore_from_pretrained_model(
                parameters,
                dataset,
                sess,
                token_to_vector=token_to_vector,
                pretrained_dataset=pretrained_dataset)
            del token_to_vector
            predictions = training_predict_LSTM.prediction_step(
                sess, dataset, "deploy", model, 0,
                parameters['conll_like_result_folder'],
                transition_params_trained)
            sess.close()

        tf.reset_default_graph()

        shutil.rmtree(parameters['conll_like_result_folder'])
        return predictions, model

    # If nothing to predict, skip actual prediction
    if len(tokenized_sents) == 0:
        sys.stdout.write('\tnothing to predict %s\n' % p_or_n)
        return []

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        print('todo: incorporate lstm')
        # vectorize tokenized sentences
        #X = []
        #for sent in tokenized_sents:
        #   id_seq = []
        #   for w in sent:
        #      if w in vocab:
        #           id_seq.append(vocab[w])
        #       else:
        #        id_seq.append(vocab['oov'])
        #  X.append(id_seq)
    else:
        from feature_extraction.features import extract_features

        # vectorize validation X
        text_features = extract_features(tokenized_sents)
        flat_X_feats = vocab.transform(flatten(text_features))
        X = reconstruct_list(flat_X_feats, save_list_structure(text_features))

    sys.stdout.write('\tpredicting  labels %s\n' % p_or_n)

    # Predict labels
    if use_lstm:
        print("TEST_PREDICT")
        exit()

    else:
        from machine_learning import crf
        predictions = crf.predict(clf, X)

    # Format labels from output
    return predictions
Ejemplo n.º 9
0
def generic_train(p_or_n,
                  train_sents,
                  train_labels,
                  use_lstm,
                  val_sents=None,
                  val_labels=None,
                  test_sents=None,
                  test_labels=None,
                  dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.         A string that indicates "prose", "nonprose", or "all"
    @param train_sents.    A list of sentences; each sentence is tokenized into words
    @param train_labels.   Parallel to `train_sents`, 7-way labels for concept spans
    @param use_lstm        Bool indicating whether to train CRF or LSTM.
    @param val_sents.      Validation data. Same format as train_sents
    @param val_labels.     Validation data. Same format as train_labels
    @param dev_split.      A real number from 0 to 1
    '''

    # Must have data to train on:
    if len(train_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10):

        p = int(dev_split * 100)
        sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p))

        perm = list(range(len(train_sents)))
        random.shuffle(perm)

        train_sents = [train_sents[i] for i in perm]
        train_labels = [train_labels[i] for i in perm]

        ind = int(dev_split * len(train_sents))

        val_sents = train_sents[:ind]
        train_sents = train_sents[ind:]

        val_labels = train_labels[:ind]
        train_labels = train_labels[ind:]
    else:
        sys.stdout.write('\tUsing existing validation data\n')

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        print("TESTING NEW DATSET OBJECT")
        dataset = Exp.Dataset()

        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = False

        Datasets_tokens = {}
        Datasets_labels = {}

        Datasets_tokens['train'] = train_sents
        Datasets_labels['train'] = train_labels

        if val_sents != None:
            Datasets_tokens['valid'] = val_sents
            Datasets_labels['valid'] = val_labels

        if test_sents != None:
            Datasets_tokens['test'] = test_sents
            Datasets_labels['test'] = test_labels

        dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters)
        pickle.dump(
            dataset,
            open(os.path.join(parameters['model_folder'], 'dataset.pickle'),
                 'wb'))

        print(Datasets_tokens['valid'][0])
        print(Datasets_tokens['test'][0])

        parameters['Feature_vector_length'] = dataset.feature_vector_size
        parameters['use_features_before_final_lstm'] = False
        parameters['learning_rate'] = 0.005

        sess = tf.Session()
        number_of_sent = list(range(len(dataset.token_indices['train'])))

        with sess.as_default():
            model = entity_model.EntityLSTM(dataset, parameters)
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)
            epoch_number = -1
            transition_params_trained = np.random.rand(5 + 2, 5 + 2)
            values = {}
            values["best"] = 0

            f1_dictionary = {}
            f1_dictionary['best'] = 0

            model_saver = tf.train.Saver(max_to_keep=100)

        print("START TRAINING")

        eval_dir = os.path.join(
            tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep)
        parameters['conll_like_result_folder'] = eval_dir

        test_temp = os.path.join(parameters['conll_like_result_folder'],
                                 'test/')
        train_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'train/')
        valid_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'valid/')

        os.mkdir(parameters['conll_like_result_folder'])
        os.mkdir(test_temp)
        os.mkdir(train_temp)
        os.mkdir(valid_temp)

        while epoch_number < 90:
            average_loss_per_phrase = 0
            accuracy_per_phase = 0
            step = 0

            epoch_number += 1
            if epoch_number != 0:
                sequence_numbers = list(
                    range(len(dataset.token_indices['train'])))
                random.shuffle(sequence_numbers)
                for sequence_number in sequence_numbers:
                    loss, accuracy, transition_params_trained = training_predict_LSTM.train_step(
                        sess, dataset, sequence_number, model)
                    average_loss_per_phrase += loss
                    accuracy_per_phase += accuracy
                    step += 1
                    if step % 10 == 0:
                        print('Training {0:.2f}% done\n'.format(
                            step / len(sequence_numbers) * 100))

                model_saver.save(
                    sess,
                    os.path.join(parameters['model_folder'],
                                 'model_{0:05d}.ckpt'.format(epoch_number)))

                total_loss = average_loss_per_phrase
                total_accuracy = accuracy_per_phase

                average_loss_per_phrase = average_loss_per_phrase / len(
                    number_of_sent)
                accuracy_per_phase = accuracy_per_phase / len(number_of_sent)

            if epoch_number > 0:
                ""
                f1, predictions = training_predict_LSTM.prediction_step(
                    sess, dataset, "test", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)
                f1_train, _ = training_predict_LSTM.prediction_step(
                    sess, dataset, "train", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)
                f1_valid, _ = training_predict_LSTM.prediction_step(
                    sess, dataset, "valid", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)

                correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy(
                    parameters['conll_like_result_folder'] + "valid" + os.sep +
                    "epoche_" + str(epoch_number) + ".txt")

                if f1_dictionary['best'] < float(f1_valid):
                    f1_dictionary['epoche'] = epoch_number
                    f1_dictionary['best'] = float(f1_valid)

                if values["best"] < correctly_predicted_tokens:
                    values["epoche"] = epoch_number
                    values["best"] = correctly_predicted_tokens

                #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens))

                print("NEW EPOCHE" + " " + str(epoch_number))

                print("Current F1 on train" + " " + str(f1_train))
                print("Current F1 on valid" + " " + str(f1_valid))
                print("Current F1 on test" + " " + str(f1))

                print("Current F1 best (validation): ")
                print(f1_dictionary)

        shutil.rmtree(parameters['conll_like_result_folder'])
        return parameters, dataset, f1_dictionary['best']

    else:
        ########
        # CRF
        ########

        # vectorize tokenized sentences
        text_features = extract_features(train_sents)
        # type(text_features): <type 'list'>

        # Collect list of feature types
        enabled_features = set()
        for sf in text_features:
            for wf in sf:
                for (feature_type, instance), value in wf.items():
                    if feature_type.startswith('prev'):
                        feature_type = 'PREV*'
                    if feature_type.startswith('next'):
                        feature_type = 'NEXT*'
                    enabled_features.add(feature_type)
        enabled_features = sorted(enabled_features)

        # Vectorize features
        vocab = DictVectorizer()
        flat_X_feats = vocab.fit_transform(flatten(text_features))
        X_feats = reconstruct_list(flat_X_feats,
                                   save_list_structure(text_features))

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels]

        assert len(X_feats) == len(Y_labels)
        for i in range(len(X_feats)):
            assert X_feats[i].shape[0] == len(Y_labels[i])

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_text_features = extract_features(val_sents)
            flat_val_X_feats = vocab.transform(flatten(val_text_features))
            val_X = reconstruct_list(flat_val_X_feats,
                                     save_list_structure(val_text_features))
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

        # if there is specified test data, then vectorize it
        if test_sents:
            # vectorize test X
            test_text_features = extract_features(test_sents)
            flat_test_X_feats = vocab.transform(flatten(test_text_features))
            test_X = reconstruct_list(flat_test_X_feats,
                                      save_list_structure(test_text_features))
            # vectorize test Y
            test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels]
        else:
            test_X = None
            test_Y = None

    sys.stdout.write('\ttraining classifiers %s\n' % p_or_n)

    if use_lstm:
        # train using lstm
        clf, dev_score = keras_ml.train(X_seq_ids,
                                        Y_labels,
                                        tag2id,
                                        len(vocab),
                                        val_X_ids=val_X,
                                        val_Y_ids=val_Y,
                                        test_X_ids=test_X,
                                        test_Y_ids=test_Y)
    else:
        # train using crf
        clf, dev_score = crf.train(X_feats,
                                   Y_labels,
                                   val_X=val_X,
                                   val_Y=val_Y,
                                   test_X=test_X,
                                   test_Y=test_Y)

    return vocab, clf, dev_score, enabled_features
Ejemplo n.º 10
0
def generic_train(p_or_n,
                  train_sents,
                  train_labels,
                  use_lstm,
                  val_sents=None,
                  val_labels=None,
                  test_sents=[],
                  test_labels=[],
                  dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.         A string that indicates "prose", "nonprose", or "all"
    @param train_sents.    A list of sentences; each sentence is tokenized into words
    @param train_labels.   Parallel to `train_sents`, 7-way labels for concept spans
    @param use_lstm        Bool indicating whether to train CRF or LSTM.
    @param val_sents.      Validation data. Same format as train_sents
    @param val_labels.     Validation data. Same format as train_labels
    @param dev_split.      A real number from 0 to 1
    '''

    # Must have data to train on:
    if len(train_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10):

        p = int(dev_split * 100)
        sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p))

        perm = list(range(len(train_sents)))
        random.shuffle(perm)

        train_sents = [train_sents[i] for i in perm]
        train_labels = [train_labels[i] for i in perm]

        ind = int(dev_split * len(train_sents))

        val_sents = train_sents[:ind]
        train_sents = train_sents[ind:]

        val_labels = train_labels[:ind]
        train_labels = train_labels[ind:]
    else:
        sys.stdout.write('\tUsing existing validation data\n')

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        ########
        # LSTM
        ########

        sys.stdout.write('%s\n' % train_sents)
        sys.stdout.write('%s\n' % train_labels)
        sys.stdout.write('incorportate hierarchical LSTM\n')
        exit()

    else:
        ########
        # CRF
        ########

        # vectorize tokenized sentences
        text_features = extract_features(train_sents)
        # type(text_features): <type 'list'>

        # Collect list of feature types
        enabled_features = set()
        for sf in text_features:
            for wf in sf:
                for (feature_type, instance), value in wf.items():
                    if feature_type.startswith('prev'):
                        feature_type = 'PREV*'
                    if feature_type.startswith('next'):
                        feature_type = 'NEXT*'
                    enabled_features.add(feature_type)
        enabled_features = sorted(enabled_features)

        # Vectorize features
        vocab = DictVectorizer()
        flat_X_feats = vocab.fit_transform(flatten(text_features))
        X_feats = reconstruct_list(flat_X_feats,
                                   save_list_structure(text_features))

        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels]

        assert len(X_feats) == len(Y_labels)
        for i in range(len(X_feats)):
            assert X_feats[i].shape[0] == len(Y_labels[i])

        # if there is specified validation data, then vectorize it
        if val_sents:
            # vectorize validation X
            val_text_features = extract_features(val_sents)
            flat_val_X_feats = vocab.transform(flatten(val_text_features))
            val_X = reconstruct_list(flat_val_X_feats,
                                     save_list_structure(val_text_features))
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

        # if there is specified test data, then vectorize it
        if test_sents:
            # vectorize test X
            test_text_features = extract_features(test_sents)
            flat_test_X_feats = vocab.transform(flatten(test_text_features))
            test_X = reconstruct_list(flat_test_X_feats,
                                      save_list_structure(test_text_features))
            # vectorize test Y
            test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels]

    sys.stdout.write('\ttraining classifiers %s\n' % p_or_n)

    if use_lstm:
        # train using lstm
        clf, dev_score = keras_ml.train(X_seq_ids,
                                        Y_labels,
                                        tag2id,
                                        len(vocab),
                                        val_X_ids=val_X,
                                        val_Y_ids=val_Y,
                                        test_X_ids=test_X,
                                        test_Y_ids=test_Y)
    else:
        # train using crf
        clf, dev_score = crf.train(X_feats,
                                   Y_labels,
                                   val_X=val_X,
                                   val_Y=val_Y,
                                   test_X=test_X,
                                   test_Y=test_Y)

    return vocab, clf, dev_score, enabled_features
Ejemplo n.º 11
0
    # convert text features to design matrix
    offsets = save_list_structure(text_features)
    flat_text_features = flatten(text_features)
    flat_train_X = dvec.fit_transform(flat_text_features)

    # vectorize labels
    flat_tags = flatten(tags)

    tag2ind = {tag: i for i, tag in enumerate(set(flat_tags))}
    ind2tag = {i: tag for tag, i in tag2ind.items()}

    flat_train_Y = [tag2ind[tag] for tag in flat_tags]

    # reconstruct list structures
    train_X = reconstruct_list(list(flat_train_X), offsets)
    train_Y = reconstruct_list(flat_train_Y, offsets)

    # build CRF model
    crf_model = crf.train(train_X, train_Y)

    # save all important info
    with open(model_path, 'wb') as f:
        pickle.dump(dvec, f)
        pickle.dump(tag2ind, f)
        pickle.dump(crf_model, f)

    ######################################################################
    #                            PREDICTING                              #
    ######################################################################
Ejemplo n.º 12
0
        ######################################################################
        #                            FEATURE ENGINEERING                     #
        ######################################################################

        text_features = extract_features(sents)

        ######################################################################
        #                         FORMATTING DATA                            #
        ######################################################################

        # convert text features to design matrix
        flat_pred_X = dvec.transform(flatten(text_features))

        # reconstruct list structures
        offsets = save_list_structure(text_features)
        pred_X = reconstruct_list(list(flat_pred_X), offsets)

        ######################################################################
        #                            PREDICTING                              #
        ######################################################################

        # make the predictions
        pred_Y = crf.predict(crf_model, pred_X)
        pred_tags = [[ind2tag[p] for p in P] for P in pred_Y]

        assert len(pred_Y) == len(pred_X)
        for i in range(len(pred_Y)):
            assert len(pred_Y[i]) == len(pred_X[i])

        # correct illegal predictions (AKA bad I -> legal B)
        for lineno, preds in enumerate(pred_tags):
Ejemplo n.º 13
0
def generic_train(p_or_n,
                  train_sents,
                  train_labels,
                  use_lstm,
                  val_sents=None,
                  val_labels=None,
                  test_sents=None,
                  test_labels=None,
                  dev_split=None):
    '''
    generic_train()

    Train a model that works for both prose and nonprose

    @param p_or_n.         A string that indicates "prose", "nonprose", or "all"
    @param train_sents.    A list of sentences; each sentence is tokenized into words
    @param train_labels.   Parallel to `train_sents`, 7-way labels for concept spans
    @param use_lstm        Bool indicating whether to train CRF or LSTM.
    @param val_sents.      Validation data. Same format as train_sents
    @param val_labels.     Validation data. Same format as train_labels
    @param dev_split.      A real number from 0 to 1
    '''

    # Must have data to train on:
    if len(train_sents) == 0:
        raise Exception('Training must have %s training examples' % p_or_n)

    # if you should split the data into train/dev yourself
    if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10):

        p = int(dev_split * 100)
        sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p))

        perm = list(range(len(train_sents)))

        random.seed(101)
        random.shuffle(perm)  # Random cause outcome to be slightly different?

        train_sents = [train_sents[i] for i in perm]
        train_labels = [train_labels[i] for i in perm]

        ind = int(dev_split *
                  len(train_sents))  #index to start split holdout set

        val_sents = train_sents[:ind]
        train_sents = train_sents[ind:]

        val_labels = train_labels[:ind]
        train_labels = train_labels[ind:]
    else:
        sys.stdout.write('\tUsing existing validation data\n')

    sys.stdout.write('\tvectorizing words %s\n' % p_or_n)

    if use_lstm:
        print("TESTING NEW DATSET OBJECT")
        import DatasetCliner_experimental as Exp
        dataset = Exp.Dataset()
        import helper_dataset as hd
        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = False

        Datasets_tokens = {}
        Datasets_labels = {}

        Datasets_tokens['train'] = train_sents
        Datasets_labels['train'] = train_labels

        if val_sents != None:
            Datasets_tokens['valid'] = val_sents
            Datasets_labels['valid'] = val_labels

        if test_sents != None:
            Datasets_tokens['test'] = test_sents
            Datasets_labels['test'] = test_labels

        parameters['token_pretrained_embedding_filepath'] = ''

        dataset.load_dataset(Datasets_tokens, Datasets_labels, "",
                             parameters)  # {}

        import pickle
        # print() # ./models/NN_models/Test_November
        pickle.dump(
            dataset,
            open(os.path.join(parameters['model_folder'], 'dataset.pickle'),
                 'wb'))

        print(Datasets_tokens['valid'][0])
        print(Datasets_tokens['test'][0])

        parameters['Feature_vector_length'] = dataset.feature_vector_size
        parameters['use_features_before_final_lstm'] = False
        parameters['learning_rate'] = 0.005

        sess = tf.Session()
        number_of_sent = list(range(len(dataset.token_indices['train'])))

        # ANAK 3/8/2019
        with sess.as_default():
            model = entity_model.EntityLSTM(dataset, parameters)
            sess.run(tf.global_variables_initializer())
            model.load_pretrained_token_embeddings(sess, dataset, parameters)
            epoch_number = -1
            transition_params_trained = np.random.rand(5 + 2, 5 + 2)
            values = {}
            values["best"] = 0

            f1_dictionary = {}
            f1_dictionary['best'] = 0

            model_saver = tf.train.Saver(max_to_keep=100)

        print("START TRAINING")

        eval_dir = os.path.join(
            tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep)
        parameters['conll_like_result_folder'] = eval_dir

        test_temp = os.path.join(parameters['conll_like_result_folder'],
                                 'test/')
        train_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'train/')
        valid_temp = os.path.join(parameters['conll_like_result_folder'],
                                  'valid/')

        os.mkdir(parameters['conll_like_result_folder'])
        os.mkdir(test_temp)
        os.mkdir(train_temp)
        os.mkdir(valid_temp)

        while epoch_number < 90:
            average_loss_per_phrase = 0
            accuracy_per_phase = 0
            step = 0

            epoch_number += 1
            if epoch_number != 0:
                sequence_numbers = list(
                    range(len(dataset.token_indices['train'])))
                random.shuffle(sequence_numbers)
                for sequence_number in sequence_numbers:
                    loss, accuracy, transition_params_trained = training_predict_LSTM.train_step(
                        sess, dataset, sequence_number, model)
                    average_loss_per_phrase += loss
                    accuracy_per_phase += accuracy
                    step += 1
                    if step % 10 == 0:
                        print('Training {0:.2f}% done\n'.format(
                            step / len(sequence_numbers) * 100))

                model_saver.save(
                    sess,
                    os.path.join(parameters['model_folder'],
                                 'model_{0:05d}.ckpt'.format(epoch_number)))

                total_loss = average_loss_per_phrase
                total_accuracy = accuracy_per_phase

                average_loss_per_phrase = average_loss_per_phrase / len(
                    number_of_sent)
                accuracy_per_phase = accuracy_per_phase / len(number_of_sent)

            if epoch_number > 0:
                ""
                f1, predictions = training_predict_LSTM.prediction_step(
                    sess, dataset, "test", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)
                f1_train, _ = training_predict_LSTM.prediction_step(
                    sess, dataset, "train", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)
                f1_valid, _ = training_predict_LSTM.prediction_step(
                    sess, dataset, "valid", model, epoch_number,
                    parameters['conll_like_result_folder'],
                    transition_params_trained)

                correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy(
                    parameters['conll_like_result_folder'] + "valid" + os.sep +
                    "epoche_" + str(epoch_number) + ".txt")

                if f1_dictionary['best'] < float(f1_valid):
                    f1_dictionary['epoche'] = epoch_number
                    f1_dictionary['best'] = float(f1_valid)

                if values["best"] < correctly_predicted_tokens:
                    values["epoche"] = epoch_number
                    values["best"] = correctly_predicted_tokens

                #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens))

                print("NEW EPOCHE" + " " + str(epoch_number))

                print("Current F1 on train" + " " + str(f1_train))
                print("Current F1 on valid" + " " + str(f1_valid))
                print("Current F1 on test" + " " + str(f1))

                print("Current F1 best (validation): ")
                print(f1_dictionary)

        shutil.rmtree(parameters['conll_like_result_folder'])
        return parameters, dataset, f1_dictionary['best']

    else:
        ########
        # CRF
        ########

        from feature_extraction.features import extract_features

        # vectorize tokenized sentences
        # print(train_sents)
        '''
        [[['medications', ':', 'darvocet-n', '__num__', 'one', 'tablet', 'p.o', '.'], ['and', 'colace', '__num__', 'mg', 'p.o', '.'], ['daily', ',', 'atrovent', 'inhaler', '__num__', 'puffs', 'q.
        i.d.'], ... ,]] 
        '''
        text_features = extract_features(train_sents)

        # print(text_features)
        '''
        [{('dummy', ''): 1, ('length', ''): 1, ('metric_unit', ''): 1, ('stem_lancaster', ','): 1, ('stem_porter', ','): 1, ('word', ','): 1, ('Generic#', ','): 1, ('mitre', 'PUNCTUATION'
            ): 1, ('mitre', 'NOVOWELS'): 1, ('last_two_letters', ','): 1, ('word_shape', 'SYMBOL'): 1}, ....]]
        '''

        # Collect list of feature types
        enabled_features = set()
        for sf in text_features:
            for wf in sf:
                for (feature_type, instance), value in wf.items():
                    if feature_type.startswith('prev'):
                        feature_type = 'PREV*'
                    if feature_type.startswith('next'):
                        feature_type = 'NEXT*'
                    enabled_features.add(feature_type)

        enabled_features = sorted(enabled_features)
        # print(enabled_features) # ['Generic#', 'NEXT*', 'PREV*', 'dummy', 'last_two_letters', 'length', 'metric_unit', 'mitre', 'pos', 'stem_lancaster', 'stem_porter', 'word', 'word_shape']
        # print(len(enabled_features)) # 13

        # print(len(text_features)) # 50
        # print(len(flatten(text_features))) #not always the same: 509, 543, 557, but why?

        # Vectorize features
        vocab = DictVectorizer()
        #accepted value of text_features are
        # D = [{('feature_1','value_1): 1, ('feature_2','value_2'): 2 ,... }]
        flat_X_feats = vocab.fit_transform(flatten(text_features))

        # print(vocab) # DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True, sparse=True)
        # print(type(flat_X_feats)) #scipy.sparse
        # print(flat_X_feats)
        '''
        (row,col)     val
        
        (0, 57)       1.0
        (0, 238)      1.0
        (0, 297)      1.0
        (0, 345)      5.0
        (0, 346)      1.0
        '''

        X_feats = reconstruct_list(flat_X_feats,
                                   save_list_structure(text_features))
        # print(type(X_feats)) #list
        # print(X_feats)
        '''
        [<35x12297 sparse matrix of type '<class 'numpy.float64'>'
            with 5610 stored elements in Compressed Sparse Row format>, <4x12297 sparse matrix of type '<class 'numpy.float64'>'
            with 297 stored elements in Compressed Sparse Row format>, <7x12297 sparse matrix of type '<class 'numpy.float64'>'
            with 691 stored elements in Compressed Sparse Row format>, <2x12297 sparse matrix of type '<class 'numpy.float64'>'
        '''

        # print(tag2id)
        '''
        labels = { 'O':0,
           'B-problem':1, 'B-test':2, 'B-treatment':3,
           'I-problem':4, 'I-test':5, 'I-treatment':6,
        }
        '''
        # print(train_labels)
        '''
        '[B-problem', 'I-problem', 'I-problem', 'I-problem', 'O', 'O', 'O', 'O', 'O',
            'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O']]
        '''
        # vectorize IOB labels
        Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels]

        # print(Y_labels)
        '''
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 4, 0, 0, 1, 4, 0, 1, 0], [0, 2, 5, 0, 0, 1, 4, 0, 0, 0, 0, 0,
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 6, 0, 0, 0, 0]]

        '''

        assert len(X_feats) == len(Y_labels)
        for i in range(len(X_feats)):
            assert X_feats[i].shape[0] == len(Y_labels[i])

        # if there is specified validation data, then vectorize it

        if val_sents:
            # vectorize validation X
            val_text_features = extract_features(val_sents)
            flat_val_X_feats = vocab.transform(flatten(val_text_features))
            val_X = reconstruct_list(flat_val_X_feats,
                                     save_list_structure(val_text_features))
            # vectorize validation Y
            val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels]

        # if there is specified test data, then vectorize it
        if test_sents:
            # vectorize test X
            test_text_features = extract_features(test_sents)
            flat_test_X_feats = vocab.transform(flatten(test_text_features))
            test_X = reconstruct_list(flat_test_X_feats,
                                      save_list_structure(test_text_features))
            # vectorize test Y
            test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels]
        else:
            test_X = None
            test_Y = None

    sys.stdout.write('\ttraining classifiers %s\n' % p_or_n)

    if use_lstm:
        # train using lstm
        clf, dev_score = keras_ml.train(X_seq_ids,
                                        Y_labels,
                                        tag2id,
                                        len(vocab),
                                        val_X_ids=val_X,
                                        val_Y_ids=val_Y,
                                        test_X_ids=test_X,
                                        test_Y_ids=test_Y)
    else:
        # train using crf
        from machine_learning import crf
        clf, dev_score = crf.train(X_feats,
                                   Y_labels,
                                   val_X=val_X,
                                   val_Y=val_Y,
                                   test_X=test_X,
                                   test_Y=test_Y)

    return vocab, clf, dev_score, enabled_features