def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param train_sents. A list of sentences; each sentence is tokenized into words @param train_labels. Parallel to `train_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as train_sents @param val_labels. Validation data. Same format as train_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(train_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10): p = int(dev_split * 100) sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p)) perm = list(range(len(train_sents))) random.shuffle(perm) train_sents = [train_sents[i] for i in perm] train_labels = [train_labels[i] for i in perm] ind = int(dev_split * len(train_sents)) val_sents = train_sents[:ind] train_sents = train_sents[ind:] val_labels = train_labels[:ind] train_labels = train_labels[ind:] else: sys.stdout.write('\tUsing existing validation data\n') sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print("TESTING NEW DATSET OBJECT") dataset = Exp.Dataset() parameters = hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = False Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['train'] = train_sents Datasets_labels['train'] = train_labels if val_sents != None: Datasets_tokens['valid'] = val_sents Datasets_labels['valid'] = val_labels if test_sents != None: Datasets_tokens['test'] = test_sents Datasets_labels['test'] = test_labels dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters) pickle.dump( dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) print(Datasets_tokens['valid'][0]) print(Datasets_tokens['test'][0]) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False parameters['learning_rate'] = 0.005 sess = tf.Session() number_of_sent = list(range(len(dataset.token_indices['train']))) with sess.as_default(): model = entity_model.EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) epoch_number = -1 transition_params_trained = np.random.rand(5 + 2, 5 + 2) values = {} values["best"] = 0 f1_dictionary = {} f1_dictionary['best'] = 0 model_saver = tf.train.Saver(max_to_keep=100) print("START TRAINING") eval_dir = os.path.join( tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep) parameters['conll_like_result_folder'] = eval_dir test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(parameters['conll_like_result_folder']) os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) while epoch_number < 90: average_loss_per_phrase = 0 accuracy_per_phase = 0 step = 0 epoch_number += 1 if epoch_number != 0: sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: loss, accuracy, transition_params_trained = training_predict_LSTM.train_step( sess, dataset, sequence_number, model) average_loss_per_phrase += loss accuracy_per_phase += accuracy step += 1 if step % 10 == 0: print('Training {0:.2f}% done\n'.format( step / len(sequence_numbers) * 100)) model_saver.save( sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) total_loss = average_loss_per_phrase total_accuracy = accuracy_per_phase average_loss_per_phrase = average_loss_per_phrase / len( number_of_sent) accuracy_per_phase = accuracy_per_phase / len(number_of_sent) if epoch_number > 0: "" f1, predictions = training_predict_LSTM.prediction_step( sess, dataset, "test", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_train, _ = training_predict_LSTM.prediction_step( sess, dataset, "train", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_valid, _ = training_predict_LSTM.prediction_step( sess, dataset, "valid", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy( parameters['conll_like_result_folder'] + "valid" + os.sep + "epoche_" + str(epoch_number) + ".txt") if f1_dictionary['best'] < float(f1_valid): f1_dictionary['epoche'] = epoch_number f1_dictionary['best'] = float(f1_valid) if values["best"] < correctly_predicted_tokens: values["epoche"] = epoch_number values["best"] = correctly_predicted_tokens #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens)) print("NEW EPOCHE" + " " + str(epoch_number)) print("Current F1 on train" + " " + str(f1_train)) print("Current F1 on valid" + " " + str(f1_valid)) print("Current F1 on test" + " " + str(f1)) print("Current F1 best (validation): ") print(f1_dictionary) shutil.rmtree(parameters['conll_like_result_folder']) return parameters, dataset, f1_dictionary['best'] else: ######## # CRF ######## from cliner.feature_extraction.features import extract_features # vectorize tokenized sentences text_features = extract_features(train_sents) # type(text_features): <type 'list'> # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform(flatten(text_features)) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] # if there is specified test data, then vectorize it if test_sents: # vectorize test X test_text_features = extract_features(test_sents) flat_test_X_feats = vocab.transform(flatten(test_text_features)) test_X = reconstruct_list(flat_test_X_feats, save_list_structure(test_text_features)) # vectorize test Y test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels] else: test_X = None test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y, test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf from machine_learning import crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) return vocab, clf, dev_score, enabled_features
def generic_train(p_or_n, tokenized_sents, iob_nested_labels, use_lstm, val_sents=None, val_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized into words @param iob_nested_labels. Parallel to `tokenized_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as tokenized_sents @param val_labels. Validation data. Same format as iob_nested_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(tokenized_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself #if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents)>1000): if (not val_sents) and (dev_split > 0.0) and (len(tokenized_sents) > 10): p = int(dev_split * 100) print '\tCreating %d/%d train/dev split' % (100 - p, p) perm = range(len(tokenized_sents)) random.shuffle(perm) tokenized_sents = [tokenized_sents[i] for i in perm] iob_nested_labels = [iob_nested_labels[i] for i in perm] ind = int(dev_split * len(tokenized_sents)) val_sents = tokenized_sents[:ind] train_sents = tokenized_sents[ind:] val_labels = iob_nested_labels[:ind] train_labels = iob_nested_labels[ind:] tokenized_sents = train_sents iob_nested_labels = train_labels print '\tvectorizing words', p_or_n #tokenized_sents = train_sents[ :2] #iob_nested_labels = train_labels[:2] # count word frequencies to determine OOV freq = defaultdict(int) for sent in tokenized_sents: for w in sent: freq[w] += 1 # determine OOV based on % of vocab or minimum word freq threshold oov = set() ''' if len(freq) < 100: lo = len(freq)/20 oov = set([ w for w,f in sorted(freq.items(), key=lambda t:t[1]) ][:lo]) else: #lo = 2 #oov = set([ w for w,f in freq.items() if (f <= lo) ]) oov = set() ''' ''' val = None for w,f in sorted(freq.items(), key=lambda t:t[1]): if val != f: val = f print print '%8d %s' % (f,w) exit() ''' if use_lstm: ######## # LSTM ######## # build vocabulary of words vocab = {} for sent in tokenized_sents: for w in sent: if (w not in vocab) and (w not in oov): vocab[w] = len(vocab) + 1 vocab['oov'] = len(vocab) + 1 # vectorize tokenized sentences X_seq_ids = [] for sent in tokenized_sents: id_seq = [(vocab[w] if w in vocab else vocab['oov']) for w in sent] X_seq_ids.append(id_seq) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels] # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_X = [] for sent in val_sents: id_seq = [(vocab[w] if w in vocab else vocab['oov']) for w in sent] val_X.append(id_seq) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] else: ######## # CRF ######## # vectorize tokenized sentences ''' def make_feature(ind): return {(ind,i):1 for i in range(10)} text_features = [] fseq = [make_feature(vocab[w] if w in vocab else vocab['oov']) for w in sent] text_features.append(fseq) ''' text_features = extract_features(tokenized_sents) # type(text_features): <type 'list'> # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform(flatten(text_features)) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in iob_nested_labels] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] print '\ttraining classifiers', p_or_n #val_sents = val_sents[ :5] #val_labels = val_labels[:5] if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y) else: # train using crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y) return vocab, clf, dev_score, enabled_features
else: test_X = None test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y, test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf from machine_learning import crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) return vocab, clf, dev_score, enabled_features #def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, pretrained_dataset=None,tokens_to_vec=None, current_model=None, parameters=None): def generic_predict(p_or_n, tokenized_sents, vocab, clf, use_lstm, hyperparams): ''' generic_predict() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param tokenized_sents. A list of sentences, where each sentence is tokenized
def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=[], test_labels=[], dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param train_sents. A list of sentences; each sentence is tokenized into words @param train_labels. Parallel to `train_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as train_sents @param val_labels. Validation data. Same format as train_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(train_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10): p = int(dev_split * 100) sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p)) perm = list(range(len(train_sents))) random.shuffle(perm) train_sents = [train_sents[i] for i in perm] train_labels = [train_labels[i] for i in perm] ind = int(dev_split * len(train_sents)) val_sents = train_sents[:ind] train_sents = train_sents[ind:] val_labels = train_labels[:ind] train_labels = train_labels[ind:] else: sys.stdout.write('\tUsing existing validation data\n') sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: ######## # LSTM ######## sys.stdout.write('%s\n' % train_sents) sys.stdout.write('%s\n' % train_labels) sys.stdout.write('incorportate hierarchical LSTM\n') exit() else: ######## # CRF ######## # vectorize tokenized sentences text_features = extract_features(train_sents) # type(text_features): <type 'list'> # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # Vectorize features vocab = DictVectorizer() flat_X_feats = vocab.fit_transform(flatten(text_features)) X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels] assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] # if there is specified test data, then vectorize it if test_sents: # vectorize test X test_text_features = extract_features(test_sents) flat_test_X_feats = vocab.transform(flatten(test_text_features)) test_X = reconstruct_list(flat_test_X_feats, save_list_structure(test_text_features)) # vectorize test Y test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels] sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y, test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) return vocab, clf, dev_score, enabled_features
def generic_train(p_or_n, train_sents, train_labels, use_lstm, val_sents=None, val_labels=None, test_sents=None, test_labels=None, dev_split=None): ''' generic_train() Train a model that works for both prose and nonprose @param p_or_n. A string that indicates "prose", "nonprose", or "all" @param train_sents. A list of sentences; each sentence is tokenized into words @param train_labels. Parallel to `train_sents`, 7-way labels for concept spans @param use_lstm Bool indicating whether to train CRF or LSTM. @param val_sents. Validation data. Same format as train_sents @param val_labels. Validation data. Same format as train_labels @param dev_split. A real number from 0 to 1 ''' # Must have data to train on: if len(train_sents) == 0: raise Exception('Training must have %s training examples' % p_or_n) # if you should split the data into train/dev yourself if (not val_sents) and (dev_split > 0.0) and (len(train_sents) > 10): p = int(dev_split * 100) sys.stdout.write('\tCreating %d/%d train/dev split\n' % (100 - p, p)) perm = list(range(len(train_sents))) random.seed(101) random.shuffle(perm) # Random cause outcome to be slightly different? train_sents = [train_sents[i] for i in perm] train_labels = [train_labels[i] for i in perm] ind = int(dev_split * len(train_sents)) #index to start split holdout set val_sents = train_sents[:ind] train_sents = train_sents[ind:] val_labels = train_labels[:ind] train_labels = train_labels[ind:] else: sys.stdout.write('\tUsing existing validation data\n') sys.stdout.write('\tvectorizing words %s\n' % p_or_n) if use_lstm: print("TESTING NEW DATSET OBJECT") import DatasetCliner_experimental as Exp dataset = Exp.Dataset() import helper_dataset as hd parameters = hd.load_parameters_from_file("LSTM_parameters.txt") parameters['use_pretrained_model'] = False Datasets_tokens = {} Datasets_labels = {} Datasets_tokens['train'] = train_sents Datasets_labels['train'] = train_labels if val_sents != None: Datasets_tokens['valid'] = val_sents Datasets_labels['valid'] = val_labels if test_sents != None: Datasets_tokens['test'] = test_sents Datasets_labels['test'] = test_labels parameters['token_pretrained_embedding_filepath'] = '' dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters) # {} import pickle # print() # ./models/NN_models/Test_November pickle.dump( dataset, open(os.path.join(parameters['model_folder'], 'dataset.pickle'), 'wb')) print(Datasets_tokens['valid'][0]) print(Datasets_tokens['test'][0]) parameters['Feature_vector_length'] = dataset.feature_vector_size parameters['use_features_before_final_lstm'] = False parameters['learning_rate'] = 0.005 sess = tf.Session() number_of_sent = list(range(len(dataset.token_indices['train']))) # ANAK 3/8/2019 with sess.as_default(): model = entity_model.EntityLSTM(dataset, parameters) sess.run(tf.global_variables_initializer()) model.load_pretrained_token_embeddings(sess, dataset, parameters) epoch_number = -1 transition_params_trained = np.random.rand(5 + 2, 5 + 2) values = {} values["best"] = 0 f1_dictionary = {} f1_dictionary['best'] = 0 model_saver = tf.train.Saver(max_to_keep=100) print("START TRAINING") eval_dir = os.path.join( tmo_dir, 'cliner_eval_%d' % random.randint(0, 256) + os.sep) parameters['conll_like_result_folder'] = eval_dir test_temp = os.path.join(parameters['conll_like_result_folder'], 'test/') train_temp = os.path.join(parameters['conll_like_result_folder'], 'train/') valid_temp = os.path.join(parameters['conll_like_result_folder'], 'valid/') os.mkdir(parameters['conll_like_result_folder']) os.mkdir(test_temp) os.mkdir(train_temp) os.mkdir(valid_temp) while epoch_number < 90: average_loss_per_phrase = 0 accuracy_per_phase = 0 step = 0 epoch_number += 1 if epoch_number != 0: sequence_numbers = list( range(len(dataset.token_indices['train']))) random.shuffle(sequence_numbers) for sequence_number in sequence_numbers: loss, accuracy, transition_params_trained = training_predict_LSTM.train_step( sess, dataset, sequence_number, model) average_loss_per_phrase += loss accuracy_per_phase += accuracy step += 1 if step % 10 == 0: print('Training {0:.2f}% done\n'.format( step / len(sequence_numbers) * 100)) model_saver.save( sess, os.path.join(parameters['model_folder'], 'model_{0:05d}.ckpt'.format(epoch_number))) total_loss = average_loss_per_phrase total_accuracy = accuracy_per_phase average_loss_per_phrase = average_loss_per_phrase / len( number_of_sent) accuracy_per_phase = accuracy_per_phase / len(number_of_sent) if epoch_number > 0: "" f1, predictions = training_predict_LSTM.prediction_step( sess, dataset, "test", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_train, _ = training_predict_LSTM.prediction_step( sess, dataset, "train", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) f1_valid, _ = training_predict_LSTM.prediction_step( sess, dataset, "valid", model, epoch_number, parameters['conll_like_result_folder'], transition_params_trained) correctly_predicted_tokens = training_predict_LSTM.compute_train_accuracy( parameters['conll_like_result_folder'] + "valid" + os.sep + "epoche_" + str(epoch_number) + ".txt") if f1_dictionary['best'] < float(f1_valid): f1_dictionary['epoche'] = epoch_number f1_dictionary['best'] = float(f1_valid) if values["best"] < correctly_predicted_tokens: values["epoche"] = epoch_number values["best"] = correctly_predicted_tokens #print ("Number of correctly predicted tokens -test "+str(correctly_predicted_tokens)) print("NEW EPOCHE" + " " + str(epoch_number)) print("Current F1 on train" + " " + str(f1_train)) print("Current F1 on valid" + " " + str(f1_valid)) print("Current F1 on test" + " " + str(f1)) print("Current F1 best (validation): ") print(f1_dictionary) shutil.rmtree(parameters['conll_like_result_folder']) return parameters, dataset, f1_dictionary['best'] else: ######## # CRF ######## from feature_extraction.features import extract_features # vectorize tokenized sentences # print(train_sents) ''' [[['medications', ':', 'darvocet-n', '__num__', 'one', 'tablet', 'p.o', '.'], ['and', 'colace', '__num__', 'mg', 'p.o', '.'], ['daily', ',', 'atrovent', 'inhaler', '__num__', 'puffs', 'q. i.d.'], ... ,]] ''' text_features = extract_features(train_sents) # print(text_features) ''' [{('dummy', ''): 1, ('length', ''): 1, ('metric_unit', ''): 1, ('stem_lancaster', ','): 1, ('stem_porter', ','): 1, ('word', ','): 1, ('Generic#', ','): 1, ('mitre', 'PUNCTUATION' ): 1, ('mitre', 'NOVOWELS'): 1, ('last_two_letters', ','): 1, ('word_shape', 'SYMBOL'): 1}, ....]] ''' # Collect list of feature types enabled_features = set() for sf in text_features: for wf in sf: for (feature_type, instance), value in wf.items(): if feature_type.startswith('prev'): feature_type = 'PREV*' if feature_type.startswith('next'): feature_type = 'NEXT*' enabled_features.add(feature_type) enabled_features = sorted(enabled_features) # print(enabled_features) # ['Generic#', 'NEXT*', 'PREV*', 'dummy', 'last_two_letters', 'length', 'metric_unit', 'mitre', 'pos', 'stem_lancaster', 'stem_porter', 'word', 'word_shape'] # print(len(enabled_features)) # 13 # print(len(text_features)) # 50 # print(len(flatten(text_features))) #not always the same: 509, 543, 557, but why? # Vectorize features vocab = DictVectorizer() #accepted value of text_features are # D = [{('feature_1','value_1): 1, ('feature_2','value_2'): 2 ,... }] flat_X_feats = vocab.fit_transform(flatten(text_features)) # print(vocab) # DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True, sparse=True) # print(type(flat_X_feats)) #scipy.sparse # print(flat_X_feats) ''' (row,col) val (0, 57) 1.0 (0, 238) 1.0 (0, 297) 1.0 (0, 345) 5.0 (0, 346) 1.0 ''' X_feats = reconstruct_list(flat_X_feats, save_list_structure(text_features)) # print(type(X_feats)) #list # print(X_feats) ''' [<35x12297 sparse matrix of type '<class 'numpy.float64'>' with 5610 stored elements in Compressed Sparse Row format>, <4x12297 sparse matrix of type '<class 'numpy.float64'>' with 297 stored elements in Compressed Sparse Row format>, <7x12297 sparse matrix of type '<class 'numpy.float64'>' with 691 stored elements in Compressed Sparse Row format>, <2x12297 sparse matrix of type '<class 'numpy.float64'>' ''' # print(tag2id) ''' labels = { 'O':0, 'B-problem':1, 'B-test':2, 'B-treatment':3, 'I-problem':4, 'I-test':5, 'I-treatment':6, } ''' # print(train_labels) ''' '[B-problem', 'I-problem', 'I-problem', 'I-problem', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O']] ''' # vectorize IOB labels Y_labels = [[tag2id[y] for y in y_seq] for y_seq in train_labels] # print(Y_labels) ''' [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 4, 0, 0, 1, 4, 0, 1, 0], [0, 2, 5, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 3, 6, 0, 0, 0, 0]] ''' assert len(X_feats) == len(Y_labels) for i in range(len(X_feats)): assert X_feats[i].shape[0] == len(Y_labels[i]) # if there is specified validation data, then vectorize it if val_sents: # vectorize validation X val_text_features = extract_features(val_sents) flat_val_X_feats = vocab.transform(flatten(val_text_features)) val_X = reconstruct_list(flat_val_X_feats, save_list_structure(val_text_features)) # vectorize validation Y val_Y = [[tag2id[y] for y in y_seq] for y_seq in val_labels] # if there is specified test data, then vectorize it if test_sents: # vectorize test X test_text_features = extract_features(test_sents) flat_test_X_feats = vocab.transform(flatten(test_text_features)) test_X = reconstruct_list(flat_test_X_feats, save_list_structure(test_text_features)) # vectorize test Y test_Y = [[tag2id[y] for y in y_seq] for y_seq in test_labels] else: test_X = None test_Y = None sys.stdout.write('\ttraining classifiers %s\n' % p_or_n) if use_lstm: # train using lstm clf, dev_score = keras_ml.train(X_seq_ids, Y_labels, tag2id, len(vocab), val_X_ids=val_X, val_Y_ids=val_Y, test_X_ids=test_X, test_Y_ids=test_Y) else: # train using crf from machine_learning import crf clf, dev_score = crf.train(X_feats, Y_labels, val_X=val_X, val_Y=val_Y, test_X=test_X, test_Y=test_Y) return vocab, clf, dev_score, enabled_features