def infer_cnn(data_path, model_path, word_vocab_path, pos_vocab_path, label_vocab_path, word_emb_path, pos_emb_path, batch_size, pred_save_path=None): # init dict word_vocab, pos_vocab, label_vocab = load_vocab( word_vocab_path), load_vocab(pos_vocab_path), load_vocab( label_vocab_path) word_emb, pos_emb = load_pkl(word_emb_path), load_pkl(pos_emb_path) word_test, pos_test = test_reader(data_path, word_vocab, pos_vocab, label_vocab) # init model model = Model(config.max_len, word_emb, pos_emb, label_vocab=label_vocab) ckpt_path = get_ckpt_path(model_path) if ckpt_path: print("Read model parameters from %s" % ckpt_path) model.saver.restore(model.sess, ckpt_path) else: print("Can't find the checkpoint.going to stop") return label_pred = model.predict(word_test, pos_test, batch_size) save(label_pred, pred_save_path=pred_save_path) print("finish prediction.")
def train(save_vocab_path='', train_path='', test_path='', train_seg_path='', test_seg_path='', model_save_dir='', vocab_max_size=5000, vocab_min_count=5, hidden_dim=512, use_cuda=False): train_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): avg_cost = train_model() optimizer = optimizer_func(hidden_dim) optimizer.minimize(avg_cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) seg_data(train_path, test_path) train_texts = build_dataset(train_seg_path) if os.path.exists(save_vocab_path): vocab = load_vocab(save_vocab_path) else: vocab, reverse_vocab = build_vocab(train_texts, min_count=vocab_min_count) write_vocab(vocab, save_vocab_path) vocab = load_vocab(save_vocab_path) train_set = read_data(train_seg_path) train_set_ids = transform_data(train_set, vocab) num_encoder_tokens = len(train_set_ids) max_input_texts_len = max([len(text) for text in train_texts]) print('num of samples:', len(train_texts)) print('num of unique input tokens:', num_encoder_tokens) print('max sequence length for inputs:', max_input_texts_len) # save_word_dict(vocab2id, save_vocab_path) train_reader = data_generator(train_set_ids) train_data = paddle.batch(paddle.reader.shuffle(train_reader, buf_size=10000), batch_size=batch_size) feeder = fluid.DataFeeder(feed_list=['question_word', 'dialogue_word', 'report_word', 'report_next_word'], place=place, program=train_prog) exe.run(startup_prog) EPOCH_NUM = 20 for pass_id in six.moves.xrange(EPOCH_NUM): batch_id = 0 for data in train_data(): cost = exe.run(train_prog, feed=feeder.feed(data), fetch_list=[avg_cost])[0] print('pass_id: %d, batch_id: %d, loss: %f' % (pass_id, batch_id, cost)) batch_id += 1 fluid.io.save_params(exe, model_save_dir, main_program=train_prog)
def main(config, eval_folder): # local the vocab file text_words_vocab = load_vocab(config.text_words_path) text_chars_vocab = load_vocab(config.text_chars_path) inv_text_vocab = {v: k for k, v in text_words_vocab.items()} # get the processing function processing_word = get_processing_word(text_words_vocab, text_chars_vocab, lowercase=True, chars=True) #load features: word_features = get_trimmed_features(config.word_embeddings_trimmed_path) examples = read_examples(eval_folder, processing_word) # build WImpModel model = WImpModel(config, word_features, None, text_words_vocab["$UNK$"], inv_text_vocab, None) model.build_graph() words, word_feats, speech_interval_feats = [], [], [] for sent_key in examples.keys(): words_, word_feats_, speech_feats_ = zip(*examples[sent_key]) word_feats_ = list(zip(*word_feats_)) word_feats.append(word_feats_) speech_interval_feats.append(speech_feats_) words.append(words_) speech_interval_feats_pad_, speech_lengths = pad_sequences( speech_interval_feats, pad_tok=[0] * config.speech_features_dim, nlevels=2) speech_feats = speech_interval_feats_pad_[:, :, :, config. speech_lexical_features_dim:] speech_lexical_feats = speech_interval_feats_pad_[:, :, 0, :config. speech_lexical_features_dim] feed, sequence_lengths = model.get_feed_dict(words=word_feats, dropout=1.0) feed[model.speech_features] = speech_feats feed[model.speech_lexical_features] = speech_lexical_feats feed[model.speech_lengths] = speech_lengths predictions = model.test(feed) print("\n") print("WORD IMPORTANCE PREDICTION OUTPUT") print("=================================") for sent_id in range(len(words)): scores = predictions[0][:sequence_lengths[sent_id]] tokens = words[sent_id] result = ["%s (%f)" % (w, s) for w, s in zip(tokens, scores)] print("--> " + " ".join(result) + "\n")
def train(train_data, val_data, fold_idx=None): train_dataset = MyDataset(train_data) val_dataset = MyDataset(val_data) train_loader = DataLoader(train_dataset, batch_size=config.batch_size) val_loader = DataLoader(val_dataset, batch_size=config.batch_size) from models.hmm import HMM word2id, id2word = load_vocab() model = HMM(len(config.label2id), len(word2id)) if fold_idx is None: print('start') model_save_path = os.path.join(config.model_path, '{}.bin'.format(model_name)) else: print('start fold: {}'.format(fold_idx + 1)) model_save_path = os.path.join( config.model_path, '{}_fold{}.bin'.format(model_name, fold_idx)) word_id_list = train_dataset.x_data label_id_list = train_dataset.y_data model.train(word_id_list, label_id_list) y_pred_list = model.predict(train_dataset.x_data) train_score = get_score(train_dataset.y_data, y_pred_list) y_pred_list = model.predict(val_dataset.x_data) val_score = get_score(val_dataset.y_data, y_pred_list) msg = 'train score: {0:>6.2%}, val score: {1:>6.2%}' print(msg.format(train_score, val_score))
def __init__(self): """ Creates output directories if they don't exist and load vocabulary Defines attributes that depends on the vocab. Look for the __init__ comments in the class attributes """ # check that the reload directory exists if self.dir_reload is not None and not os.path.exists(self.dir_reload): print("Weights directory not found ({})".format(self.dir_reload)) self.dir_reload = None # directory for training outputs if not os.path.exists(self.dir_output): os.makedirs(self.dir_output) if not os.path.exists(self.model_output): os.makedirs(self.model_output) if not os.path.exists(self.dir_plots): os.makedirs(self.dir_plots) # initializer file for answers with open(self.path_results, "a") as f: pass with open(self.path_results_final, "a") as f: pass self.vocab = load_vocab(self.path_vocab) self.vocab_size = len(self.vocab) self.attn_cell_config["num_proj"] = self.vocab_size self.id_PAD = self.vocab[PAD] self.id_END = self.vocab[END] self.logger = get_logger(self.path_log)
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data=data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) # evaluate if true_labels: try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id)) except Exception: print("error. no true labels") # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight_dict = load_dict(config.lr_feature_weight_path) pred_labels = cal_multiclass_lr_predict(data_set, feature_weight_dict, id_label) print(pred_labels[:5])
def infer_deep_model(model_type='cnn', data_path='', model_save_path='', label_vocab_path='', max_len=300, batch_size=128, col_sep='\t', pred_save_path=None): from keras.models import load_model # load data content data_set, true_labels = data_reader(data_path, col_sep) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': feature_type = 'doc_vectorize' else: feature_type = 'vectorize' feature = Feature(data_set, feature_type=feature_type, is_infer=True, max_len=max_len) # get data feature data_feature = feature.get_feature() # load model model = load_model(model_save_path) # predict, in keras, predict_proba same with predict pred_label_probs = model.predict(data_feature, batch_size=batch_size) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [prob.argmax() for prob in pred_label_probs] pred_labels = [id_label[i] for i in pred_labels] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to: %s" % pred_save_path) save_predict_result(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if true_labels: # evaluate assert len(pred_labels) == len(true_labels) for label, prob in zip(true_labels, pred_label_probs): logger.debug('label_true:%s\tprob_label:%s\tprob:%s' % (label, id_label[prob.argmax()], prob.max())) print('total eval:') try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))
def init(self): if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) if not os.path.exists(self.logger_dir): os.makedirs(self.logger_dir) self.logger = self.get_logger(self.logger_path) self.vocab_words = load_vocab(self.words_file) self.vocab_tags = load_vocab(self.tags_file) if self.use_lexicon: self.vocab_lexicons = load_vocab(self.lexicons_file) self.lexicon_z_embeddings = load_z_vectors(self.lexicon_z_file) if self.use_chars: self.vocab_chars = load_vocab(self.chars_file) else: self.vocab_chars = {}
def __init__(self, df, mode='train'): self.mode = mode self.word2id, _ = load_vocab() self.x_data = [] self.y_data = [] for i, row in df.iterrows(): x, y = self.row_to_tensor(row) self.x_data.append(x) self.y_data.append(y)
def run_evaluate(self, sess, val_set, lr_schedule=None, path_results=None): """ Performs an epoch of evaluation Args: sess: (tf.Session) val_set: Dataset instance lr_schedule: (instance of Lr schedule) optional path_results: (string) where to write the results Returns: bleu score: exact match score: """ vocab = load_vocab(self.config.path_vocab) rev_vocab = {idx: word for word, idx in vocab.iteritems()} references, hypotheses = [], [] n_words, ce_words = 0, 0 # for perplexity, sum of ce for all words + nb of words for img, formula in minibatches(val_set, self.config.batch_size): fd = self.get_feed_dict(img, training=False, formula=formula, dropout=1) ce_words_eval, n_words_eval, ids_eval = sess.run( [self.ce_words, self.n_words, self.pred_test.ids], feed_dict=fd) if self.config.decoding == "greedy": ids_eval = np.expand_dims(ids_eval, axis=1) elif self.config.decoding == "beam_search": ids_eval = np.transpose(ids_eval, [0, 2, 1]) n_words += n_words_eval ce_words += ce_words_eval for form, pred in zip(formula, ids_eval): # pred is of shape (number of hypotheses, time) references.append([form]) hypotheses.append(pred) if path_results is None: path_results = self.config.path_results scores = evaluate(references, hypotheses, rev_vocab, path_results, self.config.id_END) ce_mean = ce_words / float(n_words) scores["perplexity"] = np.exp(ce_mean) if lr_schedule is not None: lr_schedule.update(score=scores["perplexity"]) return scores
def embed2vec_with_english(spanish_embeding_file, english_embedding_file, vocab_file, dim): """ embedding -> numpy """ vocab = load_vocab(vocab_file) print('vocab size is {}'.format(len(vocab))) weight_matrix = norm_weight(len(vocab), dim) weight_matrix[0] = 0.0 words_found = 0 unfind_words = [] find_words = [] spanish_embedding_vec = load_embedding(spanish_embeding_file) english_embedding_vec = load_embedding(english_embedding_file) for index, word in enumerate(vocab): if word in spanish_embedding_vec: weight_matrix[index] = spanish_embedding_vec[word] words_found += 1 find_words.append(word) elif word in english_embedding_vec: weight_matrix[index] = english_embedding_vec[word] find_words.append(word) else: unfind_words.append(word) print('找到词向量的单词数有{},没有找到的有{}'.format(words_found, len(vocab) - words_found)) np.savez(config.all_vocab_embedding_file, weights=weight_matrix) with open('{}unfind_word.txt'.format(config.multi_task_path), 'wt', encoding='utf-8') as f: for line in unfind_words: f.write(line + '\n') print('Done')
def build_pos_embedding(path, overwrite=False, pos_vocab_path=None, pos_vocab_start=1, pos_dim=64): if os.path.exists(path) and not overwrite: print("already has $s and use it." % path) return load_pkl(path) pos_vocab = load_vocab(pos_vocab_path) pos_vocab_count = len(pos_vocab) + pos_vocab_start pos_emb = np.random.normal(size=( pos_vocab_count, pos_dim, )).astype('float32') for i in range(pos_vocab_start): pos_emb[i, :] = 0. # save dump_pkl(pos_emb, path, overwrite=True) return pos_emb
def test(args): if args.thread_restrict is True: cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2) else: cfg_proto = None with tf.Session(config=cfg_proto) as sess: # Loading the vocabulary files vocab, rev_vocab = load_vocab(args) args.vocab_size = len(rev_vocab) # Creating test model # Hacky way to get seq_len test_set = load_pickle(args, split='test') args.config.seq_len = test_set[0]['sentence_len'] # Creating training model if args.config.elmo is True: elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True) else: elmo = None with tf.variable_scope("model", reuse=None): model_test = SentimentModel(args, queue=None, mode='eval', elmo=elmo) # Reload model from checkpoints, if any steps_done = initialize_weights(sess, model_test, args, mode='test') logger.info("loaded %d completed steps", steps_done) for split in args.eval_splits.split(','): test_set = load_pickle(args, split=split) results, losses = evaluate(sess, model_test, test_set, args) if args.mode != 'train': detailed_results(args, split, test_set, rev_vocab, results) percent_correct = float(len( results['correct'])) * 100.0 / len(test_set) logger.info("correct predictions on %s - %.4f. Eval Losses - %.4f", split, percent_correct, losses)
def analysis(args): if args.thread_restrict is True: cfg_proto = tf.ConfigProto(intra_op_parallelism_threads=2) else: cfg_proto = None with tf.Session(config=cfg_proto) as sess: # Loading the vocabulary files vocab, rev_vocab = load_vocab(args) args.vocab_size = len(rev_vocab) # Creating test model train_set = load_pickle(args, split='train') args.config.seq_len = train_set[0]['sentence_len'] args.config.eval_batch_size = 1 # Creating training model if args.config.elmo is True: elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True) else: elmo = None with tf.variable_scope("model", reuse=None): model_test = SentimentModel(args, queue=None, mode='eval', elmo=elmo) # Reload model from checkpoints, if any steps_done = initialize_weights(sess, model_test, args, mode='test') logger.info("loaded %d completed steps", steps_done) logicnn.append_features(args, train_set, model_test, vocab, rev_vocab) dev_set = load_pickle(args, split='dev') logicnn.append_features(args, dev_set, model_test, vocab, rev_vocab) test_set = load_pickle(args, split='test') logicnn.append_features(args, test_set, model_test, vocab, rev_vocab) if args.config.elmo is True: elmo_embedding_analysis(sess, model_test, test_set) else: w2v_embedding_analysis(sess, model_test, test_set)
def build_word_embedding(path, overwrite=False, sentence_w2v_path=None, word_vocab_path=None, word_vocab_start=2, w2v_dim=256): if os.path.exists(path) and not overwrite: print("already has $s and use it." % path) return load_pkl(path) word_vocab = load_vocab(word_vocab_path) w2v_dict_full = load_pkl(sentence_w2v_path) word_vocab_count = len(w2v_dict_full) + word_vocab_start word_emb = np.zeros((word_vocab_count, w2v_dim), dtype='float32') for word in word_vocab: index = word_vocab[word] if word in w2v_dict_full: word_emb[index, :] = w2v_dict_full[word] else: random_vec = np.random.uniform(-0.25, 0.25, size=(w2v_dim, )).astype('float32') word_emb[index, :] = random_vec # save dump_pkl(word_emb, path, overwrite=True) return word_emb
def infer_classic(model_type='xgboost_lr', model_save_path='', label_vocab_path='', test_data_path='', pred_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word'): # load data content data_set, true_labels = data_reader(test_data_path, col_sep) # init feature feature = Feature(data_set, feature_type=feature_type, feature_vec_path=feature_vec_path, is_infer=True) # get data feature data_feature = feature.get_feature() # load model if model_type == 'xgboost_lr': model = XGBLR(model_save_path) else: model = load_pkl(model_save_path) # predict pred_label_probs = model.predict_proba(data_feature) # label id map label_id = load_vocab(label_vocab_path) id_label = {v: k for k, v in label_id.items()} pred_labels = [id_label[prob.argmax()] for prob in pred_label_probs] pred_output = [ id_label[prob.argmax()] + col_sep + str(prob.max()) for prob in pred_label_probs ] logger.info("save infer label and prob result to:%s" % pred_save_path) save(pred_output, ture_labels=None, pred_save_path=pred_save_path, data_set=data_set) if 'logistic_regression' in model_save_path and config.is_debug: count = 0 features = load_pkl('output/lr_features.pkl') for line in data_set: if count > 5: break count += 1 logger.debug(line) words = line.split() for category, category_feature in features.items(): logger.debug('*' * 43) logger.debug(category) category_score = 0 for w in words: if w in category_feature: category_score += category_feature[w] logger.debug("%s:%s" % (w, category_feature[w])) logger.debug("%s\t%f" % (category, category_score)) logger.debug('=' * 43) if true_labels: # evaluate try: print(classification_report(true_labels, pred_labels)) print(confusion_matrix(true_labels, pred_labels)) except UnicodeEncodeError: true_labels_id = [label_id[i] for i in true_labels] pred_labels_id = [label_id[i] for i in pred_labels] print(classification_report(true_labels_id, pred_labels_id)) print(confusion_matrix(true_labels_id, pred_labels_id))
def get_vocabs(vocab_file): words = load_vocab(vocab_file) word_dict = {} for i in range(len(words)): word_dict[words[i]] = i return word_dict
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): logger.info("train classic model, model_type:{}, feature_type:{}".format(model_type, feature_type)) # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) word_id = load_vocab(word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) print(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) logger.info('data size:%d' % len(data_content)) logger.info('label size:%d' % len(data_lbl)) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.error('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab, is_infer=False) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split( data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': save_pkl(model, model_save_path, overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path) # analysis lr model if config.debug and model_type == "logistic_regression": feature_weight = {} word_dict_rev = sorted(word_id.items(), key=lambda x: x[1]) for feature, index in word_dict_rev: feature_weight[feature] = list(map(float, model.coef_[:, index])) save_dict(feature_weight, config.lr_feature_weight_path)
# Two lengths must be same filters = [3, 5, 7] num_filters = [128, 128, 128] assert len(filters) == len(num_filters) # - Training epochs = 10 batch = 256 lr = 0.001 cuda = True model = "cnn" # 'cnn' or 'rnn' # Load vocabulary and make dictionary vocabs = load_vocab('data/imdb/imdb.vocab') w2i = {w: i for i, w in enumerate(vocabs)} i2w = {i: w for i, w in enumerate(vocabs)} vocab_size = len(vocabs) # Load Data train_x, train_y = load_data('data/', train=True) train_x, train_y = preprocess(train_x, train_y, w2i, maxlen) # Build Model & Loss & Optimizer model = RNN(embedding, rnn_hidden, num_layers, bi, output_dim, vocab_size) \ if model == 'rnn' else CNN(filters, num_filters, maxlen, vocab_size, embedding, output_dim) # Loss function & Optimizer criterion = nn.BCELoss() optim = torch.optim.Adam(model.parameters(), lr)
def train_classic(model_type='logistic_regression', data_path='', model_save_path='', feature_vec_path='', col_sep='\t', feature_type='tfidf_word', min_count=1, word_vocab_path='', label_vocab_path='', pr_figure_path=''): # load data data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) # save word vocab write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) # save label vocab write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] num_classes = len(set(data_label)) logger.info('num_classes:%d' % num_classes) # init feature if feature_type in ['doc_vectorize', 'vectorize']: logger.info('feature type error. use tfidf_word replace.') feature_type = 'tfidf_word' feature = Feature(data=data_content, feature_type=feature_type, feature_vec_path=feature_vec_path, word_vocab=word_vocab) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'xgboost_lr': model = XGBLR(model_save_path=model_save_path) else: model = get_model(model_type) # fit model.fit(X_train, y_train) # save model if model_type != 'xgboost_lr': dump_pkl(model, model_save_path, overwrite=True) # analysis lr model if model_type == "logistic_regression" and config.is_debug: # show each category top features weights = model.coef_ vectorizer = load_pkl(feature_vec_path) logger.debug("20 top features of each category:") features = dict() for idx, weight in enumerate(weights): feature_sorted = sorted(zip(vectorizer.get_feature_names(), weight), key=lambda k: k[1], reverse=True) logger.debug("category_" + str(idx) + ":") logger.debug(feature_sorted[:20]) feature_dict = {k[0]: k[1] for k in feature_sorted} features[idx] = feature_dict dump_pkl(features, 'output/lr_features.pkl', overwrite=True) # evaluate eval(model, X_val, y_val, num_classes=num_classes, pr_figure_path=pr_figure_path)
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=512, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split()) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' feature = Feature(data=data_content, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = cnn_model(max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_filters=num_filters, filter_sizes=filter_sizes, num_classses=num_classes, dropout=dropout) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) cp = ModelCheckpoint(model_save_path, monitor='val_acc', verbose=1, save_best_only=True) # fit and save model history = model.fit(X_train, y_train, batch_size=batch_size, epochs=nb_epoch, validation_data=(X_val, y_val), callbacks=[cp]) logger.info('save model:%s' % model_save_path) plt_history(history, model_name=model_type)
def __init__(self, input_file_path, seg_input_file_path='', word_vocab_path='', label_vocab_path='', feature_vec_path='', model_save_path='', pred_save_path='', feature_type='tf_word', model_type='logistic', num_classes=2, col_sep='\t', min_count=1, lower_thres=0.5, upper_thres=0.85, label_ratio=0.9, label_min_size=200, batch_size=10, warmstart_size=0.02, stop_words_path='data/stop_words.txt'): self.input_file_path = input_file_path self.seg_input_file_path = seg_input_file_path if seg_input_file_path else input_file_path + "_seg" self.stop_words_path = stop_words_path self.word_vocab_path = word_vocab_path if word_vocab_path else "word_vocab.txt" self.label_vocab_path = label_vocab_path if label_vocab_path else "label_vocab.txt" self.feature_vec_path = feature_vec_path if feature_vec_path else "feature_vec.pkl" self.model_save_path = model_save_path if model_save_path else "model.pkl" self.pred_save_path = pred_save_path if pred_save_path else "predict.txt" self.feature_type = feature_type self.num_classes = num_classes self.col_sep = col_sep self.min_count = min_count self.lower_thres = lower_thres self.upper_thres = upper_thres self.label_ratio = label_ratio # 1. load segment data if not os.path.exists(self.seg_input_file_path): start_time = time() seg_data(self.input_file_path, self.seg_input_file_path, col_sep=self.col_sep, stop_words_path=self.stop_words_path) logger.info("spend time: %s s" % (time() - start_time)) self.seg_contents, self.data_lbl = data_reader( self.seg_input_file_path, self.col_sep) # 2. load original data self.content, _ = data_reader(self.input_file_path, self.col_sep) # 3. load feature word_lst = [] for i in self.seg_contents: word_lst.extend(i.split()) # word vocab self.word_vocab = build_vocab(word_lst, min_count=self.min_count, sort=True, lower=True) # save word vocab write_vocab(self.word_vocab, self.word_vocab_path) # label label_vocab = build_vocab(self.data_lbl) # save label vocab write_vocab(label_vocab, self.label_vocab_path) label_id = load_vocab(self.label_vocab_path) print("label_id: %s" % label_id) self.set_label_id(label_id) self.id_label = {v: k for k, v in label_id.items()} print('num_classes:%d' % self.num_classes) self.data_feature = self._get_feature(self.word_vocab) # 4. assemble sample DataObject self.samples = self._get_samples(self.data_feature) self.batch_num = batch_size if batch_size > 1 else batch_size * len( self.samples) self.warmstart_num = warmstart_size if warmstart_size > 1 else warmstart_size * len( self.samples) self.label_min_num = label_min_size if label_min_size > 1 else label_min_size * len( self.samples) # 5. init model self.model = get_model(model_type)
class Config(): multi_task_path = "../input/processing/multi_task_learn/" vocab_path = '../input/words.txt' vocab_char_path = '../input/char_vocab.txt' embed_path = './fasttext/embedding_matrix.npz' fasttext_file_path = './fasttext/wiki.es.vec' fasttext_file_english_path = './fasttext/wiki.en.vec' all_vocab_path = "{}{}".format(multi_task_path, 'all_vocab.txt') all_vocab_embedding_file = "{}{}".format(multi_task_path, 'embedding_matrix.npz') all_char_vocab_path = "{}{}".format(multi_task_path, 'all_char_vocab.txt') all_char_vocab = load_char_vocab(all_char_vocab_path) word_vocabs = load_vocab(vocab_path) char_vocabs = load_char_vocab(vocab_char_path) PAD_WORD = '<PAD>' UNK_WORD = '<UNK>' vocab_size = len(word_vocabs) # character parameters use_char_emb = True char_vocab_size = len(char_vocabs) # char_dim = 50 char_hidden_size = 64 # for char lstm max_word_length = 10 CHAR_PAD = '@' # data path input_path = '../input/processing/' train_path = input_path + 'train_data.txt' english_train_path = input_path + 'english_train.txt' spanish_train_path = input_path + 'spanish_train_dedup.txt' valid_path = input_path + 'valid_data.txt' test_path = input_path + 'test_b.txt' model_save_path = './checkpoints/' num_workers = 1 pad_index = 0 save_model = True restore = False # for restore training eval_every = 100 embed_size = 300 # hidden_size = random.randint(256, 512) hidden_size = 256 lstm_size = 256 fc_hidden_size = 512 linear_size = fc_hidden_size num_classes = 2 max_lengths = random.choice([20, 25]) batch_size = 64 lr = random.choice([1e-3]) # lr = 1e-3 start_lr = lr lr_decay_step = 5 epochs = 60 dropout = 0.5 rnn_dropout = 0.2 clip = 5.0 rnn_layers = 3 batch_not_imporved_throld = 20 five_fold_path = '../input/processing/5fold/' five_fold_save_path = './result/' # CNN character parameters import random user_char_emb = True char_dim = 50 char_kernel_sizes = random.choices([1, 2, 3, 4, 5], k=3) char_kernel_nums = random.choices([64, 64, 128, 128], k=3) char_output_dim = 128 embed_dropout = random.uniform(0.1, 0.3) pretrained_emb = np.load(embed_path)['weights'] # for multi task learning # multi_task = random.choice([False, True]) multi_task = True multi_task_vocabs = load_vocab(all_vocab_path) multi_task_pretraind_emb = np.load(all_vocab_embedding_file)['weights'] if multi_task: char_vocabs = multi_task_vocabs pretrained_emb = multi_task_pretraind_emb vocab_size = len(multi_task_vocabs) char_vocab_size = len(all_char_vocab) print('vocab size is ', vocab_size) onehot = False # early stop # ESIM model # num_units = 300 + char_`output_dim + pos_embedding_size num_units = 300 char_num_units = num_units + char_output_dim project_input = True # whether to project input embeddings to different dimensionality distance_biases = random.randint( 15, 30 ) # number of different distances with biases used in the intra-attention model max_sentence_length = max_lengths # StackBiLSTMMaxount(SSE) h_size = [512, 1024, 2048] mlp_d = 1600 # Decoposable Attention model # BIMPM num_perspective = 50 word_dim = embed_size word_vocab_size = vocab_size max_word_len = max_word_length # extra features use_extra = False extra_path = "../input/processing/with_extra_features/" train_data_extra = extra_path + 'train_data.csv' valid_data_extra = extra_path + 'valid_data.csv' test_data_extra = extra_path + 'test.csv' if use_extra: extra_feature_nums = len( pd.read_csv(train_data_extra, sep='\t').columns) - 3 else: extra_feature_nums = 0 word_max_lengths = max_word_length CHAR_PAD_INDEX = 0 CHAR_PAD = ' ' # simaese lstm residual = True num_layers = 2 wdrop = 0.25 dropouti = 0.25 # MPCNN filter_widths = [1, 2, 3, np.inf] hidden_layer_units = 512 n_holistic_filters = 300 n_per_dim_filters = 32 msg = ( 'parameters is hidden_size: {}, max_lengths: {} batch_size {} lr: {}, dropout: {}, use_english: {}' .format(hidden_size, max_lengths, batch_size, lr, dropout, multi_task)) print(msg)
attn_type = 'general' # dot, general, concat attn_dim = 128 # when concat # - Training epochs = 200 batch = 128 lr = 0.001 cuda = torch.cuda.is_available() # - Attention visualization show_attn = False show_ex_num = 123 # Load Data and Build dictionaries src_train_sent, tar_train_sent = load_data('data/', train=True, small=True) src_dict, src_cand = load_vocab(src_train_sent) tar_dict, tar_cand = load_vocab(tar_train_sent) src_vocab_size = len(src_dict) tar_vocab_size = len(tar_dict) src_train, tar_train = preprocess(src_train_sent, tar_train_sent, src_dict, tar_dict, maxlen) # Build Seq2Seq Model & Loss & Optimizer model = Seq2seq(embedding_dim, rnn_hidden, num_layers, src_vocab_size, tar_vocab_size, bi, attention, attn_type, attn_dim) criterion = nn.NLLLoss(ignore_index=3) optim = torch.optim.Adam(model.parameters(), lr) if cuda:
def train_deep_model(model_type='cnn', data_path='', model_save_path='', word_vocab_path='', label_vocab_path='', min_count=1, max_len=300, batch_size=128, nb_epoch=10, embedding_dim=128, hidden_dim=128, col_sep='\t', num_filters=2, filter_sizes='3,4,5', dropout=0.5): # data reader data_content, data_lbl = data_reader(data_path, col_sep) word_lst = [] for i in data_content: word_lst.extend(i.split(" ")) # word vocab word_vocab = build_vocab(word_lst, min_count=min_count, sort=True, lower=True) write_vocab(word_vocab, word_vocab_path) # label label_vocab = build_vocab(data_lbl) write_vocab(label_vocab, label_vocab_path) label_id = load_vocab(label_vocab_path) logger.info(label_id) data_label = [label_id[i] for i in data_lbl] # category num_classes = len(set(data_label)) logger.info('num_classes:', num_classes) data_label = to_categorical(data_label, num_classes=num_classes) logger.info('Shape of Label Tensor:', data_label.shape) # init feature # han model need [doc sentence dim] feature(shape 3); others is [sentence dim] feature(shape 2) if model_type == 'han': logger.info( 'Hierarchical Attention Network model feature_type must be: doc_vectorize' ) feature_type = 'doc_vectorize' else: logger.info('feature_type: vectorize') feature_type = 'vectorize' word_dic = {} count = 1 for word in word_vocab: word_dic[word] = count count += 1 data_filter = [] for line in data_content: line_filter = " ".join( list(filter(lambda x: x in word_dic, line.split(" ")))) data_filter.append(line_filter) feature = Feature(data=data_filter, feature_type=feature_type, word_vocab=word_vocab, max_len=max_len) # get data feature data_feature = feature.get_feature() X_train, X_val, y_train, y_val = train_test_split(data_feature, data_label, test_size=0.1, random_state=0) if model_type == 'fasttext': model = fasttext_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, num_classes=num_classes) elif model_type == 'cnn': model = load_model(model_save_path) elif model_type == 'rnn': model = rnn_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) else: model = han_model(max_len=max_len, vocabulary_size=len(word_vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_classes=num_classes) #loss,accuracy = model.evaluate(X_val,y_val) #print loss,accuracy pre_label = model.predict(X_val, batch_size=32, verbose=0, steps=None) print(y_val) print(type(y_val)) with open("./output/result", "w") as f: for i in range(len(y_val)): f.write("%s\t%f\n" % (y_val[i][2], pre_label[i][2])) f.close()