def main(): pos_train = './data/train_pos_full.txt' neg_train = './data/train_neg_full.txt' vocab = './data/vocab.dat' inv_vocab = './data/inv_vocab.dat' build_vocab([pos_train, neg_train], DataSet.PAD_WORD, vocab, inv_vocab)
def _process_keras( features_train: List[str], labels: List[int], path_embeddings: str ) -> Tuple[Dict[str, int], Optional[Word2Vec], Dict[str, int]]: """ This method is used to process the sentences with respect to a keras model :param features_train: input train :param labels: label set. :param path_embeddings: path of pre-trained embeddings :return: vocabulary, word2vec model, word2vec vocabulary """ print("Loading pre-trained embeddings...") # load the w2v matrix with genism w2v = gensim.models.KeyedVectors.load_word2vec_format(path_embeddings, binary=True) for i, t in enumerate(features_train): if not t: del features_train[i] del labels[i] # build the vocab from the w2v model w2v_vocab = preprocess.vocab_from_w2v(w2v) print("Word2Vec model vocab len:", len(w2v_vocab)) # build vocab from the dataset data_vocab = preprocess.build_vocab([features_train]) # filter pretrained w2v with words from the dataset w2v = utils.restrict_w2v(w2v, set(data_vocab.keys())) w2v_vocab = preprocess.vocab_from_w2v(w2v) utils.write_dictionary(config.TRAIN_VOCAB, w2v_vocab) print("Cleaned vocab len:", len(w2v_vocab)) # idx2word = {v: k for k, v in vocab.items()} return data_vocab, w2v, w2v_vocab
def get_data(): # Load and preprocess data sentences, labels = load_data_and_labels() sentences_padded = pad_sentences(sentences) vocabulary, vocabulary_inv = build_vocab(sentences_padded) x, y = build_input_data(sentences_padded, labels, vocabulary) vocab_size = len(vocabulary) # randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # split train/dev set # there are a total of 10662 labeled examples to train on x_train, x_dev = x_shuffled[:-1000], x_shuffled[-1000:] y_train, y_dev = y_shuffled[:-1000], y_shuffled[-1000:] sentence_size = x_train.shape[1] print 'Train/Dev split: %d/%d' % (len(y_train), len(y_dev)) print 'train shape:', x_train.shape print 'dev shape:', x_dev.shape print 'vocab_size', vocab_size print 'sentence max words', sentence_size return Data(x_train, y_train, x_dev, y_dev, vocab_size, sentence_size)
def main(args): data = json.load(open(args.input_refexps_json, 'r')) max_length = 0 all_refexps = [] for keys in data: for ref_id in data[keys]: all_refexps.append(data[keys][ref_id]) for r in all_refexps: t = tokenize( r, punct_to_keep=[',', ';'], punct_to_remove=['?', '.'] ) if len(t) > max_length: max_length = len(t) refexp_token_to_idx = build_vocab( all_refexps, punct_to_keep=[',', ';'], punct_to_remove=['?', '.'] ) with open(args.output_vocab_json, 'w') as f: json.dump(refexp_token_to_idx, f) with h5py.File(args.output_refexps_h5df, 'w') as f: for keys in data: one_image_refexps = [] # img_name = keys.split('.')[0] one_image_refexps_to_idx = [] img_all_refexps = data[keys] for ref_id in img_all_refexps: # refexp = img_all_refexps[ref_id] # one_image_refexps.append(refexp) refexp = img_all_refexps[ref_id] one_image_refexps.append(refexp) for refexps in one_image_refexps: tokens = tokenize(refexps, punct_to_remove=['?', '.'], punct_to_keep=[';', ',']) refexps_idx = encode(tokens, refexp_token_to_idx) one_image_refexps_to_idx.append(refexps_idx) for refexp_ in one_image_refexps_to_idx: num_null = max_length - len(refexp_) if num_null > 0: refexp_ += [refexp_token_to_idx['<NULL>']]*num_null one_image_refexps_to_idx_numpy = np.asarray(one_image_refexps_to_idx, dtype=np.int32) f.create_dataset(keys, data=one_image_refexps_to_idx_numpy)
def main(): pos_file = './data/train_pos.txt' neg_file = './data/train_neg.txt' validation = './data/test_data.txt' stopwords = './data/stopwords.txt' vocab_file = 'vocab.dat' inv_vocab_file = 'inv_vocab.dat' cooc_file = 'cooc.dat' embeddings_file = 'embeddings.dat' label_file = 'labels.dat' submission_file = 'submission.csv' glove_seed = 1234 kmeans_seed = 4321 xgb_seed = 1337 sampler_seed = 7331 build_vocab([pos_file, neg_file], stopwords, vocab_file, inv_vocab_file, cutoff=5) vocab = load_pickled(vocab_file) inv_vocab = load_pickled(inv_vocab_file) build_cooc([pos_file, neg_file], vocab, cooc_file) train_glove(cooc_file, embeddings_file, glove_seed) train_kmeans(embeddings_file, label_file, kmeans_seed) train_xgb(vocab_file, pos_file, neg_file, label_file, validation, submission_file, xgb_seed, sampler_seed)
def main(): NAME_IDX = 5 data_file = '../data/the_office_scripts.csv' length = get_num_lines(data_file) data_names = get_data(data_file, NAME_IDX, length) vocab = build_vocab(data_names) data_ids = convert_to_id(vocab, data_names) train_data, test_data = split_data(data_ids, length) num_tokens = len(vocab) model = RNN_WORD_Model(num_tokens) for i in range(40): train(model, train_data) test(model, test_data)
for row in val_data[tags_predicted].iterrows(): val_targets.append(list(row[1].values)) train_sample_num = len(train_targets) if params['loss_weight_on']: loss_weights = {} for task_id in range(len(tags_predicted)): loss_weights[task_id] = torch.Tensor([(train_sample_num-pos_num_tags[task_id])/pos_num_tags[task_id]]).to(device) else: loss_weights = None train_X = train_data[steps_token] val_X = val_data[steps_token] test_X = test_data[steps_token] all_train_tokens = all_tokens_list(train_X) max_vocab_size = len(list(set(all_train_tokens))) token2id, id2token = build_vocab(all_train_tokens, max_vocab_size) emb_weight = build_emb_weight(words_emb_dict, id2token) train_data_indices = token2index_dataset(train_X, token2id) val_data_indices = token2index_dataset(val_X, token2id) test_data_indices = token2index_dataset(test_X, token2id) # batchify datasets: batch_size = params['batch_size'] max_sent_len = np.array([94, 86, 87, 90, 98, 91]) train_loader, val_loader, test_loader = create_dataset_obj(train_data_indices, val_data_indices, test_data_indices, train_targets, val_targets, test_targets, batch_size, max_sent_len, collate_func) val_auc, val_acc, model_to_test = train_model(params, emb_weight, train_loader, val_loader, test_loader, loss_weights)
n_epochs = 10000 embedding_dim = 500 hidden_dim = 126 layer_dim = 2 output_dim = 1 seq_dim = embedding_dim configure(tensor_board_log_dir) # %% # Pre-process the data ======================================================== train = pd.read_csv(train_path) test = pd.read_csv(test_path) word_2_int, int_2_word = preprocess.build_vocab(train, test) max_sent_len = preprocess.longest_sentence_length(train, test) print('Longest sentence: {}'.format(max_sent_len)) train = preprocess.MSRPWordVectorsDataSet(train, word_2_int, max_sent_len, GPU) test = preprocess.MSRPWordVectorsDataSet(test, word_2_int, max_sent_len, GPU) train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=batch_size, shuffle=False)
def main(stopword_file, infiles): build_vocab(infiles, stopword_file, 'vocab.dat', 'inv_vocab.dat', cutoff=5)
# Data Preparatopn # ================================================== # Load data print("Loading text data...") qq, ll, cc, aa = preprocess.train_data() assert len(qq) == len(ll) == len(cc) == len(aa) # Build vocabulary using same preprocessor print("Building vocabularies...") c = [y for x in cc for y in x] # single sents from context max_len = max([len(x) for x in c]) vocab = learn.preprocessing.VocabularyProcessor(max_len) vocab.fit(c) vocab.save('save/vocab.pkl') print("Convert text to data...") train_q = preprocess.build_vocab(qq, vocab) train_c = [preprocess.build_vocab(x, vocab) for x in cc] train_l = [preprocess.build_vocab(x, vocab) for x in ll] train_a = [train_l[i][x - 1] for i, x in enumerate(aa)] assert len(train_q) == len(train_c) == len(train_l) == len(train_a) # Shuffle data def shuf_data(data): ''' Return shuf_data, [context, question, right_ans, wrong_ans]''' print("Shuffle data...") shuf_idx = np.random.permutation(np.arange(len(data))) data_shuf = [data[i] for i in shuf_idx] print("Done shuffle.") return data_shuf
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print("Must give one of --input_vocab_json or --output_vocab_json") return print("Loading questions...") with open(args.input_questions, 'r') as f: questions = f.read() questions = questions.split("\n") questions = questions[:-1] print("Loading answers...") with open(args.input_answers, 'r') as f: answers = f.read() answers = answers.split("\n") answers = answers[:-1] answer_token_to_idx = None # Either create the vocab or load it from disk if args.input_vocab_json == "" or args.expand_vocab == 1: print("Building vocab...") # Convert the answer tokens to unique id answer_token_to_idx = build_vocab([answer for answer in answers]) # convert the tokens in all questions to unique id question_token_to_idx = build_vocab( [question for question in questions], min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) answer_idx_to_token = {} question_idx_to_token = {} # create a reverse dictionary for answer idx to token mapping for key, value in answer_token_to_idx.items(): answer_idx_to_token[value] = key # create a reverse dictionary for question idx to token mapping for key, value in question_token_to_idx.items(): question_idx_to_token[value] = key # dump all the dictionaries as a single JSON file vocab = { "question_token_to_idx": question_token_to_idx, "answer_token_to_idx": answer_token_to_idx, "question_idx_to_token": question_idx_to_token, "answer_idx_to_token": answer_idx_to_token } if args.input_vocab_json != "": print("Loading vocab...") if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab["question_token_to_idx"]: if word not in vocab["question_token_to_idx"]: print("Found new word %s" % word) idx = len(vocab["question_token_to_idx"]) vocab["question_token_to_idx"][word] = idx num_new_words += 1 print("Found %d new words" % num_new_words) if args.output_vocab_json != "": with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) # Encode all questions and programs # This converts question strings to integers print("Encoding data") questions_encoded = [] _answers = [] for question, answer in zip(questions, answers): question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab["question_token_to_idx"], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) _answers.append(vocab["answer_token_to_idx"][answer]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab["question_token_to_idx"]["<NULL>"]) # Create h5 dataset file print("Writing output") questions_encoded = np.asarray(questions_encoded, dtype=np.int32) print("Questions encoded shape is {}".format(questions_encoded.shape)) print("Length of answer tokens is {}".format(len(answer_token_to_idx))) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset("questions", data=questions_encoded) if len(_answers) > 0: f.create_dataset("answers", data=np.asarray(_answers))
def main(args): if (args.input_vocab_json == '') and (args.output_vocab_json == ''): print('Must give one of --input_vocab_json or --output_vocab_json') return print('Loading data...') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] '''Either create the vocab or load it from disk''' if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab...') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((str(q['answer']) for q in questions)) question_token_to_idx = build_vocab( [q['question'] for q in questions], min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.'] ) all_program_strs = [] for q in questions: if 'program' not in q.keys(): continue program_str = program_to_str(q['program'], args.mode) if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, } if args.input_vocab_json != '': print('Loading vocab...') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f) '''Encode all questions and programs''' print('Encoding data') questions_encoded = [] programs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str = program_to_str(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][str(q['answer'])]) '''Pad encoded questions and programs''' max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) '''Create h5 file''' print('Writing output') questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) print('Questions encoded shape is {}'.format(questions_encoded.shape)) print('Programs encoded shape is {}'.format(programs_encoded.shape)) with h5py.File(args.output_h5_file, 'w') as f: f.create_dataset('questions', data=questions_encoded) f.create_dataset('image_idxs', data=np.asarray(image_idxs)) f.create_dataset('orig_idxs', data=np.asarray(orig_idxs)) if len(programs_encoded) > 0: f.create_dataset('programs', data=programs_encoded) if len(question_families) > 0: f.create_dataset('question_families', data=np.asarray(question_families)) if len(answers) > 0: f.create_dataset('answers', data=np.asarray(answers))
metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) # train() if sys.argv[1] == 'train': train() else: test()