def is_stimm(): from preprocess import read_vocab from preprocess import vocab glove_reduced = "data/glove_100d.txt" vocab_file = "data/vocab" read_vocab(vocab_file) counter = 0 with open(glove_reduced, "r") as f: for line in f.readlines(): counter += 1 if len(vocab) == counter: print("YESSSSSS") print(len(vocab), " <==> ", counter)
def get_dataset_gender_cooccurrences(data_dir, vocab, target_pos=None, female_nouns=None, male_nouns=None): """ Get gender cooccurrences for a preprocessed dataset """ if type(vocab) == str: # Load vocab if a path was provided vocab = read_vocab(vocab) female_cooccur = Counter() male_cooccur = Counter() for fname in os.listdir(data_dir): fpath = os.path.join(data_dir, fname) # Read file sentences = read_preprocessed_file(fpath, vocab) female_cooccur_file, male_cooccur_file \ = get_sentence_list_gender_cooccurrences(sentences, target_pos=target_pos) female_cooccur += female_cooccur_file male_cooccur += male_cooccur_file return female_cooccur, male_cooccur
def get_dataset_gender_cooccurrences(data_dir, vocab, target_pos=None, female_nouns=None, male_nouns=None, n_jobs=1, verbose=False): """ Get gender cooccurrences for a preprocessed dataset """ if type(vocab) == str: # Load vocab if a path was provided vocab = read_vocab(vocab) female_cooccur = Counter() male_cooccur = Counter() filepaths = [ os.path.join(data_dir, fname) for fname in os.listdir(data_dir) ] worker_args = ((read_preprocessed_file(fpath, vocab), target_pos, female_nouns, male_nouns) for fpath in filepaths) num_files = len(filepaths) LOGGER.info("Processing {} files.".format(num_files)) update_interval = max(num_files // 1000, 1) pool = Pool(n_jobs) last_time = time.time() for idx, res in enumerate(pool.imap_unordered(file_worker, worker_args)): female_cooccur_sent, male_cooccur_sent = res female_cooccur += female_cooccur_sent male_cooccur += male_cooccur_sent if verbose and ((idx + 1) % update_interval == 0): curr_time = time.time() LOGGER.info("| Processed {}/{} files. Batch took {}s.".format( idx + 1, num_files, curr_time - last_time)) last_time = curr_time return female_cooccur, male_cooccur
if batch % args.log_interval == 0 and batch > 0: cur_loss = total_loss / args.log_interval #total_loss.[0] elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, batch, len(train_data) // args.bptt, lr, elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() #load vocab vocab = preprocess.read_vocab(os.path.join(args.vocab)) #create json file with indexed filename for following separation # inds = jams.util.find_with_extension(args.data, 'bin') inds = [ os.path.join(args.data, x) for x in os.listdir(args.data) if x.endswith('bin') ] index_train = {} index_train['id'] = {} iteration = 0 for ind in inds: index_train['id'][iteration] = os.path.basename(ind) iteration += 1
else: torch.cuda.manual_seed(args.seed) if args.temperature < 1e-3: parser.error("--temperature has to be greater or equal 1e-3") with open(args.checkpoint, 'rb') as f: model = torch.load(f) model.eval() if args.cuda: model.cuda() else: model.cpu() vocab = preprocess.read_vocab(os.path.join(args.data, 'VOCAB.txt')) idx_train = pd.read_json('idx_train.json') idx_val = pd.read_json('idx_val.json') idx_test = pd.read_json('idx_test.json') # Load pretrained Embeddings gn_glove_dir = './gn_glove/1b-vectors300-0.8-0.8.txt' #142527 tokens, last one is '<unk>' ntokens = sum(1 for line in open(gn_glove_dir)) + 1 with open(gn_glove_dir) as f: gn_glove_vecs = np.zeros((ntokens, 300)) words2idx_emb = {} idx2words_emb = [] # ordered_words = [] for i, line in enumerate(f): try:
# #Keeping sentences which are less than max_len = 25 # data_test = data_test[data_test['target'].str.split(" ").str.len() <= max_len] # data_test = data_test[data_test['source'].str.split(" ").str.len() <= max_len] data_test = data_test[(data_test['source'].str.split(" ").str.len() <= max_len) & (data_test['target'].str.split(" ").str.len() <= max_len)] data_test = data_test.reset_index().drop('index',1) # Concatenating train and test data entire_data = pd.concat([data,data_test],axis =0) #Tokenizing the sentences using tokenizer, src_sents = pp.preprocess_func(entire_data['source'], tokenizers[source_lang], min_word_count) trgt_sents = pp.preprocess_func(entire_data['target'], tokenizers[target_lang], min_word_count) #Generating Source and Target Vocab from Training Data src_vocab = pp.read_vocab(src_sents) trgt_vocab = pp.read_vocab(trgt_sents) #Spliting the data in train and test indices training_indices = np.arange(0,19431) test_indices = np.arange(19431,19947) #Generating preprocessed training and test data with the help of indices training_source = [src_sents[i] for i in training_indices] test_source = [src_sents[i] for i in test_indices] training_target = [trgt_sents[i] for i in training_indices] test_target = [trgt_sents[i] for i in test_indices] #Adding 0's to the tensors that is padding according to max length max_seq_length = max_len + 2 # 2 for EOS_token and SOS_token
metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories)) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test_cls, y_pred_cls) print(cm) time_dif = get_time_dif(start_time) print("Time usage:", time_dif) if __name__ == '__main__': if len(sys.argv) != 2 or sys.argv[1] not in ['train', 'test']: raise ValueError("""usage: python run_cnn.py [train / test]""") print('Configuring CNN model...') config = TCNNConfig() if not os.path.exists(vocab_dir): # 如果不存在词汇表,重建 build_vocab(train_dir, vocab_dir, config.vocab_size) categories, cat_to_id = read_category() words, word_to_id = read_vocab(vocab_dir) config.vocab_size = len(words) model = TextCNN(config) # train() if sys.argv[1] == 'train': train() else: test()
def vector_to_dict(): read_vocab() vector_read() dictionary = dict(zip(vocab_array, vector_array)) return dictionary