if __name__ == '__main__': if len(sys.argv) > 1 and sys.argv[1] == 'conceptnet': preprocess_conceptnet('conceptnet-assertions-5.5.5.csv') exit(0) init_tokenizer() #---middle #preprocess_dataset('./data/race/dev_middle') #preprocess_dataset('./data/race/test_middle') #preprocess_dataset('./data/race/train_middle') import utils train_data = utils.load_data('./data/race/train_middle-processed.json') dev_data = utils.load_data('./data/race/dev_middle-processed.json') #test_data =utils.load_data('./data/race/test_middle-processed.json') utils.build_vocab(train_data + dev_data) #utils.build_vocab(test_data) #---high #preprocess_dataset_race('./data/dev_high') #preprocess_dataset_race('./data/test_high') #preprocess_dataset_race('./data/train_high') #preprocess_dataset_sciq('./data/train_sciq.json') #preprocess_dataset_sciq('./data/test_sciq.json') #preprocess_dataset_tqa('./data/tqa_v1_val.json') #preprocess_dataset_tqa('./data/tqa_v1_train.json') #preprocess_dataset_sciq('./data/valid_sciq.json') #import utils #train_data = utils.load_data('./data/train_high-processed.json') #dev_data = utils.load_data('./data/dev_high-processed.json')
import numpy as np import torch from torch.utils import data import json from consts import NONE, PAD, CLS, SEP, UNK, ATTITUDES, ARGUMENTS, ENTITIES from utils import build_vocab from pytorch_pretrained_bert import BertTokenizer # init vocab all_attitudes, attitude2idx, idx2attitude = build_vocab(ATTITUDES) all_entities, entity2idx, idx2entity = build_vocab(ENTITIES) all_arguments, argument2idx, idx2argument = build_vocab(ARGUMENTS, BIO_tagging=False) class MPQADataset(data.Dataset): def __init__(self, fpath, bert_size='base'): self.sent_li, self.entities_li, self.attitudes_li, self.arguments_li = [], [], [], [] bert_string = 'bert-{}-uncased'.format(bert_size) self.tokenizer = BertTokenizer.from_pretrained(bert_string, do_lower_case=True, never_split=(PAD, CLS, SEP, UNK)) with open(fpath, 'r') as f: data = json.load(f) assert isinstance(data, list) for item_index, item in enumerate(data):
def __init__(self, config): super().__init__() self.config = config annos = read_json(config['anno_file'])[config['emo_type']] ################################################## # 作者将这部分代码注释掉了,就是用是否只用训练数据集进行训练 # 在train_test模式下,所有的数据都应该预先处理 if not config["val_file"]: print("Caution! Loading Samples from {}".format(config['id_file'])) ids = [] tmp_annos = [] with open(config['id_file']) as fin: for line in fin.readlines(): ids.append(int(line.strip())) for jj, anno in enumerate(annos): if jj in ids: tmp_annos.append(anno) annos = tmp_annos ################################################## emo_num = 9 if config['emo_type'] == 'primary' else 14 self.emotion_classes = EMOTIONS[:emo_num] data = read_json(config['data_file']) self.visual_features, self.audio_features, self.text_features = [], [], [] self.visual_valids, self.audio_valids, self.text_valids = [], [], [] ################################ # 用来保存概念,用来后面准备加知识图 self.visual_concepts, self.audio_concepts, self.text_concepts = list(), list(), list() self.visual_concepts_lengths, self.audio_concepts_lengths, self.text_concepts_lengths = list(), list(), list() ################################ self.labels = [] self.charcaters_seq = [] self.time_seq = [] self.target_loc = [] self.seg_len = [] self.n_character = [] vfe = VisualFeatureExtractor(config) afe = AudioFeatureExtractor(config) tfe = TextFeatureExtractor(config) pfe = PersonalityFeatureExtractor(config) self.personality_list = pfe.get_features() # n_c self.personality_features = [] ################################### print("Processing Concepts") self.concept2id_v, self.id2concept_v = build_vocab(config, 'visual') self.concept2id_a, self.id2concept_a = build_vocab(config, 'audio') self.concept2id_t, self.id2concept_t = build_vocab(config, 'text') vfe.concepts2id = self.concept2id_v afe.concepts2id = self.concept2id_a tfe.concepts2id = self.concept2id_t # print(self.concept2id_t) # print(afe.concepts2id) assert config["visual"]["concept_size"] == len(self.concept2id_v), "the size of concept in config ({}) mismatches the size captured from data ({})".format(config["visual"]["concept_size"], len(self.concept2id_v)) assert config["audio"]["concept_size"] == len(self.concept2id_a), "the size of concept in config ({}) mismatches the size captured from data ({})".format(config["audio"]["concept_size"], len(self.concept2id_a)) assert config["text"]["concept_size"] == len(self.concept2id_t), "the size of concept in config ({}) mismatches the size captured from data ({})".format(config["text"]["concept_size"], len(self.concept2id_t)) ################################### ################################### print("Processing Knowledge") vectors = Magnitude(config["knowledge"]["embedding_file"]) self.embedding_concept_v = get_concept_embedding(self.concept2id_v, config, vectors) self.embedding_concept_a = get_concept_embedding(self.concept2id_a, config, vectors) self.embedding_concept_t = get_concept_embedding(self.concept2id_t, config, vectors) self.edge_matrix_v, self.affectiveness_v = build_kb(self.concept2id_v, config, "visual") self.edge_matrix_a, self.affectiveness_a = build_kb(self.concept2id_a, config, "audio") self.edge_matrix_t, self.affectiveness_t = build_kb(self.concept2id_t, config, "text") ################################### print('Processing Samples...') for jj, anno in enumerate(tqdm(annos)): # if jj >= 300: break clip = anno['clip'] target_character = anno['character'] target_moment = anno['moment'] on_characters = data[clip]['on_character'] if target_character not in on_characters: on_characters.append(target_character) on_characters = sorted(on_characters) charcaters_seq, time_seq, target_loc, personality_seq = [], [], [], [] for ii in range(len(data[clip]['seg_start'])): for character in on_characters: charcaters_seq.append([0 if character != i else 1 for i in range(len(config['speakers']))]) time_seq.append(ii) personality_seq.append(self.personality_list[character]) if character == target_character and data[clip]['seg_start'][ii] <= target_moment < data[clip]['seg_end'][ii]: target_loc.append(1) else: target_loc.append(0) # for character in on_characters: # for ii in range(len(data[clip]['seg_start'])): # charcaters_seq.append([0 if character != i else 1 for i in range(len(config['speakers']))]) # time_seq.append(ii) # personality_seq.append(self.personality_list[character]) # if character == target_character and data[clip]['seg_start'][ii] <= target_moment < data[clip]['seg_end'][ii]: # target_loc.append(1) # else: # target_loc.append(0) #################################################### # 什么c就是对应的概念,读到列表里面,暂时没想好动作特征咋处理 vf, v_valid, vc = vfe.get_feature(anno['clip'], target_character) # seqlen * n_c, dim_features_v af, a_valid, ac = afe.get_feature(anno['clip'], target_character) # seqlen * n_c, dim_features_a tf, t_valid, tc = tfe.get_feature(anno['clip'], target_character) # seqlen * n_c, dim_features_t #################################################### self.n_character.append(len(on_characters)) self.seg_len.append(len(data[clip]['seg_start'])) self.personality_features.append(torch.stack(personality_seq)) # num_anno, seqlen * n_c, dim_features_p self.charcaters_seq.append(torch.tensor(charcaters_seq)) # num_anno, seqlen * n_c, some self.time_seq.append(torch.tensor(time_seq)) # num_anno, seqlen * n_c, some self.target_loc.append(torch.tensor(target_loc, dtype=torch.int8)) # num_anno, seqlen * n_c self.visual_features.append(vf) # num_anno, seqlen * n_c, dim_features_v self.audio_features.append(af) # num_anno, seqlen * n_c, dim_features_a self.text_features.append(tf) # num_anno, seqlen * n_c, dim_features_t self.visual_valids.append(v_valid) # num_anno, seqlen * n_c self.audio_valids.append(a_valid) # num_anno, seqlen * n_c self.text_valids.append(t_valid) # num_anno, seqlen * n_c ####################################################### # 对应的保存,按照样本对应 lengths = list() vc_new = list() for concepts in vc: new = torch.zeros(512, dtype=torch.long) lengths.append(concepts.size(0)) new[:concepts.size(0)] = concepts[:] vc_new.append(new) self.visual_concepts_lengths.append(torch.tensor(lengths, dtype=torch.int8)) # num_anno, seqlen # assert len(vc_new) == len(vc) and len(vc_new) == len(data[clip]['seg_start']) ac_new = list() lengths = list() for concepts in ac: # print(concepts) new = torch.zeros(512, dtype=torch.long) # max_num_concept lengths.append(concepts.size(0)) new[:concepts.size(0)] = concepts[:] ac_new.append(new) self.audio_concepts_lengths.append(torch.tensor(lengths, dtype=torch.int8)) # num_anno, seqlen tc_new = list() lengths = list() for concepts in tc: new = torch.zeros(512, dtype=torch.long) lengths.append(concepts.size(0)) new[:concepts.size(0)] = concepts[:] tc_new.append(new) self.text_concepts_lengths.append(torch.tensor(lengths, dtype=torch.int8)) # num_anno, seqlen self.visual_concepts.append(torch.stack(vc_new, dim=0)) # num_anno, seqlen, max_num_concept # assert torch.stack(vc_new, dim=0).size(0) == len(data[clip]['seg_start']) self.audio_concepts.append(torch.stack(ac_new, dim=0)) # num_anno, seqlen, max_num_concept self.text_concepts.append(torch.stack(tc_new, dim=0)) # num_anno, seqlen, max_num_concept ####################################################### self.labels.append(self.emotion_classes.index(anno['emotion']))
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd!=0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data,'train/') dev_dir = os.path.join(args.data,'dev/') test_dir = os.path.join(args.data,'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data,'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [os.path.join(split,'a.toks') for split in [train_dir,dev_dir,test_dir]] token_files_b = [os.path.join(split,'b.toks') for split in [train_dir,dev_dir,test_dir]] token_files = token_files_a+token_files_b sick_vocab_file = os.path.join(args.data,'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data,'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data,'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data,'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM( vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim=='adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim=='adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim=='sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove,'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(),glove_emb.size(1)).normal_(-0.05,0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.emb.weight.data.copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred,train_dataset.labels) train_mse = metrics.mse(train_pred,train_dataset.labels) logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred,dev_dataset.labels) dev_mse = metrics.mse(dev_pred,dev_dataset.labels) logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred,test_dataset.labels) test_mse = metrics.mse(test_pred,test_dataset.labels) logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = {'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
start_click[k] = 1 if len(start_click) < 2: sort_start_click = list(start_click.keys()) else: sort_start_click = sorted(start_click, key=start_click.get, reverse=True)[1:10000] max_interact_len = 2 * int(np.mean([len(s) for s in data])) print("Average length of the dataset:", np.mean([len(s) for s in data]), "max_interact_len:", max_interact_len) fold = len(data) / 40 data_train = data[:(fold * 38)] data_dev = data[(fold * 38):(fold * 39)] data_test = data[(fold * 39):] vocab, embed = build_vocab(data_train) aid2index = {} index2aid = {} for i, a in enumerate(vocab): aid2index[a] = i index2aid[i] = a def filter(d): new_d = [] for i, s in enumerate(d): tmps = [] for c in s: c["click"] = aid2index[ c["click"]] if c["click"] in aid2index else UNK_ID c["rec_list"] = list(
def main(args): print('Loading data') with open(args.input_questions_json, 'r') as f: questions = json.load(f)['questions'] # Either create the vocab or load it from disk if args.input_vocab_json == '' or args.expand_vocab == 1: print('Building vocab') if 'answer' in questions[0]: answer_token_to_idx = build_vocab((q['answer'] for q in questions)) question_token_to_idx = build_vocab((q['question'] for q in questions), min_token_count=args.unk_threshold, punct_to_keep=[';', ','], punct_to_remove=['?', '.'], add_special=True) all_program_strs = [] for q in questions: if 'program' not in q: continue program_str = program_to_strs(q['program'], args.mode)[0] if program_str is not None: all_program_strs.append(program_str) program_token_to_idx = build_vocab(all_program_strs, add_special=True) vocab = { 'question_token_to_idx': question_token_to_idx, 'program_token_to_idx': program_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, # no special tokens } if args.input_vocab_json != '': print('Loading vocab') if args.expand_vocab == 1: new_vocab = vocab with open(args.input_vocab_json, 'r') as f: vocab = json.load(f) if args.expand_vocab == 1: num_new_words = 0 for word in new_vocab['question_token_to_idx']: if word not in vocab['question_token_to_idx']: print('Found new word %s' % word) idx = len(vocab['question_token_to_idx']) vocab['question_token_to_idx'][word] = idx num_new_words += 1 print('Found %d new words' % num_new_words) if args.output_vocab_json != '': with open(args.output_vocab_json, 'w') as f: json.dump(vocab, f, indent=4) # Encode all questions and programs print('Encoding data') questions_encoded = [] programs_encoded = [] # value_inputs, encoded by question_token_to_idx in CLEVR # because all valid inputs are in question vocab program_inputs_encoded = [] question_families = [] orig_idxs = [] image_idxs = [] answers = [] for orig_idx, q in enumerate(questions): question = q['question'] orig_idxs.append(orig_idx) image_idxs.append(q['image_index']) if 'question_family_index' in q: question_families.append(q['question_family_index']) question_tokens = tokenize(question, punct_to_keep=[';', ','], punct_to_remove=['?', '.']) question_encoded = encode(question_tokens, vocab['question_token_to_idx'], allow_unk=args.encode_unk == 1) questions_encoded.append(question_encoded) if 'program' in q: program = q['program'] program_str, input_str = program_to_strs(program, args.mode) program_tokens = tokenize(program_str) program_encoded = encode(program_tokens, vocab['program_token_to_idx']) programs_encoded.append(program_encoded) # program value_inputs input_tokens = tokenize(input_str) input_encoded = encode(input_tokens, vocab['question_token_to_idx']) assert len(input_encoded) == len( program_encoded) # input should have the same len with func program_inputs_encoded.append(input_encoded) if 'answer' in q: answers.append(vocab['answer_token_to_idx'][q['answer']]) # Pad encoded questions and programs max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) if len(programs_encoded) > 0: max_program_length = max(len(x) for x in programs_encoded) for pe in programs_encoded: while len(pe) < max_program_length: pe.append(vocab['program_token_to_idx']['<NULL>']) for ie in program_inputs_encoded: while len(ie) < max_program_length: ie.append(vocab['question_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) programs_encoded = np.asarray(programs_encoded, dtype=np.int32) program_inputs_encoded = np.asarray(program_inputs_encoded, dtype=np.int32) print(questions_encoded.shape) print(programs_encoded.shape) print(program_inputs_encoded.shape) print('Writing') obj = { 'questions': questions_encoded, 'image_idxs': np.asarray(image_idxs), 'orig_idxs': np.asarray(orig_idxs), 'programs': programs_encoded, 'program_inputs': program_inputs_encoded, 'question_families': question_families, 'answers': answers, } with open(args.output_pt_file, 'wb') as f: pickle.dump(obj, f)
import os from config import Config import time from importlib import import_module import torch from train_and_test import init_network, train, test from utils import build_dataset, build_iterator, get_time_dif, load_vocabulary, build_vocab config = Config() full_train_path = os.path.join('./data', config.dataset, config.train_data_path) full_test_path = os.path.join('./data', config.dataset, config.test_data_path) full_valid_path = os.path.join('./data', config.dataset, config.valid_data_path) build_vocab(os.path.join(full_train_path, config.input_file), os.path.join(config.vocab_path, 'in_vocab')) build_vocab(os.path.join(full_train_path, config.slot_file), os.path.join(config.vocab_path, 'slot_vocab'), unk=False) build_vocab(os.path.join(full_train_path, config.intent_file), os.path.join(config.vocab_path, 'intent_vocab'), pad=False, unk=False) if config.dataset == 'snips': print('use snips dataset') elif config.dataset == 'atis': print('use atis dataset') model_name = 'Attention_RNN' torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True start_time = time.time() print('加载数据...')
context_radius = args.context_radius filepath = 'training-sets/' + output_dir if not os.path.exists('/training-sets/'): os.makedirs('/training-sets') if not os.path.exists(filepath): os.makedirs(filepath) else: selection = input( 'The directory /training-sets/{}/ already exists. Would you ' 'like to overwrite it? [Y/N]: '.format(output_dir)) while selection is not 'Y' and selection is not 'y' and selection is not 'N' and selection is not 'n': selection = input( 'You did not enter a valid option. Choose from [Y/N]: ') if selection is 'Y' or selection is 'y': shutil.rmtree(filepath) os.makedirs(filepath) elif selection is 'N' or selection is 'n': print('Exiting.') exit() index_to_word, word_to_index = build_vocab(input_dir=input_dir, output_dir=output_dir, vocabulary_size=vocabulary_size) build_training_set(input_dir=input_dir, output_dir=output_dir, index_to_word=index_to_word, word_to_index=word_to_index, context_radius=context_radius)
type=str, default= "Transformer-based-pretrained-model-for-event-extraction-master/save_model/latest_model.pt" ) parser.add_argument("--batch_size", type=int, default=16) hp = parser.parse_args() if hp.PreTrain_Model not in MODELS_dict.keys(): KeyError("PreTrain_Model不在可选列表内") tokenizer = MODELS_dict[hp.PreTrain_Model][1].from_pretrained( MODELS_dict[hp.PreTrain_Model][2]) # init vocab all_triggers, trigger2idx, idx2trigger = build_vocab(TRIGGERS) all_entities, entity2idx, idx2entity = build_vocab(ENTITIES) all_postags, postag2idx, idx2postag = build_vocab(POSTAGS, BIO_tagging=False) all_arguments, argument2idx, idx2argument = build_vocab(ARGUMENTS, BIO_tagging=False) class ACE2005Dataset(data.Dataset): def __init__(self, fpath): self.sent_li, self.entities_li, self.postags_li, self.triggers_li, self.arguments_li, self.adjm_li = [], [], [], [], [], [] with open(fpath, 'r') as f: data = json.load(f) for item in data: words = item['words'] entities = [[NONE] for _ in range(len(words))]
import calendar from os.path import isfile import pandas as pd from config import max_count from utils import get_all_lines, pad_sentences, build_vocab preprocessed_events_description = "data/barclays_events_description_preprocessed.csv" assert isfile(preprocessed_events_description) assert isfile("data/barclays_events.csv") months = list(calendar.month_abbr) df = pd.read_csv("data/barclays_events.csv", sep=", ") preprocessed_descriptions = get_all_lines(preprocessed_events_description) padded_description = pad_sentences(preprocessed_descriptions) vocabulary, vocabulary_inv, word_counts = build_vocab(padded_description) print("Length of vocab is: {}".format(len(vocabulary))) def get_encoded_sentence(sentence): padded_sentece = [0] * max_count words = sentence.split(" ") for i in range(min(max_count, len(words))): padded_sentece[i] = vocabulary.get(words[i].strip(), 0) return padded_sentece def get_all_events(): events_info = {} with open(preprocessed_events_description) as f: desciptions = f.readlines()[1:]
def train(): lines = [line.strip() for line in open("data/data.csv", "r").readlines()] lines = [(json.loads(line)["dream"], json.loads(line)["decode"]) for line in lines] inputs = [" ".join(list(q)) for q, a in lines] outputs = [" ".join(list(a)) for q, a in lines] all_info = ' '.join(inputs + outputs).split() if os.path.exists(args.vocab_file): dictionary_input, rev_dictionary_input = read_vocab(args.vocab_file) else: dictionary_input, rev_dictionary_input = build_vocab( all_info, args.vocab_file) dictionary_output, rev_dictionary_output = dictionary_input, rev_dictionary_input min_line_length = 2 max_line_length = 100 data_filter = [(q, a) for q, a in zip(inputs, outputs) if len_check(q, min_line_length, max_line_length) and len_check(a, min_line_length, max_line_length)] random.shuffle(data_filter) inputs = [q for q, a in data_filter] outputs = [a + ' EOS' for q, a in data_filter] tf.logging.info("sample size: %s", len(inputs)) inputs_dev = inputs[0:100] outputs_dev = outputs[0:100] inputs_train = inputs[100:] outputs_train = outputs[100:] inputs_train = str_idx(inputs_train, dictionary_input, dictionary_input['UNK']) print(inputs_train[:2]) outputs_train = str_idx(outputs_train, dictionary_output, dictionary_output['UNK']) print(outputs_train[:2]) inputs_dev = str_idx(inputs_dev, dictionary_input, dictionary_input['UNK']) outputs_dev = str_idx(outputs_dev, dictionary_output, dictionary_output['UNK']) model = Seq2Seq(args.size_layer, args.num_layers, args.embedded_size, len(dictionary_input), len(dictionary_output), args.learning_rate, dictionary_input) with tf.Session() as sess: with tf.device("/cpu:0"): ckpt = tf.train.get_checkpoint_state(args.checkpoint_dir) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): tf.logging.info("restore model from patch: %s", ckpt.model_checkpoint_path) # 加载预训练模型 saver = tf.train.Saver(max_to_keep=4) saver.restore(sess, ckpt.model_checkpoint_path) else: saver = tf.train.Saver(max_to_keep=4) sess.run(tf.global_variables_initializer()) global_step = 0 for epoch_index in range(args.epoch): total_loss, total_accuracy = 0, 0 batch_num = 0 for k in range(0, len(inputs_train), args.batch_size): batch_num = batch_num + 1 index = min(k + args.batch_size, len(inputs_train)) batch_x, seq_x = pad_sentence_batch( inputs_train[k:index], dictionary_input["PAD"]) batch_y, seq_y = pad_sentence_batch( outputs_train[k:index], dictionary_input["PAD"]) predicted, accuracy, loss, _, global_step = sess.run( fetches=[ model.predicting_ids, model.accuracy, model.cost, model.optimizer, model.global_step ], feed_dict={ model.X: batch_x, model.Y: batch_y }) total_loss += loss total_accuracy += accuracy if global_step % 100 == 0: print( '%s epoch: %d, global_step: %d, loss: %f, accuracy: %f' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), epoch_index + 1, global_step, loss, accuracy)) saver.save(sess, os.path.join(args.checkpoint_dir, "seq2seq.ckpt"), global_step=global_step) print("+" * 20) for i in range(4): print('row %d' % (i + 1)) print( 'dream:', ''.join([ rev_dictionary_input[n] for n in batch_x[i] if n not in [0, 1, 2, 3] ])) print( 'real meaning:', ''.join([ rev_dictionary_output[n] for n in batch_y[i] if n not in [0, 1, 2, 3] ])) print( 'dream decoding:', ''.join([ rev_dictionary_output[n] for n in predicted[i] if n not in [0, 1, 2, 3] ]), '') index = list(range(len((inputs_dev)))) random.shuffle(index) batch_x, _ = pad_sentence_batch([ inputs_dev[i] for i in index ][:args.batch_size], dictionary_input["PAD"]) batch_y, _ = pad_sentence_batch([ outputs_dev[i] for i in index ][:args.batch_size], dictionary_input["PAD"]) predicted = sess.run(model.predicting_ids, feed_dict={model.X: batch_x}) print("-" * 20) for i in range(4): print('row %d' % (i + 1)) # print(batch_x[i]) # print(predicted[i]) print( 'dream:', ''.join([ rev_dictionary_input[n] for n in batch_x[i] if n not in [0, 1, 2, 3] ])) print( 'real meaning:', ''.join([ rev_dictionary_output[n] for n in batch_y[i] if n not in [0, 1, 2, 3] ])) print( 'dream decoding:', ''.join([ rev_dictionary_output[n] for n in predicted[i] if n not in [0, 1, 2, 3] ]), '') total_loss /= batch_num total_accuracy /= batch_num print( '***%s epoch: %d, global_step: %d, avg loss: %f, avg accuracy: %f' % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), epoch_index + 1, global_step, total_loss, total_accuracy))
for d in dial_list: for i in range(2, len(d) + 1): extend_dial_list += [d[:i]] return extend_dial_list def save_data(out_dir, prefix, dial_list): out_dial_path = os.path.join(out_dir, prefix + "_text.txt") with open(out_dial_path, "w") as f: for dial in dial_list: f.write("<dial>\n") for uttr in dial: f.write(uttr + "\n") f.write("</dial>\n") if __name__ == "__main__": out_dir = "./Data/DailyDialogues/" in_dir = "./Data/DailyDialogues/" os.makedirs(out_dir, exist_ok=True) dial_list, prefix = load_data(in_dir, "test") loginfo_and_print( logger, "Load {} dialogues from {}".format(len(dial_list), in_dir)) if prefix == "train": build_vocab(out_dir, dial_list) print("Finish building vocab") dial_list = extend_dial(dial_list) loginfo_and_print(logger, "Extend to {} dialogues".format(len(dial_list))) save_data(out_dir, prefix, dial_list) print("Data saved successfully !!")
hp = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = hp.gpu device = 'cuda' if torch.cuda.is_available() else 'cpu' if not os.path.exists(hp.model_path): print('Warning: There is no model on the path:', hp.model_path, 'Please check the model_path parameter') model = torch.load(hp.model_path) if device == 'cuda': model = model.cuda() if hp.module_arg == 'all': all_arguments, argument2idx, idx2argument = build_vocab( ARGUMENTS, BIO_tagging=False) elif hp.module_arg in ARGUMENTS: all_arguments, argument2idx, idx2argument = build_vocab( [hp.module_arg], BIO_tagging=False) if hp.eval_finetune: novel_event = hp.novel_event test_dataset = ACE2005DatasetNovel(hp.testset, all_arguments, argument2idx, novel_event=novel_event, novel_shot=1000) else: test_dataset = ACE2005Dataset(hp.testset, all_arguments, argument2idx) test_iter = data.DataLoader(dataset=test_dataset,
random.seed(args.seed) if __name__ == '__main__': preprocess_dataset('./data/trial-data.json') preprocess_dataset('./data/dev-data.json') preprocess_dataset('./data/train-data.json') preprocess_dataset('./data/test-data.json', is_test_set=True) import utils trial_data = utils.load_data('./data/trial-data-processed.json') train_data = utils.load_data('./data/train-data-processed.json') dev_data = utils.load_data('./data/dev-data-processed.json') test_data = utils.load_data('./data/test-data-processed.json') utils.build_vocab(trial_data + train_data + dev_data + test_data) build_vocab() train_data = load_data('./data/train-data-processed.json') train_data += load_data('./data/trial-data-processed.json') dev_data = load_data('./data/dev-data-processed.json') if args.test_mode: # use validation data as training data train_data += dev_data dev_data = [] model = Model(args) best_dev_acc = 0.0 os.makedirs('./checkpoint', exist_ok=True) checkpoint_path = './checkpoint/%d-%s.mdl' % (args.seed, datetime.now().isoformat()) print('Trained model will be saved to %s' % checkpoint_path)
for w in row[3].split(): types.add(w) num_tok += 1 print("num types", len(types), "num tokens", num_tok) print("HADM_ID: {}".format(len(dfnl['HADM_ID'].unique()))) print("SUBJECT_ID: {}".format(len(dfnl['SUBJECT_ID'].unique()))) # step 6: split data into train dev test fname = '%s/notes_labeled.csv' % args.MIMIC_3_DIR base_name = "%s/disch" % args.MIMIC_3_DIR #for output tr, dv, te = split_data(fname, base_name, args.MIMIC_3_DIR) vocab_min = 3 vname = '%s/vocab.csv' % args.MIMIC_3_DIR build_vocab(vocab_min, tr, vname) # step 7: sort data by its note length, add length to the last column for splt in ['train', 'dev', 'test']: filename = '%s/disch_%s_split.csv' % (args.MIMIC_3_DIR, splt) df = pd.read_csv(filename) df['length'] = df.apply(lambda row: len(str(row['TEXT']).split()), axis=1) df = df.sort_values(['length']) df.to_csv('%s/%s_full.csv' % (args.MIMIC_3_DIR, splt), index=False) # step 8: train word embeddings via word2vec and fasttext w2v_file = word_embeddings('full', '%s/disch_full.csv' % args.MIMIC_3_DIR, 100, 0, 5) gensim_to_embeddings('%s/processed_full.w2v' % args.MIMIC_3_DIR, '%s/vocab.csv' % args.MIMIC_3_DIR, Y) fasttext_file = fasttext_embeddings('full', '%s/disch_full.csv' % args.MIMIC_3_DIR, 100, 0, 5) gensim_to_fasttext_embeddings('%s/processed_full.fasttext' % args.MIMIC_3_DIR, '%s/vocab.csv' % args.MIMIC_3_DIR, Y)
Author: Susheel Suresh Last Modified: 04/03/2019 """ from classifier import BinaryClassifier from sgd import SGDHinge,SGDLog,SGDHingeReg,SGDLogReg from utils import read_data, build_vocab import utils from config import args if __name__ == '__main__': filepath = '../data/given/' build_vocab(filepath, vocab_size=args.vocab_size) train_data, test_data = read_data(filepath) sgd_l_classifier = SGDLog(args) sgd_l_classifier.fit(train_data) acc, prec, rec, f1 = sgd_l_classifier.evaluate(test_data) print('\nSGD Log Loss (No Regularization) :') print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1)) sgd_l_r_classifier = SGDLogReg(args) sgd_l_r_classifier.fit(train_data) acc, prec, rec, f1 = sgd_l_r_classifier.evaluate(test_data) print('\nSGD Log Loss (With Regularization) :') print('Accuracy: %.2f, Precision: %.2f, Recall: %.2f, F1: %.2f'%(acc, prec, rec, f1)) sgd_h_classifier = SGDHinge(args)
def main(): print(args) # local """ data_dir = 'sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt') VALID_X = os.path.join(data_dir, 'train/valid.article.filter.txt') VALID_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') EVAL_X = os.path.join(data_dir, 'train/valid.article.filter.txt') EVAL_Y = os.path.join(data_dir, 'train/valid.title.filter.txt') """ # server data_dir = 'sumdata/' TRAIN_X = os.path.join(data_dir, 'train/train.article_01_new.txt') TRAIN_Y = os.path.join(data_dir, 'train/train.title_01_new.txt') VALID_X = os.path.join(data_dir, 'train/train.article_000_new.txt') VALID_Y = os.path.join(data_dir, 'train/train.title_000_new.txt') EVAL_X = os.path.join(data_dir, 'train/train.article_001_new.txt') EVAL_Y = os.path.join(data_dir, 'train/train.title_001_new.txt') small_vocab_file = 'sumdata/small_vocab.json' if os.path.exists(small_vocab_file): small_vocab = json.load(open(small_vocab_file)) else: small_vocab = build_vocab([TRAIN_X, TRAIN_Y], small_vocab_file, vocab_size=80000) # bert embeddings emb_file = 'sumdata/bert-large-uncased.30522.1024d.vec' vocab, embeddings = load_word2vec_embedding(emb_file) max_src_len = 101 max_tgt_len = 47 bs = args.batch_size n_train = args.n_train n_valid = args.n_valid n_eval = args.n_eval # vocab = small_vocab train_x = BatchManager(load_data(TRAIN_X, max_src_len, n_train), bs, vocab) train_y = BatchManager(load_data(TRAIN_Y, max_tgt_len, n_train), bs, vocab) valid_x = BatchManager(load_data(VALID_X, max_src_len, n_valid), bs, vocab) valid_y = BatchManager(load_data(VALID_Y, max_tgt_len, n_valid), bs, vocab) eval_x = BatchManager(load_data(EVAL_X, max_src_len, n_eval), bs, vocab) eval_y = BatchManager(load_data(EVAL_Y, max_tgt_len, n_eval), bs, vocab) # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 4, 256, # 64, 64, 1024, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = Transformer(len(vocab), len(vocab), max_src_len, max_tgt_len, 1, 6, 300, # 50, 50, 1200, src_tgt_emb_share=True, tgt_prj_emb_share=True).cuda() # model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 300, 50, 50, 1200, False).cuda() model = TransformerShareEmbedding(len(vocab), max_src_len, 1, 6, 1024, 50, 50, 1200, False, embeddings = embeddings).cuda() # print(model) saved_state = {'epoch': 0, 'lr': 0.001} if os.path.exists(args.ckpt_file): saved_state = torch.load(args.ckpt_file) model.load_state_dict(saved_state['state_dict']) logging.info('Load model parameters from %s' % args.ckpt_file) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = torch.optim.Adam(parameters, lr=saved_state['lr']) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.3) scheduler.step() # last_epoch=-1, which will not update lr at the first time # eval_model(valid_x, valid_y, vocab, model) # train(train_x, train_y, valid_x, valid_y, model, optimizer, vocab, scheduler, args.n_epochs, saved_state['epoch']) myeval(eval_x, eval_y, vocab, model)
# -*- coding:utf-8 -*- from utils import tokenize, build_vocab, read_vocab import tensorflow as tf from model import NerModel import tensorflow_addons as tf_ad import os import numpy as np from args_help import args from my_log import logger if not (os.path.exists(args.vocab_file) and os.path.exists(args.tag_file)): logger.info("building vocab file") build_vocab([args.train_path], args.vocab_file, args.tag_file) else: logger.info("vocab file exits!!") vocab2id, id2vocab = read_vocab(args.vocab_file) tag2id, id2tag = read_vocab(args.tag_file) text_sequences, label_sequences = tokenize(args.train_path, vocab2id, tag2id) train_dataset = tf.data.Dataset.from_tensor_slices( (text_sequences, label_sequences)) print(type(train_dataset)) train_dataset = train_dataset.shuffle(len(text_sequences)).batch( args.batch_size, drop_remainder=True)
from config import args from utils import load_data, build_vocab, gen_submission, gen_final_submission, eval_based_on_outputs from model import Model if __name__ == '__main__': if not args.pretrained: print('No pretrained model specified.') exit(0) build_vocab() if args.test_mode: dev_data = load_data('./data/test-data-processed.json') show_data = load_data('./data/show-data-processed.json') else: dev_data = load_data('./data/dev-data-processed.json') #show_data = load_data('./data/last_show_data.json') #show_data = load_data('./data/final_show_data_processed.json') #show_data = load_data('./data/first.json') #show_data = load_data('./data/second.json') #show_data = load_data('./data/four1.json') #show_data = load_data('./data/four2.json') #show_data = load_data('./data/Seven1.json') #show_data = load_data('./data/Seven2.json') #show_data = load_data('./data/Eight1.json') #show_data = load_data('./data/Eight2.json') #show_data = load_data('./data/401_1.json') show_data = load_data('./data/401_2.json') #show_data = load_data('./data/365.json') #show_data = load_data('./data/387_0.json') #show_data = load_data('./data/387_1.json') model_path_list = args.pretrained.split(',')