return text.split() user_stop_words = {'.', ','} STOP_WORDS.update(user_stop_words) stop_words = STOP_WORDS # Pretrain Model PRE_TRAIN_MODEL_BASE_PATH = '/home/ubuntu/likun/nlp_vectors' PRE_TRAIN_MODEL_DIR = 'glove' PRE_TRAIN_MODEL_NAME = 'glove.6B.200d.txt' USE_PRE_TRAIN_MODEL = True cache = '.vector_cache' vector_path = os.path.join(PRE_TRAIN_MODEL_BASE_PATH, PRE_TRAIN_MODEL_DIR, PRE_TRAIN_MODEL_NAME) vectors = Vectors(name=vector_path, cache=cache) if USE_PRE_TRAIN_MODEL else None # Build Dataset TEXT = data.Field(unk_token=UNK_TOKEN, tokenize=tokenizer, lower=False, stop_words=stop_words, batch_first=True) LABEL = data.LabelField() train_data = data.TabularDataset(path=os.path.join(DATA_BASE_PATH, DATA_DIR, DATA_TRAIN_FILE_NAME), format='csv', fields=[('text', TEXT), ('label', LABEL)], skip_header=True) test_data = data.TabularDataset(path=os.path.join(DATA_BASE_PATH, DATA_DIR,
def get_vectors(self, path: str): logger.info('loading vectors from {}'.format(path)) vectors = Vectors(path) logger.info('successed loading vectors') return vectors
fields = [('sentiment', LABEL), ('title', None), ('review', TEXT)] reviews = TabularDataset(path=args.data_dir + "/" + args.sensitive_filename, format='csv', fields=fields, skip_header=True) train_private = reviews phrase_count_complete = create_clean_counter(reviews, add_space_split=True) train_vocab = torchtext.vocab.Vocab(counter=phrase_count_complete) # Attach GloVe embeddings embedding_dims = args.embedding_size vectors = Vectors(args.vectors_dir + "/" + args.vectors_filename, max_vectors=100_000) train_vocab.load_vectors(vectors) # Create approximate nearest neighbor index num_trees = 50 ann_index = AnnoyIndex(embedding_dims, 'euclidean') ann_filename = join(args.artifact_output_dir, "index.ann") for vector_num, vector in enumerate(train_vocab.vectors): ann_index.add_item(vector_num, vector) print("Building annoy index...") assert ann_index.build(num_trees) ann_index.save(ann_filename) print("Annoy index built")
def load_data(self, w2v_file, train_file, test_file, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: w2v_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec) train_file (String): absolute path to training file test_file (String): absolute path to test file val_file (String): absolute path to validation file ''' NLP = spacy.load('en') tokenizer = lambda sent: [ x.text for x in NLP.tokenizer(sent) if x.text != " " ] # Creating Field for data TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len) LABEL = data.Field(sequential=False, use_vocab=False) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df = self.get_pandas_df(train_file) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] test_data = data.Dataset(test_examples, datafields) # If validation file exists, load it. Otherwise get validation data from training data if val_file: val_df = self.get_pandas_df(val_file) val_examples = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_examples, datafields) else: train_data, val_data = train_data.split(split_ratio=0.8) TEXT.build_vocab(train_data, vectors=Vectors(w2v_file)) self.word_embeddings = TEXT.vocab.vectors self.vocab = TEXT.vocab self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) print("Loaded {} training examples".format(len(train_data))) print("Loaded {} test examples".format(len(test_data))) print("Loaded {} validation examples".format(len(val_data)))
embedding_sd = checkpoint['embedding'] voc.__dict__ = checkpoint['voc_dict'] print('Building encoder and decoder ...') # Initialize GNN n_edge_types = dataset.n_edge_types n_node = dataset.n_node state_dim = dataset.state_dim net = GGNN(state_dim, annotation_dim, n_edge_types, n_node, n_steps) net.double() print(net) # Initialize word embeddings embedding = nn.Embedding(voc.num_words, hidden_size) weight_matrix = Vectors(glove_path) voc.getEmb(weight_matrix) print(torch.FloatTensor(np.array(voc.index2emb)).size()) embedding.weight.data.copy_(torch.FloatTensor(np.array(voc.index2emb))) embedding.weight.requires_grad = False if loadFilename: embedding.load_state_dict(embedding_sd) # Initialize encoder & decoder models encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout) decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout) if loadFilename: encoder.load_state_dict(encoder_sd) decoder.load_state_dict(decoder_sd) seq2seq = Seq2Seq(encoder, decoder, net, opts) # Use appropriate device seq2seq.to(device)
def load_dataset(args): if args.dataset == '20newsgroup': train_classes, val_classes, test_classes, label_dict = _get_20newsgroup_classes( args) elif args.dataset == 'amazon': train_classes, val_classes, test_classes, label_dict = _get_amazon_classes( args) elif args.dataset == 'fewrel': train_classes, val_classes, test_classes, label_dict = _get_fewrel_classes( args) elif args.dataset == 'huffpost': train_classes, val_classes, test_classes, label_dict = _get_huffpost_classes( args) elif args.dataset == 'reuters': train_classes, val_classes, test_classes, label_dict = _get_reuters_classes( args) elif args.dataset == 'rcv1': train_classes, val_classes, test_classes, label_dict = _get_rcv1_classes( args) else: raise ValueError( 'args.dataset should be one of' '[20newsgroup, amazon, fewrel, huffpost, reuters, rcv1]') assert (len(train_classes) == args.n_train_class) assert (len(val_classes) == args.n_val_class) assert (len(test_classes) == args.n_test_class) print("train_classes", train_classes) print("val_classes", val_classes) print("test_classes", test_classes) tprint('Loading data') all_data = _load_json(args.data_path) class_names = [] class_name_words = [] for ld in label_dict: class_name_dic = {} class_name_dic['label'] = label_dict[ld] class_name_dic['text'] = ld.lower().split() class_names.append(class_name_dic) class_name_words.append(class_name_dic['text']) tprint('Loading word vectors') vectors = Vectors(args.word_vector, cache=args.wv_path) vocab = Vocab(collections.Counter(_read_words(all_data, class_name_words)), vectors=vectors, specials=['<pad>', '<unk>'], min_freq=5) # print word embedding statistics wv_size = vocab.vectors.size() tprint('Total num. of words: {}, word vector dimension: {}'.format( wv_size[0], wv_size[1])) num_oov = wv_size[0] - torch.nonzero( torch.sum(torch.abs(vocab.vectors), dim=1)).size()[0] tprint(('Num. of out-of-vocabulary words' '(they are initialized to zeros): {}').format(num_oov)) # Split into meta-train, meta-val, meta-test data train_data, val_data, test_data = _meta_split(all_data, train_classes, val_classes, test_classes) tprint('#train {}, #val {}, #test {}'.format(len(train_data), len(val_data), len(test_data))) # Convert everything into np array for fast data loading class_names = _data_to_nparray(class_names, vocab, args) train_data = _data_to_nparray(train_data, vocab, args) val_data = _data_to_nparray(val_data, vocab, args) test_data = _data_to_nparray(test_data, vocab, args) train_data['is_train'] = True val_data['is_train'] = True test_data['is_train'] = True # this tag is used for distinguishing train/val/test when creating source pool temp_num = np.argsort(class_names['label']) class_names['label'] = class_names['label'][temp_num] class_names['text'] = class_names['text'][temp_num] class_names['text_len'] = class_names['text_len'][temp_num] return train_data, val_data, test_data, class_names, vocab
def run(self): print("Running on", self.a.device) self.set_device(self.a.device) np.random.seed(self.a.seed) torch.manual_seed(self.a.seed) torch.backends.cudnn.benchmark = True #################### loading event extraction dataset #################### if self.a.test_ee: log('testing event extraction corpus from %s' % self.a.test_ee) if self.a.test_ee: log('testing event extraction corpus from %s' % self.a.test_ee) # both for grounding and ee WordsField = Field(lower=True, include_lengths=True, batch_first=True) PosTagsField = Field(lower=True, batch_first=True) EntityLabelsField = MultiTokenField(lower=False, batch_first=True) AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True) EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False) # only for ee LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None) EventsField = EventField(lower=False, batch_first=True) SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) if self.a.amr: colcc = 'simple-parsing' else: colcc = 'combined-parsing' print(colcc) train_ee_set = ACE2005Dataset(path=self.a.train_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1) dev_ee_set = ACE2005Dataset(path=self.a.dev_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) # test_ee_set = ACE2005Dataset(path=self.a.test_ee, # fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), # "pos-tags": ("POSTAGS", PosTagsField), # "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), # colcc: ("ADJM", AdjMatrixField), # "golden-event-mentions": ("LABEL", LabelField), # "all-events": ("EVENT", EventsField), # "all-entities": ("ENTITIES", EntitiesField)}, # amr=self.a.amr, keep_events=0) print('self.a.train_ee', self.a.train_ee) LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL) print('LabelField.vocab.stoi', LabelField.vocab.stoi) EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT) print('EventsField.vocab.stoi', EventsField.vocab.stoi) print('len(EventsField.vocab.itos)', len(EventsField.vocab.itos)) print('len(EventsField.vocab.stoi)', len(EventsField.vocab.stoi)) #################### loading SR dataset #################### # both for grounding and sr if self.a.train_sr: log('loading corpus from %s' % self.a.train_sr) transform = transforms.Compose([ transforms.Resize(256), transforms.RandomHorizontalFlip(), transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True) vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True) vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True) # only need get_role_mask() and sr_mapping() train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, EventsField.vocab.stoi, LabelField.vocab.stoi, self.a.imsitu_ontology_file, self.a.train_sr, self.a.verb_mapping_file, None, None, 0, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=False, filter_place=self.a.filter_place) #################### loading grounding dataset #################### if self.a.train_grounding: log('loading grounding corpus from %s' % self.a.train_grounding) # only for grounding IMAGEIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) # IMAGEField = SparseField(sequential=False, use_vocab=False, batch_first=True) train_grounding_set = GroundingDataset(path=self.a.train_grounding, img_dir=None, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr) dev_grounding_set = GroundingDataset(path=self.a.dev_grounding, img_dir=None, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr) # test_grounding_set = GroundingDataset(path=self.a.test_grounding, # img_dir=None, # fields={"id": ("IMAGEID", IMAGEIDField), # "sentence_id": ("SENTID", SENTIDField), # "words": ("WORDS", WordsField), # "pos-tags": ("POSTAGS", PosTagsField), # "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), # colcc: ("ADJM", AdjMatrixField), # "all-entities": ("ENTITIES", EntitiesField), # # "image": ("IMAGE", IMAGEField), # }, # transform=transform, # amr=self.a.amr) #################### build vocabulary #################### if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS, vectors=pretrained_embedding) else: WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS) PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS, train_grounding_set.POSTAGS, dev_grounding_set.POSTAGS) EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS, train_grounding_set.ENTITYLABELS, dev_grounding_set.ENTITYLABELS) consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME] # print("O label is", consts.O_LABEL) consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME] # print("O label for AE is", consts.ROLE_O_LABEL) # dev_ee_set1 = ACE2005Dataset(path=self.a.dev_ee, # fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), # "pos-tags": ("POSTAGS", PosTagsField), # "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), # colcc: ("ADJM", AdjMatrixField), # "golden-event-mentions": ("LABEL", LabelField), # "all-events": ("EVENT", EventsField), # "all-entities": ("ENTITIES", EntitiesField)}, # amr=self.a.amr, keep_events=1, only_keep=True) # # test_ee_set1 = ACE2005Dataset(path=self.a.test_ee, # fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), # "pos-tags": ("POSTAGS", PosTagsField), # "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), # colcc: ("ADJM", AdjMatrixField), # "golden-event-mentions": ("LABEL", LabelField), # "all-events": ("EVENT", EventsField), # "all-entities": ("ENTITIES", EntitiesField)}, # amr=self.a.amr, keep_events=1, only_keep=True) # print("train set length", len(train_ee_set)) # # print("dev set length", len(dev_ee_set)) # print("dev set 1/1 length", len(dev_ee_set1)) # # print("test set length", len(test_ee_set)) # print("test set 1/1 length", len(test_ee_set1)) # sr model initialization if not self.a.sr_hps_path: self.a.sr_hps = eval(self.a.sr_hps) embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(self.device) embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(self.device) embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(self.device) if "wvemb_size" not in self.a.sr_hps: self.a.sr_hps["wvemb_size"] = len(vocab_verb.id2word) if "wremb_size" not in self.a.sr_hps: self.a.sr_hps["wremb_size"] = len(vocab_role.id2word) if "wnemb_size" not in self.a.sr_hps: self.a.sr_hps["wnemb_size"] = len(vocab_noun.id2word) # if "ae_oc" not in self.a.sr_hps: # self.a.sr_hps["ae_oc"] = len(vocab_role.id2word) # self.a.ee_label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5 # self.a.ee_label_weight[consts.O_LABEL] = 1.0 # self.a.ee_arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5 # if not self.a.ee_hps_path: # self.a.ee_hps = eval(self.a.ee_hps) # if "wemb_size" not in self.a.ee_hps: # self.a.ee_hps["wemb_size"] = len(WordsField.vocab.itos) # if "pemb_size" not in self.a.ee_hps: # self.a.ee_hps["pemb_size"] = len(PosTagsField.vocab.itos) # if "psemb_size" not in self.a.ee_hps: # # self.a.ee_hps["psemb_size"] = max([train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2 # self.a.ee_hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest(), train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2 # if "eemb_size" not in self.a.ee_hps: # self.a.ee_hps["eemb_size"] = len(EntityLabelsField.vocab.itos) # if "oc" not in self.a.ee_hps: # self.a.ee_hps["oc"] = len(LabelField.vocab.itos) # if "ae_oc" not in self.a.ee_hps: # self.a.ee_hps["ae_oc"] = len(EventsField.vocab.itos) if "oc" not in self.a.sr_hps: self.a.sr_hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.sr_hps: self.a.sr_hps["ae_oc"] = len(EventsField.vocab.itos) ace_classifier = ACEClassifier(self.a.sr_hps["wemb_dim"], self.a.sr_hps["oc"], self.a.sr_hps["ae_oc"], self.device) ee_model = None # # if self.a.score_ee: # if self.a.finetune_ee: # log('init ee model from ' + self.a.finetune_ee) # ee_model = load_ee_model(self.a.ee_hps, self.a.finetune_ee, WordsField.vocab.vectors, self.device, ace_classifier) # log('ee model loaded, there are %i sets of params' % len(ee_model.parameters_requires_grads())) # else: # ee_model = load_ee_model(self.a.ee_hps, None, WordsField.vocab.vectors, self.device, ace_classifier) # log('ee model created from scratch, there are %i sets of params' % len(ee_model.parameters_requires_grads())) # if self.a.score_sr: if self.a.finetune_sr: log('init sr model from ' + self.a.finetune_sr) sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune_sr, self.device, ace_classifier, add_object=self.a.add_object) log('sr model loaded, there are %i sets of params' % len(sr_model.parameters_requires_grads())) else: sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, ace_classifier, add_object=self.a.add_object) log('sr model created from scratch, there are %i sets of params' % len(sr_model.parameters_requires_grads())) model = GroundingModel(ee_model, sr_model, self.get_device()) # ee_model = torch.nn.DataParallel(ee_model) # sr_model = torch.nn.DataParallel(sr_model) # model = torch.nn.DataParallel(model) # if self.a.optimizer == "adadelta": # optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(), # weight_decay=self.a.l2decay) # elif self.a.optimizer == "adam": # optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(), # weight_decay=self.a.l2decay) # else: # optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(), # weight_decay=self.a.l2decay, # momentum=0.9) # log('optimizer in use: %s' % str(self.a.optimizer)) if not os.path.exists(self.a.out): os.mkdir(self.a.out) # with open(os.path.join(self.a.out, "word.vec"), "wb") as f: # pickle.dump(WordsField.vocab, f) # with open(os.path.join(self.a.out, "pos.vec"), "wb") as f: # pickle.dump(PosTagsField.vocab.stoi, f) # with open(os.path.join(self.a.out, "entity.vec"), "wb") as f: # pickle.dump(EntityLabelsField.vocab.stoi, f) with open(os.path.join(self.a.out, "label.vec"), "wb") as f: pickle.dump(LabelField.vocab.stoi, f) with open(os.path.join(self.a.out, "role.vec"), "wb") as f: pickle.dump(EventsField.vocab.stoi, f) log('init complete\n') # # ee mappings # self.a.ee_word_i2s = WordsField.vocab.itos # self.a.ee_label_i2s = LabelField.vocab.itos # self.a.ee_role_i2s = EventsField.vocab.itos # self.a.ee_role_mask = None # if self.a.apply_ee_role_mask: # self.a.ee_role_mask = event_role_mask(self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi, EventsField.vocab.stoi, self.device) # sr mappings self.a.sr_word_i2s = vocab_noun.id2word self.a.sr_label_i2s = vocab_verb.id2word # LabelField.vocab.itos self.a.sr_role_i2s = vocab_role.id2word self.a.role_masks = train_sr_set.get_role_mask().to_dense().to(self.device) writer = SummaryWriter(os.path.join(self.a.out, "exp")) self.a.writer = writer # loading testing data # voa_text = self.a.test_voa_text voa_image_dir = self.a.test_voa_image gt_voa_image = self.a.gt_voa_image gt_voa_text = self.a.gt_voa_text gt_voa_align =self.a.gt_voa_align sr_verb_mapping, sr_role_mapping = train_sr_set.get_sr_mapping() test_m2e2_set = M2E2Dataset(path=gt_voa_text, img_dir=voa_image_dir, fields={"image": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file, object_detection_threshold=self.a.object_detection_threshold, keep_events=self.a.keep_events, ) object_results, object_label, object_detection_threshold = test_m2e2_set.get_object_results() # build batch on cpu test_m2e2_iter = BucketIterator(test_m2e2_set, batch_size=1, train=False, shuffle=False, device=-1, sort_key=lambda x: len(x.POSTAGS)) # scores = 0.0 # now_bad = 0 # restart_used = 0 print("\nStarting testing...\n") # lr = parser.lr # optimizer = optimizer_constructor(lr=lr) # ee_tester = EDTester(LabelField.vocab.itos, EventsField.vocab.itos, self.a.ignore_time_test) # sr_tester = SRTester() # g_tester = GroundingTester() j_tester = JointTester(self.a.ignore_place_sr_test, self.a.ignore_time_test) # if self.a.visual_voa_ee_path is not None: # ee_visualizer = EDVisualizer(self.a.gt_voa_text) # else: # ee_visualizer = None image_gt = json.load(open(gt_voa_image)) # all_y = [] # all_y_ = [] # all_events = [] # all_events_ = [] vision_result = dict() # if self.a.visual_voa_g_path is not None and not os.path.exists(self.a.visual_voa_g_path): # os.makedirs(self.a.visual_voa_g_path, exist_ok=True) # if self.a.visual_voa_ee_path is not None and not os.path.exists(self.a.visual_voa_ee_path): # os.makedirs(self.a.visual_voa_ee_path, exist_ok=True) if self.a.visual_voa_sr_path is not None and not os.path.exists(self.a.visual_voa_sr_path): os.makedirs(self.a.visual_voa_sr_path, exist_ok=True) # grounding_writer = open(self.a.visual_voa_g_path, 'w') doc_done = set() with torch.no_grad(): model.eval() for batch in test_m2e2_iter: vision_result = joint_test_batch( model_g=model, batch_g=batch, device=self.device, transform=transform, img_dir=voa_image_dir, # ee_hyps=self.a.ee_hps, # ee_word_i2s=self.a.ee_word_i2s, # ee_label_i2s=self.a.ee_label_i2s, # ee_role_i2s=self.a.ee_role_i2s, # ee_tester=ee_tester, # ee_visualizer=ee_visualizer, sr_noun_i2s=self.a.sr_word_i2s, sr_verb_i2s=self.a.sr_label_i2s, sr_role_i2s=self.a.sr_role_i2s, # sr_tester=sr_tester, role_masks=self.a.role_masks, # ee_role_mask=self.a.ee_role_mask, # j_tester=j_tester, image_gt=image_gt, verb2type=sr_verb_mapping, role2role=sr_role_mapping, vision_result=vision_result, # all_y=all_y, # all_y_=all_y_, # all_events=all_events, # all_events_=all_events_, # visual_g_path=self.a.visual_voa_g_path, # visual_ee_path=self.a.visual_voa_ee_path, load_object=self.a.add_object, object_results=object_results, object_label=object_label, object_detection_threshold=object_detection_threshold, vocab_objlabel=vocab_noun.word2id, # apply_ee_role_mask=self.a.apply_ee_role_mask keep_events_sr=self.a.keep_events_sr, doc_done=doc_done, ) print('vision_result size', len(vision_result)) # pickle.dump(vision_result, open(os.path.join(self.a.out, 'vision_result.pkl'), 'w')) # ep, er, ef = ee_tester.calculate_report(all_y, all_y_, transform=True) # ap, ar, af = ee_tester.calculate_sets(all_events, all_events_) # if self.a.visual_voa_ee_path is not None: # ee_visualizer.rewrite_brat(self.a.visual_voa_ee_path, self.a.visual_voa_ee_gt_ann) # # print('text ep, er, ef', ep, er, ef) # print('text ap, ar, af', ap, ar, af) evt_p, evt_r, evt_f1, role_scores = j_tester.calculate_report( vision_result, voa_image_dir, self.a.visual_voa_sr_path, self.a.add_object, keep_events_sr=self.a.keep_events_sr )#consts.O_LABEL, consts.ROLE_O_LABEL) print('image event ep, er, ef \n', evt_p, '\n', evt_r, '\n', evt_f1) # if not self.a.add_object: # print('image att_iou ap, ar, af', role_scores['role_att_iou_p'], role_scores['role_att_iou_r'], # role_scores['role_att_iou_f1']) # print('image att_hit ap, ar, af', role_scores['role_att_hit_p'], role_scores['role_att_hit_r'], # role_scores['role_att_hit_f1']) # print('image att_cor ap, ar, af', role_scores['role_att_cor_p'], role_scores['role_att_cor_r'], # role_scores['role_att_cor_f1']) # else: # print('image obj_iou ap, ar, af', role_scores['role_obj_iou_p'], role_scores['role_obj_iou_r'], # role_scores['role_obj_iou_f1']) # print('image obj_iou_union ap, ar, af', role_scores['role_obj_iou_union_p'], role_scores['role_obj_iou_union_r'], # role_scores['role_obj_iou_union_f1']) for key in role_scores: print(key) for key_ in role_scores[key]: print(key_, role_scores[key][key_])
LABEL = torchtext.data.Field(sequential=False, use_vocab=False) #pandasでcsvを保存するときに、labelをintでキャストしておかないとエラーでるから注意 train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits( path='drive/My Drive/', train='4/train.csv', validation='4/validation.csv', test='4/test.csv', format='csv', fields=[('text', TEXT), ('Label', LABEL)] ) print(len(train_ds)) print(len(val_ds)) print(len(test_ds)) from torchtext.vocab import Vectors japanese_word2vec_vectors = Vectors( name='drive/My Drive/tweets133_.vec') print(japanese_word2vec_vectors.dim) print(len(japanese_word2vec_vectors.itos)) #ボキャブラリを作成 TEXT.build_vocab(train_ds, vectors=japanese_word2vec_vectors) print(TEXT.vocab.vectors.shape) print(TEXT.vocab.stoi) train_dl = torchtext.data.Iterator(train_ds, batch_size=64, train=True) val_dl = torchtext.data.Iterator(val_ds, batch_size=64, train=False, sort=False) test_dl = torchtext.data.Iterator(test_ds, batch_size=64, train=False, sort=False) batch = next(iter(val_dl)) print(batch.text)
def __init__(self, batch_size=128, fix_length=32, singer=None, target_vocab_size=5000, vector_path=VEC_PATH, device=None): """ 用于生成歌词生成任务的数据预处理和Batch生成 每次输入网络的数据包括: encoder_input: 编码器输入, shape: (batch_size, time_step, word_id) encoder_length: 编码器输入文本有效长度, shape: (batch_size, ) decoder_input: 解码器输入, shape: (batch_size, time_step, word_id) decoder_length: 解码器输入文本有效长度, shape: (batch_size, ) target: 解码器输出目标, 用于计算Loss, shape: (batch_size, time_step, word_id) :param batch_size: 每个batch的大小. 默认: 128 :param fix_length: 每个序列的最大长度, 长度不足的句子会用"<pad>"补齐, 超过的句子会被截断. 默认: 32 :param singer: 为None时读取所有歌曲; 否则只读取对应歌手的歌曲. 默认: None :param target_vocab_size: 目标词典(解码器输出)的长度, 在输出端(目标)只保留词频最高的前 target_vocab_size 个词语, 其它词语都会被"<unk>"替换. 默认: 5000 :param vector_path: word2vec模型的路径. PS: 必须是.txt格式的文件 :param device: 设备, "cuda"或"cpu". 默认: None, 自动选择"cuda"或"cpu" """ self.batch_size = batch_size self.fix_length = fix_length self.singer = singer self.target_vocab_size = target_vocab_size self.vector_path = vector_path self.DEVICE = device or torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.tokenize = lambda x: jieba.lcut(x, HMM=False) # 分词 # 定义torchtext的三个用于文本预处理的Field对象, 其中ENCODER并不需要句首符"<go>"和末尾符"<eos>", # 但为了三个Field对象对文本编码解码的一致性, 在定义ENCODER Field对象时要将它们声明, # 在词典映射构建完毕后再将它们去掉, 在self._build_vocab 中也有说明 self.ENCODER = Field( sequential=True, tokenize=self.tokenize, batch_first=True, # 数据的第一维是batch(默认是time_step) fix_length=self. fix_length, # 固定句子长度, 长度不足的句子会用"<pad>"补齐, 超过的句子会被截断 include_lengths=True, # 处理文本时除了返回编码后的文本, 同时返回文本的长度 init_token="<go>", # 文本的句首会自动添加"<go>" eos_token="<eos>") # 文本的末尾会自动添加"<eos>" self.DECODER = Field(sequential=True, tokenize=self.tokenize, batch_first=True, fix_length=self.fix_length, include_lengths=True, init_token="<go>", eos_token="<eos>") self.TARGET = Field( sequential=True, tokenize=self.tokenize, batch_first=True, fix_length=self.fix_length, eos_token="<eos>") # 由于`target`是`decoder`左移一位的结果, 所以不需要句首符"<go>" # 数据处理 self._proprecess() # 对语料库进行读取, 并转化维torchtext能识别的.json文件格式 self.dataset = self._build_dataset() # 读取处理后的数据, 生成torchtext的DataSet对象 self.vectors = Vectors(name=self.vector_path, cache=FILE_PATH + "/temp") # 加载word2vec词向量 self._build_vocab() # 构建词典映射 self._build_vector() # 构建词向量映射 self.stoi = self.ENCODER.vocab.stoi # 从词语到id的映射字典 self.itos = self.ENCODER.vocab.itos # 从id到词典的映射字典 self.vocab_size = len(self.ENCODER.vocab) # 词典的大小 self.vector_dim = self.vectors.dim # 词向量的维度 self.vector_weights = self.ENCODER.vocab.vectors # 词向量的权重 self.target_vocab_size = len( self.TARGET.vocab ) # 重新赋值, 因为加入了"<eos>"等标志位的实际词典大会大于原target_vocab_size # 迭代器, 用于训练时生成batch self.data_iter = BucketIterator( self.dataset, batch_size=self.batch_size, shuffle=True, # 打乱数据原本顺序 device=self.DEVICE)
def load_data(opt): # 不设置fix_length TEXT = data.Field(sequential=True, fix_length=opt.max_text_len) # 词或者字符 LABEL = data.Field(sequential=False, use_vocab=False) # load # word/ or article/ train_path = opt.data_path + opt.text_type + '/train_set.csv' val_path = opt.data_path + opt.text_type + '/val_set.csv' test_path = opt.data_path + opt.text_type + '/test_set.csv' train_path = 'D:/git/dataset/val_set.csv' test_path = 'D:/git/dataset/val_set.csv' val_path = 'D:/git/dataset/val_set.csv' # aug for data augmentation if opt.aug: print('make augmentation datasets!') train = GrandDataset(train_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False, aug=opt.aug) val = GrandDataset(val_path, text_field=TEXT, label_field=LABEL, text_type=opt.text_type, test=False) test = GrandDataset(test_path, text_field=TEXT, label_field=None, text_type=opt.text_type, test=True) cache = '.vector_cache' if not os.path.exists(cache): os.mkdir(cache) embedding_path = '{}/{}_{}.txt'.format(opt.embedding_path, opt.text_type, opt.embedding_dim) vectors = Vectors(name=embedding_path, cache=cache) print('load word2vec vectors from {}'.format(embedding_path)) vectors.unk_init = init.xavier_uniform_ # 没有命中的token的初始化方式 # 构建Vocab print('building {} vocabulary......'.format(opt.text_type)) TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors) # LABEL.build_vocab(train) # 构建Iterator # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序 # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序 train_iter = data.BucketIterator(dataset=train, batch_size=opt.batch_size, shuffle=True, sort_within_batch=False, repeat=False, device=opt.device) # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False, # device=opt.device) # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device) val_iter = data.Iterator(dataset=val, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) test_iter = data.Iterator(dataset=test, batch_size=opt.batch_size, shuffle=False, sort=False, repeat=False, device=opt.device) return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
def load_data(batch_size, device): # 标签 LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) # 文本 SEN1 = data.Field(sequential=True, tokenize=tokenizer, fix_length=50, lower=True, batch_first=True) SEN2 = data.Field(sequential=True, tokenize=tokenizer, fix_length=50, lower=True, batch_first=True) # 构建DataSet train, valid = data.TabularDataset.splits( path='./snli_1.0/', skip_header=True, train="train4.csv", validation="dev3.csv", format='csv', fields=[("label", LABEL), ("sentence1", SEN1), ("sentence2", SEN2)], ) test = data.TabularDataset( path='./snli_1.0/test3.csv', skip_header=True, format='csv', fields=[("sentence1", SEN1), ("sentence2", SEN2)], ) # 创建词表 SEN1.build_vocab((train.sentence11, train.sentence2), vectors=Vectors(name='/data/yinli/dataset/glove.840B.300d.txt')) SEN2.vocab = SEN1.vocab # 构建迭代器 train_iter = data.BucketIterator(train, sort_key=lambda x: len(x.SEN1), sort_within_batch=False, shuffle=True, batch_size=batch_size, repeat=False, device=device) valid_iter = data.Iterator(valid, sort=False, shuffle=False, sort_within_batch=False, batch_size=batch_size, repeat=False, train=False, device=device) test_iter = data.Iterator(test, sort=False, shuffle=False, sort_within_batch=False, batch_size=batch_size, repeat=False, train=False, device=device) return train_iter, valid_iter, test_iter, SEN1.vocab, SEN2.vocab # 加载数据集,生成迭代器 # def load_data(batch_size, device): # # 标签 # LABEL = data.Field(sequential=True, batch_first=True) # # 文本 # SEN1 = data.Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True) # SEN2 = data.Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True) # # # 构建DataSet # train = data.TabularDataset( # path='./snli_1.0/train2.csv', # skip_header=True, # format='csv', # fields=[("label", LABEL), ("sentence1", SEN1), ("sentence2", SEN2)], # ) # # # 创建词表 # SEN1.build_vocab(train, vectors=Vectors(name='/data/yinli/dataset/glove.840B.300d.txt')) # SEN2.build_vocab(train, vectors=Vectors(name='/data/yinli/dataset/glove.840B.300d.txt')) # LABEL.build_vocab(train) # # # 构建迭代器 # train_iter = data.BucketIterator(train, # sort_key=lambda x: len(x.SEN1), # sort_within_batch=False, # shuffle=True, # batch_size=batch_size, # repeat=False, # device=device) # # return train_iter, SEN1.vocab, SEN2.vocab # device = torch.device("cuda:1") # train_iter, dev_iter, test_iter, sentence1_vocab, sentence2_vocab = load_data(5, 50, device) # # for batch in train_iter: # print(batch.label) # print(batch.sentence1) # print(batch.sentence2) # break # print(len(sentence1_vocab.vectors)) # # print(sentence1_vocab.stoi['frown']) # print(sentence2_vocab.stoi['frown']) # print(sentence1_vocab.stoi['<unk>']) # # del train_iter # del dev_iter # del test_iter # del sentence1_vocab # del sentence2_vocab # # embedding = torch.cat((sentence2_vocab.vectors ,sentence1_vocab.vectors[2:]), 0) # print(embedding.size()) # vocab_size, embed_size = embedding.size() # print(vocab_size) # print(embed_size) # print(len(label_vocab)) # print(label_vocab.stoi) #label2id = {'<unk>': 0, '<pad>': 1, 'neutral': 2, 'contradiction': 3, 'entailment': 4}
def __init__(self, qid_path, train_path, test_path, word_path, char_path, num_folds=10, batch_size=32, seed=2018): question_df = pd.read_csv(qid_path) question_df = question_df.set_index('qid') train_df = pd.read_csv(train_path) test_df = pd.read_csv(test_path) self.num_folds = num_folds self.batch_size = batch_size self.seed = seed self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') train_df['q1_wid'] = train_df['qid1'].apply( lambda qid: question_df.loc[qid]['wid']) train_df['q2_wid'] = train_df['qid2'].apply( lambda qid: question_df.loc[qid]['wid']) train_df['q1_cid'] = train_df['qid1'].apply( lambda qid: question_df.loc[qid]['cid']) train_df['q2_cid'] = train_df['qid2'].apply( lambda qid: question_df.loc[qid]['cid']) self.train_df = train_df[[ 'q1_wid', 'q2_wid', 'q1_cid', 'q2_cid', 'label' ]] test_df['q1_wid'] = test_df['qid1'].apply( lambda qid: question_df.loc[qid]['wid']) test_df['q2_wid'] = test_df['qid2'].apply( lambda qid: question_df.loc[qid]['wid']) test_df['q1_cid'] = test_df['qid1'].apply( lambda qid: question_df.loc[qid]['cid']) test_df['q2_cid'] = test_df['qid2'].apply( lambda qid: question_df.loc[qid]['cid']) self.test_df = test_df[['q1_wid', 'q2_wid', 'q1_cid', 'q2_cid']] self.word_embedding_path = word_path self.char_embedding_path = char_path cache = '../cache' if not os.path.exists(cache): os.mkdir(cache) self.word_vectors = Vectors(self.word_embedding_path, cache) self.char_vectors = Vectors(self.char_embedding_path, cache) self.word_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05) self.char_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05) self.wordTEXT = data.Field(batch_first=True) self.charTEXT = data.Field(batch_first=True) self.LABEL = data.Field(sequential=False, use_vocab=False, dtype=torch.float) train_dataset = self.generate_dataset() test_dataset = self.generate_dataset(role='test') self.wordTEXT.build_vocab(train_dataset, test_dataset, min_freq=1, vectors=self.word_vectors) self.charTEXT.build_vocab(train_dataset, test_dataset, min_freq=1, vectors=self.char_vectors) self.word_embedding = self.wordTEXT.vocab.vectors self.char_embedding = self.charTEXT.vocab.vectors
import collections import gensim from torchtext.vocab import Vectors, Vocab model = gensim.models.KeyedVectors.load_word2vec_format('input/vector.bin', binary=True) print(model['中国']) # 肉眼可读方式存储的word2vec vectors = Vectors(word_vector, cache=wv_path) vocab = Vocab(collections.Counter(words), vectors=vectors, specials=['<pad>', '<unk>'], min_freq=1) wv_size = vocab.vectors.size() vocab.stoi['<unk>']
def run(self): print("Running on", self.a.device) self.set_device(self.a.device) np.random.seed(self.a.seed) torch.manual_seed(self.a.seed) # create training set if self.a.train: log('loading corpus from %s' % self.a.train) WordsField = Field(lower=True, include_lengths=True, batch_first=True) PosTagsField = Field(lower=True, batch_first=True) EntityLabelsField = MultiTokenField(lower=False, batch_first=True) AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True) LabelField = Field(lower=False, batch_first=True, pad_token=None, unk_token=None) EventsField = EventField(lower=False, batch_first=True) EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False) train_set = ACE2005Dataset(path=self.a.train, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), "stanford-colcc": ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, keep_events=1) dev_set = ACE2005Dataset(path=self.a.dev, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), "stanford-colcc": ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, keep_events=0) test_set = ACE2005Dataset(path=self.a.test, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), "stanford-colcc": ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, keep_events=0) if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) WordsField.build_vocab(train_set.WORDS, dev_set.WORDS, vectors=pretrained_embedding) else: WordsField.build_vocab(train_set.WORDS, dev_set.WORDS) PosTagsField.build_vocab(train_set.POSTAGS, dev_set.POSTAGS) EntityLabelsField.build_vocab(train_set.ENTITYLABELS, dev_set.ENTITYLABELS) LabelField.build_vocab(train_set.LABEL, dev_set.LABEL) EventsField.build_vocab(train_set.EVENT, dev_set.EVENT) consts.O_LABEL = LabelField.vocab.stoi["O"] # print("O label is", consts.O_LABEL) consts.ROLE_O_LABEL = EventsField.vocab.stoi["OTHER"] # print("O label for AE is", consts.ROLE_O_LABEL) dev_set1 = ACE2005Dataset(path=self.a.dev, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), "stanford-colcc": ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, keep_events=1, only_keep=True) test_set1 = ACE2005Dataset(path=self.a.test, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), "stanford-colcc": ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, keep_events=1, only_keep=True) print("dev set length", len(dev_set)) print("dev set 1/1 length", len(dev_set1)) print("test set length", len(test_set)) print("test set 1/1 length", len(test_set1)) self.a.label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5 self.a.label_weight[consts.O_LABEL] = 1.0 self.a.hps = eval(self.a.hps) if "wemb_size" not in self.a.hps: self.a.hps["wemb_size"] = len(WordsField.vocab.itos) if "pemb_size" not in self.a.hps: self.a.hps["pemb_size"] = len(PosTagsField.vocab.itos) if "psemb_size" not in self.a.hps: self.a.hps["psemb_size"] = max([train_set.longest(), dev_set.longest(), test_set.longest()]) + 2 if "eemb_size" not in self.a.hps: self.a.hps["eemb_size"] = len(EntityLabelsField.vocab.itos) if "oc" not in self.a.hps: self.a.hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.hps: self.a.hps["ae_oc"] = len(EventsField.vocab.itos) tester = self.get_tester(LabelField.vocab.itos) if self.a.finetune: log('init model from ' + self.a.finetune) model = self.load_model(self.a.finetune) log('model loaded, there are %i sets of params' % len(model.parameters_requires_grads())) else: model = self.load_model(None) log('model created from scratch, there are %i sets of params' % len(model.parameters_requires_grads())) if self.a.optimizer == "adadelta": optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay) elif self.a.optimizer == "adam": optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay) else: optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay, momentum=0.9) log('optimizer in use: %s' % str(self.a.optimizer)) if not os.path.exists(self.a.out): os.mkdir(self.a.out) with open(os.path.join(self.a.out, "word.vec"), "wb") as f: pickle.dump(WordsField.vocab, f) with open(os.path.join(self.a.out, "pos.vec"), "wb") as f: pickle.dump(PosTagsField.vocab.stoi, f) with open(os.path.join(self.a.out, "entity.vec"), "wb") as f: pickle.dump(EntityLabelsField.vocab.stoi, f) with open(os.path.join(self.a.out, "label.vec"), "wb") as f: pickle.dump(LabelField.vocab.stoi, f) with open(os.path.join(self.a.out, "role.vec"), "wb") as f: pickle.dump(EventsField.vocab.stoi, f) log('init complete\n') self.a.word_i2s = WordsField.vocab.itos self.a.label_i2s = LabelField.vocab.itos self.a.role_i2s = EventsField.vocab.itos writer = SummaryWriter(os.path.join(self.a.out, "exp")) self.a.writer = writer train( model=model, train_set=train_set, dev_set=dev_set, test_set=test_set, optimizer_constructor=optimizer_constructor, epochs=self.a.epochs, tester=tester, parser=self.a, other_testsets={ "dev 1/1": dev_set1, "test 1/1": test_set1, } ) log('Done!')
print('len(TEXT.vocab)', len(TEXT.vocab)) print('len(LABEL.vocab)', len(LABEL.vocab)) labels = [ex.label for ex in train.examples] train_iter, _, _ = torchtext.data.BucketIterator.splits((train, valid, test), batch_size=args.bsz, device=-1, repeat=False) _, valid_iter, test_iter = torchtext.data.BucketIterator.splits( (train, valid, test), batch_size=10, device=-1) # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) #simple_vec = TEXT.vocab.vectors.clone() #url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec' #TEXT.vocab.load_vectors(vectors=Vectors('wiki.en.vec', url=url)) #complex_vec = TEXT.vocab.vectors # def output_test(model): "All models should be able to be run with following command." upload = [] loss.reduce = False for batch in test_iter: # Your prediction data here (don't cheat!) x = batch.text
def run(self): print("Running on", self.a.device) self.set_device(self.a.device) np.random.seed(self.a.seed) torch.manual_seed(self.a.seed) torch.backends.cudnn.benchmark = True #################### loading event extraction dataset #################### if self.a.train_ee: log('loading event extraction corpus from %s' % self.a.train_ee) # both for grounding and ee WordsField = Field(lower=True, include_lengths=True, batch_first=True) PosTagsField = Field(lower=True, batch_first=True) EntityLabelsField = MultiTokenField(lower=False, batch_first=True) AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True) EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False) # only for ee LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None) EventsField = EventField(lower=False, batch_first=True) SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) if self.a.amr: colcc = 'simple-parsing' else: colcc = 'combined-parsing' print(colcc) train_ee_set = ACE2005Dataset(path=self.a.train_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1) dev_ee_set = ACE2005Dataset(path=self.a.dev_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) test_ee_set = ACE2005Dataset(path=self.a.test_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL, vectors=pretrained_embedding) EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT, vectors=pretrained_embedding) else: LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL) EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT) # add role mask self.a.role_mask = event_role_mask(self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi, EventsField.vocab.stoi, self.device) #################### loading SR dataset #################### # both for grounding and sr if self.a.train_sr: log('loading corpus from %s' % self.a.train_sr) transform = transforms.Compose([ transforms.Resize(256), transforms.RandomHorizontalFlip(), transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True) vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True) vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True) # train_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file, # self.a.train_sr, self.a.verb_mapping_file, self.a.role_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1) #self.a.shuffle # dev_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file, # self.a.dev_sr, self.a.verb_mapping_file, self.a.role_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1) # test_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file, # self.a.test_sr, self.a.verb_mapping_file, self.a.role_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1) train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, LabelField.vocab.stoi, EventsField.vocab.stoi, self.a.imsitu_ontology_file, self.a.train_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) dev_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, LabelField.vocab.stoi, EventsField.vocab.stoi, self.a.imsitu_ontology_file, self.a.dev_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) test_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, LabelField.vocab.stoi, EventsField.vocab.stoi, self.a.imsitu_ontology_file, self.a.test_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) #################### loading grounding dataset #################### if self.a.train_grounding: log('loading grounding corpus from %s' % self.a.train_grounding) # only for grounding IMAGEIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) # IMAGEField = SparseField(sequential=False, use_vocab=False, batch_first=True) train_grounding_set = GroundingDataset(path=self.a.train_grounding, img_dir=self.a.img_dir_grounding, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file_g, object_detection_threshold=self.a.object_detection_threshold, ) dev_grounding_set = GroundingDataset(path=self.a.dev_grounding, img_dir=self.a.img_dir_grounding, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file_g, object_detection_threshold=self.a.object_detection_threshold, ) test_grounding_set = GroundingDataset(path=self.a.test_grounding, img_dir=self.a.img_dir_grounding, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file_g, object_detection_threshold=self.a.object_detection_threshold, ) #################### build vocabulary #################### if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS, vectors=pretrained_embedding) else: WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS) PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS, train_grounding_set.POSTAGS, dev_grounding_set.POSTAGS) EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS, train_grounding_set.ENTITYLABELS, dev_grounding_set.ENTITYLABELS) consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME] # print("O label is", consts.O_LABEL) consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME] # print("O label for AE is", consts.ROLE_O_LABEL) dev_ee_set1 = ACE2005Dataset(path=self.a.dev_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1, only_keep=True) test_ee_set1 = ACE2005Dataset(path=self.a.test_ee, fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1, only_keep=True) print("train set length", len(train_ee_set)) print("dev set length", len(dev_ee_set)) print("dev set 1/1 length", len(dev_ee_set1)) print("test set length", len(test_ee_set)) print("test set 1/1 length", len(test_ee_set1)) # sr model initialization if not self.a.sr_hps_path: self.a.sr_hps = eval(self.a.sr_hps) embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(self.device) embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(self.device) embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(self.device) if "wvemb_size" not in self.a.sr_hps: self.a.sr_hps["wvemb_size"] = len(vocab_verb.id2word) if "wremb_size" not in self.a.sr_hps: self.a.sr_hps["wremb_size"] = len(vocab_role.id2word) if "wnemb_size" not in self.a.sr_hps: self.a.sr_hps["wnemb_size"] = len(vocab_noun.id2word) self.a.ee_label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5 self.a.ee_label_weight[consts.O_LABEL] = 1.0 self.a.ee_arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5 self.a.ee_hps = eval(self.a.ee_hps) if "wemb_size" not in self.a.ee_hps: self.a.ee_hps["wemb_size"] = len(WordsField.vocab.itos) if "pemb_size" not in self.a.ee_hps: self.a.ee_hps["pemb_size"] = len(PosTagsField.vocab.itos) if "psemb_size" not in self.a.ee_hps: # self.a.ee_hps["psemb_size"] = max([train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2 self.a.ee_hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest(), train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2 if "eemb_size" not in self.a.ee_hps: self.a.ee_hps["eemb_size"] = len(EntityLabelsField.vocab.itos) if "oc" not in self.a.ee_hps: self.a.ee_hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.ee_hps: self.a.ee_hps["ae_oc"] = len(EventsField.vocab.itos) if "oc" not in self.a.sr_hps: self.a.sr_hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.sr_hps: self.a.sr_hps["ae_oc"] = len(EventsField.vocab.itos) ee_tester = EDTester(LabelField.vocab.itos, EventsField.vocab.itos, self.a.ignore_time_test) sr_tester = SRTester() g_tester = GroundingTester() j_tester = JointTester(self.a.ignore_place_sr_test, self.a.ignore_time_test) ace_classifier = ACEClassifier(2 * self.a.ee_hps["lstm_dim"], self.a.ee_hps["oc"], self.a.ee_hps["ae_oc"], self.device) if self.a.finetune_ee: log('init ee model from ' + self.a.finetune_ee) ee_model = load_ee_model(self.a.ee_hps, self.a.finetune_ee, WordsField.vocab.vectors, self.device, ace_classifier) log('ee model loaded, there are %i sets of params' % len(ee_model.parameters_requires_grads())) else: ee_model = load_ee_model(self.a.ee_hps, None, WordsField.vocab.vectors, self.device, ace_classifier) log('ee model created from scratch, there are %i sets of params' % len(ee_model.parameters_requires_grads())) if self.a.finetune_sr: log('init sr model from ' + self.a.finetune_sr) sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune_sr, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True) log('sr model loaded, there are %i sets of params' % len(sr_model.parameters_requires_grads())) else: sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True) log('sr model created from scratch, there are %i sets of params' % len(sr_model.parameters_requires_grads())) model = GroundingModel(ee_model, sr_model, self.get_device()) # ee_model = torch.nn.DataParallel(ee_model) # sr_model = torch.nn.DataParallel(sr_model) # model = torch.nn.DataParallel(model) if self.a.optimizer == "adadelta": optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay) elif self.a.optimizer == "adam": optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay) else: optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay, momentum=0.9) log('optimizer in use: %s' % str(self.a.optimizer)) if not os.path.exists(self.a.out): os.mkdir(self.a.out) with open(os.path.join(self.a.out, "word.vec"), "wb") as f: pickle.dump(WordsField.vocab, f) with open(os.path.join(self.a.out, "pos.vec"), "wb") as f: pickle.dump(PosTagsField.vocab.stoi, f) with open(os.path.join(self.a.out, "entity.vec"), "wb") as f: pickle.dump(EntityLabelsField.vocab.stoi, f) with open(os.path.join(self.a.out, "label.vec"), "wb") as f: pickle.dump(LabelField.vocab.stoi, f) with open(os.path.join(self.a.out, "role.vec"), "wb") as f: pickle.dump(EventsField.vocab.stoi, f) with open(os.path.join(self.a.out, "ee_hyps.json"), "w") as f: json.dump(self.a.ee_hps, f) with open(os.path.join(self.a.out, "sr_hyps.json"), "w") as f: json.dump(self.a.sr_hps, f) log('init complete\n') # ee mappings self.a.ee_word_i2s = WordsField.vocab.itos self.a.ee_label_i2s = LabelField.vocab.itos self.a.ee_role_i2s = EventsField.vocab.itos # sr mappings self.a.sr_word_i2s = vocab_noun.id2word self.a.sr_label_i2s = vocab_verb.id2word # LabelField.vocab.itos self.a.sr_role_i2s = vocab_role.id2word writer = SummaryWriter(os.path.join(self.a.out, "exp")) self.a.writer = writer joint_train( model_ee=ee_model, model_sr=sr_model, model_g=model, train_set_g=train_grounding_set, dev_set_g=dev_grounding_set, test_set_g=test_grounding_set, train_set_ee=train_ee_set, dev_set_ee=dev_ee_set, test_set_ee=test_ee_set, train_set_sr=train_sr_set, dev_set_sr=dev_sr_set, test_set_sr=test_sr_set, optimizer_constructor=optimizer_constructor, epochs=self.a.epochs, ee_tester=ee_tester, sr_tester=sr_tester, g_tester=g_tester, j_tester=j_tester, parser=self.a, other_testsets={ "dev ee 1/1": dev_ee_set1, "test ee 1/1": test_ee_set1, }, transform=transform, vocab_objlabel=vocab_noun.word2id ) log('Done!')
def load_data(self, w2v_file, train_file, test_file, val_file=None): ''' 从文件中读取数据,建立 iterators、vocabulary 和 embeddings Inputs: w2v_file(String): 预训练的词向量文件(Glove/Word2Vec) train_file(String): 训练数据路径 test_file(String): 测试数据路径 val_file(String): 验证数据路径 ''' tokenizer = lambda sent: [ x for x in nltk.word_tokenize(sent) if x != " " ] # 这里是一个列表生成式,避免了 token 的结果为 " " # 创建 Field 对象 TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len) # LABEL 中的 sequential 一定要设置为 False LABEL = data.Field( sequential=False, use_vocab=False ) # 如果LABEL是整型,不需要 numericalize , 就需要将 use_vocab=False datafields = [("text", TEXT), ("label", LABEL)] # 将 DataFrame 中的数据添加到 torchtext.data.Dataset 中 train_df = self.get_pandas_df(train_file) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] # 生成训练样本 train_data = data.Dataset(train_examples, datafields) test_df = self.get_pandas_df(test_file) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] # 生成测试样本 text_data = data.Dataset(test_examples, datafields test_data = data.Dataset(test_examples, datafields) # 划分验证集 if val_file: val_df = self.get_pandas_df(val_file) val_example = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_example, datafields) else: train_data, val_data = train_data.split( split_ratio=0.8) # 利用 split 划分 # 加载预训练的 word embedding TEXT.build_vocab(train_data, vectors=Vectors(w2v_file)) self.word_embeddings = TEXT.vocab.vectors self.vocab = TEXT.vocab # 生成训练数据迭代对象 self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) # 生成测试数据和验证数据的迭代对象 self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=False) print("Local {} train examples".format(len(train_data))) print("Local {} test examples".format(len(test_data))) print("Local {} validation examples".format(len(val_data)))
TEXT = Field(sequential=True, tokenize=spacy_tok, lower=True) tst_datafields = [("comment_text", TEXT)] tst = TabularDataset(path=data_review_csv_path, format='csv', skip_header=True, fields=tst_datafields) novel_datafields = [("novel", TEXT)] novel = TabularDataset(path=data_novelty_csv_path, format='csv', skip_header=True, fields=novel_datafields) cache = '.vector_cache' vectors = Vectors(name=glove_path, cache=cache) TEXT.build_vocab(tst, vectors=vectors) data_iter = Iterator(tst, batch_size=1, device=-1, sort=False, sort_within_batch=False, repeat=False, shuffle=False) novel_iter = Iterator(novel, batch_size=1, device=-1, sort=False, sort_within_batch=False, repeat=False,
def run(self): print("Running on", self.a.device) self.set_device(self.a.device) np.random.seed(self.a.seed) torch.manual_seed(self.a.seed) torch.backends.cudnn.benchmark = True # build text event vocab and ee_role vocab WordsField = Field(lower=True, include_lengths=True, batch_first=True) PosTagsField = Field(lower=True, batch_first=True) EntityLabelsField = MultiTokenField(lower=False, batch_first=True) AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True) EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False) # only for ee LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None) EventsField = EventField(lower=False, batch_first=True) colcc = 'stanford-colcc' train_ee_set = ACE2005Dataset(path=self.a.train_ee, fields={ "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField) }, amr=False, keep_events=1) pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) LabelField.build_vocab(train_ee_set.LABEL, vectors=pretrained_embedding) EventsField.build_vocab(train_ee_set.EVENT, vectors=pretrained_embedding) # consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME] # # print("O label is", consts.O_LABEL) # consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME] # # print("O label for AE is", consts.ROLE_O_LABEL) # create testing set if self.a.test_sr: log('loading corpus from %s' % self.a.test_sr) transform = transforms.Compose([ transforms.Resize(256), transforms.RandomHorizontalFlip(), transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True) vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True) vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True) # train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, # EventsField.vocab.stoi, LabelField.vocab.stoi, # self.a.imsitu_ontology_file, # self.a.train_sr, self.a.verb_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, # load_object=self.a.add_object, filter_place=self.a.filter_place) # dev_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, # EventsField.vocab.stoi, LabelField.vocab.stoi, # self.a.imsitu_ontology_file, # self.a.dev_sr, self.a.verb_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, # load_object=self.a.add_object, filter_place=self.a.filter_place) test_sr_set = ImSituDataset( self.a.image_dir, vocab_noun, vocab_role, vocab_verb, EventsField.vocab.stoi, LabelField.vocab.stoi, self.a.imsitu_ontology_file, self.a.test_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to( self.device) embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to( self.device) embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to( self.device) # consts.O_LABEL = vocab_verb.word2id['0'] # verb?? # consts.ROLE_O_LABEL = vocab_role.word2id["OTHER"] #??? # self.a.label_weight = torch.ones([len(vocab_sr.id2word)]) * 5 # more important to learn # self.a.label_weight[consts.O_LABEL] = 1.0 #??? if not self.a.hps_path: self.a.hps = eval(self.a.hps) if self.a.textontology: if "wvemb_size" not in self.a.hps: self.a.hps["wvemb_size"] = len(LabelField.vocab.stoi) if "wremb_size" not in self.a.hps: self.a.hps["wremb_size"] = len(EventsField.vocab.itos) if "wnemb_size" not in self.a.hps: self.a.hps["wnemb_size"] = len(vocab_noun.id2word) if "oc" not in self.a.hps: self.a.hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.hps: self.a.hps["ae_oc"] = len(EventsField.vocab.itos) else: if "wvemb_size" not in self.a.hps: self.a.hps["wvemb_size"] = len(vocab_verb.id2word) if "wremb_size" not in self.a.hps: self.a.hps["wremb_size"] = len(vocab_role.id2word) if "wnemb_size" not in self.a.hps: self.a.hps["wnemb_size"] = len(vocab_noun.id2word) if "oc" not in self.a.hps: self.a.hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.hps: self.a.hps["ae_oc"] = len(EventsField.vocab.itos) tester = self.get_tester() if self.a.textontology: if self.a.finetune: log('init model from ' + self.a.finetune) model = load_sr_model(self.a.hps, embeddingMatrix_noun, LabelField.vocab.vectors, EventsField.vocab.vectors, self.a.finetune, self.device, add_object=self.a.add_object) log('sr model loaded, there are %i sets of params' % len(model.parameters_requires_grads())) else: model = load_sr_model(self.a.hps, embeddingMatrix_noun, LabelField.vocab.vectors, EventsField.vocab.vectors, None, self.device, add_object=self.a.add_object) log('sr model created from scratch, there are %i sets of params' % len(model.parameters_requires_grads())) else: if self.a.finetune: log('init model from ' + self.a.finetune) model = load_sr_model(self.a.hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune, self.device, add_object=self.a.add_object) log('sr model loaded, there are %i sets of params' % len(model.parameters_requires_grads())) else: model = load_sr_model(self.a.hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, add_object=self.a.add_object) log('sr model created from scratch, there are %i sets of params' % len(model.parameters_requires_grads())) # for name, para in model.named_parameters(): # if para.requires_grad: # print(name) # exit(1) log('init complete\n') if not os.path.exists(self.a.out): os.mkdir(self.a.out) self.a.word_i2s = vocab_noun.id2word # if self.a.textontology: self.a.acelabel_i2s = LabelField.vocab.itos self.a.acerole_i2s = EventsField.vocab.itos # with open(os.path.join(self.a.out, "label_s2i.vec"), "wb") as f: # pickle.dump(LabelField.vocab.stoi, f) # with open(os.path.join(self.a.out, "role_s2i.vec"), "wb") as f: # pickle.dump(EventsField.vocab.stoi, f) # with open(os.path.join(self.a.out, "label_i2s.vec"), "wb") as f: # pickle.dump(LabelField.vocab.itos, f) # with open(os.path.join(self.a.out, "role_i2s.vec"), "wb") as f: # pickle.dump(EventsField.vocab.itos, f) # else: self.a.label_i2s = vocab_verb.id2word #LabelField.vocab.itos self.a.role_i2s = vocab_role.id2word # save as Vocab writer = SummaryWriter(os.path.join(self.a.out, "exp")) self.a.writer = writer # with open(os.path.join(self.a.out, "sr_hyps.json"), "w") as f: # json.dump(self.a.hps, f) test_iter = torch.utils.data.DataLoader(dataset=test_sr_set, batch_size=self.a.batch, shuffle=False, num_workers=2, collate_fn=image_collate_fn) verb_roles = test_sr_set.get_verb_role_mapping() if 'visualize_path' not in self.a: visualize_path = None else: visualize_path = self.a.visualize_path test_loss, test_verb_p, test_verb_r, test_verb_f1, \ test_role_p, test_role_r, test_role_f1, \ test_noun_p, test_noun_r, test_noun_f1, \ test_triple_p, test_triple_r, test_triple_f1, \ test_noun_p_relaxed, test_noun_r_relaxed, test_noun_f1_relaxed, \ test_triple_p_relaxed, test_triple_r_relaxed, test_triple_f1_relaxed = run_over_data_sr(data_iter=test_iter, optimizer=None, model=model, need_backward=False, MAX_STEP=ceil(len( test_sr_set) / self.a.batch), tester=tester, hyps=model.hyperparams, device=model.device, maxnorm=self.a.maxnorm, word_i2s=self.a.word_i2s, label_i2s=self.a.label_i2s, role_i2s=self.a.role_i2s, verb_roles=verb_roles, load_object=self.a.add_object, visualize_path=visualize_path, save_output=os.path.join( self.a.out, "test_final.txt")) print("\nFinally test loss: ", test_loss, "\ntest verb p: ", test_verb_p, " test verb r: ", test_verb_r, " test verb f1: ", test_verb_f1, "\ntest role p: ", test_role_p, " test role r: ", test_role_r, " test role f1: ", test_role_f1, "\ntest noun p: ", test_noun_p, " test noun r: ", test_noun_r, " test noun f1: ", test_noun_f1, "\ntest triple p: ", test_triple_p, " test triple r: ", test_triple_r, " test triple f1: ", test_triple_f1, "\ntest noun p relaxed: ", test_noun_p_relaxed, " test noun r relaxed: ", test_noun_r_relaxed, " test noun f1 relaxed: ", test_noun_f1_relaxed, "\ntest triple p relaxed: ", test_triple_p_relaxed, " test triple r relaxed: ", test_triple_r_relaxed, " test triple f1 relaxed: ", test_triple_f1_relaxed)
batch_size = 64 embedding_dim = 300 hidden_size = 128 n_filters = 200 filters_sizes = [2, 3, 4, 5] sentence_max_len = 400 output_dim = 2 dropout = 0.5 num_epochs = 50 device = torch.device("cuda:5") lr = 0.0001 if not os.path.exists('.vector_cache'): os.mkdir('.vector_cache') vectors = Vectors(name='./glove.840B.300d.txt') def tokenizer(text): #return [tok.text for tok in spacy_en.tokenize(text)] return [tok for tok in nltk.word_tokenize(text)] TEXT = data.Field(sequential=True, stop_words=None, tokenize=tokenizer, lower=True, fix_length=sentence_max_len, batch_first=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)
def load_my_data(self, word_embedding_pkl, pairs_pkl): """ Loads the data from file :param word_embedding_pkl: absolute path to word_embeddings {Glove/Word2Vec} :param pairs_pkl: # pkl file save data :param context_flag: # 0: bairly include pairs # 1: include pairs and local context # 2: include pairs and global context # 3: include pairs, local context and global context :return: """ tokenizer = lambda text: [x for x in text] TEXT = data.Field(sequential=True, tokenize=tokenizer, fix_length=self.config.max_sen_len) LABEL = data.Field(sequential=False, use_vocab=False) datafields = [("text", TEXT), ("label", LABEL)] # Load data from pd.DataFrame into torchtext.data.Dataset train_df, test_df, val_df = self.get_my_pandas_df( pairs_pkl, self.config.context_flag) train_examples = [ data.Example.fromlist(i, datafields) for i in train_df.values.tolist() ] train_data = data.Dataset(train_examples, datafields) test_examples = [ data.Example.fromlist(i, datafields) for i in test_df.values.tolist() ] test_data = data.Dataset(test_examples, datafields) val_examples = [ data.Example.fromlist(i, datafields) for i in val_df.values.tolist() ] val_data = data.Dataset(val_examples, datafields) TEXT.build_vocab(train_data, vectors=Vectors(name=word_embedding_pkl)) self.word_embeddings = TEXT.vocab.vectors self.vocab = TEXT.vocab self.train_iterator = data.BucketIterator( (train_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) self.val_iterator, self.test_iterator = data.BucketIterator.splits( (val_data, test_data), batch_size=self.config.batch_size, sort_key=lambda x: len(x.text), repeat=False, shuffle=True) print('Loaded %d training example' % len(train_data)) print('Loaded %d test example ' % len(test_data)) print('Loaded %d validation examples' % len(val_data))
fix_length=40) LABEL = torchtext.data.Field(sequential=False, use_vocab=False) #pandasでcsvを保存するときに、labelをintでキャストしておかないとエラーでるから注意 train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits( path='drive/My Drive/dataset/CBET/ekman', train='train.csv', validation='val.csv', test='test.csv', format='csv', fields=[('Text', TEXT), ('Label', LABEL)]) from torchtext.vocab import Vectors english_fasttext_vectors = Vectors(name='drive/My Drive/wiki-news-300d-1M.vec') print(english_fasttext_vectors.dim) print(len(english_fasttext_vectors.itos)) #ボキャブラリを作成する TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors) print(TEXT.vocab.stoi) batch_size = 64 d_model = 300 hidden_size = 512 output_dim = 5 dropout_rate = 0.1
# 定义字段与FIELD之间读配对 fields = [('data', TEXT), ('label', LABEL)] # 注意skip_header train, val = TabularDataset.splits(path='data', train='train.csv', validation='val.csv', format='csv', fields=fields, skip_header=True) # train, val = TabularDataset().splits(path='./data', train='train.csv', validation='val.csv', # format='csv', fields=fields, skip_header=True) # 构建从本地加载的词向量 vectors = Vectors(name=bc.embedding_loc, cache=bc.cach) # 构建vocabulary TEXT.build_vocab(train, val, vectors=vectors) LABEL.build_vocab(train, val, vectors=vectors) # print(LABEL.vocab.stoi['0']) # '1':2, '0':3 train_iter = BucketIterator(train, batch_size=bc.batch_size, \ sort_key=lambda x: len(x.data), sort_within_batch=True, shuffle=True) val_iter = BucketIterator(val, batch_size=bc.batch_size, \ sort_key=lambda x: len(x.data), sort_within_batch=True, shuffle=True) vocab_size = TEXT.vocab.vectors.shape
def run(self): print("Running on", self.a.device) self.set_device(self.a.device) np.random.seed(self.a.seed) torch.manual_seed(self.a.seed) torch.backends.cudnn.benchmark = True # create training set if self.a.test_ee: log('loading event extraction corpus from %s' % self.a.test_ee) WordsField = Field(lower=True, include_lengths=True, batch_first=True) PosTagsField = Field(lower=True, batch_first=True) EntityLabelsField = MultiTokenField(lower=False, batch_first=True) AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True) LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None) EventsField = EventField(lower=False, batch_first=True) EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False) if self.a.amr: colcc = 'amr-colcc' else: colcc = 'stanford-colcc' print(colcc) train_ee_set = ACE2005Dataset(path=self.a.train_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1) dev_ee_set = ACE2005Dataset(path=self.a.dev_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) test_ee_set = ACE2005Dataset(path=self.a.test_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, vectors=pretrained_embedding) else: WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS) PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS) EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS) LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL) EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT) consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME] # print("O label is", consts.O_LABEL) consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME] # print("O label for AE is", consts.ROLE_O_LABEL) self.a.label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5 self.a.label_weight[consts.O_LABEL] = 1.0 self.a.arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5 # add role mask self.a.role_mask = event_role_mask(self.a.test_ee, self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi, EventsField.vocab.stoi, self.device) # print('self.a.hps', self.a.hps) if not self.a.hps_path: self.a.hps = eval(self.a.hps) if "wemb_size" not in self.a.hps: self.a.hps["wemb_size"] = len(WordsField.vocab.itos) if "pemb_size" not in self.a.hps: self.a.hps["pemb_size"] = len(PosTagsField.vocab.itos) if "psemb_size" not in self.a.hps: self.a.hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest()]) + 2 if "eemb_size" not in self.a.hps: self.a.hps["eemb_size"] = len(EntityLabelsField.vocab.itos) if "oc" not in self.a.hps: self.a.hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.hps: self.a.hps["ae_oc"] = len(EventsField.vocab.itos) tester = self.get_tester(LabelField.vocab.itos, EventsField.vocab.itos) if self.a.finetune: log('init model from ' + self.a.finetune) model = load_ee_model(self.a.hps, self.a.finetune, WordsField.vocab.vectors, self.device) log('model loaded, there are %i sets of params' % len(model.parameters_requires_grads())) else: model = load_ee_model(self.a.hps, None, WordsField.vocab.vectors, self.device) log('model created from scratch, there are %i sets of params' % len(model.parameters_requires_grads())) self.a.word_i2s = WordsField.vocab.itos self.a.label_i2s = LabelField.vocab.itos self.a.role_i2s = EventsField.vocab.itos writer = SummaryWriter(os.path.join(self.a.out, "exp")) self.a.writer = writer # train_iter = BucketIterator(train_ee_set, batch_size=self.a.batch, # train=True, shuffle=False, device=-1, # sort_key=lambda x: len(x.POSTAGS)) # dev_iter = BucketIterator(dev_ee_set, batch_size=self.a.batch, train=False, # shuffle=False, device=-1, # sort_key=lambda x: len(x.POSTAGS)) test_iter = BucketIterator(test_ee_set, batch_size=self.a.batch, train=False, shuffle=False, device=-1, sort_key=lambda x: len(x.POSTAGS)) print("\nStarting testing ...\n") # Testing Phrase test_loss, test_ed_p, test_ed_r, test_ed_f1, \ test_ae_p, test_ae_r, test_ae_f1 = run_over_data(data_iter=test_iter, optimizer=None, model=model, need_backward=False, MAX_STEP=ceil(len( test_ee_set) / self.a.batch), tester=tester, hyps=model.hyperparams, device=model.device, maxnorm=self.a.maxnorm, word_i2s=self.a.word_i2s, label_i2s=self.a.label_i2s, role_i2s=self.a.role_i2s, weight=self.a.label_weight, arg_weight=self.a.arg_weight, save_output=os.path.join( self.a.out, "test_final.txt"), role_mask=self.a.role_mask) print("\nFinally test loss: ", test_loss, "\ntest ed p: ", test_ed_p, " test ed r: ", test_ed_r, " test ed f1: ", test_ed_f1, "\ntest ae p: ", test_ae_p, " test ae r: ", test_ae_r, " test ae f1: ", test_ae_f1)
train, val, test = torchtext.datasets.SST.splits( TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral') print('len(train)', len(train)) print('vars(train[0])', vars(train[0])) TEXT.build_vocab(train) LABEL.build_vocab(train) print('len(TEXT.vocab)', len(TEXT.vocab)) print('len(LABEL.vocab)', len(LABEL.vocab)) train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits( (train, val, test), batch_size=bs, device=-1, repeat=False) # glove = GloVe(name='6B',dim=300) TEXT.vocab.load_vectors(vectors=Vectors('glove.6B.300d.txt')) glove = TEXT.vocab.vectors # Build the vocabulary with word embeddings url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) print("Word embeddings size ", TEXT.vocab.vectors.size()) word2vec = TEXT.vocab.vectors ############################################ # With help from Yunjey Pytorch Tutorial on Github # class CNN(nn.Module): # def __init__(self): # super(CNN, self).__init__()
def train(args, writer, is_train=True): # Build train dataset fields, train_dataset = build_and_cache_dataset(args, mode='train') # for i in range(5): # print(train_dataset[i].category,train_dataset[i].news) # return # Build vocab ID, CATEGORY, NEWS = fields vectors = Vectors(name=args.embed_path, cache=args.data_dir) # NOTE: use train_dataset to build vocab! NEWS.build_vocab( train_dataset, max_size=args.vocab_size, vectors=vectors, unk_init=torch.nn.init.xavier_normal_, ) CATEGORY.build_vocab(train_dataset) # print("查找第1000个单词:"+NEWS.vocab.itos[1000]) # print("查找单词‘每个’的索引:"+str(NEWS.vocab.stoi[r'每个'])) # print("词向量矩阵的维度:"+str(NEWS.vocab.vectors.shape)) # word_vec = NEWS.vocab.vectors[NEWS.vocab.stoi['每个']] # print("单词‘每个’的词向量为:"+str(word_vec)) # return # model = TextClassifier( # vocab_size=len(NEWS.vocab), # output_dim=args.num_labels, # pad_idx=NEWS.vocab.stoi[NEWS.pad_token], # dropout=args.dropout, # ) #使用双向gru+attetion机制模型 model = bigru_attention( vocab_size=len(NEWS.vocab), output_dim=args.num_labels, pad_idx=NEWS.vocab.stoi[NEWS.pad_token], dropout=args.dropout, ) # Init embeddings for model model.embedding.from_pretrained(NEWS.vocab.vectors) bucket_iterator = BucketIterator( train_dataset, batch_size=args.train_batch_size, sort_within_batch=True, shuffle=True, sort_key=lambda x: len(x.news), device=args.device, ) f1_score = 0 if os.listdir("output_dir"): f1_score = float( os.listdir("output_dir")[0].split("_")[1].split(".p")[0]) model.load_state_dict( torch.load("output_dir/" + os.listdir("output_dir")[0])) model.to(args.device) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) # scheduler = lr_scheduler.OneCycleLR(optimizer, # max_lr=args.learning_rate, # epochs=args.num_train_epochs, # steps_per_epoch=len(bucket_iterator)) #scheduler = lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1,last_epoch = -1 ) global_step = 0 model.zero_grad() if is_train: train_trange = trange(0, args.num_train_epochs, desc="Train epoch") for _ in train_trange: epoch_iterator = tqdm(bucket_iterator, desc='Training') results_f1_score = 0 for step, batch in enumerate(epoch_iterator): model.train() news, news_lengths = batch.news #new.size() [8 ,64] category = batch.category #category.size() [64] #preds = model(news, news_lengths) preds = model(news) loss = criterion(preds, category) loss.backward() #optimizer.zero_grad() optimizer.step() #scheduler.step() # Logging writer.add_scalar('Train/Loss', loss.item(), global_step) # writer.add_scalar('Train/lr', # scheduler.get_last_lr()[0], global_step) # NOTE: Update model, optimizer should update before scheduler global_step += 1 # NOTE:Evaluate if args.logging_steps > 0 and global_step % args.logging_steps == 0: results = evaluate(args, model, CATEGORY.vocab, NEWS.vocab) results_f1_score = results['f1'] for key, value in results.items(): writer.add_scalar("Eval/{}".format(key), value, global_step) # NOTE: save model # if args.save_steps > 0 and global_step % args.save_steps == 0: # save_model(args, model, optimizer, scheduler, global_step) if results_f1_score > f1_score: try: os.remove("output_dir/model_" + str(f1_score) + ".pt") except: print("None!") torch.save( model.state_dict(), "output_dir/model_" + str(results_f1_score) + ".pt") f1_score = results_f1_score print("So far the best score is:" + str(f1_score) + "+++++++++++++++++++++++++++++++") writer.close() else: test(args, model, CATEGORY.vocab, NEWS.vocab)
def _read_word_embeddings(self, file): from torchtext.vocab import Vectors from pathlib import Path path = Path(file) vectors = Vectors(name=path.name, cache=path.parent) return vectors
print(EN.vocab.freqs.most_common(10)) print("Size of English vocab", len(EN.vocab)) print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"]) # vocab index for <s>, </s> BATCH_SIZE = 32 train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=-1, repeat=False, sort_key=lambda x: len(x.src)) batch = next(iter(train_iter)) print("Source size", batch.src.size()) print("Target size", batch.trg.size()) # https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md if word2vec: url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec' EN.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) # feel free to alter path print("Simple English embeddings size", EN.vocab.vectors.size()) url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.de.vec' DE.vocab.load_vectors(vectors=Vectors('wiki.de.vec', url=url)) # feel free to alter path print("German embeddings size", DE.vocab.vectors.size()) print("REMINDER!!! Did you create ../../models/HW3?????") unk_token = EN.vocab.stoi["<unk>"] pad_token = EN.vocab.stoi["<pad>"] sos_token = EN.vocab.stoi["<s>"] eos_token = EN.vocab.stoi["</s>"] ''' TODO Fix bidirectional S2S Does ppl change if you average loss the actual Yoon way?
def get_utterance_and_context_loader(max_length=256, batch_size=64): max_length = max_length batch_size = batch_size ID = torchtext.data.Field(sequential=False, use_vocab=False) UTTERANCE = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") SPEAKER = torchtext.data.Field(sequential=False, use_vocab=True) CONTEXT_ALL = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") LABEL = torchtext.data.Field(sequential=False, use_vocab=False, preprocessing=lambda l: 0 if l=='TRUE' else 1, is_target=True) # CONTEXT1 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT2 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT3 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT4 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT5 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT6 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT7 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT8 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT9 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT10 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") # CONTEXT11 = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>") ds = torchtext.data.TabularDataset( path='./MUStARD/sarcasm_data.csv', format='csv', fields=[("id", ID), ("utterance", UTTERANCE), ("speaker", SPEAKER), ("context_all", CONTEXT_ALL), ("label", LABEL)], skip_header=True) # test dataloader # print(f'データ数{len(ds)}') # print(f'1つ目のデータ{vars(ds[1])}') # dsをtrain, val, testに分ける. ランダムに8:1:1 で分ける train_ds, val_ds, test_ds = ds.split(split_ratio=[0.8, 0.1, 0.1], random_state=random.seed(1234)) # test split # print(f'train dataの数:{len(train_ds)}, validataion dataの数:{len(val_ds)}, test dataの数: {len(test_ds)}') # print(f'1つ目のデータ{vars(train_ds[1])}') english_fasttext_vectors = Vectors(name='data/wiki-news-300d-1M.vec') # ベクトル化したバージョンのボキャブラリーを作成. (UTTERANCEとCONTEXTの2つのフィールドで同一のvocabを作成したため少し変則的) UTTERANCE.build_vocab(ds.utterance, ds.context_all, vectors=english_fasttext_vectors, min_freq=1) CONTEXT_ALL.vocab = UTTERANCE.vocab # 普通のbuild_vocab # UTTERANCE.build_vocab(ds, vectors=english_fasttext_vectors, min_freq=1) # CONTEXT_ALL.build_vocab(ds, vectors=english_fasttext_vectors, min_freq=1) SPEAKER.build_vocab(ds) # ボキャブラリーのベクトルを確認 # print(UTTERANCE.vocab.vectors.shape) # print(UTTERANCE.vocab.vectors) # ボキャブラリーの単語の順番を確認 # print(CONTEXT_ALL.vocab.stoi) # make dataloader train_dl = torchtext.data.Iterator(train_ds, batch_size=24, train=True) val_dl = torchtext.data.Iterator(val_ds, batch_size=24, train=False, sort=False) test_dl = torchtext.data.Iterator(test_ds, batch_size=24, train=False, sort=False) # test train_dataで確認 batch = next(iter(train_dl)) print(batch.utterance) print(batch.label)
device=torch_device, # 如果使用gpu,此处更换为GPU的编号 sort_key=lambda x: len(x.sentence), # 这个BucketIterator需要文本的长度 sort_within_batch=False, repeat=False ) test_iter = data.Iterator(test, batch_size=test_batch_size, device=torch_device, sort=False, sort_within_batch=False, repeat=False) from torchtext.vocab import Vectors import os cache='../vector_cache' if not os.path.exists(cache): os.mkdir(cache) vectors = Vectors(name='glove.6B.'+str(vocab_dimension)+'d.txt', cache=cache) print("build vocab: start") TEXT.build_vocab(train, vectors=vectors) vocab = TEXT.vocab weight_matrix = vocab.vectors glove_vocabulary = set(vectors.stoi) train_data_vocabulary = set(vocab.stoi) print("字典词汇数/训练集登录词汇数/训练集未登录词汇数={}/{}/{}".format(len(glove_vocabulary), len(train_data_vocabulary & glove_vocabulary), len(train_data_vocabulary - glove_vocabulary))) print("build vocab: end")