Beispiel #1
0
    return text.split()


user_stop_words = {'.', ','}
STOP_WORDS.update(user_stop_words)
stop_words = STOP_WORDS

# Pretrain Model
PRE_TRAIN_MODEL_BASE_PATH = '/home/ubuntu/likun/nlp_vectors'
PRE_TRAIN_MODEL_DIR = 'glove'
PRE_TRAIN_MODEL_NAME = 'glove.6B.200d.txt'
USE_PRE_TRAIN_MODEL = True
cache = '.vector_cache'
vector_path = os.path.join(PRE_TRAIN_MODEL_BASE_PATH, PRE_TRAIN_MODEL_DIR,
                           PRE_TRAIN_MODEL_NAME)
vectors = Vectors(name=vector_path,
                  cache=cache) if USE_PRE_TRAIN_MODEL else None

# Build Dataset
TEXT = data.Field(unk_token=UNK_TOKEN,
                  tokenize=tokenizer,
                  lower=False,
                  stop_words=stop_words,
                  batch_first=True)
LABEL = data.LabelField()

train_data = data.TabularDataset(path=os.path.join(DATA_BASE_PATH, DATA_DIR,
                                                   DATA_TRAIN_FILE_NAME),
                                 format='csv',
                                 fields=[('text', TEXT), ('label', LABEL)],
                                 skip_header=True)
test_data = data.TabularDataset(path=os.path.join(DATA_BASE_PATH, DATA_DIR,
Beispiel #2
0
 def get_vectors(self, path: str):
     logger.info('loading vectors from {}'.format(path))
     vectors = Vectors(path)
     logger.info('successed loading vectors')
     return vectors
Beispiel #3
0
    fields = [('sentiment', LABEL), ('title', None), ('review', TEXT)]

    reviews = TabularDataset(path=args.data_dir + "/" +
                             args.sensitive_filename,
                             format='csv',
                             fields=fields,
                             skip_header=True)

    train_private = reviews

    phrase_count_complete = create_clean_counter(reviews, add_space_split=True)
    train_vocab = torchtext.vocab.Vocab(counter=phrase_count_complete)

    # Attach GloVe embeddings
    embedding_dims = args.embedding_size
    vectors = Vectors(args.vectors_dir + "/" + args.vectors_filename,
                      max_vectors=100_000)
    train_vocab.load_vectors(vectors)

    # Create approximate nearest neighbor index
    num_trees = 50

    ann_index = AnnoyIndex(embedding_dims, 'euclidean')

    ann_filename = join(args.artifact_output_dir, "index.ann")
    for vector_num, vector in enumerate(train_vocab.vectors):
        ann_index.add_item(vector_num, vector)

    print("Building annoy index...")
    assert ann_index.build(num_trees)
    ann_index.save(ann_filename)
    print("Annoy index built")
Beispiel #4
0
    def load_data(self, w2v_file, train_file, test_file, val_file=None):
        '''
        Loads the data from files
        Sets up iterators for training, validation and test data
        Also create vocabulary and word embeddings based on the data
        
        Inputs:
            w2v_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec)
            train_file (String): absolute path to training file
            test_file (String): absolute path to test file
            val_file (String): absolute path to validation file
        '''

        NLP = spacy.load('en')
        tokenizer = lambda sent: [
            x.text for x in NLP.tokenizer(sent) if x.text != " "
        ]

        # Creating Field for data
        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          lower=True,
                          fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file)
        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        # If validation file exists, load it. Otherwise get validation data from training data
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_examples = [
                data.Example.fromlist(i, datafields)
                for i in val_df.values.tolist()
            ]
            val_data = data.Dataset(val_examples, datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)

        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)

        print("Loaded {} training examples".format(len(train_data)))
        print("Loaded {} test examples".format(len(test_data)))
        print("Loaded {} validation examples".format(len(val_data)))
Beispiel #5
0
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']


print('Building encoder and decoder ...')
# Initialize GNN
n_edge_types = dataset.n_edge_types
n_node = dataset.n_node
state_dim = dataset.state_dim
net = GGNN(state_dim, annotation_dim, n_edge_types, n_node, n_steps)
net.double()
print(net)

# Initialize word embeddings
embedding = nn.Embedding(voc.num_words, hidden_size)
weight_matrix = Vectors(glove_path)
voc.getEmb(weight_matrix)
print(torch.FloatTensor(np.array(voc.index2emb)).size())
embedding.weight.data.copy_(torch.FloatTensor(np.array(voc.index2emb)))
embedding.weight.requires_grad = False
if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
seq2seq = Seq2Seq(encoder, decoder, net, opts)
# Use appropriate device
seq2seq.to(device)
Beispiel #6
0
def load_dataset(args):
    if args.dataset == '20newsgroup':
        train_classes, val_classes, test_classes, label_dict = _get_20newsgroup_classes(
            args)
    elif args.dataset == 'amazon':
        train_classes, val_classes, test_classes, label_dict = _get_amazon_classes(
            args)
    elif args.dataset == 'fewrel':
        train_classes, val_classes, test_classes, label_dict = _get_fewrel_classes(
            args)
    elif args.dataset == 'huffpost':
        train_classes, val_classes, test_classes, label_dict = _get_huffpost_classes(
            args)
    elif args.dataset == 'reuters':
        train_classes, val_classes, test_classes, label_dict = _get_reuters_classes(
            args)
    elif args.dataset == 'rcv1':
        train_classes, val_classes, test_classes, label_dict = _get_rcv1_classes(
            args)
    else:
        raise ValueError(
            'args.dataset should be one of'
            '[20newsgroup, amazon, fewrel, huffpost, reuters, rcv1]')

    assert (len(train_classes) == args.n_train_class)
    assert (len(val_classes) == args.n_val_class)
    assert (len(test_classes) == args.n_test_class)

    print("train_classes", train_classes)
    print("val_classes", val_classes)
    print("test_classes", test_classes)

    tprint('Loading data')
    all_data = _load_json(args.data_path)
    class_names = []
    class_name_words = []
    for ld in label_dict:
        class_name_dic = {}
        class_name_dic['label'] = label_dict[ld]
        class_name_dic['text'] = ld.lower().split()
        class_names.append(class_name_dic)
        class_name_words.append(class_name_dic['text'])

    tprint('Loading word vectors')

    vectors = Vectors(args.word_vector, cache=args.wv_path)
    vocab = Vocab(collections.Counter(_read_words(all_data, class_name_words)),
                  vectors=vectors,
                  specials=['<pad>', '<unk>'],
                  min_freq=5)

    # print word embedding statistics
    wv_size = vocab.vectors.size()
    tprint('Total num. of words: {}, word vector dimension: {}'.format(
        wv_size[0], wv_size[1]))

    num_oov = wv_size[0] - torch.nonzero(
        torch.sum(torch.abs(vocab.vectors), dim=1)).size()[0]
    tprint(('Num. of out-of-vocabulary words'
            '(they are initialized to zeros): {}').format(num_oov))

    # Split into meta-train, meta-val, meta-test data
    train_data, val_data, test_data = _meta_split(all_data, train_classes,
                                                  val_classes, test_classes)
    tprint('#train {}, #val {}, #test {}'.format(len(train_data),
                                                 len(val_data),
                                                 len(test_data)))

    # Convert everything into np array for fast data loading
    class_names = _data_to_nparray(class_names, vocab, args)
    train_data = _data_to_nparray(train_data, vocab, args)
    val_data = _data_to_nparray(val_data, vocab, args)
    test_data = _data_to_nparray(test_data, vocab, args)

    train_data['is_train'] = True
    val_data['is_train'] = True
    test_data['is_train'] = True
    # this tag is used for distinguishing train/val/test when creating source pool

    temp_num = np.argsort(class_names['label'])
    class_names['label'] = class_names['label'][temp_num]
    class_names['text'] = class_names['text'][temp_num]
    class_names['text_len'] = class_names['text_len'][temp_num]

    return train_data, val_data, test_data, class_names, vocab
Beispiel #7
0
    def run(self):
        print("Running on", self.a.device)
        self.set_device(self.a.device)

        np.random.seed(self.a.seed)
        torch.manual_seed(self.a.seed)
        torch.backends.cudnn.benchmark = True

        ####################    loading event extraction dataset   ####################
        if self.a.test_ee:
            log('testing event extraction corpus from %s' % self.a.test_ee)
        if self.a.test_ee:
            log('testing event extraction corpus from %s' % self.a.test_ee)

        # both for grounding and ee
        WordsField = Field(lower=True, include_lengths=True, batch_first=True)
        PosTagsField = Field(lower=True, batch_first=True)
        EntityLabelsField = MultiTokenField(lower=False, batch_first=True)
        AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False)
        # only for ee
        LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None)
        EventsField = EventField(lower=False, batch_first=True)
        SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)

        if self.a.amr:
            colcc = 'simple-parsing'
        else:
            colcc = 'combined-parsing'
        print(colcc)

        train_ee_set = ACE2005Dataset(path=self.a.train_ee,
                                   fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           colcc: ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   amr=self.a.amr, keep_events=1)

        dev_ee_set = ACE2005Dataset(path=self.a.dev_ee,
                                 fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                         "pos-tags": ("POSTAGS", PosTagsField),
                                         "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                         colcc: ("ADJM", AdjMatrixField),
                                         "golden-event-mentions": ("LABEL", LabelField),
                                         "all-events": ("EVENT", EventsField),
                                         "all-entities": ("ENTITIES", EntitiesField)},
                                 amr=self.a.amr, keep_events=0)

        # test_ee_set = ACE2005Dataset(path=self.a.test_ee,
        #                           fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
        #                                   "pos-tags": ("POSTAGS", PosTagsField),
        #                                   "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
        #                                   colcc: ("ADJM", AdjMatrixField),
        #                                   "golden-event-mentions": ("LABEL", LabelField),
        #                                   "all-events": ("EVENT", EventsField),
        #                                   "all-entities": ("ENTITIES", EntitiesField)},
        #                           amr=self.a.amr, keep_events=0)

        print('self.a.train_ee', self.a.train_ee)
        LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL)
        print('LabelField.vocab.stoi', LabelField.vocab.stoi)
        EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT)
        print('EventsField.vocab.stoi', EventsField.vocab.stoi)
        print('len(EventsField.vocab.itos)', len(EventsField.vocab.itos))
        print('len(EventsField.vocab.stoi)', len(EventsField.vocab.stoi))

        ####################    loading SR dataset   ####################
        # both for grounding and sr
        if self.a.train_sr:
            log('loading corpus from %s' % self.a.train_sr)

        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))])

        vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True)
        vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True)
        vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True)

        # only need get_role_mask() and sr_mapping()
        train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                     EventsField.vocab.stoi, LabelField.vocab.stoi,
                                     self.a.imsitu_ontology_file,
                                     self.a.train_sr, self.a.verb_mapping_file,
                                     None, None,
                                     0,
                                     transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                     load_object=False, filter_place=self.a.filter_place)


        ####################    loading grounding dataset   ####################
        if self.a.train_grounding:
            log('loading grounding corpus from %s' % self.a.train_grounding)

        # only for grounding
        IMAGEIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        # IMAGEField = SparseField(sequential=False, use_vocab=False, batch_first=True)

        train_grounding_set = GroundingDataset(path=self.a.train_grounding,
                                               img_dir=None,
                                               fields={"id": ("IMAGEID", IMAGEIDField),
                                                       "sentence_id": ("SENTID", SENTIDField),
                                                       "words": ("WORDS", WordsField),
                                                       "pos-tags": ("POSTAGS", PosTagsField),
                                                       "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                       colcc: ("ADJM", AdjMatrixField),
                                                       "all-entities": ("ENTITIES", EntitiesField),
                                                       # "image": ("IMAGE", IMAGEField),
                                                       },
                                               transform=transform,
                                               amr=self.a.amr)

        dev_grounding_set = GroundingDataset(path=self.a.dev_grounding,
                                             img_dir=None,
                                             fields={"id": ("IMAGEID", IMAGEIDField),
                                                     "sentence_id": ("SENTID", SENTIDField),
                                                     "words": ("WORDS", WordsField),
                                                     "pos-tags": ("POSTAGS", PosTagsField),
                                                     "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                     colcc: ("ADJM", AdjMatrixField),
                                                     "all-entities": ("ENTITIES", EntitiesField),
                                                     # "image": ("IMAGE", IMAGEField),
                                                     },
                                             transform=transform,
                                             amr=self.a.amr)

        # test_grounding_set = GroundingDataset(path=self.a.test_grounding,
        #                                       img_dir=None,
        #                                       fields={"id": ("IMAGEID", IMAGEIDField),
        #                                               "sentence_id": ("SENTID", SENTIDField),
        #                                               "words": ("WORDS", WordsField),
        #                                               "pos-tags": ("POSTAGS", PosTagsField),
        #                                               "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
        #                                               colcc: ("ADJM", AdjMatrixField),
        #                                               "all-entities": ("ENTITIES", EntitiesField),
        #                                               # "image": ("IMAGE", IMAGEField),
        #                                               },
        #                                       transform=transform,
        #                                       amr=self.a.amr)

        ####################    build vocabulary   ####################

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS, vectors=pretrained_embedding)
        else:
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS)
        PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS, train_grounding_set.POSTAGS, dev_grounding_set.POSTAGS)
        EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS,  train_grounding_set.ENTITYLABELS, dev_grounding_set.ENTITYLABELS)

        consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME]
        # print("O label is", consts.O_LABEL)
        consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME]
        # print("O label for AE is", consts.ROLE_O_LABEL)

        # dev_ee_set1 = ACE2005Dataset(path=self.a.dev_ee,
        #                           fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
        #                                   "pos-tags": ("POSTAGS", PosTagsField),
        #                                   "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
        #                                   colcc: ("ADJM", AdjMatrixField),
        #                                   "golden-event-mentions": ("LABEL", LabelField),
        #                                   "all-events": ("EVENT", EventsField),
        #                                   "all-entities": ("ENTITIES", EntitiesField)},
        #                           amr=self.a.amr, keep_events=1, only_keep=True)
        #
        # test_ee_set1 = ACE2005Dataset(path=self.a.test_ee,
        #                            fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
        #                                    "pos-tags": ("POSTAGS", PosTagsField),
        #                                    "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
        #                                    colcc: ("ADJM", AdjMatrixField),
        #                                    "golden-event-mentions": ("LABEL", LabelField),
        #                                    "all-events": ("EVENT", EventsField),
        #                                    "all-entities": ("ENTITIES", EntitiesField)},
        #                            amr=self.a.amr, keep_events=1, only_keep=True)
        # print("train set length", len(train_ee_set))
        #
        # print("dev set length", len(dev_ee_set))
        # print("dev set 1/1 length", len(dev_ee_set1))
        #
        # print("test set length", len(test_ee_set))
        # print("test set 1/1 length", len(test_ee_set1))

        # sr model initialization
        if not self.a.sr_hps_path:
            self.a.sr_hps = eval(self.a.sr_hps)
        embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(self.device)
        embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(self.device)
        embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(self.device)
        if "wvemb_size" not in self.a.sr_hps:
            self.a.sr_hps["wvemb_size"] = len(vocab_verb.id2word)
        if "wremb_size" not in self.a.sr_hps:
            self.a.sr_hps["wremb_size"] = len(vocab_role.id2word)
        if "wnemb_size" not in self.a.sr_hps:
            self.a.sr_hps["wnemb_size"] = len(vocab_noun.id2word)
        # if "ae_oc" not in self.a.sr_hps:
        #     self.a.sr_hps["ae_oc"] = len(vocab_role.id2word)

        # self.a.ee_label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5
        # self.a.ee_label_weight[consts.O_LABEL] = 1.0
        # self.a.ee_arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5
        # if not self.a.ee_hps_path:
        #     self.a.ee_hps = eval(self.a.ee_hps)
        # if "wemb_size" not in self.a.ee_hps:
        #     self.a.ee_hps["wemb_size"] = len(WordsField.vocab.itos)
        # if "pemb_size" not in self.a.ee_hps:
        #     self.a.ee_hps["pemb_size"] = len(PosTagsField.vocab.itos)
        # if "psemb_size" not in self.a.ee_hps:
        #     # self.a.ee_hps["psemb_size"] = max([train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2
        #     self.a.ee_hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest(), train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2
        # if "eemb_size" not in self.a.ee_hps:
        #     self.a.ee_hps["eemb_size"] = len(EntityLabelsField.vocab.itos)
        # if "oc" not in self.a.ee_hps:
        #     self.a.ee_hps["oc"] = len(LabelField.vocab.itos)
        # if "ae_oc" not in self.a.ee_hps:
        #     self.a.ee_hps["ae_oc"] = len(EventsField.vocab.itos)
        if "oc" not in self.a.sr_hps:
            self.a.sr_hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.sr_hps:
            self.a.sr_hps["ae_oc"] = len(EventsField.vocab.itos)



        ace_classifier = ACEClassifier(self.a.sr_hps["wemb_dim"], self.a.sr_hps["oc"], self.a.sr_hps["ae_oc"],
                                       self.device)

        ee_model = None
        # # if self.a.score_ee:
        # if  self.a.finetune_ee:
        #     log('init ee model from ' + self.a.finetune_ee)
        #     ee_model = load_ee_model(self.a.ee_hps, self.a.finetune_ee, WordsField.vocab.vectors, self.device, ace_classifier)
        #     log('ee model loaded, there are %i sets of params' % len(ee_model.parameters_requires_grads()))
        # else:
        #     ee_model = load_ee_model(self.a.ee_hps, None, WordsField.vocab.vectors, self.device, ace_classifier)
        #     log('ee model created from scratch, there are %i sets of params' % len(ee_model.parameters_requires_grads()))

        # if self.a.score_sr:
        if self.a.finetune_sr:
            log('init sr model from ' + self.a.finetune_sr)
            sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune_sr, self.device, ace_classifier, add_object=self.a.add_object)
            log('sr model loaded, there are %i sets of params' % len(sr_model.parameters_requires_grads()))
        else:
            sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, ace_classifier, add_object=self.a.add_object)
            log('sr model created from scratch, there are %i sets of params' % len(sr_model.parameters_requires_grads()))

        model = GroundingModel(ee_model, sr_model, self.get_device())
        # ee_model = torch.nn.DataParallel(ee_model)
        # sr_model = torch.nn.DataParallel(sr_model)
        # model = torch.nn.DataParallel(model)

        # if self.a.optimizer == "adadelta":
        #     optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(),
        #                                     weight_decay=self.a.l2decay)
        # elif self.a.optimizer == "adam":
        #     optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(),
        #                                     weight_decay=self.a.l2decay)
        # else:
        #     optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(),
        #                                     weight_decay=self.a.l2decay,
        #                                     momentum=0.9)

        # log('optimizer in use: %s' % str(self.a.optimizer))

        if not os.path.exists(self.a.out):
            os.mkdir(self.a.out)
        # with open(os.path.join(self.a.out, "word.vec"), "wb") as f:
        #     pickle.dump(WordsField.vocab, f)
        # with open(os.path.join(self.a.out, "pos.vec"), "wb") as f:
        #     pickle.dump(PosTagsField.vocab.stoi, f)
        # with open(os.path.join(self.a.out, "entity.vec"), "wb") as f:
        #     pickle.dump(EntityLabelsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "label.vec"), "wb") as f:
            pickle.dump(LabelField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "role.vec"), "wb") as f:
            pickle.dump(EventsField.vocab.stoi, f)

        log('init complete\n')

        # # ee mappings
        # self.a.ee_word_i2s = WordsField.vocab.itos
        # self.a.ee_label_i2s = LabelField.vocab.itos
        # self.a.ee_role_i2s = EventsField.vocab.itos
        # self.a.ee_role_mask = None
        # if self.a.apply_ee_role_mask:
        #     self.a.ee_role_mask = event_role_mask(self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi, EventsField.vocab.stoi, self.device)
        # sr mappings
        self.a.sr_word_i2s = vocab_noun.id2word
        self.a.sr_label_i2s = vocab_verb.id2word  # LabelField.vocab.itos
        self.a.sr_role_i2s = vocab_role.id2word
        self.a.role_masks = train_sr_set.get_role_mask().to_dense().to(self.device)
        writer = SummaryWriter(os.path.join(self.a.out, "exp"))
        self.a.writer = writer

        # loading testing data
        # voa_text = self.a.test_voa_text
        voa_image_dir = self.a.test_voa_image
        gt_voa_image = self.a.gt_voa_image
        gt_voa_text = self.a.gt_voa_text
        gt_voa_align =self.a.gt_voa_align

        sr_verb_mapping, sr_role_mapping = train_sr_set.get_sr_mapping()

        test_m2e2_set = M2E2Dataset(path=gt_voa_text,
                                    img_dir=voa_image_dir,
                                    fields={"image": ("IMAGEID", IMAGEIDField),
                                          "sentence_id": ("SENTID", SENTIDField),
                                          "words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          colcc: ("ADJM", AdjMatrixField),
                                          "all-entities": ("ENTITIES", EntitiesField),
                                          # "image": ("IMAGE", IMAGEField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          },
                                    transform=transform,
                                    amr=self.a.amr,
                                    load_object=self.a.add_object,
                                    object_ontology_file=self.a.object_class_map_file,
                                    object_detection_pkl_file=self.a.object_detection_pkl_file,
                                    object_detection_threshold=self.a.object_detection_threshold,
                                    keep_events=self.a.keep_events,
                                    )

        object_results, object_label, object_detection_threshold = test_m2e2_set.get_object_results()

        # build batch on cpu
        test_m2e2_iter = BucketIterator(test_m2e2_set, batch_size=1, train=False,
                                   shuffle=False, device=-1,
                                   sort_key=lambda x: len(x.POSTAGS))

        # scores = 0.0
        # now_bad = 0
        # restart_used = 0
        print("\nStarting testing...\n")
        # lr = parser.lr
        # optimizer = optimizer_constructor(lr=lr)

        # ee_tester = EDTester(LabelField.vocab.itos, EventsField.vocab.itos, self.a.ignore_time_test)
        # sr_tester = SRTester()
        # g_tester = GroundingTester()
        j_tester = JointTester(self.a.ignore_place_sr_test, self.a.ignore_time_test)
        # if self.a.visual_voa_ee_path is not None:
        #     ee_visualizer = EDVisualizer(self.a.gt_voa_text)
        # else:
        #     ee_visualizer = None
        image_gt = json.load(open(gt_voa_image))

        # all_y = []
        # all_y_ = []
        # all_events = []
        # all_events_ = []

        vision_result = dict()
        # if self.a.visual_voa_g_path is not None and not os.path.exists(self.a.visual_voa_g_path):
        #     os.makedirs(self.a.visual_voa_g_path, exist_ok=True)
        # if self.a.visual_voa_ee_path is not None and not os.path.exists(self.a.visual_voa_ee_path):
        #     os.makedirs(self.a.visual_voa_ee_path, exist_ok=True)
        if self.a.visual_voa_sr_path is not None and not os.path.exists(self.a.visual_voa_sr_path):
            os.makedirs(self.a.visual_voa_sr_path, exist_ok=True)
        # grounding_writer = open(self.a.visual_voa_g_path, 'w')
        doc_done = set()
        with torch.no_grad():
            model.eval()
            for batch in test_m2e2_iter:
                vision_result = joint_test_batch(
                    model_g=model,
                    batch_g=batch,
                    device=self.device,
                    transform=transform,
                    img_dir=voa_image_dir,
                    # ee_hyps=self.a.ee_hps,
                    # ee_word_i2s=self.a.ee_word_i2s,
                    # ee_label_i2s=self.a.ee_label_i2s,
                    # ee_role_i2s=self.a.ee_role_i2s,
                    # ee_tester=ee_tester,
                    # ee_visualizer=ee_visualizer,
                    sr_noun_i2s=self.a.sr_word_i2s,
                    sr_verb_i2s=self.a.sr_label_i2s,
                    sr_role_i2s=self.a.sr_role_i2s,
                    # sr_tester=sr_tester,
                    role_masks=self.a.role_masks,
                    # ee_role_mask=self.a.ee_role_mask,
                    # j_tester=j_tester,
                    image_gt=image_gt,
                    verb2type=sr_verb_mapping,
                    role2role=sr_role_mapping,
                    vision_result=vision_result,
                    # all_y=all_y,
                    # all_y_=all_y_,
                    # all_events=all_events,
                    # all_events_=all_events_,
                    # visual_g_path=self.a.visual_voa_g_path,
                    # visual_ee_path=self.a.visual_voa_ee_path,
                    load_object=self.a.add_object,
                    object_results=object_results,
                    object_label=object_label,
                    object_detection_threshold=object_detection_threshold,
                    vocab_objlabel=vocab_noun.word2id,
                    # apply_ee_role_mask=self.a.apply_ee_role_mask
                    keep_events_sr=self.a.keep_events_sr,
                    doc_done=doc_done,
                )

        print('vision_result size', len(vision_result))
        # pickle.dump(vision_result, open(os.path.join(self.a.out, 'vision_result.pkl'), 'w'))

        # ep, er, ef = ee_tester.calculate_report(all_y, all_y_, transform=True)
        # ap, ar, af = ee_tester.calculate_sets(all_events, all_events_)
        # if self.a.visual_voa_ee_path is not None:
        #     ee_visualizer.rewrite_brat(self.a.visual_voa_ee_path, self.a.visual_voa_ee_gt_ann)
        #
        # print('text ep, er, ef', ep, er, ef)
        # print('text ap, ar, af', ap, ar, af)

        evt_p, evt_r, evt_f1, role_scores = j_tester.calculate_report(
            vision_result, voa_image_dir, self.a.visual_voa_sr_path, self.a.add_object,
            keep_events_sr=self.a.keep_events_sr
        )#consts.O_LABEL, consts.ROLE_O_LABEL)

        print('image event ep, er, ef \n', evt_p, '\n', evt_r, '\n', evt_f1)
        # if not self.a.add_object:
        #     print('image att_iou ap, ar, af', role_scores['role_att_iou_p'], role_scores['role_att_iou_r'],
        #           role_scores['role_att_iou_f1'])
        #     print('image att_hit ap, ar, af', role_scores['role_att_hit_p'], role_scores['role_att_hit_r'],
        #           role_scores['role_att_hit_f1'])
        #     print('image att_cor ap, ar, af', role_scores['role_att_cor_p'], role_scores['role_att_cor_r'],
        #           role_scores['role_att_cor_f1'])
        # else:
        #     print('image obj_iou ap, ar, af', role_scores['role_obj_iou_p'], role_scores['role_obj_iou_r'],
        #           role_scores['role_obj_iou_f1'])
        #     print('image obj_iou_union ap, ar, af', role_scores['role_obj_iou_union_p'], role_scores['role_obj_iou_union_r'],
        #           role_scores['role_obj_iou_union_f1'])
        for key in role_scores:
            print(key)
            for key_ in role_scores[key]:
                print(key_, role_scores[key][key_])
Beispiel #8
0
LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

#pandasでcsvを保存するときに、labelをintでキャストしておかないとエラーでるから注意
train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path='drive/My Drive/', train='4/train.csv', validation='4/validation.csv',
    test='4/test.csv', format='csv', fields=[('text', TEXT), ('Label', LABEL)]
)

print(len(train_ds))
print(len(val_ds))
print(len(test_ds))

from torchtext.vocab import Vectors

japanese_word2vec_vectors = Vectors(
    name='drive/My Drive/tweets133_.vec')

print(japanese_word2vec_vectors.dim)
print(len(japanese_word2vec_vectors.itos))

#ボキャブラリを作成
TEXT.build_vocab(train_ds, vectors=japanese_word2vec_vectors)
print(TEXT.vocab.vectors.shape)
print(TEXT.vocab.stoi)

train_dl = torchtext.data.Iterator(train_ds, batch_size=64, train=True)
val_dl = torchtext.data.Iterator(val_ds, batch_size=64, train=False, sort=False)
test_dl = torchtext.data.Iterator(test_ds, batch_size=64, train=False, sort=False)

batch = next(iter(val_dl))
print(batch.text)
Beispiel #9
0
    def __init__(self,
                 batch_size=128,
                 fix_length=32,
                 singer=None,
                 target_vocab_size=5000,
                 vector_path=VEC_PATH,
                 device=None):
        """
        用于生成歌词生成任务的数据预处理和Batch生成
        每次输入网络的数据包括:
            encoder_input:  编码器输入, shape: (batch_size, time_step, word_id)
            encoder_length: 编码器输入文本有效长度, shape: (batch_size, )
            decoder_input:  解码器输入, shape: (batch_size, time_step, word_id)
            decoder_length: 解码器输入文本有效长度, shape: (batch_size, )
            target: 解码器输出目标, 用于计算Loss, shape: (batch_size, time_step, word_id)
        :param batch_size: 每个batch的大小. 默认: 128
        :param fix_length: 每个序列的最大长度, 长度不足的句子会用"<pad>"补齐, 超过的句子会被截断. 默认: 32
        :param singer: 为None时读取所有歌曲; 否则只读取对应歌手的歌曲. 默认: None
        :param target_vocab_size: 目标词典(解码器输出)的长度, 在输出端(目标)只保留词频最高的前 target_vocab_size 个词语,
                            其它词语都会被"<unk>"替换. 默认: 5000
        :param vector_path: word2vec模型的路径. PS: 必须是.txt格式的文件
        :param device: 设备, "cuda"或"cpu". 默认: None, 自动选择"cuda"或"cpu"
        """
        self.batch_size = batch_size
        self.fix_length = fix_length
        self.singer = singer
        self.target_vocab_size = target_vocab_size
        self.vector_path = vector_path
        self.DEVICE = device or torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.tokenize = lambda x: jieba.lcut(x, HMM=False)  # 分词

        # 定义torchtext的三个用于文本预处理的Field对象, 其中ENCODER并不需要句首符"<go>"和末尾符"<eos>",
        # 但为了三个Field对象对文本编码解码的一致性, 在定义ENCODER Field对象时要将它们声明,
        # 在词典映射构建完毕后再将它们去掉, 在self._build_vocab 中也有说明
        self.ENCODER = Field(
            sequential=True,
            tokenize=self.tokenize,
            batch_first=True,  # 数据的第一维是batch(默认是time_step)
            fix_length=self.
            fix_length,  # 固定句子长度, 长度不足的句子会用"<pad>"补齐, 超过的句子会被截断
            include_lengths=True,  # 处理文本时除了返回编码后的文本, 同时返回文本的长度
            init_token="<go>",  # 文本的句首会自动添加"<go>"
            eos_token="<eos>")  # 文本的末尾会自动添加"<eos>"
        self.DECODER = Field(sequential=True,
                             tokenize=self.tokenize,
                             batch_first=True,
                             fix_length=self.fix_length,
                             include_lengths=True,
                             init_token="<go>",
                             eos_token="<eos>")
        self.TARGET = Field(
            sequential=True,
            tokenize=self.tokenize,
            batch_first=True,
            fix_length=self.fix_length,
            eos_token="<eos>")  # 由于`target`是`decoder`左移一位的结果, 所以不需要句首符"<go>"

        # 数据处理
        self._proprecess()  # 对语料库进行读取, 并转化维torchtext能识别的.json文件格式
        self.dataset = self._build_dataset()  # 读取处理后的数据, 生成torchtext的DataSet对象
        self.vectors = Vectors(name=self.vector_path,
                               cache=FILE_PATH + "/temp")  # 加载word2vec词向量
        self._build_vocab()  # 构建词典映射
        self._build_vector()  # 构建词向量映射
        self.stoi = self.ENCODER.vocab.stoi  # 从词语到id的映射字典
        self.itos = self.ENCODER.vocab.itos  # 从id到词典的映射字典
        self.vocab_size = len(self.ENCODER.vocab)  # 词典的大小
        self.vector_dim = self.vectors.dim  # 词向量的维度
        self.vector_weights = self.ENCODER.vocab.vectors  # 词向量的权重
        self.target_vocab_size = len(
            self.TARGET.vocab
        )  # 重新赋值, 因为加入了"<eos>"等标志位的实际词典大会大于原target_vocab_size

        # 迭代器, 用于训练时生成batch
        self.data_iter = BucketIterator(
            self.dataset,
            batch_size=self.batch_size,
            shuffle=True,  # 打乱数据原本顺序
            device=self.DEVICE)
Beispiel #10
0
def load_data(opt):
    # 不设置fix_length
    TEXT = data.Field(sequential=True, fix_length=opt.max_text_len)  # 词或者字符
    LABEL = data.Field(sequential=False, use_vocab=False)

    # load
    # word/ or article/
    train_path = opt.data_path + opt.text_type + '/train_set.csv'
    val_path = opt.data_path + opt.text_type + '/val_set.csv'
    test_path = opt.data_path + opt.text_type + '/test_set.csv'
    train_path = 'D:/git/dataset/val_set.csv'
    test_path = 'D:/git/dataset/val_set.csv'
    val_path = 'D:/git/dataset/val_set.csv'

    # aug for data augmentation
    if opt.aug:
        print('make augmentation datasets!')
    train = GrandDataset(train_path,
                         text_field=TEXT,
                         label_field=LABEL,
                         text_type=opt.text_type,
                         test=False,
                         aug=opt.aug)
    val = GrandDataset(val_path,
                       text_field=TEXT,
                       label_field=LABEL,
                       text_type=opt.text_type,
                       test=False)
    test = GrandDataset(test_path,
                        text_field=TEXT,
                        label_field=None,
                        text_type=opt.text_type,
                        test=True)

    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    embedding_path = '{}/{}_{}.txt'.format(opt.embedding_path, opt.text_type,
                                           opt.embedding_dim)
    vectors = Vectors(name=embedding_path, cache=cache)
    print('load word2vec vectors from {}'.format(embedding_path))
    vectors.unk_init = init.xavier_uniform_  # 没有命中的token的初始化方式

    # 构建Vocab
    print('building {} vocabulary......'.format(opt.text_type))
    TEXT.build_vocab(train, val, test, min_freq=5, vectors=vectors)
    # LABEL.build_vocab(train)

    # 构建Iterator
    # 在 test_iter, shuffle, sort, repeat一定要设置成 False, 要不然会被 torchtext 搞乱样本顺序
    # 如果输入变长序列,sort_within_batch需要设置成true,使每个batch内数据按照sort_key降序进行排序
    train_iter = data.BucketIterator(dataset=train,
                                     batch_size=opt.batch_size,
                                     shuffle=True,
                                     sort_within_batch=False,
                                     repeat=False,
                                     device=opt.device)
    # val_iter = data.BucketIterator(dataset=val, batch_size=opt.batch_size, sort_within_batch=False, repeat=False,
    #                                device=opt.device)
    # train_iter = data.Iterator(dataset=train, batch_size=opt.batch_size, train=True, repeat=False, device=opt.device)
    val_iter = data.Iterator(dataset=val,
                             batch_size=opt.batch_size,
                             shuffle=False,
                             sort=False,
                             repeat=False,
                             device=opt.device)
    test_iter = data.Iterator(dataset=test,
                              batch_size=opt.batch_size,
                              shuffle=False,
                              sort=False,
                              repeat=False,
                              device=opt.device)

    return train_iter, val_iter, test_iter, len(TEXT.vocab), TEXT.vocab.vectors
def load_data(batch_size, device):
    # 标签
    LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)
    # 文本
    SEN1 = data.Field(sequential=True, tokenize=tokenizer,  fix_length=50, lower=True, batch_first=True)
    SEN2 = data.Field(sequential=True, tokenize=tokenizer,  fix_length=50, lower=True, batch_first=True)

    # 构建DataSet
    train, valid = data.TabularDataset.splits(
        path='./snli_1.0/',
        skip_header=True,
        train="train4.csv",
        validation="dev3.csv",
        format='csv',
        fields=[("label", LABEL), ("sentence1", SEN1), ("sentence2", SEN2)],
    )

    test = data.TabularDataset(
        path='./snli_1.0/test3.csv',
        skip_header=True,
        format='csv',
        fields=[("sentence1", SEN1), ("sentence2", SEN2)],
    )

    # 创建词表
    SEN1.build_vocab((train.sentence11, train.sentence2), vectors=Vectors(name='/data/yinli/dataset/glove.840B.300d.txt'))
    SEN2.vocab = SEN1.vocab

    # 构建迭代器
    train_iter = data.BucketIterator(train,
                                sort_key=lambda x: len(x.SEN1),
                                sort_within_batch=False,
                                shuffle=True,
                                batch_size=batch_size,
                                repeat=False,
                                device=device)

    valid_iter = data.Iterator(valid,
                              sort=False,
                              shuffle=False,
                              sort_within_batch=False,
                              batch_size=batch_size,
                              repeat=False,
                              train=False,
                              device=device)

    test_iter = data.Iterator(test,
                               sort=False,
                               shuffle=False,
                               sort_within_batch=False,
                               batch_size=batch_size,
                               repeat=False,
                               train=False,
                               device=device)

    return train_iter, valid_iter, test_iter, SEN1.vocab, SEN2.vocab

# 加载数据集,生成迭代器
# def load_data(batch_size, device):
#     # 标签
#     LABEL = data.Field(sequential=True, batch_first=True)
#     # 文本
#     SEN1 = data.Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True)
#     SEN2 = data.Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True)
#
#     # 构建DataSet
#     train = data.TabularDataset(
#         path='./snli_1.0/train2.csv',
#         skip_header=True,
#         format='csv',
#         fields=[("label", LABEL), ("sentence1", SEN1), ("sentence2", SEN2)],
#     )
#
#     # 创建词表
#     SEN1.build_vocab(train, vectors=Vectors(name='/data/yinli/dataset/glove.840B.300d.txt'))
#     SEN2.build_vocab(train, vectors=Vectors(name='/data/yinli/dataset/glove.840B.300d.txt'))
#     LABEL.build_vocab(train)
#
#     # 构建迭代器
#     train_iter = data.BucketIterator(train,
#                                 sort_key=lambda x: len(x.SEN1),
#                                 sort_within_batch=False,
#                                 shuffle=True,
#                                 batch_size=batch_size,
#                                 repeat=False,
#                                 device=device)
#
#     return train_iter, SEN1.vocab, SEN2.vocab


# device = torch.device("cuda:1")
# train_iter, dev_iter, test_iter, sentence1_vocab, sentence2_vocab = load_data(5, 50, device)
#
# for batch in train_iter:
#     print(batch.label)
#     print(batch.sentence1)
#     print(batch.sentence2)
#     break
# print(len(sentence1_vocab.vectors))
#
# print(sentence1_vocab.stoi['frown'])
# print(sentence2_vocab.stoi['frown'])
# print(sentence1_vocab.stoi['<unk>'])
#
# del train_iter
# del dev_iter
# del test_iter
# del sentence1_vocab
# del sentence2_vocab

#
# embedding = torch.cat((sentence2_vocab.vectors ,sentence1_vocab.vectors[2:]), 0)
# print(embedding.size())
# vocab_size, embed_size = embedding.size()
# print(vocab_size)
# print(embed_size)
# print(len(label_vocab))
# print(label_vocab.stoi)
#label2id = {'<unk>': 0, '<pad>': 1, 'neutral': 2, 'contradiction': 3, 'entailment': 4}
Beispiel #12
0
    def __init__(self,
                 qid_path,
                 train_path,
                 test_path,
                 word_path,
                 char_path,
                 num_folds=10,
                 batch_size=32,
                 seed=2018):
        question_df = pd.read_csv(qid_path)
        question_df = question_df.set_index('qid')

        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)

        self.num_folds = num_folds
        self.batch_size = batch_size
        self.seed = seed
        self.device = torch.device(
            'cuda:0' if torch.cuda.is_available() else 'cpu')

        train_df['q1_wid'] = train_df['qid1'].apply(
            lambda qid: question_df.loc[qid]['wid'])
        train_df['q2_wid'] = train_df['qid2'].apply(
            lambda qid: question_df.loc[qid]['wid'])
        train_df['q1_cid'] = train_df['qid1'].apply(
            lambda qid: question_df.loc[qid]['cid'])
        train_df['q2_cid'] = train_df['qid2'].apply(
            lambda qid: question_df.loc[qid]['cid'])
        self.train_df = train_df[[
            'q1_wid', 'q2_wid', 'q1_cid', 'q2_cid', 'label'
        ]]

        test_df['q1_wid'] = test_df['qid1'].apply(
            lambda qid: question_df.loc[qid]['wid'])
        test_df['q2_wid'] = test_df['qid2'].apply(
            lambda qid: question_df.loc[qid]['wid'])
        test_df['q1_cid'] = test_df['qid1'].apply(
            lambda qid: question_df.loc[qid]['cid'])
        test_df['q2_cid'] = test_df['qid2'].apply(
            lambda qid: question_df.loc[qid]['cid'])
        self.test_df = test_df[['q1_wid', 'q2_wid', 'q1_cid', 'q2_cid']]

        self.word_embedding_path = word_path
        self.char_embedding_path = char_path

        cache = '../cache'
        if not os.path.exists(cache):
            os.mkdir(cache)

        self.word_vectors = Vectors(self.word_embedding_path, cache)
        self.char_vectors = Vectors(self.char_embedding_path, cache)
        self.word_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
        self.char_vectors.unk_init = lambda x: init.uniform_(x, -0.05, 0.05)
        self.wordTEXT = data.Field(batch_first=True)
        self.charTEXT = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False,
                                use_vocab=False,
                                dtype=torch.float)

        train_dataset = self.generate_dataset()
        test_dataset = self.generate_dataset(role='test')
        self.wordTEXT.build_vocab(train_dataset,
                                  test_dataset,
                                  min_freq=1,
                                  vectors=self.word_vectors)
        self.charTEXT.build_vocab(train_dataset,
                                  test_dataset,
                                  min_freq=1,
                                  vectors=self.char_vectors)
        self.word_embedding = self.wordTEXT.vocab.vectors
        self.char_embedding = self.charTEXT.vocab.vectors
Beispiel #13
0
import collections

import gensim
from torchtext.vocab import Vectors, Vocab

model = gensim.models.KeyedVectors.load_word2vec_format('input/vector.bin',
                                                        binary=True)
print(model['中国'])

# 肉眼可读方式存储的word2vec
vectors = Vectors(word_vector, cache=wv_path)
vocab = Vocab(collections.Counter(words),
              vectors=vectors,
              specials=['<pad>', '<unk>'],
              min_freq=1)
wv_size = vocab.vectors.size()
vocab.stoi['<unk>']
Beispiel #14
0
    def run(self):
        print("Running on", self.a.device)
        self.set_device(self.a.device)

        np.random.seed(self.a.seed)
        torch.manual_seed(self.a.seed)

        # create training set
        if self.a.train:
            log('loading corpus from %s' % self.a.train)

        WordsField = Field(lower=True, include_lengths=True, batch_first=True)
        PosTagsField = Field(lower=True, batch_first=True)
        EntityLabelsField = MultiTokenField(lower=False, batch_first=True)
        AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        LabelField = Field(lower=False, batch_first=True, pad_token=None, unk_token=None)
        EventsField = EventField(lower=False, batch_first=True)
        EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False)

        train_set = ACE2005Dataset(path=self.a.train,
                                   fields={"words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           "stanford-colcc": ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   keep_events=1)

        dev_set = ACE2005Dataset(path=self.a.dev,
                                 fields={"words": ("WORDS", WordsField),
                                         "pos-tags": ("POSTAGS", PosTagsField),
                                         "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                         "stanford-colcc": ("ADJM", AdjMatrixField),
                                         "golden-event-mentions": ("LABEL", LabelField),
                                         "all-events": ("EVENT", EventsField),
                                         "all-entities": ("ENTITIES", EntitiesField)},
                                 keep_events=0)

        test_set = ACE2005Dataset(path=self.a.test,
                                  fields={"words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          "stanford-colcc": ("ADJM", AdjMatrixField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities": ("ENTITIES", EntitiesField)},
                                  keep_events=0)

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            WordsField.build_vocab(train_set.WORDS, dev_set.WORDS, vectors=pretrained_embedding)
        else:
            WordsField.build_vocab(train_set.WORDS, dev_set.WORDS)
        PosTagsField.build_vocab(train_set.POSTAGS, dev_set.POSTAGS)
        EntityLabelsField.build_vocab(train_set.ENTITYLABELS, dev_set.ENTITYLABELS)
        LabelField.build_vocab(train_set.LABEL, dev_set.LABEL)
        EventsField.build_vocab(train_set.EVENT, dev_set.EVENT)

        consts.O_LABEL = LabelField.vocab.stoi["O"]
        # print("O label is", consts.O_LABEL)
        consts.ROLE_O_LABEL = EventsField.vocab.stoi["OTHER"]
        # print("O label for AE is", consts.ROLE_O_LABEL)

        dev_set1 = ACE2005Dataset(path=self.a.dev,
                                  fields={"words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          "stanford-colcc": ("ADJM", AdjMatrixField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities": ("ENTITIES", EntitiesField)},
                                  keep_events=1, only_keep=True)

        test_set1 = ACE2005Dataset(path=self.a.test,
                                   fields={"words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           "stanford-colcc": ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   keep_events=1, only_keep=True)

        print("dev set length", len(dev_set))
        print("dev set 1/1 length", len(dev_set1))

        print("test set length", len(test_set))
        print("test set 1/1 length", len(test_set1))

        self.a.label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5
        self.a.label_weight[consts.O_LABEL] = 1.0

        self.a.hps = eval(self.a.hps)
        if "wemb_size" not in self.a.hps:
            self.a.hps["wemb_size"] = len(WordsField.vocab.itos)
        if "pemb_size" not in self.a.hps:
            self.a.hps["pemb_size"] = len(PosTagsField.vocab.itos)
        if "psemb_size" not in self.a.hps:
            self.a.hps["psemb_size"] = max([train_set.longest(), dev_set.longest(), test_set.longest()]) + 2
        if "eemb_size" not in self.a.hps:
            self.a.hps["eemb_size"] = len(EntityLabelsField.vocab.itos)
        if "oc" not in self.a.hps:
            self.a.hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.hps:
            self.a.hps["ae_oc"] = len(EventsField.vocab.itos)

        tester = self.get_tester(LabelField.vocab.itos)

        if self.a.finetune:
            log('init model from ' + self.a.finetune)
            model = self.load_model(self.a.finetune)
            log('model loaded, there are %i sets of params' % len(model.parameters_requires_grads()))
        else:
            model = self.load_model(None)
            log('model created from scratch, there are %i sets of params' % len(model.parameters_requires_grads()))

        if self.a.optimizer == "adadelta":
            optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay)
        elif self.a.optimizer == "adam":
            optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay)
        else:
            optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay,
                                            momentum=0.9)

        log('optimizer in use: %s' % str(self.a.optimizer))

        if not os.path.exists(self.a.out):
            os.mkdir(self.a.out)
        with open(os.path.join(self.a.out, "word.vec"), "wb") as f:
            pickle.dump(WordsField.vocab, f)
        with open(os.path.join(self.a.out, "pos.vec"), "wb") as f:
            pickle.dump(PosTagsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "entity.vec"), "wb") as f:
            pickle.dump(EntityLabelsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "label.vec"), "wb") as f:
            pickle.dump(LabelField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "role.vec"), "wb") as f:
            pickle.dump(EventsField.vocab.stoi, f)

        log('init complete\n')

        self.a.word_i2s = WordsField.vocab.itos
        self.a.label_i2s = LabelField.vocab.itos
        self.a.role_i2s = EventsField.vocab.itos
        writer = SummaryWriter(os.path.join(self.a.out, "exp"))
        self.a.writer = writer

        train(
            model=model,
            train_set=train_set,
            dev_set=dev_set,
            test_set=test_set,
            optimizer_constructor=optimizer_constructor,
            epochs=self.a.epochs,
            tester=tester,
            parser=self.a,
            other_testsets={
                "dev 1/1": dev_set1,
                "test 1/1": test_set1,
            }
        )
        log('Done!')
Beispiel #15
0
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))

labels = [ex.label for ex in train.examples]

train_iter, _, _ = torchtext.data.BucketIterator.splits((train, valid, test),
                                                        batch_size=args.bsz,
                                                        device=-1,
                                                        repeat=False)

_, valid_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, valid, test), batch_size=10, device=-1)

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

#simple_vec = TEXT.vocab.vectors.clone()


#url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec'
#TEXT.vocab.load_vectors(vectors=Vectors('wiki.en.vec', url=url))
#complex_vec = TEXT.vocab.vectors
#
def output_test(model):
    "All models should be able to be run with following command."
    upload = []
    loss.reduce = False
    for batch in test_iter:
        # Your prediction data here (don't cheat!)
        x = batch.text
Beispiel #16
0
    def run(self):
        print("Running on", self.a.device)
        self.set_device(self.a.device)

        np.random.seed(self.a.seed)
        torch.manual_seed(self.a.seed)
        torch.backends.cudnn.benchmark = True

        ####################    loading event extraction dataset   ####################
        if self.a.train_ee:
            log('loading event extraction corpus from %s' % self.a.train_ee)

        # both for grounding and ee
        WordsField = Field(lower=True, include_lengths=True, batch_first=True)
        PosTagsField = Field(lower=True, batch_first=True)
        EntityLabelsField = MultiTokenField(lower=False, batch_first=True)
        AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False)
        # only for ee
        LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None)
        EventsField = EventField(lower=False, batch_first=True)
        SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)

        if self.a.amr:
            colcc = 'simple-parsing'
        else:
            colcc = 'combined-parsing'
        print(colcc)

        train_ee_set = ACE2005Dataset(path=self.a.train_ee,
                                   fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           colcc: ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   amr=self.a.amr, keep_events=1)

        dev_ee_set = ACE2005Dataset(path=self.a.dev_ee,
                                 fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                         "pos-tags": ("POSTAGS", PosTagsField),
                                         "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                         colcc: ("ADJM", AdjMatrixField),
                                         "golden-event-mentions": ("LABEL", LabelField),
                                         "all-events": ("EVENT", EventsField),
                                         "all-entities": ("ENTITIES", EntitiesField)},
                                 amr=self.a.amr, keep_events=0)

        test_ee_set = ACE2005Dataset(path=self.a.test_ee,
                                  fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          colcc: ("ADJM", AdjMatrixField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities": ("ENTITIES", EntitiesField)},
                                  amr=self.a.amr, keep_events=0)

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL, vectors=pretrained_embedding)
            EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT, vectors=pretrained_embedding)
        else:
            LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL)
            EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT)

        # add role mask
        self.a.role_mask = event_role_mask(self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi,
                                           EventsField.vocab.stoi, self.device)

        ####################    loading SR dataset   ####################
        # both for grounding and sr
        if self.a.train_sr:
            log('loading corpus from %s' % self.a.train_sr)

        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))])

        vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True)
        vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True)
        vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True)

        # train_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file,
        #                             self.a.train_sr, self.a.verb_mapping_file, self.a.role_mapping_file,
        #                             self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                             self.a.object_detection_threshold,
        #                             transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1)  #self.a.shuffle
        # dev_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file,
        #                             self.a.dev_sr, self.a.verb_mapping_file, self.a.role_mapping_file,
        #                             self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                             self.a.object_detection_threshold,
        #                             transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1)
        # test_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file,
        #                             self.a.test_sr, self.a.verb_mapping_file, self.a.role_mapping_file,
        #                             self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                             self.a.object_detection_threshold,
        #                             transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1)
        train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                     LabelField.vocab.stoi, EventsField.vocab.stoi,
                                     self.a.imsitu_ontology_file,
                                     self.a.train_sr, self.a.verb_mapping_file,
                                     self.a.object_class_map_file, self.a.object_detection_pkl_file,
                                     self.a.object_detection_threshold,
                                     transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                     load_object=self.a.add_object, filter_place=self.a.filter_place)
        dev_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                   LabelField.vocab.stoi, EventsField.vocab.stoi,
                                   self.a.imsitu_ontology_file,
                                   self.a.dev_sr, self.a.verb_mapping_file,
                                   self.a.object_class_map_file, self.a.object_detection_pkl_file,
                                   self.a.object_detection_threshold,
                                   transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                   load_object=self.a.add_object, filter_place=self.a.filter_place)
        test_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                    LabelField.vocab.stoi, EventsField.vocab.stoi,
                                    self.a.imsitu_ontology_file,
                                    self.a.test_sr, self.a.verb_mapping_file,
                                    self.a.object_class_map_file, self.a.object_detection_pkl_file,
                                    self.a.object_detection_threshold,
                                    transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                    load_object=self.a.add_object, filter_place=self.a.filter_place)


        ####################    loading grounding dataset   ####################
        if self.a.train_grounding:
            log('loading grounding corpus from %s' % self.a.train_grounding)

        # only for grounding
        IMAGEIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        # IMAGEField = SparseField(sequential=False, use_vocab=False, batch_first=True)

        train_grounding_set = GroundingDataset(path=self.a.train_grounding,
                                               img_dir=self.a.img_dir_grounding,
                                               fields={"id": ("IMAGEID", IMAGEIDField),
                                                       "sentence_id": ("SENTID", SENTIDField),
                                                       "words": ("WORDS", WordsField),
                                                       "pos-tags": ("POSTAGS", PosTagsField),
                                                       "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                       colcc: ("ADJM", AdjMatrixField),
                                                       "all-entities": ("ENTITIES", EntitiesField),
                                                       # "image": ("IMAGE", IMAGEField),
                                                       },
                                               transform=transform,
                                               amr=self.a.amr,
                                               load_object=self.a.add_object,
                                               object_ontology_file=self.a.object_class_map_file,
                                               object_detection_pkl_file=self.a.object_detection_pkl_file_g,
                                               object_detection_threshold=self.a.object_detection_threshold,
                                               )

        dev_grounding_set = GroundingDataset(path=self.a.dev_grounding,
                                             img_dir=self.a.img_dir_grounding,
                                             fields={"id": ("IMAGEID", IMAGEIDField),
                                                     "sentence_id": ("SENTID", SENTIDField),
                                                     "words": ("WORDS", WordsField),
                                                     "pos-tags": ("POSTAGS", PosTagsField),
                                                     "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                     colcc: ("ADJM", AdjMatrixField),
                                                     "all-entities": ("ENTITIES", EntitiesField),
                                                     # "image": ("IMAGE", IMAGEField),
                                                     },
                                             transform=transform,
                                             amr=self.a.amr,
                                             load_object=self.a.add_object,
                                             object_ontology_file=self.a.object_class_map_file,
                                             object_detection_pkl_file=self.a.object_detection_pkl_file_g,
                                             object_detection_threshold=self.a.object_detection_threshold,
                                             )

        test_grounding_set = GroundingDataset(path=self.a.test_grounding,
                                              img_dir=self.a.img_dir_grounding,
                                              fields={"id": ("IMAGEID", IMAGEIDField),
                                                      "sentence_id": ("SENTID", SENTIDField),
                                                      "words": ("WORDS", WordsField),
                                                      "pos-tags": ("POSTAGS", PosTagsField),
                                                      "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                      colcc: ("ADJM", AdjMatrixField),
                                                      "all-entities": ("ENTITIES", EntitiesField),
                                                      # "image": ("IMAGE", IMAGEField),
                                                      },
                                              transform=transform,
                                              amr=self.a.amr,
                                              load_object=self.a.add_object,
                                              object_ontology_file=self.a.object_class_map_file,
                                              object_detection_pkl_file=self.a.object_detection_pkl_file_g,
                                              object_detection_threshold=self.a.object_detection_threshold,
                                              )

        ####################    build vocabulary   ####################

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS, vectors=pretrained_embedding)
        else:
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS)
        PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS, train_grounding_set.POSTAGS, dev_grounding_set.POSTAGS)
        EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS,  train_grounding_set.ENTITYLABELS, dev_grounding_set.ENTITYLABELS)

        consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME]
        # print("O label is", consts.O_LABEL)
        consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME]
        # print("O label for AE is", consts.ROLE_O_LABEL)

        dev_ee_set1 = ACE2005Dataset(path=self.a.dev_ee,
                                  fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          colcc: ("ADJM", AdjMatrixField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities": ("ENTITIES", EntitiesField)},
                                  amr=self.a.amr, keep_events=1, only_keep=True)

        test_ee_set1 = ACE2005Dataset(path=self.a.test_ee,
                                   fields={"sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           colcc: ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   amr=self.a.amr, keep_events=1, only_keep=True)
        print("train set length", len(train_ee_set))

        print("dev set length", len(dev_ee_set))
        print("dev set 1/1 length", len(dev_ee_set1))

        print("test set length", len(test_ee_set))
        print("test set 1/1 length", len(test_ee_set1))

        # sr model initialization
        if not self.a.sr_hps_path:
            self.a.sr_hps = eval(self.a.sr_hps)
        embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(self.device)
        embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(self.device)
        embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(self.device)
        if "wvemb_size" not in self.a.sr_hps:
            self.a.sr_hps["wvemb_size"] = len(vocab_verb.id2word)
        if "wremb_size" not in self.a.sr_hps:
            self.a.sr_hps["wremb_size"] = len(vocab_role.id2word)
        if "wnemb_size" not in self.a.sr_hps:
            self.a.sr_hps["wnemb_size"] = len(vocab_noun.id2word)

        self.a.ee_label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5
        self.a.ee_label_weight[consts.O_LABEL] = 1.0
        self.a.ee_arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5
        self.a.ee_hps = eval(self.a.ee_hps)
        if "wemb_size" not in self.a.ee_hps:
            self.a.ee_hps["wemb_size"] = len(WordsField.vocab.itos)
        if "pemb_size" not in self.a.ee_hps:
            self.a.ee_hps["pemb_size"] = len(PosTagsField.vocab.itos)
        if "psemb_size" not in self.a.ee_hps:
            # self.a.ee_hps["psemb_size"] = max([train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2
            self.a.ee_hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest(), train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2
        if "eemb_size" not in self.a.ee_hps:
            self.a.ee_hps["eemb_size"] = len(EntityLabelsField.vocab.itos)
        if "oc" not in self.a.ee_hps:
            self.a.ee_hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.ee_hps:
            self.a.ee_hps["ae_oc"] = len(EventsField.vocab.itos)
        if "oc" not in self.a.sr_hps:
            self.a.sr_hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.sr_hps:
            self.a.sr_hps["ae_oc"] = len(EventsField.vocab.itos)

        ee_tester = EDTester(LabelField.vocab.itos, EventsField.vocab.itos, self.a.ignore_time_test)
        sr_tester = SRTester()
        g_tester = GroundingTester()
        j_tester = JointTester(self.a.ignore_place_sr_test, self.a.ignore_time_test)

        ace_classifier = ACEClassifier(2 * self.a.ee_hps["lstm_dim"], self.a.ee_hps["oc"], self.a.ee_hps["ae_oc"], self.device)

        if self.a.finetune_ee:
            log('init ee model from ' + self.a.finetune_ee)
            ee_model = load_ee_model(self.a.ee_hps, self.a.finetune_ee, WordsField.vocab.vectors, self.device, ace_classifier)
            log('ee model loaded, there are %i sets of params' % len(ee_model.parameters_requires_grads()))
        else:
            ee_model = load_ee_model(self.a.ee_hps, None, WordsField.vocab.vectors, self.device, ace_classifier)
            log('ee model created from scratch, there are %i sets of params' % len(ee_model.parameters_requires_grads()))

        if self.a.finetune_sr:
            log('init sr model from ' + self.a.finetune_sr)
            sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune_sr, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True)
            log('sr model loaded, there are %i sets of params' % len(sr_model.parameters_requires_grads()))
        else:
            sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True)
            log('sr model created from scratch, there are %i sets of params' % len(sr_model.parameters_requires_grads()))

        model = GroundingModel(ee_model, sr_model, self.get_device())
        # ee_model = torch.nn.DataParallel(ee_model)
        # sr_model = torch.nn.DataParallel(sr_model)
        # model = torch.nn.DataParallel(model)

        if self.a.optimizer == "adadelta":
            optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay)
        elif self.a.optimizer == "adam":
            optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay)
        else:
            optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay,
                                            momentum=0.9)

        log('optimizer in use: %s' % str(self.a.optimizer))

        if not os.path.exists(self.a.out):
            os.mkdir(self.a.out)
        with open(os.path.join(self.a.out, "word.vec"), "wb") as f:
            pickle.dump(WordsField.vocab, f)
        with open(os.path.join(self.a.out, "pos.vec"), "wb") as f:
            pickle.dump(PosTagsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "entity.vec"), "wb") as f:
            pickle.dump(EntityLabelsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "label.vec"), "wb") as f:
            pickle.dump(LabelField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "role.vec"), "wb") as f:
            pickle.dump(EventsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "ee_hyps.json"), "w") as f:
            json.dump(self.a.ee_hps, f)
        with open(os.path.join(self.a.out, "sr_hyps.json"), "w") as f:
            json.dump(self.a.sr_hps, f)

        log('init complete\n')

        # ee mappings
        self.a.ee_word_i2s = WordsField.vocab.itos
        self.a.ee_label_i2s = LabelField.vocab.itos
        self.a.ee_role_i2s = EventsField.vocab.itos
        # sr mappings
        self.a.sr_word_i2s = vocab_noun.id2word
        self.a.sr_label_i2s = vocab_verb.id2word  # LabelField.vocab.itos
        self.a.sr_role_i2s = vocab_role.id2word
        writer = SummaryWriter(os.path.join(self.a.out, "exp"))
        self.a.writer = writer

        joint_train(
            model_ee=ee_model,
            model_sr=sr_model,
            model_g=model,
            train_set_g=train_grounding_set,
            dev_set_g=dev_grounding_set,
            test_set_g=test_grounding_set,
            train_set_ee=train_ee_set,
            dev_set_ee=dev_ee_set,
            test_set_ee=test_ee_set,
            train_set_sr=train_sr_set,
            dev_set_sr=dev_sr_set,
            test_set_sr=test_sr_set,
            optimizer_constructor=optimizer_constructor,
            epochs=self.a.epochs,
            ee_tester=ee_tester,
            sr_tester=sr_tester,
            g_tester=g_tester,
            j_tester=j_tester,
            parser=self.a,
            other_testsets={
                "dev ee 1/1": dev_ee_set1,
                "test ee 1/1": test_ee_set1,
            },
            transform=transform,
            vocab_objlabel=vocab_noun.word2id
        )
        log('Done!')
Beispiel #17
0
    def load_data(self, w2v_file, train_file, test_file, val_file=None):
        '''
            从文件中读取数据,建立 iterators、vocabulary 和 embeddings
            Inputs:
                w2v_file(String): 预训练的词向量文件(Glove/Word2Vec)
                train_file(String): 训练数据路径
                test_file(String): 测试数据路径
                val_file(String): 验证数据路径
        '''

        tokenizer = lambda sent: [
            x for x in nltk.word_tokenize(sent) if x != " "
        ]  # 这里是一个列表生成式,避免了 token 的结果为 " "

        # 创建 Field 对象
        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          lower=True,
                          fix_length=self.config.max_sen_len)
        # LABEL 中的 sequential 一定要设置为 False
        LABEL = data.Field(
            sequential=False, use_vocab=False
        )  # 如果LABEL是整型,不需要 numericalize , 就需要将 use_vocab=False
        datafields = [("text", TEXT), ("label", LABEL)]

        # 将 DataFrame 中的数据添加到 torchtext.data.Dataset 中
        train_df = self.get_pandas_df(train_file)
        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]  # 生成训练样本
        train_data = data.Dataset(train_examples, datafields)

        test_df = self.get_pandas_df(test_file)
        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]  # 生成测试样本        text_data = data.Dataset(test_examples, datafields
        test_data = data.Dataset(test_examples, datafields)

        # 划分验证集
        if val_file:
            val_df = self.get_pandas_df(val_file)
            val_example = [
                data.Example.fromlist(i, datafields)
                for i in val_df.values.tolist()
            ]
            val_data = data.Dataset(val_example, datafields)
        else:
            train_data, val_data = train_data.split(
                split_ratio=0.8)  # 利用 split 划分

        # 加载预训练的 word embedding
        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        # 生成训练数据迭代对象
        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        # 生成测试数据和验证数据的迭代对象
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)
        print("Local {} train examples".format(len(train_data)))
        print("Local {} test examples".format(len(test_data)))
        print("Local {} validation examples".format(len(val_data)))
Beispiel #18
0
TEXT = Field(sequential=True, tokenize=spacy_tok, lower=True)
tst_datafields = [("comment_text", TEXT)]
tst = TabularDataset(path=data_review_csv_path,
                     format='csv',
                     skip_header=True,
                     fields=tst_datafields)

novel_datafields = [("novel", TEXT)]
novel = TabularDataset(path=data_novelty_csv_path,
                       format='csv',
                       skip_header=True,
                       fields=novel_datafields)

cache = '.vector_cache'
vectors = Vectors(name=glove_path, cache=cache)
TEXT.build_vocab(tst, vectors=vectors)

data_iter = Iterator(tst,
                     batch_size=1,
                     device=-1,
                     sort=False,
                     sort_within_batch=False,
                     repeat=False,
                     shuffle=False)
novel_iter = Iterator(novel,
                      batch_size=1,
                      device=-1,
                      sort=False,
                      sort_within_batch=False,
                      repeat=False,
Beispiel #19
0
    def run(self):
        print("Running on", self.a.device)
        self.set_device(self.a.device)

        np.random.seed(self.a.seed)
        torch.manual_seed(self.a.seed)
        torch.backends.cudnn.benchmark = True

        # build text event vocab and ee_role vocab
        WordsField = Field(lower=True, include_lengths=True, batch_first=True)
        PosTagsField = Field(lower=True, batch_first=True)
        EntityLabelsField = MultiTokenField(lower=False, batch_first=True)
        AdjMatrixField = SparseField(sequential=False,
                                     use_vocab=False,
                                     batch_first=True)
        EntitiesField = EntityField(lower=False,
                                    batch_first=True,
                                    use_vocab=False)
        # only for ee
        LabelField = Field(lower=False,
                           batch_first=True,
                           pad_token='0',
                           unk_token=None)
        EventsField = EventField(lower=False, batch_first=True)
        colcc = 'stanford-colcc'
        train_ee_set = ACE2005Dataset(path=self.a.train_ee,
                                      fields={
                                          "words": ("WORDS", WordsField),
                                          "pos-tags":
                                          ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions":
                                          ("ENTITYLABELS", EntityLabelsField),
                                          colcc: ("ADJM", AdjMatrixField),
                                          "golden-event-mentions":
                                          ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities":
                                          ("ENTITIES", EntitiesField)
                                      },
                                      amr=False,
                                      keep_events=1)
        pretrained_embedding = Vectors(self.a.webd,
                                       ".",
                                       unk_init=partial(torch.nn.init.uniform_,
                                                        a=-0.15,
                                                        b=0.15))
        LabelField.build_vocab(train_ee_set.LABEL,
                               vectors=pretrained_embedding)
        EventsField.build_vocab(train_ee_set.EVENT,
                                vectors=pretrained_embedding)

        # consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME]
        # # print("O label is", consts.O_LABEL)
        # consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME]
        # # print("O label for AE is", consts.ROLE_O_LABEL)

        # create testing set
        if self.a.test_sr:
            log('loading corpus from %s' % self.a.test_sr)

        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        vocab_noun = Vocab(os.path.join(self.a.vocab,
                                        'vocab_situation_noun.pkl'),
                           load=True)
        vocab_role = Vocab(os.path.join(self.a.vocab,
                                        'vocab_situation_role.pkl'),
                           load=True)
        vocab_verb = Vocab(os.path.join(self.a.vocab,
                                        'vocab_situation_verb.pkl'),
                           load=True)

        # train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
        #                              EventsField.vocab.stoi, LabelField.vocab.stoi,
        #                          self.a.imsitu_ontology_file,
        #                          self.a.train_sr, self.a.verb_mapping_file,
        #                          self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                          self.a.object_detection_threshold,
        #                          transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
        #                              load_object=self.a.add_object, filter_place=self.a.filter_place)
        # dev_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
        #                            EventsField.vocab.stoi, LabelField.vocab.stoi,
        #                          self.a.imsitu_ontology_file,
        #                          self.a.dev_sr, self.a.verb_mapping_file,
        #                          self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                          self.a.object_detection_threshold,
        #                          transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
        #                            load_object=self.a.add_object, filter_place=self.a.filter_place)
        test_sr_set = ImSituDataset(
            self.a.image_dir,
            vocab_noun,
            vocab_role,
            vocab_verb,
            EventsField.vocab.stoi,
            LabelField.vocab.stoi,
            self.a.imsitu_ontology_file,
            self.a.test_sr,
            self.a.verb_mapping_file,
            self.a.object_class_map_file,
            self.a.object_detection_pkl_file,
            self.a.object_detection_threshold,
            transform,
            filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
            load_object=self.a.add_object,
            filter_place=self.a.filter_place)

        embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(
            self.device)
        embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(
            self.device)
        embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(
            self.device)
        # consts.O_LABEL = vocab_verb.word2id['0'] # verb??
        # consts.ROLE_O_LABEL = vocab_role.word2id["OTHER"] #???

        # self.a.label_weight = torch.ones([len(vocab_sr.id2word)]) * 5 # more important to learn
        # self.a.label_weight[consts.O_LABEL] = 1.0 #???

        if not self.a.hps_path:
            self.a.hps = eval(self.a.hps)
        if self.a.textontology:
            if "wvemb_size" not in self.a.hps:
                self.a.hps["wvemb_size"] = len(LabelField.vocab.stoi)
            if "wremb_size" not in self.a.hps:
                self.a.hps["wremb_size"] = len(EventsField.vocab.itos)
            if "wnemb_size" not in self.a.hps:
                self.a.hps["wnemb_size"] = len(vocab_noun.id2word)
            if "oc" not in self.a.hps:
                self.a.hps["oc"] = len(LabelField.vocab.itos)
            if "ae_oc" not in self.a.hps:
                self.a.hps["ae_oc"] = len(EventsField.vocab.itos)
        else:
            if "wvemb_size" not in self.a.hps:
                self.a.hps["wvemb_size"] = len(vocab_verb.id2word)
            if "wremb_size" not in self.a.hps:
                self.a.hps["wremb_size"] = len(vocab_role.id2word)
            if "wnemb_size" not in self.a.hps:
                self.a.hps["wnemb_size"] = len(vocab_noun.id2word)
            if "oc" not in self.a.hps:
                self.a.hps["oc"] = len(LabelField.vocab.itos)
            if "ae_oc" not in self.a.hps:
                self.a.hps["ae_oc"] = len(EventsField.vocab.itos)

        tester = self.get_tester()

        if self.a.textontology:
            if self.a.finetune:
                log('init model from ' + self.a.finetune)
                model = load_sr_model(self.a.hps,
                                      embeddingMatrix_noun,
                                      LabelField.vocab.vectors,
                                      EventsField.vocab.vectors,
                                      self.a.finetune,
                                      self.device,
                                      add_object=self.a.add_object)
                log('sr model loaded, there are %i sets of params' %
                    len(model.parameters_requires_grads()))
            else:
                model = load_sr_model(self.a.hps,
                                      embeddingMatrix_noun,
                                      LabelField.vocab.vectors,
                                      EventsField.vocab.vectors,
                                      None,
                                      self.device,
                                      add_object=self.a.add_object)
                log('sr model created from scratch, there are %i sets of params'
                    % len(model.parameters_requires_grads()))
        else:
            if self.a.finetune:
                log('init model from ' + self.a.finetune)
                model = load_sr_model(self.a.hps,
                                      embeddingMatrix_noun,
                                      embeddingMatrix_verb,
                                      embeddingMatrix_role,
                                      self.a.finetune,
                                      self.device,
                                      add_object=self.a.add_object)
                log('sr model loaded, there are %i sets of params' %
                    len(model.parameters_requires_grads()))
            else:
                model = load_sr_model(self.a.hps,
                                      embeddingMatrix_noun,
                                      embeddingMatrix_verb,
                                      embeddingMatrix_role,
                                      None,
                                      self.device,
                                      add_object=self.a.add_object)
                log('sr model created from scratch, there are %i sets of params'
                    % len(model.parameters_requires_grads()))

        # for name, para in model.named_parameters():
        #     if para.requires_grad:
        #         print(name)
        # exit(1)

        log('init complete\n')

        if not os.path.exists(self.a.out):
            os.mkdir(self.a.out)

        self.a.word_i2s = vocab_noun.id2word
        # if self.a.textontology:
        self.a.acelabel_i2s = LabelField.vocab.itos
        self.a.acerole_i2s = EventsField.vocab.itos
        # with open(os.path.join(self.a.out, "label_s2i.vec"), "wb") as f:
        #     pickle.dump(LabelField.vocab.stoi, f)
        # with open(os.path.join(self.a.out, "role_s2i.vec"), "wb") as f:
        #     pickle.dump(EventsField.vocab.stoi, f)
        # with open(os.path.join(self.a.out, "label_i2s.vec"), "wb") as f:
        #     pickle.dump(LabelField.vocab.itos, f)
        # with open(os.path.join(self.a.out, "role_i2s.vec"), "wb") as f:
        #     pickle.dump(EventsField.vocab.itos, f)
        # else:
        self.a.label_i2s = vocab_verb.id2word  #LabelField.vocab.itos
        self.a.role_i2s = vocab_role.id2word
        # save as Vocab
        writer = SummaryWriter(os.path.join(self.a.out, "exp"))
        self.a.writer = writer

        # with open(os.path.join(self.a.out, "sr_hyps.json"), "w") as f:
        #     json.dump(self.a.hps, f)

        test_iter = torch.utils.data.DataLoader(dataset=test_sr_set,
                                                batch_size=self.a.batch,
                                                shuffle=False,
                                                num_workers=2,
                                                collate_fn=image_collate_fn)

        verb_roles = test_sr_set.get_verb_role_mapping()

        if 'visualize_path' not in self.a:
            visualize_path = None
        else:
            visualize_path = self.a.visualize_path

        test_loss, test_verb_p, test_verb_r, test_verb_f1, \
        test_role_p, test_role_r, test_role_f1, \
        test_noun_p, test_noun_r, test_noun_f1, \
        test_triple_p, test_triple_r, test_triple_f1, \
        test_noun_p_relaxed, test_noun_r_relaxed, test_noun_f1_relaxed, \
        test_triple_p_relaxed, test_triple_r_relaxed, test_triple_f1_relaxed = run_over_data_sr(data_iter=test_iter,
                                                                                                optimizer=None,
                                                                                                model=model,
                                                                                                need_backward=False,
                                                                                                MAX_STEP=ceil(len(
                                                                                                    test_sr_set) / self.a.batch),
                                                                                                tester=tester,
                                                                                                hyps=model.hyperparams,
                                                                                                device=model.device,
                                                                                                maxnorm=self.a.maxnorm,
                                                                                                word_i2s=self.a.word_i2s,
                                                                                                label_i2s=self.a.label_i2s,
                                                                                                role_i2s=self.a.role_i2s,
                                                                                                verb_roles=verb_roles,
                                                                                                load_object=self.a.add_object,
                                                                                                visualize_path=visualize_path,
                                                                                                save_output=os.path.join(
                                                                                                    self.a.out,
                                                                                                    "test_final.txt"))
        print("\nFinally test loss: ", test_loss, "\ntest verb p: ",
              test_verb_p, " test verb r: ", test_verb_r, " test verb f1: ",
              test_verb_f1, "\ntest role p: ", test_role_p, " test role r: ",
              test_role_r, " test role f1: ", test_role_f1, "\ntest noun p: ",
              test_noun_p, " test noun r: ", test_noun_r, " test noun f1: ",
              test_noun_f1, "\ntest triple p: ", test_triple_p,
              " test triple r: ", test_triple_r, " test triple f1: ",
              test_triple_f1, "\ntest noun p relaxed: ", test_noun_p_relaxed,
              " test noun r relaxed: ", test_noun_r_relaxed,
              " test noun f1 relaxed: ", test_noun_f1_relaxed,
              "\ntest triple p relaxed: ", test_triple_p_relaxed,
              " test triple r relaxed: ", test_triple_r_relaxed,
              " test triple f1 relaxed: ", test_triple_f1_relaxed)
Beispiel #20
0
batch_size = 64
embedding_dim = 300
hidden_size = 128
n_filters = 200
filters_sizes = [2, 3, 4, 5]
sentence_max_len = 400
output_dim = 2
dropout = 0.5
num_epochs = 50
device = torch.device("cuda:5")
lr = 0.0001

if not os.path.exists('.vector_cache'):
    os.mkdir('.vector_cache')
vectors = Vectors(name='./glove.840B.300d.txt')


def tokenizer(text):
    #return [tok.text for tok in spacy_en.tokenize(text)]
    return [tok for tok in nltk.word_tokenize(text)]


TEXT = data.Field(sequential=True,
                  stop_words=None,
                  tokenize=tokenizer,
                  lower=True,
                  fix_length=sentence_max_len,
                  batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)
Beispiel #21
0
    def load_my_data(self, word_embedding_pkl, pairs_pkl):
        """
        Loads the data from file
        :param word_embedding_pkl: absolute path to word_embeddings {Glove/Word2Vec}
        :param pairs_pkl:       # pkl file save data
        :param context_flag:    # 0: bairly include pairs
                                # 1: include pairs and local context
                                # 2: include pairs and global context
                                # 3: include pairs, local context and global context
        :return:
        """
        tokenizer = lambda text: [x for x in text]

        TEXT = data.Field(sequential=True,
                          tokenize=tokenizer,
                          fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text", TEXT), ("label", LABEL)]

        # Load data from pd.DataFrame into torchtext.data.Dataset
        train_df, test_df, val_df = self.get_my_pandas_df(
            pairs_pkl, self.config.context_flag)

        train_examples = [
            data.Example.fromlist(i, datafields)
            for i in train_df.values.tolist()
        ]
        train_data = data.Dataset(train_examples, datafields)

        test_examples = [
            data.Example.fromlist(i, datafields)
            for i in test_df.values.tolist()
        ]
        test_data = data.Dataset(test_examples, datafields)

        val_examples = [
            data.Example.fromlist(i, datafields)
            for i in val_df.values.tolist()
        ]
        val_data = data.Dataset(val_examples, datafields)

        TEXT.build_vocab(train_data, vectors=Vectors(name=word_embedding_pkl))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab

        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)

        print('Loaded %d training example' % len(train_data))
        print('Loaded %d test example ' % len(test_data))
        print('Loaded %d validation examples' % len(val_data))
Beispiel #22
0
                            fix_length=40)

LABEL = torchtext.data.Field(sequential=False, use_vocab=False)

#pandasでcsvを保存するときに、labelをintでキャストしておかないとエラーでるから注意
train_ds, val_ds, test_ds = torchtext.data.TabularDataset.splits(
    path='drive/My Drive/dataset/CBET/ekman',
    train='train.csv',
    validation='val.csv',
    test='test.csv',
    format='csv',
    fields=[('Text', TEXT), ('Label', LABEL)])

from torchtext.vocab import Vectors

english_fasttext_vectors = Vectors(name='drive/My Drive/wiki-news-300d-1M.vec')

print(english_fasttext_vectors.dim)
print(len(english_fasttext_vectors.itos))

#ボキャブラリを作成する
TEXT.build_vocab(train_ds, vectors=english_fasttext_vectors)

print(TEXT.vocab.stoi)

batch_size = 64
d_model = 300
hidden_size = 512
output_dim = 5
dropout_rate = 0.1
Beispiel #23
0
# 定义字段与FIELD之间读配对
fields = [('data', TEXT), ('label', LABEL)]

# 注意skip_header
train, val = TabularDataset.splits(path='data',
                                   train='train.csv',
                                   validation='val.csv',
                                   format='csv',
                                   fields=fields,
                                   skip_header=True)

# train, val = TabularDataset().splits(path='./data', train='train.csv', validation='val.csv',
#                                    format='csv', fields=fields, skip_header=True)

#  构建从本地加载的词向量
vectors = Vectors(name=bc.embedding_loc, cache=bc.cach)

# 构建vocabulary
TEXT.build_vocab(train, val, vectors=vectors)
LABEL.build_vocab(train, val, vectors=vectors)

# print(LABEL.vocab.stoi['0']) # '1':2, '0':3

train_iter = BucketIterator(train, batch_size=bc.batch_size, \
sort_key=lambda x: len(x.data), sort_within_batch=True, shuffle=True)

val_iter = BucketIterator(val, batch_size=bc.batch_size, \
sort_key=lambda x: len(x.data), sort_within_batch=True, shuffle=True)

vocab_size = TEXT.vocab.vectors.shape
Beispiel #24
0
    def run(self):
        print("Running on", self.a.device)
        self.set_device(self.a.device)

        np.random.seed(self.a.seed)
        torch.manual_seed(self.a.seed)
        torch.backends.cudnn.benchmark = True

        # create training set
        if self.a.test_ee:
            log('loading event extraction corpus from %s' % self.a.test_ee)

        WordsField = Field(lower=True, include_lengths=True, batch_first=True)
        PosTagsField = Field(lower=True, batch_first=True)
        EntityLabelsField = MultiTokenField(lower=False, batch_first=True)
        AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None)
        EventsField = EventField(lower=False, batch_first=True)
        EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False)
        if self.a.amr:
            colcc = 'amr-colcc'
        else:
            colcc = 'stanford-colcc'
        print(colcc)

        train_ee_set = ACE2005Dataset(path=self.a.train_ee,
                                      fields={"words": ("WORDS", WordsField),
                                              "pos-tags": ("POSTAGS", PosTagsField),
                                              "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                              colcc: ("ADJM", AdjMatrixField),
                                              "golden-event-mentions": ("LABEL", LabelField),
                                              "all-events": ("EVENT", EventsField),
                                              "all-entities": ("ENTITIES", EntitiesField)},
                                      amr=self.a.amr, keep_events=1)

        dev_ee_set = ACE2005Dataset(path=self.a.dev_ee,
                                    fields={"words": ("WORDS", WordsField),
                                            "pos-tags": ("POSTAGS", PosTagsField),
                                            "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                            colcc: ("ADJM", AdjMatrixField),
                                            "golden-event-mentions": ("LABEL", LabelField),
                                            "all-events": ("EVENT", EventsField),
                                            "all-entities": ("ENTITIES", EntitiesField)},
                                    amr=self.a.amr, keep_events=0)

        test_ee_set = ACE2005Dataset(path=self.a.test_ee,
                                     fields={"words": ("WORDS", WordsField),
                                             "pos-tags": ("POSTAGS", PosTagsField),
                                             "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                             colcc: ("ADJM", AdjMatrixField),
                                             "golden-event-mentions": ("LABEL", LabelField),
                                             "all-events": ("EVENT", EventsField),
                                             "all-entities": ("ENTITIES", EntitiesField)},
                                     amr=self.a.amr, keep_events=0)

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, vectors=pretrained_embedding)
        else:
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS)
        PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS)
        EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS)
        LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL)
        EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT)
        consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME]
        # print("O label is", consts.O_LABEL)
        consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME]
        # print("O label for AE is", consts.ROLE_O_LABEL)

        self.a.label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5
        self.a.label_weight[consts.O_LABEL] = 1.0
        self.a.arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5
        # add role mask
        self.a.role_mask = event_role_mask(self.a.test_ee, self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi,
                                           EventsField.vocab.stoi, self.device)
        # print('self.a.hps', self.a.hps)
        if not self.a.hps_path:
            self.a.hps = eval(self.a.hps)
        if "wemb_size" not in self.a.hps:
            self.a.hps["wemb_size"] = len(WordsField.vocab.itos)
        if "pemb_size" not in self.a.hps:
            self.a.hps["pemb_size"] = len(PosTagsField.vocab.itos)
        if "psemb_size" not in self.a.hps:
            self.a.hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest()]) + 2
        if "eemb_size" not in self.a.hps:
            self.a.hps["eemb_size"] = len(EntityLabelsField.vocab.itos)
        if "oc" not in self.a.hps:
            self.a.hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.hps:
            self.a.hps["ae_oc"] = len(EventsField.vocab.itos)

        tester = self.get_tester(LabelField.vocab.itos, EventsField.vocab.itos)

        if self.a.finetune:
            log('init model from ' + self.a.finetune)
            model = load_ee_model(self.a.hps, self.a.finetune, WordsField.vocab.vectors, self.device)
            log('model loaded, there are %i sets of params' % len(model.parameters_requires_grads()))
        else:
            model = load_ee_model(self.a.hps, None, WordsField.vocab.vectors, self.device)
            log('model created from scratch, there are %i sets of params' % len(model.parameters_requires_grads()))

        self.a.word_i2s = WordsField.vocab.itos
        self.a.label_i2s = LabelField.vocab.itos
        self.a.role_i2s = EventsField.vocab.itos
        writer = SummaryWriter(os.path.join(self.a.out, "exp"))
        self.a.writer = writer

        # train_iter = BucketIterator(train_ee_set, batch_size=self.a.batch,
        #                             train=True, shuffle=False, device=-1,
        #                             sort_key=lambda x: len(x.POSTAGS))
        # dev_iter = BucketIterator(dev_ee_set, batch_size=self.a.batch, train=False,
        #                           shuffle=False, device=-1,
        #                           sort_key=lambda x: len(x.POSTAGS))
        test_iter = BucketIterator(test_ee_set, batch_size=self.a.batch, train=False,
                                   shuffle=False, device=-1,
                                   sort_key=lambda x: len(x.POSTAGS))

        print("\nStarting testing ...\n")

        # Testing Phrase
        test_loss, test_ed_p, test_ed_r, test_ed_f1, \
        test_ae_p, test_ae_r, test_ae_f1 = run_over_data(data_iter=test_iter,
                                                         optimizer=None,
                                                         model=model,
                                                         need_backward=False,
                                                         MAX_STEP=ceil(len(
                                                             test_ee_set) /
                                                                       self.a.batch),
                                                         tester=tester,
                                                         hyps=model.hyperparams,
                                                         device=model.device,
                                                         maxnorm=self.a.maxnorm,
                                                         word_i2s=self.a.word_i2s,
                                                         label_i2s=self.a.label_i2s,
                                                         role_i2s=self.a.role_i2s,
                                                         weight=self.a.label_weight,
                                                         arg_weight=self.a.arg_weight,
                                                         save_output=os.path.join(
                                                             self.a.out,
                                                             "test_final.txt"),
                                                         role_mask=self.a.role_mask)

        print("\nFinally test loss: ", test_loss,
              "\ntest ed p: ", test_ed_p,
              " test ed r: ", test_ed_r,
              " test ed f1: ", test_ed_f1,
              "\ntest ae p: ", test_ae_p,
              " test ae r: ", test_ae_r,
               " test ae f1: ", test_ae_f1)
Beispiel #25
0
train, val, test = torchtext.datasets.SST.splits(
    TEXT, LABEL, filter_pred=lambda ex: ex.label != 'neutral')

print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

TEXT.build_vocab(train)
LABEL.build_vocab(train)
print('len(TEXT.vocab)', len(TEXT.vocab))
print('len(LABEL.vocab)', len(LABEL.vocab))

train_iter, val_iter, test_iter = torchtext.data.BucketIterator.splits(
    (train, val, test), batch_size=bs, device=-1, repeat=False)

# glove = GloVe(name='6B',dim=300)
TEXT.vocab.load_vectors(vectors=Vectors('glove.6B.300d.txt'))
glove = TEXT.vocab.vectors

# Build the vocabulary with word embeddings
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
TEXT.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url))

print("Word embeddings size ", TEXT.vocab.vectors.size())
word2vec = TEXT.vocab.vectors

############################################
# With help from Yunjey Pytorch Tutorial on Github

# class CNN(nn.Module):
#   def __init__(self):
#       super(CNN, self).__init__()
Beispiel #26
0
def train(args, writer, is_train=True):

    # Build train dataset
    fields, train_dataset = build_and_cache_dataset(args, mode='train')
    # for i in range(5):
    #     print(train_dataset[i].category,train_dataset[i].news)
    # return

    # Build vocab
    ID, CATEGORY, NEWS = fields
    vectors = Vectors(name=args.embed_path, cache=args.data_dir)
    # NOTE: use train_dataset to build vocab!
    NEWS.build_vocab(
        train_dataset,
        max_size=args.vocab_size,
        vectors=vectors,
        unk_init=torch.nn.init.xavier_normal_,
    )
    CATEGORY.build_vocab(train_dataset)

    # print("查找第1000个单词:"+NEWS.vocab.itos[1000])
    # print("查找单词‘每个’的索引:"+str(NEWS.vocab.stoi[r'每个']))
    # print("词向量矩阵的维度:"+str(NEWS.vocab.vectors.shape))
    # word_vec = NEWS.vocab.vectors[NEWS.vocab.stoi['每个']]
    # print("单词‘每个’的词向量为:"+str(word_vec))
    # return

    # model = TextClassifier(
    #     vocab_size=len(NEWS.vocab),
    #     output_dim=args.num_labels,
    #     pad_idx=NEWS.vocab.stoi[NEWS.pad_token],
    #     dropout=args.dropout,
    # )

    #使用双向gru+attetion机制模型
    model = bigru_attention(
        vocab_size=len(NEWS.vocab),
        output_dim=args.num_labels,
        pad_idx=NEWS.vocab.stoi[NEWS.pad_token],
        dropout=args.dropout,
    )

    # Init embeddings for model
    model.embedding.from_pretrained(NEWS.vocab.vectors)

    bucket_iterator = BucketIterator(
        train_dataset,
        batch_size=args.train_batch_size,
        sort_within_batch=True,
        shuffle=True,
        sort_key=lambda x: len(x.news),
        device=args.device,
    )
    f1_score = 0
    if os.listdir("output_dir"):
        f1_score = float(
            os.listdir("output_dir")[0].split("_")[1].split(".p")[0])
        model.load_state_dict(
            torch.load("output_dir/" + os.listdir("output_dir")[0]))
    model.to(args.device)
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(),
                     lr=args.learning_rate,
                     eps=args.adam_epsilon)
    # scheduler = lr_scheduler.OneCycleLR(optimizer,
    #                        max_lr=args.learning_rate,
    #                        epochs=args.num_train_epochs,
    #                        steps_per_epoch=len(bucket_iterator))
    #scheduler = lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1,last_epoch = -1 )

    global_step = 0
    model.zero_grad()

    if is_train:
        train_trange = trange(0, args.num_train_epochs, desc="Train epoch")
        for _ in train_trange:
            epoch_iterator = tqdm(bucket_iterator, desc='Training')
            results_f1_score = 0
            for step, batch in enumerate(epoch_iterator):
                model.train()

                news, news_lengths = batch.news  #new.size() [8  ,64]
                category = batch.category  #category.size() [64]
                #preds = model(news, news_lengths)
                preds = model(news)
                loss = criterion(preds, category)
                loss.backward()
                #optimizer.zero_grad()
                optimizer.step()
                #scheduler.step()
                # Logging
                writer.add_scalar('Train/Loss', loss.item(), global_step)
                # writer.add_scalar('Train/lr',
                #                   scheduler.get_last_lr()[0], global_step)
                # NOTE: Update model, optimizer should update before scheduler

                global_step += 1

                # NOTE:Evaluate
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    results = evaluate(args, model, CATEGORY.vocab, NEWS.vocab)
                    results_f1_score = results['f1']
                    for key, value in results.items():
                        writer.add_scalar("Eval/{}".format(key), value,
                                          global_step)

                # NOTE: save model
                # if args.save_steps > 0 and global_step % args.save_steps == 0:
                #     save_model(args, model, optimizer, scheduler, global_step)
                if results_f1_score > f1_score:
                    try:
                        os.remove("output_dir/model_" + str(f1_score) + ".pt")
                    except:
                        print("None!")
                    torch.save(
                        model.state_dict(),
                        "output_dir/model_" + str(results_f1_score) + ".pt")
                    f1_score = results_f1_score
                    print("So far the best score is:" + str(f1_score) +
                          "+++++++++++++++++++++++++++++++")
        writer.close()
    else:
        test(args, model, CATEGORY.vocab, NEWS.vocab)
Beispiel #27
0
 def _read_word_embeddings(self, file):
     from torchtext.vocab import Vectors
     from pathlib import Path
     path = Path(file)
     vectors = Vectors(name=path.name, cache=path.parent)
     return vectors
Beispiel #28
0
print(EN.vocab.freqs.most_common(10))
print("Size of English vocab", len(EN.vocab))
print(EN.vocab.stoi["<s>"], EN.vocab.stoi["</s>"]) # vocab index for <s>, </s>

BATCH_SIZE = 32
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=BATCH_SIZE, device=-1,
                                                  repeat=False, sort_key=lambda x: len(x.src))

batch = next(iter(train_iter))
print("Source size", batch.src.size())
print("Target size", batch.trg.size())

# https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
if word2vec:
    url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
    EN.vocab.load_vectors(vectors=Vectors('wiki.simple.vec', url=url)) # feel free to alter path
    print("Simple English embeddings size", EN.vocab.vectors.size())
    url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.de.vec'
    DE.vocab.load_vectors(vectors=Vectors('wiki.de.vec', url=url)) # feel free to alter path
    print("German embeddings size", DE.vocab.vectors.size())

print("REMINDER!!! Did you create ../../models/HW3?????")

unk_token = EN.vocab.stoi["<unk>"]
pad_token = EN.vocab.stoi["<pad>"]
sos_token = EN.vocab.stoi["<s>"]
eos_token = EN.vocab.stoi["</s>"]

''' TODO
Fix bidirectional S2S
Does ppl change if you average loss the actual Yoon way?
Beispiel #29
0
def get_utterance_and_context_loader(max_length=256, batch_size=64):
    max_length = max_length
    batch_size = batch_size

    ID = torchtext.data.Field(sequential=False, use_vocab=False)
    UTTERANCE = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>")
    SPEAKER = torchtext.data.Field(sequential=False, use_vocab=True)
    CONTEXT_ALL = torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>")
    LABEL = torchtext.data.Field(sequential=False, use_vocab=False, preprocessing=lambda l: 0 if l=='TRUE' else 1, is_target=True)
   #  CONTEXT1 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT2 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT3 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT4 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT5 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT6 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT7 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT8 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT9 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   #  CONTEXT10 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")
   # CONTEXT11 =  torchtext.data.Field(sequential=True, tokenize=tokenizer_with_preprocessing, use_vocab=True, lower=True, include_lengths=True, batch_first=True, fix_length=max_length, init_token="<cls>", eos_token="<eos>")


    ds = torchtext.data.TabularDataset(
        path='./MUStARD/sarcasm_data.csv', format='csv',
        fields=[("id", ID),
                ("utterance", UTTERANCE),
                ("speaker", SPEAKER),
                ("context_all", CONTEXT_ALL),
                ("label", LABEL)],
                skip_header=True)

    # test dataloader
    # print(f'データ数{len(ds)}')
    # print(f'1つ目のデータ{vars(ds[1])}')

    # dsをtrain, val, testに分ける. ランダムに8:1:1 で分ける
    train_ds, val_ds, test_ds = ds.split(split_ratio=[0.8, 0.1, 0.1], random_state=random.seed(1234))

    # test split
    # print(f'train dataの数:{len(train_ds)}, validataion dataの数:{len(val_ds)}, test dataの数: {len(test_ds)}')
    # print(f'1つ目のデータ{vars(train_ds[1])}')


    english_fasttext_vectors = Vectors(name='data/wiki-news-300d-1M.vec')

    # ベクトル化したバージョンのボキャブラリーを作成. (UTTERANCEとCONTEXTの2つのフィールドで同一のvocabを作成したため少し変則的)
    UTTERANCE.build_vocab(ds.utterance, ds.context_all, vectors=english_fasttext_vectors, min_freq=1)
    CONTEXT_ALL.vocab = UTTERANCE.vocab
    # 普通のbuild_vocab
    # UTTERANCE.build_vocab(ds, vectors=english_fasttext_vectors, min_freq=1)
    # CONTEXT_ALL.build_vocab(ds, vectors=english_fasttext_vectors, min_freq=1)
    SPEAKER.build_vocab(ds)

    # ボキャブラリーのベクトルを確認
    # print(UTTERANCE.vocab.vectors.shape)
    # print(UTTERANCE.vocab.vectors)

    # ボキャブラリーの単語の順番を確認
    # print(CONTEXT_ALL.vocab.stoi)

    # make dataloader
    train_dl = torchtext.data.Iterator(train_ds, batch_size=24, train=True)
    val_dl = torchtext.data.Iterator(val_ds, batch_size=24, train=False, sort=False)
    test_dl = torchtext.data.Iterator(test_ds, batch_size=24, train=False, sort=False)

    # test   train_dataで確認
    batch = next(iter(train_dl))
    print(batch.utterance)
    print(batch.label)
Beispiel #30
0
    device=torch_device,  # 如果使用gpu,此处更换为GPU的编号
    sort_key=lambda x: len(x.sentence),  # 这个BucketIterator需要文本的长度
    sort_within_batch=False,
    repeat=False
)

test_iter = data.Iterator(test, batch_size=test_batch_size, device=torch_device, sort=False, sort_within_batch=False, repeat=False)


from torchtext.vocab import Vectors
import os


cache='../vector_cache'
if not os.path.exists(cache):
    os.mkdir(cache)
vectors = Vectors(name='glove.6B.'+str(vocab_dimension)+'d.txt', cache=cache)

print("build vocab: start")
TEXT.build_vocab(train, vectors=vectors)

vocab = TEXT.vocab
weight_matrix = vocab.vectors
glove_vocabulary = set(vectors.stoi)
train_data_vocabulary = set(vocab.stoi)
print("字典词汇数/训练集登录词汇数/训练集未登录词汇数={}/{}/{}".format(len(glove_vocabulary),
            len(train_data_vocabulary & glove_vocabulary), len(train_data_vocabulary - glove_vocabulary)))

print("build vocab: end")