def __init__(self,
                 data_dir,
                 seq_length,
                 vocab_size=None,
                 vocab=None,
                 training=False):
        self.data_dir = data_dir
        self.seq_length = seq_length
        self.vocab = Vocabulary()
        with open(os.path.join(data_dir, "simpsons.txt"),
                  "r",
                  encoding="utf-8") as f:
            self.text = f.read()

        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(self.text)
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(self.text)
        self.tokens = self.vocab.tokenize(self.text)
    def __init__(
        self,
        data_dir,
        vocab_size=None,
        vocab=None,
        seq_length=40,
        training=False,
        vocab_from_pretrained="bert-base-uncased",
        do_lower_case=True,
    ):

        self.data_dir = data_dir
        self.vocab = Vocabulary(vocab_from_pretrained, do_lower_case)
        self.seq_length = seq_length

        data_all = pd.read_csv(os.path.join(self.data_dir, "combined-data.csv"), sep=' ', header=None, encoding="cp1252")
        data_all[1] = data_all[1] + " " + data_all[2]
        data_all = data_all[[0, 1]]
        data_all.columns = ['label', 'text']
        data_all = data_all[['text', 'label']]
        data_all = data_all[~data_all.text.isna()]
        data_all.label = data_all.label.apply(lambda x: int(x[-1]))
        data_all.text = data_all.text.apply(lambda x: x.lower())

        data_all = data_all.sample(1000)
        
        self.train_df = data_all.copy() #pd.DataFrame({"text": [], "label": []})
        self.val_df = pd.DataFrame({"text": [], "label": []})
        self.test_df = data_all.copy() # pd.DataFrame({"text": [], "label": []}) #data_all.copy()

        del data_all

        if training:
            self.train()
            if vocab is not None:
                if isinstance(vocab, str):
                    self.vocab.load(vocab)
                elif isinstance(vocab, Vocabulary):
                    self.vocab = vocab
            elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
                self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
            else:
                self.vocab.add_text(
                    " ".join(pd.concat([self.train_df, self.val_df], sort=False).text.values)
                )
                self.vocab.save(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.test()
            if vocab is not None:
                if isinstance(vocab, str):
                    self.vocab.load(vocab)
                elif isinstance(vocab, Vocabulary):
                    self.vocab = vocab
            elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
                self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
            else:
                raise(Exception("Vocab file is not specified in test mode!"))
        
        if vocab_size is not None:
                self.vocab = self.vocab.most_common(vocab_size - 2)
    def __init__(self, data_dir, seq_length, vocab_size, vocab=None):
        self.df = pd.read_csv(os.path.join(data_dir, 'spam.csv'),
                              encoding="mbcs")
        self.vocab = Vocabulary()
        self.labels = []
        for x in self.df.v1:
            if x == 'ham':
                self.labels.append(0)
            else:
                self.labels.append(1)
        self.seq_length = seq_length
        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(" ".join(self.df["v2"].values))
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(" ".join(self.df["v2"].values))
        self.tokens = []
        for content in self.df["v2"].values:
            self.tokens.append(
                self.vocab.tokenize(self.vocab.clean_text(content)))
def build_vocab(anns, threshold=4):
    """Build a simple vocabulary wrapper."""

    counter = Counter()

    for i, ann in enumerate(anns):
        # print('Processing {}/{}...'.format(i+1, len(anns)))
        caption = ann.get('caption')
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(caption.lower())
        counter.update(tokens)

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    vocab = Vocabulary()
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab
Exemple #5
0
    def __init__(
        self,
        data_dir,
        seq_length,
        vocab_size=None,
        vocab=None,
        training=False,
        vocab_from_pretrained="bert-base-uncased",
        do_lower_case=True,
    ):
        self.data_dir = data_dir
        self.seq_length = seq_length
        self.vocab = Vocabulary()
        with open(os.path.join(data_dir, "rick_and_morty.txt"),
                  "r",
                  encoding="utf-8") as f:
            self.text = f.read()

        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(self.text)
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(self.text)
        self.tokens = self.vocab.tokenize(self.text)
def parse_exist_vocab(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    idx2word = data[2]
    word2idx = data[3]

    vocab = Vocabulary()
    vocab.set_content(word2idx=word2idx, idx2word=idx2word)
    return vocab
Exemple #7
0
    def __init__(self, mode):
        self.mode = mode
        assert self.mode in ['train', 'valid', 'test']
        self.root = os.path.join('data', 'yelp')
        voc_f = os.path.join(self.root, 'yelp.vocab')
        self.max_len = config.max_len

        self.sentences = []
        with open(os.path.join(self.root, '{}.txt'.format(self.mode))) as f:
            for line in f.readlines():
                if len(line.strip()) in [0, 1]:
                    continue
                words = line.strip().split()
                assert words[0] in [str(dig) for dig in range(5)], '{} does not start with the rating'.format(words)
                self.sentences.append(words[1:])

        print('Yelp data successfully read.')

        # Build vocabulary.
        if self.mode == 'train':
            print('----- Building vocab -----')
            build_vocab(self.sentences, voc_f, min_occur=1)  # TODO

        # Load vocabulary.
        print('----- Loading vocab -----')
        self.vocab = Vocabulary(voc_f)
        print('vocabulary size:', self.vocab.size)
        self.pad = self.vocab.word2id['<pad>']
        self.go = self.vocab.word2id['<go>']
        self.eos = self.vocab.word2id['<eos>']
        self.unk = self.vocab.word2id['<unk>']
    def __init__(self, mode, noisy_for_train, sentiment, direction):
        self.mode = mode
        self.root = os.path.join('../data', 'yelp')
        self.noisy = self.mode == 'train' and noisy_for_train

        # Load data from domain 0 and domain 1.
        path = os.path.join(self.root,
                            'sentiment.{}.{}'.format(mode, sentiment))

        # Load vocabulary.
        print('----- Loading vocab -----')
        self.vocab = Vocabulary('../data/amazon/amazon.vocab')
        print('vocabulary size:', self.vocab.size)
        self.pad = self.vocab.word2id['<pad>']
        self.go = self.vocab.word2id['<go>']
        self.eos = self.vocab.word2id['<eos>']
        self.unk = self.vocab.word2id['<unk>']

        # Tokenize file content
        with open(path, 'r') as f:
            ids = []
            for line in f:
                words = ['<go>'] + line.split() + ['<eos>']
                if direction == 'forward':
                    pass
                elif direction == 'backward':
                    words.reverse()
                else:
                    raise ValueError()
                for word in words:
                    ids.append(self.vocab.word2id[word] if word in
                               self.vocab.word2id else self.unk)
        self.ids = torch.LongTensor(ids)  # (very_long, )
        self.ids = batchify(self.ids, config.batch_size,
                            config)  # shape = (???, batch_size)
    def __init__(self, test_cls, dataset):

        self.vocab = Vocabulary('../data/{}/{}.vocab'.format(dataset, dataset))
        self.Emb = nn.Embedding.from_pretrained(self.vocab.embedding,
                                                freeze=False)
        self.Emb = gpu_wrapper(self.Emb)
        if test_cls == 'TextCNN':
            self.C = Discriminator(kernels=config.textCNN_kernels,
                                   conv_dim=config.textCNN_conv_dim,
                                   dim_h=100,
                                   D=2,
                                   dropout=config.textCNN_dropout)
        else:
            raise ValueError()
        self.C = gpu_wrapper(self.C)

        self.train_set, self.test_set, self.val_set = None, None, None
        self.logger, self.optim, self.best_acc = None, None, 0
        self.iter_num = 0
        self.lr = config.textCNN_lr
        self.dataset = dataset
        self.model_name = test_cls + '-' + dataset
        self.noisy = True
        self.total_iters = 200000
        self.beta1 = 0.5
        self.beta2 = 0.999
        self.batch_size = 64
        self.num_workers = 8
        self.ROUND = 4
        self.sample_step = 4000
        self.lr_decay_step = 1000
        self.num_iters_decay = 0
        self.max_len = 20
Exemple #10
0
    def __init__(self):
        self.root = os.path.join('data', 'ptb')
        voc_f = os.path.join(self.root, 'ptb.vocab')
        self.max_len = config.max_len

        self.sentence_pairs = []
        sentences = []
        with open(os.path.join(self.root, 'interp_pairs.txt')) as f:
            for line in f.readlines():
                if len(line.strip()) == 0:
                    continue
                sent = line.strip().split()
                sentences.append(sent)
        for i in range(len(sentences)):
            for j in range(len(sentences)):
                if i == j:
                    continue
                self.sentence_pairs.append((sentences[i], sentences[j]))
        print('PTB Interpolation data successfully read.')

        # Load vocabulary.
        print('----- Loading vocab -----')
        self.vocab = Vocabulary(voc_f)
        print('vocabulary size:', self.vocab.size)
        self.pad = self.vocab.word2id['<pad>']
        self.go = self.vocab.word2id['<go>']
        self.eos = self.vocab.word2id['<eos>']
        self.unk = self.vocab.word2id['<unk>']
Exemple #11
0
    def __init__(self, mode):
        self.mode = mode
        assert self.mode in ['train', 'valid', 'test']
        self.root = os.path.join('data', 'switchboard')
        voc_f = os.path.join(self.root, 'switchboard.vocab')
        self.max_len = config.max_len

        self.posts = []
        self.responses = []
        with open(os.path.join(self.root, '{}.txt'.format(self.mode))) as f:
            for line in f.readlines():
                if len(line.strip()) == 0:
                    continue
                post, response = line.strip().split('\t')
                self.posts.append(post.split())
                self.responses.append(response.split())

        print('SwitchBoard data successfully read.')

        # Build vocabulary.
        if self.mode == 'train':
            print('----- Building vocab -----')
            build_vocab(self.posts + self.responses, voc_f,
                        min_occur=5)  # TODO

        # Load vocabulary.
        print('----- Loading vocab -----')
        self.vocab = Vocabulary(voc_f)
        print('vocabulary size:', self.vocab.size)
        self.pad = self.vocab.word2id['<pad>']
        self.go = self.vocab.word2id['<go>']
        self.eos = self.vocab.word2id['<eos>']
        self.unk = self.vocab.word2id['<unk>']
Exemple #12
0
def get_dialog_data_iter(vocab: Vocabulary,
                         dialog_file,
                         batch_size,
                         max_len=None,
                         max_turn=None,
                         model="HRED",
                         infer=False,
                         shuffle=False,
                         bucket_config=None):
    dialog_data = load_dialog_data(dialog_file, '</d>')
    dialog_idx = [[vocab.convert2idx(sent) for sent in dialog]
                  for dialog in dialog_data]

    if model == 'HRED':
        data_iter = DialogIterator(dialog_data=dialog_idx,
                                   batch_size=batch_size,
                                   sos_idx=vocab.sos_idx,
                                   eos_idx=vocab.eos_idx,
                                   pad_idx=vocab.pad_idx,
                                   max_len=max_len,
                                   max_turn=max_turn,
                                   infer=infer,
                                   shuffle=shuffle,
                                   bucket_config=bucket_config)
        return data_iter
    else:
        raise NotImplementedError("Not Implemented Data Iterator")
Exemple #13
0
    def __init__(self, mode, noisy_for_train):
        self.mode = mode
        self.root = os.path.join('../data', 'yelp')
        voc_f = os.path.join('../data/yelp', 'yelp.vocab')
        if self.mode == 'dev':
            self.max_len = 30
        else:
            self.max_len = 20
        self.noisy = self.mode == 'train' and noisy_for_train

        # Load data from domain 0 and domain 1.
        path0 = os.path.join(self.root, 'sentiment.{}.0'.format(mode))
        data0 = []
        self.remove0 = []
        with open(path0) as f:
            for i, line in enumerate(f):
                sent = line.split()
                if 4 < len(sent) < self.max_len:
                    data0.append(sent)
                else:
                    self.remove0.append(i)
        print('{}/{} removed from domain 0'.format(
            len(self.remove0),
            len(self.remove0) + len(data0)))
        path1 = os.path.join(self.root, 'sentiment.{}.1'.format(mode))
        data1 = []
        self.remove1 = []
        with open(path1) as f:
            for i, line in enumerate(f):
                sent = line.split()
                if 4 < len(sent) < self.max_len:
                    data1.append(sent)
                else:
                    self.remove1.append(i)
        print('{}/{} removed from domain 1'.format(
            len(self.remove1),
            len(self.remove1) + len(data1)))
        self.l0 = len(data0)
        self.l1 = len(data1)
        # Make up for the same length.
        if len(data0) < len(data1):
            data0 = makeup(data0, len(data1))
        if len(data1) < len(data0):
            data1 = makeup(data1, len(data0))
        assert len(data0) == len(data1)
        self.data0 = data0
        self.data1 = data1

        if self.mode == 'dev':
            self.max_len += 5

        # Load vocabulary.
        print('----- Loading vocab -----')
        self.vocab = Vocabulary(voc_f)
        print('vocabulary size:', self.vocab.size)
        self.pad = self.vocab.word2id['<pad>']
        self.go = self.vocab.word2id['<go>']
        self.eos = self.vocab.word2id['<eos>']
        self.unk = self.vocab.word2id['<unk>']
Exemple #14
0
    def __init__(self, data_path, vocab=Vocabulary(), predict=False):
        """
        Creates an object that gets data from a file.
        """
        super(Data, self).__init__(data_path, vocab)

        if not predict:
            self._train_test_split()
Exemple #15
0
    def __init__(self, data_path, vocab=Vocabulary()):
        self.vocab = vocab

        data = get_requests_from_file(data_path)
        print("Downloaded {} samples".format(len(data)))

        map_result = map(self._process_request, data)
        self.lengths = [x[1] for x in map_result]
        map_result = map(self._process_request, data)
        self.data = [x[0] for x in map_result]

        assert len(self.data) == len(self.lengths)
def prepare_training_data_from_init(mconf, num_tasks=7):

    vocab_save_path = mconf.vocab_save_dir_prefix + "vocab.c{}".format(
        mconf.vocab_cutoff)
    vocab = Vocabulary()

    if os.path.exists(vocab_save_path):
        vocab.init_from_saved_vocab(vocab_save_path)
    else:
        vocab.update_vocab(mconf.data_dir_prefix + "all_text")
        if not os.path.exists(mconf.vocab_save_dir_prefix):
            os.makedirs(mconf.vocab_save_dir_prefix)
        vocab.save_vocab(vocab_save_path)

    mconf.vocab_size = vocab._size

    X_a, y_a, X_b, y_b = get_meta_train_data(mconf,
                                             vocab,
                                             num_tasks,
                                             save=True)

    return X_a, y_a, X_b, y_b, vocab
Exemple #17
0
    args = _update_default_parameters(args)

    set_random_seed(seed=args["random_seed"])
    if os.path.exists(args["output_dir"]):
        shutil.rmtree(args["output_dir"])
    create_output_dir(args["output_dir"])

    dataset_reader = DatasetReader(args)
    train_data = dataset_reader.read("data/%s/train.txt" % args["dataset"])
    print_out("Load %d instances from train set." % (len(train_data)))
    dev_data = dataset_reader.read("data/%s/dev.txt" % args["dataset"])
    print_out("Load %d instances from dev set." % (len(dev_data)))
    test_data = dataset_reader.read("data/%s/test.txt" % args["dataset"])
    print_out("Load %d instances from test set." % (len(test_data)))

    datasets = {"train": train_data, "validation": dev_data, "test": test_data}
    vocab = Vocabulary.from_instances(
        (instance for dataset in datasets.values() for instance in dataset))
    vocab.save_to_files(os.path.join(args["output_dir"], "vocabulary"))
    train_iterator = BucketIterator(sorting_keys=[['tokens', 'tokens_length']],
                                    batch_size=args["batch_size"])
    train_iterator.index_with(vocab)
    dev_iterator = BasicIterator(batch_size=args["batch_size"])
    dev_iterator.index_with(vocab)

    model, metrics, model_paths = train_model(args)
    metrics["args"] = args

    final_evaluate(model, vocab, test_data, dev_iterator, _external_eval,
                   metrics, args, model_paths)
Exemple #18
0
class RickAndMortyDataset(BaseDataset):
    """ Wrapper class to process and produce training samples """
    def __init__(
        self,
        data_dir,
        seq_length,
        vocab_size=None,
        vocab=None,
        training=False,
        vocab_from_pretrained="bert-base-uncased",
        do_lower_case=True,
    ):
        self.data_dir = data_dir
        self.seq_length = seq_length
        self.vocab = Vocabulary()
        with open(os.path.join(data_dir, "rick_and_morty.txt"),
                  "r",
                  encoding="utf-8") as f:
            self.text = f.read()

        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(self.text)
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(self.text)
        self.tokens = self.vocab.tokenize(self.text)

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        input_ids = [
            self.vocab[word] for word in self.tokens[idx:idx + self.seq_length]
        ]
        y = [self.vocab[self.tokens[idx + self.seq_length]]]

        attention_mask = attention_mask = [1] * len(input_ids)
        segment_ids = attention_mask = [1] * len(input_ids)

        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.LongTensor(attention_mask)
        segment_ids = torch.LongTensor(segment_ids)
        y = torch.LongTensor(y)
        return input_ids, attention_mask, segment_ids, y
Exemple #19
0
def test(config):
    logger = config.get_logger('test')
    logger.info("Running test with configuration:")
    logger.info(config)

    expert_dims = compute_dims(config)

    vocab = None
    vocab_size = None
    we_parameter = None

    if "attr" in config['experts']['modalities']:
        attr_vocab = Vocabulary()
        attr_vocab.load(
            os.path.join(config['data_loader']['args']['data_dir'],
                         'attributes/dict.attr.json'))
        attr_vocab_size = len(attr_vocab)
    else:
        attr_vocab = None
        attr_vocab_size = None

    data_loaders = config.init(
        name='data_loader',
        module=module_data,
        expert_dims=expert_dims,
        text_feat=config['experts']['text_feat'],
        text_dim=config['experts']['text_dim'],
    )

    model = config.init(name='arch',
                        module=module_arch,
                        expert_dims=expert_dims,
                        text_dim=config['experts']['text_dim'],
                        same_dim=config['experts']['ce_shared_dim'],
                        text_feat=config['experts']['text_feat'])
    trainer = TrainerJoint(
        model,
        loss=None,
        optimizer=None,
        config=config,
        data_loaders=data_loaders,
        lr_scheduler=None,
    )

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Running test on {device}")

    metric = trainer._valid_epoch(save_textatt=True)

    if config._args.mode == 'val':
        for key, value in metric.items():
            if key == 'recall_avg':
                logger.info(f'[Avg Recall]     : {value}')
            elif key == 'recall_avg_corr':
                logger.info(f'[Avg Recall corr]: {value}')
            elif key == 'comb_avg':
                logger.info(f'[comb_avg]       : {value}')
            elif key == 'recall':
                for i, category in zip(value, trainer.categories):
                    if len(i) == 2:
                        logger.info(f'[{category}] r@10, r@50: {i[0]}\t{i[1]}')
                    elif len(i) == 4:
                        logger.info(
                            f'[{category}] comp corr r@10, r@50: {i[0]}\t{i[1]}\t{i[2]}\t{i[3]}'
                        )
            elif key == 'comb':
                combstr = "comb:"
                for i, category in zip(value, trainer.categories):
                    combstr += f' {i[0]} {i[1]}'
                logger.info(combstr)
    else:
        save_fname = config.save_dir / f'test_score.pt'
        tic = time.time()
        logger.info("Saving score matrix: {} ...".format(save_fname))
        torch.save(metric, save_fname)
        logger.info(f"Done in {time.time() - tic:.3f}s")
Exemple #20
0
def main():
    # Update path
    training_data = r'----------------/Data/Skipgram/hansards/training.en'
    dump_process_pkl = r'----------------/Data/Skipgram/hansards/processed_en_w.pkl'
    dump_context_dict = r'----------------/Data/Skipgram/hansards/context_dict_w.pkl'
    dump_context_list = r'----------------/Data/Skipgram/hansards/context_list_w.pkl'
    save_model_path = r'----------------/Data/Skipgram/hansards'
    embedding_txt = r'----------------/Data/Skipgram/hansards/embedding.txt'
    embedding_temp = r'----------------/Data/Skipgram/hansards/embedding_temp.txt'
    epochs = 20
    batch_size = 2**10
    window = 5
    num_neg_sample = 5
    writer = SummaryWriter()
    stopwords = set(stopwords.words('english'))

    with open(training_data, 'r') as f:
        data = f.readlines()
        data = [line.replace('\n', '').split(' ') for line in data]
        data = [[word for word in line if word not in stopwords]
                for line in data]

    if os.path.exists(dump_process_pkl):
        with open(dump_process_pkl, 'rb') as f:
            vocab = pickle.load(f)
    else:
        vocab = Vocabulary()
        vocab.add_documents(data)
        vocab.build()

        with open(dump_process_pkl, 'wb') as f:
            pickle.dump(vocab, f)

    # use transformation only once, i.e either during creating the context dict and list or during training
    if not os.path.exists(dump_context_dict):
        l, d = multiprocess(data, window=window, transform=vocab.doc2id)
        with open(dump_context_dict, 'wb') as f:
            pickle.dump(d, f)
        with open(dump_context_list, 'wb') as f:
            pickle.dump(l, f)
    else:
        with open(dump_context_dict, 'rb') as f:
            d = pickle.load(f)
        with open(dump_context_list, 'rb') as f:
            l = pickle.load(f)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # here transformation is required we will directly sample the index
    sample_table = negative_sampling_table(vocab.token_counter(),
                                           transform=vocab.token_to_id)
    neg_sample = np.random.choice(sample_table, size=(len(l), num_neg_sample))

    context_data = ContextData(l, d, neg_sample, n_sample=5, transform=None)
    context_dataloader = DataLoader(context_data,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=6)

    model_embedding = SkipGram(len(vocab.vocab), embedding_size=200)
    model_embedding.load_state_dict(
        torch.load(os.path.join(save_model_path, 'sk_model5_5.pkl')))
    model_embedding.to(device)
    optimizer_embedding = torch.optim.SparseAdam(model_embedding.parameters(),
                                                 lr=0.005)

    train(model_embedding,
          optimizer_embedding,
          context_dataloader,
          epochs,
          device,
          neg_sample,
          n_sample=num_neg_sample,
          save_path=save_model_path)
    word_embeddings = (model_embedding.out_embedding.weight.data +
                       model_embedding.in_embedding.weight.data) / 2
    word_embeddings = word_embeddings.cpu().numpy()

    sorted_vocab_tuple = sorted(vocab.vocab.items(), key=lambda kv: kv[1])

    with open(embedding_txt, 'w') as f:
        for idx, item in enumerate(sorted_vocab_tuple):
            if item[0] == '\n':
                continue
            f.write(item[0] + ' ' +
                    ' '.join([str(i) for i in word_embeddings[idx]]) + '\n')

    glove_file = datapath(embedding_txt)
    temp_file = get_tmpfile(embedding_temp)
    _ = glove2word2vec(glove_file, temp_file)

    wv = KeyedVectors.load_word2vec_format(temp_file)

    result = wv.most_similar(positive=['woman', 'king'], negative=['man'])
    print("{}: {:.4f}".format(*result[0]))

    writer.close()
Exemple #21
0
def test(config):
    config.config['data_loader']['args']['mode'] = 'test'
    logger = config.get_logger('test')
    logger.info("Running test with configuration:")
    logger.info(config)

    expert_dims, raw_input_dims = compute_dims(config)

    if config['experts']['text_feat'] == 'learnable':
        # vocab
        vocab = Vocabulary()
        vocab.load('dataset/captions/dict.all_200k_gan.json')
        vocab_size = len(vocab)

        # word2vec
        if config['experts']['text_feat_init'] == True:
            # word2vec, download file and move to we_root-path directory
            # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1
            we_rootpath = '/home/yj/pretrained_model'
            w2v_data_path = os.path.join(we_rootpath, "word2vec/", 'flickr',
                                         'vec500flickr30m')
            we_parameter = get_we_parameter(vocab, w2v_data_path)
        else:
            we_parameter = None
    else:
        vocab = None
        vocab_size = None
        we_parameter = None

    if "attr" in config['experts']['modalities']:
        attr_vocab = Vocabulary()
        attr_vocab.load('dataset/captions/dict.attr.json')
        attr_vocab_size = len(attr_vocab)
    else:
        attr_vocab = None
        attr_vocab_size = None

    data_loaders = config.init(name='data_loader',
                               module=module_data,
                               raw_input_dims=raw_input_dims,
                               text_feat=config['experts']['text_feat'],
                               text_dim=config['experts']['text_dim'],
                               vocab=vocab,
                               attr_vocab=attr_vocab,
                               pretrain=config['trainer']['pretrain'])

    model = config.init(name='arch',
                        module=module_arch,
                        expert_dims=expert_dims,
                        text_dim=config['experts']['text_dim'],
                        same_dim=config['experts']['ce_shared_dim'],
                        we_parameter=we_parameter,
                        vocab_size=vocab_size,
                        attr_vocab_size=attr_vocab_size,
                        text_feat=config['experts']['text_feat'])

    ckpt_path = Path(config._args.resume)
    logger.info(f"Loading checkpoint: {ckpt_path} ...")
    checkpoint = torch.load(ckpt_path)
    state_dict = checkpoint['state_dict']
    if config['n_gpu'] > 1:
        model = torch.nn.DataParallel(model)
    model.load_state_dict(state_dict)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Running test on {device}")

    model = model.to(device)
    model.eval()

    categories = ['dress', 'shirt', 'toptee']
    modalities = data_loaders[categories[0]].dataset.ordered_experts
    metric = {'score': dict()}

    for i, category in enumerate(categories):
        val_experts = {expert: list() for expert in modalities}
        target_ind = {expert: list() for expert in modalities}
        data_asin = []

        for batch in data_loaders[category + '_trg']:
            for key, val in batch['candidate_experts'].items():
                batch['candidate_experts'][key] = val.to(device)

            data_asin.extend(
                [meta['candidate'] for meta in batch['meta_info']])

            for key, val in batch['candidate_ind'].items():
                target_ind[key].append(val)

            with torch.no_grad():
                experts, _, _ = model(batch['candidate_experts'],
                                      batch['candidate_ind'],
                                      target=True)
                for modality, val in experts.items():
                    val_experts[modality].append(val)

        for modality, val in val_experts.items():
            val_experts[modality] = torch.cat(val)

        for modality, val in target_ind.items():
            target_ind[modality] = torch.cat(val)

        scores = []
        meta_infos = []
        val_size = val_experts['resnet'].size(0)

        for batch in data_loaders[category]:
            for experts in ['candidate_experts']:
                for key, val in batch[experts].items():
                    batch[experts][key] = val.to(device)
            batch["text"] = batch["text"].to(device)
            batch_size = batch["text"].size(0)

            meta_infos.extend(list(batch['meta_info']))

            with torch.no_grad():
                # composition_feature, text, moe_weights = model(batch['candidate_experts'],
                #                                                batch['candidate_ind'],
                #                                                batch['text'],
                #                                                batch['text_bow'],
                #                                                batch['text_lengths'])

                # batch_target = dict()
                # for mod in modalities:
                #     tmp = []
                #     for k in range(batch_size):
                #         tmp.append(model.target_composition(val_experts[mod], text[mod][k].expand(val_size, -1)))
                #     batch_target[mod] = torch.stack(tmp)

                src_experts = model.image_encoder(batch['candidate_experts'],
                                                  batch['candidate_ind'])
                src_text, moe_weights = model.get_text_feature(
                    batch['text'], batch['candidate_ind'], batch['text_bow'],
                    batch['text_lengths'])
                src_feature = model.get_combined_feature(src_experts, src_text)

                trg_text, _ = model.get_text_feature(batch['text'],
                                                     batch['target_ind'],
                                                     batch['text_bow'],
                                                     batch['text_lengths'],
                                                     target=True)
                # trg_text, _ = self.model.text_encoder['trg'](batch['text_mean'].unsqueeze(1), batch['target_ind'])

                batch_target = dict()
                for h, mod in enumerate(modalities):
                    tmp = []
                    for k in range(batch_size):
                        tmp.append(
                            model.trg_normalization_layer(
                                model.target_composition[h](
                                    val_experts[mod],
                                    trg_text[mod][k].expand(val_size, -1))))
                    batch_target[mod] = torch.stack(tmp)

                cross_view_conf_matrix = sharded_cross_view_inner_product(
                    vid_embds=batch_target,
                    text_embds=src_feature,
                    text_weights=moe_weights,
                    subspaces=model.image_encoder.modalities,
                    l2renorm=True,
                    dist=True,
                    val=True)

                scores.append(cross_view_conf_matrix)
        scores = torch.cat(scores)
        val_ids = data_loaders[category + '_trg'].dataset.data
        assert val_ids == data_asin
        metric['score'][category] = {
            'ids': val_ids,
            'matrix': scores,
            'meta_info': meta_infos
        }

    save_fname = ckpt_path.parent / f'test_score.pt'
    tic = time.time()
    logger.info("Saving score matrix: {} ...".format(save_fname))
    torch.save(metric, save_fname)
    logger.info(f"Done in {time.time() - tic:.3f}s")
class SimpsonsDataset(Dataset):
    """ Wrapper class to process and produce training samples """

    def __init__(self, data_dir, seq_length, vocab_size=None, vocab=None, training=False):
        self.data_dir = data_dir
        self.seq_length = seq_length
        self.vocab = Vocabulary()
        with open(os.path.join(data_dir, 'simpsons.txt'), 'r', encoding="utf-8") as f:
            self.text = f.read()

        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(self.text)
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(self.text)
        self.tokens = self.vocab.tokenize(self.text)

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        x = [self.vocab[word]
             for word in self.tokens[idx:idx + self.seq_length]]
        y = [self.vocab[self.tokens[idx + self.seq_length]]]
        x = torch.LongTensor(x)
        y = torch.LongTensor(y)
        return x, y
Exemple #23
0
def get_text_vocab(texts):
    word_vocab = Vocabulary()
    char_vocab = Vocabulary(lower=False)

    for item in texts:
        word_vocab.add_documents(item)
        for words in item:
            char_vocab.add_documents(words)

    word_vocab.build()
    char_vocab.build()

    return word_vocab, char_vocab
def main(paths, params):
    path_to_train_input = paths.training
    path_to_valid_input = paths.develop
    path_to_test= paths.test
    ctd_file = paths.ctd_file
    c2m_file = paths.c2m_file
    toD_mesh = Convert2D(ctd_file, c2m_file)

    sentence_pad = False # Don't pad sentence with begin and end sentence '<s>' and '<\s>

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    writer = SummaryWriter()

    X = BratInput(path_to_train_input)
    X = X.transform()
    X = split_annotated_documents(X)

    X_valid = BratInput(path_to_valid_input)
    X_valid = X_valid.transform()
    X_valid = split_annotated_documents(X_valid)

    X_test = BratInput(path_to_test)
    X_test = X_test.transform()
    X_test = split_annotated_documents(X_test)

    if params.randomize:
        torch.manual_seed(5)
        random.seed(5)
        np.random.seed(5)

    # Obtain MeSH information
    mesh_file = paths.MeSH_file
    disease_file= paths.disease_file
    mesh_graph_file = paths.MeSH_graph_disease
    mesh_folder = paths.MeSH_folder
    mt_folder = paths.multitask_folder


    # read disease file
    with open(disease_file,'r') as f:
        disease_data = f.readlines()

    mesh_dict = read_mesh_file(mesh_file)

    mesh_graph = nx.read_gpickle(mesh_graph_file)
    mesh_graph = mesh_graph.to_undirected()
    scope_text, id2idx_dict, idx2id_dict = mesh_dict_to_tokens(mesh_dict, disease_data)
    node_list = list(idx2id_dict.values())

    # A_HAT metrix for GCN
    if not os.path.exists(os.path.join(mesh_folder, 'a_hat_matrix')):
        a_matrix = get_adjacancy_matrix(mesh_graph, node_list)

        a_matrix = sparse.coo_matrix(a_matrix)
        with open(os.path.join(mesh_folder, 'a_hat_matrix'), 'wb') as f:
            pickle.dump(data, f)
    else:
        with open(os.path.join(mesh_folder, 'a_hat_matrix'), 'rb') as f:
            a_matrix = pickle.load(f)

    i = torch.tensor([a_matrix.row, a_matrix.col], dtype=torch.long, device=device)
    v = torch.tensor(a_matrix.data, dtype=torch.float32, device=device)
    a_hat = torch.sparse.FloatTensor(i, v, torch.Size([len(node_list), len(node_list)])).to(device)

    # Construct usable data format
    x_tr_text, ner_tr_tags, x_tr_tokens = annotated_docs_to_tokens(X, sentence_pad=sentence_pad)
    x_val_text, ner_val_tags, x_val_tokens = annotated_docs_to_tokens(X_valid, sentence_pad=sentence_pad)
    x_test_text, ner_test_tags, x_test_tokens = annotated_docs_to_tokens(X_test, sentence_pad=sentence_pad)

    # elmo embeddings
    options_file = paths.elmo_options
    weight_file = paths.elmo_weights
    ELMO_folder = paths.elmo_folder
    elmo_dim = params.elmo_dim
    elmo = Elmo(options_file, weight_file, 2,dropout=0)
    elmo.to(device)

    with torch.no_grad():
        if not os.path.exists(os.path.join(mt_folder,'text_tr_elmo_split.pkl')):
            text_tr = get_elmo_representation(x_tr_text, elmo, elmo_dim=params.elmo_dim, device=device)
            with open(os.path.join(mt_folder,'text_tr_elmo_split.pkl'),'wb+') as f:
                pickle.dump(text_tr, f)
        else:
            with open(os.path.join(mt_folder,'text_tr_elmo_split.pkl'),'rb+') as f:
                text_tr = pickle.load(f)
        
        if not os.path.exists(os.path.join(mt_folder,'text_val_elmo_split.pkl')):
            text_val = get_elmo_representation(x_val_text, elmo, elmo_dim=params.elmo_dim, device=device)
            with open(os.path.join(mt_folder,'text_val_elmo_split.pkl'),'wb+') as f:
                pickle.dump(text_val, f)
        else:
            with open(os.path.join(mt_folder,'text_val_elmo_split.pkl'),'rb+') as f:
                text_val = pickle.load(f)

        if not os.path.exists(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl')):
            text_test = get_elmo_representation(x_test_text, elmo, elmo_dim=params.elmo_dim, device=device)
            with open(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl'),'wb+') as f:
                pickle.dump(text_test, f)
        else:
            with open(os.path.join(paths.multitask_folder,'text_test_elmo_split.pkl'),'rb+') as f:
                text_test = pickle.load(f)

    # NER label vocab
    ner_labels_vocab = Vocabulary(lower=False)
    ner_labels_vocab.add_documents(ner_tr_tags)
    ner_labels_vocab.build()

    # mesh scope embedding
    if not os.path.exists(os.path.join(paths.dump_folder, 'scope_emb.pkl')):
        scope_embedding, _ = get_scope_elmo(elmo, ELMO_folder, scope_text, elmo_dim, idx2id_dict, id2idx_dict, device=device)
        with open(os.path.join(paths.dump_folder, 'scope_emb.pkl'), 'wb') as f:
            pickle.dump(scope_embedding, f)
    else:
        with open(os.path.join(paths.dump_folder, 'scope_emb.pkl'), 'rb') as f:
            scope_embedding = pickle.load(f)
            
    train_el_set = EL_set(X, toD_mesh, id2idx_dict)
    val_el_set = EL_set(X_valid, toD_mesh, id2idx_dict)


    train(paths, params, X, text_tr, ner_tr_tags, train_el_set, X_valid, x_val_tokens, text_val,
            ner_val_tags, val_el_set, ner_labels_vocab, scope_text, scope_embedding, a_hat, mesh_graph, id2idx_dict, idx2id_dict, writer, device=device)
Exemple #25
0
params = {
    "batch_size": 128,
    "embed_size": 64,
    "hidden_size": 64,
    "num_layers": 2,
    "checkpoints": "./checkpoints/",
    "std_factor": 6.,
    "dropout": 0.7,
}

path_normal_data = "datasets/vulnbank_train.txt"
path_anomaly_data = "datasets/vulnbank_anomaly.txt"

create_checkpoints_dir(params["checkpoints"])

vocab = Vocabulary()
params["vocab"] = vocab

#d = Data(path_normal_data)
#####
x = np.linspace(0, 30, 105)
y = 2 * np.sin(x)

l1, = plt.plot(x[:85], y[:85], 'y', label='training samples')
l2, = plt.plot(x[85:], y[85:105], 'c--', label='test samples')
plt.legend(handles=[l1, l2], loc='upper left')
plt.show()

train_y = y.copy()

noise_factor = 0.5
Exemple #26
0
def train_node2vec(paths, params):
    dump_process_pkl = paths.dump_process
    dump_context_dict = paths.dump_context_dict
    dump_context_list = paths.dump_context_list
    dump_walks = paths.dump_walks
    save_model_path = paths.node2vec_base
    embedding_txt = paths.embedding_text
    embedding_temp = paths.embedding_temp
    embedding = paths.embedding
    mesh_graph_file = paths.MeSH_graph_disease

    if not params.randomize:
        np.random.seed(5)
        torch.manual_seed(5)
        random.seed(5)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    writer = SummaryWriter()

    # ----------- Random walk --------------------
    directed_graph = False

    if not os.path.exists(dump_walks):
        num_walks = 30
        walk_length = 10
        nx_G = read_graph(mesh_graph_file, directed_graph)
        G = Graph(nx_G, is_directed=directed_graph, p=params.p, q=params.q)
        G.preprocess_transition_probs()
        walks = G.simulate_walks(num_walks, walk_length)
        with open(dump_walks, 'wb') as f:
            pickle.dump(walks, f)
    else:
        with open(dump_walks, 'rb') as f:
            walks = pickle.load(f)

    if os.path.exists(dump_process_pkl):
        with open(dump_process_pkl, 'rb') as f:
            vocab = pickle.load(f)
    else:
        vocab = Vocabulary(lower=False)
        vocab.add_documents(walks)
        vocab.build()

        with open(dump_process_pkl, 'wb') as f:
            pickle.dump(vocab, f)

    # ---------- build embedding model ----------
    mesh_file = paths.MeSH_file
    ELMO_folder = paths.elmo_folder
    options_file = paths.elmo_options
    weight_file = paths.elmo_weights

    elmo = Elmo(options_file, weight_file, 2, dropout=0)
    elmo.to(device)

    mesh_graph = nx.read_gpickle(mesh_graph_file)
    mesh_graph = mesh_graph.to_undirected()

    mesh_dict = read_mesh_file(mesh_file)

    # Get the list of nodes (idx 0 is '<pad>')
    node_list = list(vocab.vocab.keys())

    # create weight matrix by using node_list order(which correspond to original vocab index order)
    elmo_embedding_dim = 1024
    if not os.path.exists(os.path.join(ELMO_folder, 'elmo_weights')):
        weight_list = []
        for idx, i in enumerate(node_list):
            if i in mesh_dict:
                node_idx = vocab.token_to_id(i)
                scope_note = mesh_dict[i].scope_note
                character_ids = batch_to_ids(scope_note).to(device)
                elmo_embeddings = elmo(character_ids)
                embeddings = elmo_embeddings['elmo_representations'][0]
                mask = elmo_embeddings['mask']
                embeddings = embeddings * mask.unsqueeze(2).expand(
                    mask.shape[0], mask.shape[1], embeddings.shape[2]).float()
                embeddings = embeddings.mean(dim=0).mean(dim=0)  # average
                weight_list.append(embeddings.cpu())
            else:
                weight_list.append(torch.zeros(elmo_embedding_dim))

        with open(os.path.join(ELMO_folder, 'elmo_weights'), 'wb') as f:
            pickle.dump(weight_list, f)
    else:
        with open(os.path.join(ELMO_folder, 'elmo_weights'), 'rb') as f:
            weight_list = pickle.load(f)

    weight = torch.stack(weight_list, dim=0)

    # ---------- train SkipGram -----------------
    epochs = params.epochs
    batch_size = params.batch_size
    window = params.window
    num_neg_sample = params.num_neg_sample
    writer = SummaryWriter()

    # use transformation only once, i.e either during creating the context dict and list or during training
    if not os.path.exists(dump_context_dict):
        l, d = multiprocess(walks, window=window, transform=vocab.doc2id)
        with open(dump_context_dict, 'wb') as f:
            pickle.dump(d, f)
        with open(dump_context_list, 'wb') as f:
            pickle.dump(l, f)
    else:
        with open(dump_context_dict, 'rb') as f:
            d = pickle.load(f)
        with open(dump_context_list, 'rb') as f:
            l = pickle.load(f)

    # here transformation is required we will directly sample the index
    sample_table = negative_sampling_table(vocab.token_counter(),
                                           transform=vocab.token_to_id)
    neg_sample = np.random.choice(sample_table, size=(len(l), num_neg_sample))

    context_data = ContextData(l, d, neg_sample, n_sample=5, transform=None)
    context_dataloader = DataLoader(context_data,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    pin_memory=True,
                                    num_workers=6)

    model_embedding = SkipGramModified(len(vocab.vocab),
                                       embedding_size=elmo_embedding_dim,
                                       weight=weight)
    model_embedding.to(device)
    optimizer_FC = torch.optim.Adam(list(model_embedding.parameters()),
                                    lr=0.005)  #+list(model_fc.parameters()

    train(model_embedding,
          optimizer_FC,
          context_dataloader,
          epochs,
          device,
          neg_sample,
          n_sample=num_neg_sample,
          writer=writer,
          save_path=save_model_path,
          l=l,
          d=d,
          vocab=vocab,
          batch_size=batch_size)

    node_idx = []
    for item in node_list:
        node_idx.append(vocab.token_to_id(item))

    x = torch.tensor(node_idx, device=device)
    y = torch.zeros(x.shape, device=device)
    z = torch.zeros(x.shape, device=device)

    x, y, z = model_embedding(x, y, z)

    word_embeddings = x.cpu().detach().numpy()

    sorted_vocab_tuple = sorted(vocab.vocab.items(), key=lambda kv: kv[1])

    with open(embedding_txt, 'w') as f:
        for idx, item in enumerate(sorted_vocab_tuple):
            if item[0] == '\n':
                continue
            f.write(item[0] + ' ' +
                    ' '.join([str(i) for i in word_embeddings[idx]]) + '\n')

    glove_file = datapath(embedding_txt)
    temp_file = get_tmpfile(embedding_temp)
    _ = glove2word2vec(glove_file, temp_file)

    wv = KeyedVectors.load_word2vec_format(temp_file)
    wv.save(embedding)

    writer.close()
def train_node2vec(paths, params):
    dump_process_pkl = paths.dump_process
    dump_context_dict = paths.dump_context_dict
    dump_context_list = paths.dump_context_list
    dump_walks = paths.dump_walks
    save_model_path = paths.node2vec_base
    embedding_txt = paths.embedding_text
    embedding_temp = paths.embedding_temp
    embedding = paths.embedding
    mesh_graph_file = paths.MeSH_graph_disease

    if not params.randomize:
        np.random.seed(5)
        torch.manual_seed(5)
        random.seed(5)

    # ----------- Random walk --------------------
    directed_graph = False

    if not os.path.exists(dump_walks):
        num_walks = 30
        walk_length = 8
        nx_G = read_graph(mesh_graph_file, directed_graph)
        G = Graph(nx_G, is_directed=directed_graph, p=params.p, q=params.q)
        G.preprocess_transition_probs()
        walks = G.simulate_walks(num_walks, walk_length)
        with open(dump_walks, 'wb') as f:
            pickle.dump(walks, f)
    else:
        with open(dump_walks, 'rb') as f:
            walks = pickle.load(f)

    # ---------- train SkipGram -----------------
    epochs = params.epochs
    batch_size = params.batch_size
    window = params.window
    num_neg_sample = params.num_neg_sample
    writer = SummaryWriter()

    if os.path.exists(dump_process_pkl):
        with open(dump_process_pkl, 'rb') as f:
            vocab = pickle.load(f)
    else:
        vocab = Vocabulary(lower=False)
        vocab.add_documents(walks)
        vocab.build()

        with open(dump_process_pkl, 'wb') as f:
            pickle.dump(vocab, f)

    # use transformation only once, i.e either during creating the context dict and list or during training
    if not os.path.exists(dump_context_dict):
        l, d = multiprocess(walks, window=window, transform=vocab.doc2id)
        with open(dump_context_dict, 'wb') as f:
            pickle.dump(d, f)
        with open(dump_context_list, 'wb') as f:
            pickle.dump(l, f)
    else:
        with open(dump_context_dict, 'rb') as f:
            d = pickle.load(f)
        with open(dump_context_list, 'rb') as f:
            l = pickle.load(f)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # here transformation is required we will directly sample the index
    sample_table = negative_sampling_table(vocab.token_counter(),
                                           transform=vocab.token_to_id)
    neg_sample = np.random.choice(sample_table, size=(len(l), num_neg_sample))

    context_data = ContextData(l, d, neg_sample, n_sample=5, transform=None)
    context_dataloader = DataLoader(context_data,
                                    batch_size=batch_size,
                                    shuffle=True,
                                    num_workers=6)

    model_embedding = SkipGram(len(vocab.vocab), embedding_size=1024)
    model_embedding.to(device)
    optimizer_embedding = torch.optim.SparseAdam(model_embedding.parameters(),
                                                 lr=0.005)

    train(model_embedding,
          optimizer_embedding,
          context_dataloader,
          epochs,
          device,
          neg_sample,
          n_sample=num_neg_sample,
          transform=None,
          writer=writer,
          save_path=save_model_path,
          l=l,
          d=d,
          vocab=vocab,
          batch_size=batch_size)
    word_embeddings = (model_embedding.out_embedding.weight.data +
                       model_embedding.in_embedding.weight.data) / 2
    word_embeddings = word_embeddings.cpu().numpy()

    sorted_vocab_tuple = sorted(vocab.vocab.items(), key=lambda kv: kv[1])

    with open(embedding_txt, 'w') as f:
        for idx, item in enumerate(sorted_vocab_tuple):
            if item[0] == '\n':
                continue
            f.write(item[0] + ' ' +
                    ' '.join([str(i) for i in word_embeddings[idx]]) + '\n')

    glove_file = datapath(embedding_txt)
    temp_file = get_tmpfile(embedding_temp)
    _ = glove2word2vec(glove_file, temp_file)

    wv = KeyedVectors.load_word2vec_format(temp_file)
    wv.save(embedding)

    writer.close()


# if __name__ == '__main__':
#     base_path = '/media/druv022/Data2/Final'
#     paths = Paths(base_path, node2vec_type='1')

#     train_node2vec(paths)
class SpamData(Dataset):
    """ Wrapper class to process and produce training samples """
    def __init__(self, data_dir, seq_length, vocab_size, vocab=None):
        self.df = pd.read_csv(os.path.join(data_dir, 'spam.csv'),
                              encoding="mbcs")
        self.vocab = Vocabulary()
        self.labels = []
        for x in self.df.v1:
            if x == 'ham':
                self.labels.append(0)
            else:
                self.labels.append(1)
        self.seq_length = seq_length
        if vocab is not None:
            if isinstance(vocab, str):
                self.vocab.load(vocab)
            elif isinstance(vocab, Vocabulary):
                self.vocab = vocab
        elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
            self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.vocab.add_text(" ".join(self.df["v2"].values))
            self.vocab.save(os.path.join(data_dir, "vocab.pkl"))

        if vocab_size is not None:
            self.vocab = self.vocab.most_common(vocab_size - 2)

        self.text = self.vocab.clean_text(" ".join(self.df["v2"].values))
        self.tokens = []
        for content in self.df["v2"].values:
            self.tokens.append(
                self.vocab.tokenize(self.vocab.clean_text(content)))

    def __len__(self):
        return len(self.tokens) - self.seq_length

    def __getitem__(self, idx):
        tokens_list = self.tokens[idx]
        if len(tokens_list) > self.seq_length:
            tokens_list = tokens_list[:self.seq_length]
        else:
            tokens_list.extend(['<pad>'] *
                               (self.seq_length - len(tokens_list)))
        x = [self.vocab[word] for word in tokens_list]
        y = [0, 0]
        y[int(self.labels[idx])] = 1
        x = torch.LongTensor(x)
        y = torch.FloatTensor([y])
        return x, y
Exemple #29
0
def main(config):
    logger = config.get_logger('train')
    expert_dims, raw_input_dims = compute_dims(config)
    seeds = [int(x) for x in config._args.seeds.split(',')]

    for seed in seeds:
        tic = time.time()
        logger.info(f"Setting experiment random seed to {seed}")
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if config['experts']['text_feat'] == 'learnable':
            # vocab
            vocab = Vocabulary()
            vocab.load('dataset/captions/dict.all_200k_gan.json')
            vocab_size = len(vocab)
            if config['experts']['text_feat_init'] == True:
                # word2vec, download file and move to we_root-path directory
                # https://www.kaggle.com/jacksoncrow/word2vec-flickr30k/version/1
                we_rootpath = '/home/yj/pretrained_model'
                w2v_data_path = os.path.join(we_rootpath, "word2vec/",
                                             'flickr', 'vec500flickr30m')
                we_parameter = get_we_parameter(vocab, w2v_data_path)
            else:
                we_parameter = None
        else:
            vocab = None
            vocab_size = None
            we_parameter = None

        if "attr" in config['experts']['modalities']:
            attr_vocab = Vocabulary()
            attr_vocab.load('dataset/captions/dict.attr.json')
            attr_vocab_size = len(attr_vocab)
        else:
            attr_vocab = None
            attr_vocab_size = None

        data_loaders = config.init(name='data_loader',
                                   module=module_data,
                                   raw_input_dims=raw_input_dims,
                                   text_feat=config['experts']['text_feat'],
                                   text_dim=config['experts']['text_dim'],
                                   vocab=vocab,
                                   attr_vocab=attr_vocab,
                                   pretrain=config['trainer']['pretrain'])

        model = config.init(
            name='arch',
            module=module_arch,
            expert_dims=expert_dims,
            text_dim=config['experts']['text_dim'],
            same_dim=config['experts']['ce_shared_dim'],
            we_parameter=we_parameter,
            vocab_size=vocab_size,
            attr_vocab_size=attr_vocab_size,
            text_feat=config['experts']['text_feat'],
        )
        # logger.info(model)

        loss = config.init(name='loss', module=module_loss)

        trainable_params = filter(lambda p: p.requires_grad,
                                  model.parameters())
        optimizer = config.init('optimizer', torch.optim, trainable_params)
        lr_scheduler = config.init('lr_scheduler', torch.optim.lr_scheduler,
                                   optimizer)

        trainer = Trainer(
            model,
            loss,
            optimizer,
            config=config,
            data_loaders=data_loaders,
            lr_scheduler=lr_scheduler,
        )

        trainer.train()
        best_ckpt_path = config.save_dir / "trained_model.pth"
        duration = time.strftime('%Hh%Mm%Ss', time.gmtime(time.time() - tic))
        logger.info(f"Training took {duration}")

        test_args = argparse.ArgumentParser()
        test_args.add_argument("--device", default=config._args.device)
        test_args.add_argument("--resume", default=best_ckpt_path)
        test_config = ConfigParser(test_args)
        test(test_config)
if __name__ == "__main__":
    args = parse_parameters()
    args = _update_default_parameters(args)

    set_random_seed(seed=args["random_seed"])
    if os.path.exists(args["output_dir"]):
        shutil.rmtree(args["output_dir"])
    create_output_dir(args["output_dir"])

    dataset_reader = DatasetReader(args)
    train_data = dataset_reader.read("data/%s/train.txt" % args["dataset"])
    print_out("Load %d instances from train set." % (len(train_data)))
    dev_data = dataset_reader.read("data/%s/dev.txt" % args["dataset"])
    print_out("Load %d instances from dev set." % (len(dev_data)))
    test_data = dataset_reader.read("data/%s/test.txt" % args["dataset"])
    print_out("Load %d instances from test set." % (len(test_data)))

    datasets = {"train": train_data, "validation": dev_data, "test": test_data}
    vocab = Vocabulary.from_instances((instance for dataset in datasets.values() for instance in dataset))
    vocab.save_to_files(os.path.join(args["output_dir"], "vocabulary"))
    train_iterator = BucketIterator(sorting_keys=[['tokens', 'tokens_length']], batch_size=args["batch_size"])
    train_iterator.index_with(vocab)
    dev_iterator = BasicIterator(batch_size=args["batch_size"])
    dev_iterator.index_with(vocab)

    model, metrics, model_paths = train_model(args)
    metrics["args"] = args

    final_evaluate(model, vocab, test_data, dev_iterator, _external_eval, metrics, args, model_paths)
class EmailSpamDataset(BaseDataset):
    """ Wrapper class to process and produce training samples """

    def __init__(
        self,
        data_dir,
        vocab_size=None,
        vocab=None,
        seq_length=40,
        training=False,
        vocab_from_pretrained="bert-base-uncased",
        do_lower_case=True,
    ):

        self.data_dir = data_dir
        self.vocab = Vocabulary(vocab_from_pretrained, do_lower_case)
        self.seq_length = seq_length

        data_all = pd.read_csv(os.path.join(self.data_dir, "combined-data.csv"), sep=' ', header=None, encoding="cp1252")
        data_all[1] = data_all[1] + " " + data_all[2]
        data_all = data_all[[0, 1]]
        data_all.columns = ['label', 'text']
        data_all = data_all[['text', 'label']]
        data_all = data_all[~data_all.text.isna()]
        data_all.label = data_all.label.apply(lambda x: int(x[-1]))
        data_all.text = data_all.text.apply(lambda x: x.lower())

        data_all = data_all.sample(1000)
        
        self.train_df = data_all.copy() #pd.DataFrame({"text": [], "label": []})
        self.val_df = pd.DataFrame({"text": [], "label": []})
        self.test_df = data_all.copy() # pd.DataFrame({"text": [], "label": []}) #data_all.copy()

        del data_all

        if training:
            self.train()
            if vocab is not None:
                if isinstance(vocab, str):
                    self.vocab.load(vocab)
                elif isinstance(vocab, Vocabulary):
                    self.vocab = vocab
            elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
                self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
            else:
                self.vocab.add_text(
                    " ".join(pd.concat([self.train_df, self.val_df], sort=False).text.values)
                )
                self.vocab.save(os.path.join(data_dir, "vocab.pkl"))
        else:
            self.test()
            if vocab is not None:
                if isinstance(vocab, str):
                    self.vocab.load(vocab)
                elif isinstance(vocab, Vocabulary):
                    self.vocab = vocab
            elif os.path.exists(os.path.join(data_dir, "vocab.pkl")):
                self.vocab.load(os.path.join(data_dir, "vocab.pkl"))
            else:
                raise(Exception("Vocab file is not specified in test mode!"))
        
        if vocab_size is not None:
                self.vocab = self.vocab.most_common(vocab_size - 2)

    def validation(self):
        self.text = self.val_df.text.values
        self.labels = self.val_df.label.values
        self.len = len(self.val_df)
        return True

    def train(self):
        self.text = self.train_df.text.values
        self.labels = self.train_df.label.values
        self.len = len(self.train_df)
        return True

    def test(self):
        self.text = self.test_df.text.values
        self.labels = self.test_df.label.values
        self.len = len(self.test_df)
        return True

    def __len__(self):
        return self.len - 1 if self.len else 0

    def __getitem__(self, idx):
        y = self.labels[idx]
        text = self.text[idx]

        text = self.vocab.clean_text(text)
        input_ids, attention_mask, segment_ids = self.format_in_text(text)
        y = torch.LongTensor([y])

        return input_ids, attention_mask, segment_ids, y
    
    def format_in_text(self, text):
        text = self.vocab.clean_text(text)
        tokens_a = self.vocab.tokenize(text)

        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > self.seq_length - 2:
            tokens_a = tokens_a[: (self.seq_length - 2)]

        tokens = (
                [self.vocab.tokenizer.cls_token]
                + tokens_a
                + [self.vocab.tokenizer.sep_token]
        )
        segment_ids = [0] * len(tokens)
        # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
        input_ids = [self.vocab[x] for x in tokens]
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        attention_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [self.vocab.tokenizer.pad_token_id] * (
                self.seq_length - len(input_ids)
        )
        input_ids += padding
        attention_mask += padding
        segment_ids += padding


        input_ids = torch.LongTensor(input_ids)
        attention_mask = torch.LongTensor(attention_mask)
        segment_ids = torch.LongTensor(segment_ids)
        return input_ids, attention_mask, segment_ids