Beispiel #1
0
def load_generation_data():
    """
    This function loads all data necessary for training and evaluation of a
    code/comment generation model. Data is loaded from a TSV file that
    contains all data instances. This file is found in the directory pointed to
    by data_path. Training, dev, and testing sets are created for the model
    using a torchtext BucketIterator that creates batches, indicated by the
    batch_size variable such that the batches have minimal padding. This
    function also loads pretrained word embedding vectors that are located in
    the data_path directory.

    :param batch_size: [int] -- amount of data elements per batch
    :returns: [Tuple] -- (TRAIN set of batches,
                          DEV set of batches,
                          TEST set of batches,
                          code pretrained vectors,
                          docstring pretrained vectors)
    """
    input_path = CODE_CORPUS / "input"
    # Create a field variable for each field that will be in our TSV file
    code_field = data.Field(sequential=True,
                            tokenize=lambda s: s.split(" "),
                            include_lengths=True,
                            use_vocab=True)

    comm_field = data.Field(sequential=True,
                            tokenize=lambda s: s.split(" "),
                            include_lengths=True,
                            use_vocab=True)

    # Used to create a tabular dataset from TSV
    train_val_fields = [("code", code_field), ("comm", comm_field)]

    # Build the large tabular dataset using the defined fields
    tsv_file_path = input_path / "generation_dataset.tsv"
    tab_data = data.TabularDataset(str(tsv_file_path), "TSV", train_val_fields)

    # Split the large dataset into TRAIN, DEV, TEST portions
    train_data, dev_data, test_data = tab_data.split(
        split_ratio=[0.85, 0.05, 0.1])

    # Load the pretrained word embedding vectors
    code_vec_path = input_path / "code-vectors.txt"
    comm_vec_path = input_path / "comm-vectors.txt"
    code_vectors = vocab.Vectors(str(code_vec_path), str(input_path))
    comm_vectors = vocab.Vectors(str(comm_vec_path), str(input_path))

    # Builds the known word vocab for code and comments from the pretrained vectors
    code_field.build_vocab(train_data,
                           dev_data,
                           test_data,
                           vectors=code_vectors)
    comm_field.build_vocab(train_data,
                           dev_data,
                           test_data,
                           vectors=comm_vectors)

    # We need to return the test sets and the field pretrained vectors
    return (train_data, dev_data, test_data, code_field.vocab,
            comm_field.vocab)
Beispiel #2
0
def sst_word_char(path, word_field, char_field, label_field, batch_size, device, word_emb_file, char_emb_file):

    fields = {
        'text': [('text_word', word_field), ('text_char', char_field)],
        'label': ('label', label_field)
    }
    train, dev, test = data.TabularDataset.splits(
        path=path, train='train.jsonl', validation='dev.jsonl',
        test='test.jsonl', format='json', skip_header=True,
        fields=fields)
    
    word_vectors = vocab.Vectors(word_emb_file)
    char_vectors = vocab.Vectors(char_emb_file)

    word_field.build_vocab(
        train, dev, test, max_size=25000,
        vectors=word_vectors, unk_init=torch.Tensor.normal_)
    char_field.build_vocab(
        train, dev, test, max_size=94,
        vectors=char_vectors, unk_init=torch.Tensor.normal_)
    
    label_field.build_vocab(train, dev, test)

    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test), batch_sizes=(batch_size, len(dev), len(test)), sort_key=lambda x: len(x.text_word), sort_within_batch=True, repeat=False, shuffle=True, device=device
    )

    return train_iter, dev_iter, test_iter
Beispiel #3
0
    def __init__(self, batch_size=100):
        print('Device: ' + str(device))

        self.candidate_title = data.Field(sequential=True,
                                          lower=True,
                                          tokenize=tokenizer,
                                          include_lengths=True,
                                          use_vocab=True)
        self.candidate_resume = data.Field(sequential=True,
                                           lower=True,
                                           include_lengths=True,
                                           use_vocab=True)
        self.job_title = data.Field(sequential=True,
                                    lower=True,
                                    tokenize=tokenizer,
                                    include_lengths=True,
                                    use_vocab=True)
        self.job_description = data.Field(sequential=True,
                                          lower=True,
                                          include_lengths=True,
                                          use_vocab=True)
        self.match_status = data.Field(sequential=False, use_vocab=False)

        self.train_set, self.validation_set = data.TabularDataset.splits(
            path='./gdrive/My Drive/Colab Notebooks/data/TalentFox/',
            train='train_data.csv',
            validation='val_data.csv',
            format='csv',
            fields=[('index', None), ('job_title', self.job_title),
                    ('job_description', self.job_description),
                    ('candidate_title', self.candidate_title),
                    ('candidate_resume', self.candidate_resume),
                    ('match_status', self.match_status)],
            skip_header=True,
        )

        self.train_iter, self.validation_iter = data.BucketIterator.splits(
            (self.train_set, self.validation_set),
            batch_size=batch_size,
            shuffle=True,
            device=device,
            sort_key=lambda x: len(x.job_description),
            sort_within_batch=True,
            repeat=True)

        self.match_status.build_vocab(self.train_set)
        url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.de.vec'
        self.job_title.build_vocab(self.train_set,
                                   vectors=vocab.Vectors('wiki.de.vec',
                                                         url=url))
        self.job_description.build_vocab(self.train_set,
                                         vectors=vocab.Vectors('wiki.de.vec',
                                                               url=url))
        self.candidate_title.build_vocab(self.train_set,
                                         vectors=vocab.Vectors('wiki.de.vec',
                                                               url=url))
        self.candidate_resume.build_vocab(self.train_set,
                                          vectors=vocab.Vectors('wiki.de.vec',
                                                                url=url))
Beispiel #4
0
def load_sst2(path, text_field, label_field, batch_size, device,
              embedding_file):

    train, dev, test = data.TabularDataset.splits(path=path,
                                                  train='train.tsv',
                                                  validation='dev.tsv',
                                                  test='test.tsv',
                                                  format='tsv',
                                                  skip_header=True,
                                                  fields=[('text', text_field),
                                                          ('label',
                                                           label_field)])
    print("the size of train: {}, dev:{}, test:{}".format(
        len(train.examples), len(dev.examples), len(test.examples)))
    vectors = vocab.Vectors(embedding_file)

    text_field.build_vocab(train,
                           dev,
                           test,
                           max_size=25000,
                           vectors=vectors,
                           unk_init=torch.Tensor.normal_)
    label_field.build_vocab(train, dev, test)

    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test),
        batch_sizes=(batch_size, len(dev), len(test)),
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        repeat=False,
        shuffle=True,
        device=device)

    return train_iter, dev_iter, test_iter
Beispiel #5
0
def get_embedding_weights(embedding):
    embeddings_file = ''
    cache = ''
    if embedding == 'glove_specific':
        embeddings_file = 'glove.vec'
        cache = 'specific-embeddings'
    elif embedding == 'glove_generic':
        embeddings_file = 'glove.6B.300d.txt'
        cache = '.vector_cache'
    elif embedding == 'fasttext_specific':
        embeddings_file = 'fasttext.vec'
        cache = 'specific-embeddings'
    elif embedding == 'fasttext_generic':
        embeddings_file = 'crawl-300d-2M.vec'
        cache = '.fasttext_cache'
    elif embedding == 'word2vec_specific':
        embeddings_file = 'word2vec.vec'
        cache = 'specific-embeddings'
    elif embedding == 'word2vec_generic':
        embeddings_file = 'embeddings.vec'
        cache = '.word2vec_cache'

    model = vocab.Vectors(name=embeddings_file, cache=cache)

    return torch.FloatTensor(model.vectors)
Beispiel #6
0
def create_text_and_label(SEED, ratio, filename):
    TEXT = data.Field(sequential=True, tokenize='spacy', include_lengths=True)
    LABEL = data.LabelField(tokenize='spacy', is_target=True, sequential=False)
    fields = [('text', TEXT), ('label', LABEL)]
    train_data = data.TabularDataset.splits(path='',
                                            train=filename,
                                            format='csv',
                                            fields=fields,
                                            skip_header=True)
    train_data = train_data[0]
    if ratio == 8:
        ratio = 0.8
    else:
        ratio = 0.7
    train_data, valid_data = train_data.split(split_ratio=ratio,
                                              random_state=random.seed(SEED))
    custom_embeddings = vocab.Vectors(
        name=os.path.join(GLOVE, 'glove.6B.100d.txt'))

    MAX_VOCAB_SIZE = 25000

    TEXT.build_vocab(train_data,
                     max_size=MAX_VOCAB_SIZE,
                     vectors=custom_embeddings,
                     unk_init=torch.Tensor.normal_)

    LABEL.build_vocab(train_data)
    return TEXT, LABEL, train_data, valid_data
Beispiel #7
0
def load_conv_data(file, g_sequence_len, embed_file=None, min_freq=1):
    TEXT = data.Field(tokenize=tokenize_en,
                      lower=True,
                      fix_length=g_sequence_len,
                      batch_first=True,
                      eos_token='<eos>',
                      init_token='<sos>')
    LABEL = data.Field(sequential=False, unk_token=None)
    tb = data.TabularDataset(file,
                             format='tsv',
                             fields=[('text1', TEXT), ('text2', TEXT),
                                     ('label', LABEL)])
    if embed_file:
        TEXT.build_vocab(tb,
                         vectors=vocab.Vectors(embed_file),
                         min_freq=min_freq)
    else:
        TEXT.build_vocab(tb, min_freq=min_freq)
    LABEL.build_vocab(tb)
    label_names = LABEL.vocab.itos
    label_examples = [[] for _ in label_names]
    for each in tb:
        label_examples[label_names.index(each.label)].append(each)
    label_datasets = [
        data.Dataset(label_examples[i],
                     fields=[('text1', TEXT), ('text2', TEXT),
                             ('label', LABEL)])
        for i in range(len(label_names))
    ]

    return tb, TEXT, LABEL, label_names, label_datasets
Beispiel #8
0
def load_news(config, text_field, band_field):
    fields = {'text': ('text', text_field), 'label': ('label', band_field)}

    word_vectors = vocab.Vectors(config.embedding_file)

    train, val, test = data.TabularDataset.splits(path=config.data_path,
                                                  train='train.csv',
                                                  validation='val.csv',
                                                  test='test.csv',
                                                  format='csv',
                                                  fields=fields)

    print("the size of train: {}, dev:{}, test:{}".format(
        len(train.examples), len(val.examples), len(test.examples)))

    text_field.build_vocab(train,
                           val,
                           test,
                           max_size=config.n_vocab,
                           vectors=word_vectors,
                           unk_init=torch.Tensor.normal_)
    band_field.build_vocab(train, val, test)

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_sizes=(config.batch_size, config.batch_size, config.batch_size),
        sort=False,
        device=config.device,
        sort_within_batch=False,
        shuffle=False)
Beispiel #9
0
    def __init__(self, num_docs, text_field,
                 path):  # vec_dim, num_docs, num_words)
        super(DMemb, self).__init__()
        vectors = vocab.Vectors(path)
        text_field.vocab.set_vectors(vectors.stoi, vectors.vectors,
                                     vectors.dim)
        [num_words, vec_dim] = torch.tensor(vectors.vectors.shape)
        # paragraph matrix
        self._D = nn.Parameter(torch.randn(num_docs, vec_dim),
                               requires_grad=True)
        # word matrix

        self._W = nn.Embedding.from_pretrained(
            torch.FloatTensor(text_field.vocab.vectors))

        z = self._W.weight[0, :]
        for key, value in text_field.vocab.stoi.items():
            a = self._W.weight[value, :]
            if bool(torch.all(torch.eq(z, a))):
                for keyv, valuev in vectors.stoi.items():
                    word = keyv.lower()
                    if word == key:
                        #print(key, keyv)
                        self._W.weight[value, :] = vectors.vectors[
                            vectors.stoi[keyv], :]

        # output layer parameters
        self._O = nn.Parameter(torch.FloatTensor(vec_dim, num_words).zero_(),
                               requires_grad=True)
Beispiel #10
0
def load_race(path, id_field, word_field, label_field, train_batch_size, dev_batch_size, test_batch_size, device, word_embed_file, cache_dir):

    fields = {
        'race_id': ('race_id', id_field),
        'article': ('article', word_field),
        'question': ('question', word_field),
        'option_0': ('option_0', word_field),
        'option_1': ('option_1', word_field),
        'option_2': ('option_2', word_field),
        'option_3': ('option_3', word_field),
        'label': ('label', label_field)
    }

    word_vectors = vocab.Vectors(word_embed_file, cache_dir)

    train, dev, test = data.TabularDataset.splits(
        path=path, train='train.jsonl', validation='dev.jsonl',
        test='test.jsonl', format='json', fields=fields)
    
    print("the size of train: {}, dev:{}, test:{}".format(
        len(train.examples), len(dev.examples), len(test.examples)))
    
    word_field.build_vocab(train, dev, test, max_size=50000,
                           vectors=word_vectors, unk_init=torch.Tensor.normal_)
    
    label_field.build_vocab(train, dev, test)
    
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test), batch_sizes=(train_batch_size, dev_batch_size, test_batch_size), sort_key=lambda x: len(x.article), device=device, shuffle=True)

    return train_iter, dev_iter, test_iter
Beispiel #11
0
def load_dataset(data_dir, embeddings_path, max_vocab_size):
    # Load train and test data
    text = data.Field(sequential=True, tokenize='spacy', include_lengths=True)
    label = data.LabelField(sequential=False, tokenize='spacy', is_target=True)
    fields = [('text', text), ('label', label)]
    train_data, test_data = data.TabularDataset.splits(
        path=data_dir,
        train='train.csv',
        test='test.csv',
        format='csv',
        fields=fields,
        skip_header=True,
    )

    # Load embeddings
    embeddings = vocab.Vectors(name=embeddings_path)

    # Build vocabulary
    text.build_vocab(train_data,
                     max_size=max_vocab_size,
                     vectors=embeddings,
                     unk_init=torch.Tensor.normal_)
    label.build_vocab(train_data)

    return train_data, test_data, text, label
Beispiel #12
0
    def get_dataset(self):

        self.embeddings = vocab.Vectors(name=self.index_path,
                                        cache=self.cache_path)
        self.vocabulary = torchtext.data.Field()

        # Adding pad and unk token
        self.embeddings.stoi[self.vocabulary.pad_token] = len(
            self.embeddings.stoi)
        self.embeddings.vectors[self.embeddings.stoi[
            self.vocabulary.pad_token]] = torch.zeros(300)
        self.embeddings.stoi[self.vocabulary.unk_token] = len(
            self.embeddings.stoi)
        self.embeddings.vectors[self.embeddings.stoi[
            self.vocabulary.unk_token]] = torch.zeros(300)

        for lang in ['en', 'hi', 'gu', 'pa', 'or', 'mr', 'bn']:
            for d in self.dataset:
                if self.lang_map[d["Target_ID"]] == lang:
                    try:
                        # Remove unknown tokens
                        self.targets.append(self.embeddings.vectors[
                            self.embeddings.stoi[d["Target_keyword"]]])
                        self.src_lang.append(self.lang_map[d["Source_ID"]])
                        self.target_lang.append(self.lang_map[d["Target_ID"]])
                        self.phrases.append(d["Source_text"])

                    except KeyError:
                        #print(d["Target_keyword"] + " not found")
                        pass
Beispiel #13
0
def load_data(
    path,
    id_field,
    word_field,
    label_field,
    train_batch_size,
    dev_batch_size,
    test_batch_size,
    device,
    word_embed_file,
    cache_dir,
):

    fields = {
        "article": ("article", word_field),
        "question": ("question", word_field),
        "option_0": ("option_0", word_field),
        "option_1": ("option_1", word_field),
        "option_2": ("option_2", word_field),
        "option_3": ("option_3", word_field),
        "option_4": ("option_4", word_field),
        "label": ("label", label_field),
    }

    word_vectors = vocab.Vectors(word_embed_file, cache_dir)

    train, dev = data.TabularDataset.splits(
        path=path,
        train="Task_2_train_trial.jsonl",
        validation="Task_2_dev.jsonl",
        test=None,
        format="json",
        fields=fields,
    )

    print("the size of train: {}, dev:{},".format(
        len(train.examples),
        len(dev.examples),
    ))

    word_field.build_vocab(train,
                           dev,
                           max_size=50000,
                           vectors=word_vectors,
                           unk_init=torch.Tensor.normal_)

    label_field.build_vocab(train, dev)

    train_iter, dev_iter = data.BucketIterator.splits(
        (train, dev),
        batch_sizes=(train_batch_size, dev_batch_size),
        sort_key=lambda x: len(x.article),
        device=device,
        shuffle=True,
    )

    return train_iter, dev_iter
Beispiel #14
0
def load_then_visualize_embeddings(path):
	"""Visualizes pretrained embeddings into tensorboard.

	Args:
		path: Path to the pretrained vector file.
	"""
	writer = SummaryWriter()
	v = vocab.Vectors(path)
	writer.add_embedding(v.vectors, v.itos) 
Beispiel #15
0
    def load_embedding(self):

        embed_path = self.args_dict['embed_path']
        custom_embedding = vocab.Vectors(name=os.path.basename(embed_path),
                                         cache=os.path.dirname(embed_path))

        # !custom_embedding.stoi['cat'])
        # !custom_embedding.vectors[6])

        return custom_embedding
def load_custom_embeddings():
    weibo_word_vector = os.path.join('/', 'home', 'wzw',
                                     'pretrained_word_embeddings',
                                     custom_word_embedding)
    cache = os.path.join('/', 'home', 'wzw', 'pretrained_word_embeddings',
                         'cache.' + custom_word_embedding)
    custom_embeddings = vocab.Vectors(name=weibo_word_vector,
                                      cache=cache,
                                      unk_init=torch.Tensor.normal_)
    return custom_embeddings
Beispiel #17
0
def load_sst2(path, text_field, label_field, batch_size, embedding_file,
              cache_file):
    # 2. 定义 DataSet
    train, dev = data.TabularDataset.splits(path=path,
                                            train='train.tsv',
                                            validation='dev.tsv',
                                            format='tsv',
                                            skip_header=True,
                                            fields=[('text', text_field),
                                                    ('label', label_field)])

    # 这里需要注意单独处理的时候不能用 splits 方法。
    test = data.TabularDataset(path + 'test.tsv',
                               format='tsv',
                               skip_header=True,
                               fields=[('index', label_field),
                                       ('text', text_field)])
    print("the size of train: {}, dev:{}, test:{}".format(
        len(train), len(dev), len(test)))
    print("the result of dataset: ", train[0].text, train[0].label)

    # 3. 建立 vocab,大小是text_field里面的词数量
    vectors = vocab.Vectors(embedding_file, cache_file)

    text_field.build_vocab(train,
                           dev,
                           test,
                           max_size=25000,
                           vectors=vectors,
                           unk_init=torch.Tensor.normal_)

    label_field.build_vocab(train, dev, test)

    # 4. 构造迭代器
    train_iter, dev_iter = data.BucketIterator.splits(
        (train, dev),
        batch_sizes=(batch_size, batch_size),
        sort_key=lambda x: len(x.text),
        sort_within_batch=True,
        repeat=False,
        shuffle=True)

    # 同样单独处理的时候
    test_iter = data.Iterator(test,
                              batch_size=len(test),
                              train=False,
                              sort=False)

    print("the size of train_iter: {}, dev_iter:{}, test_iter:{}".format(
        len(train_iter), len(dev_iter), len(test_iter)))
    #    for batch_idx, (X_train_var, y_train_var) in enumerate(train_iter):
    #        print("the shape of train_x: {}, train_y:{}".format(X_train_var.shape, y_train_var.shape))
    #        break

    return train_iter, dev_iter, test_iter
Beispiel #18
0
    def build_vocab(self) -> None:
        def extend_vocab(field, word_lst, using_vector=False):
            cnt_add_w = 0
            for w in word_lst:
                if w not in field.vocab.stoi:
                    cnt_add_w += 1
                    field.vocab.itos.append(w)
                    field.vocab.stoi[w] = len(field.vocab.itos) - 1
                # else:
                #     self.logger.warning(w + ' is already in the field')
            if using_vector:
                # self.logger.info('Add ' + str(cnt_add_w) + ' zero vectors into vocab.vectors')
                field.vocab.vectors = torch.cat((field.vocab.vectors,
                                                 torch.zeros(cnt_add_w, self.word_embedding_size)), 0)

        self.logger.info('Building vocabularies')
        self.logger.info('Loading pretrained vectors from' + self.pretrained_emb_path)
        pretrained_vec = vocab.Vectors(os.path.basename(self.pretrained_emb_path),
                                       os.path.dirname(self.pretrained_emb_path))
        self.WORDS.build_vocab(self.train_dataset, min_freq=self.min_freq, vectors=pretrained_vec)
        extend_vocab(self.WORDS, self.singletons, using_vector=True)

        # print vocab to file
        f_write = open(os.path.join(self.save_to, 'vocab.txt'), 'w')
        for w in self.WORDS.vocab.itos:
            f_write.write(w + '\n')

        cnt_zero = 0
        zero_words = []
        for cnt, each_vec in enumerate(self.WORDS.vocab.vectors):
            if each_vec.sum().item() == 0:
                cnt_zero += 1
                cur_word = self.WORDS.vocab.itos[cnt]
                assert cur_word.startswith('unk') or cur_word == '<unk>' or cur_word in self.singletons
                self.WORDS.vocab.vectors[cnt] = np.random.normal(0, 0.05)
                zero_words.append(cur_word)

        self.logger.info('There are ' + str(cnt_zero) + ' zero embeddings')
        print('Zero words = ', zero_words[-25:] + zero_words[:25])
        assert cnt_zero > 1

        self.POS_TAGS.build_vocab(self.train_dataset)
        self.NONTERMS.build_vocab(self.train_dataset)
        extend_vocab(self.NONTERMS, ['<w>'])

        self.ACTIONS.build_vocab()
        assert self.ACTIONS.vocab.itos[2] == 'NP(TOP -> S)'

        self.num_words = len(self.WORDS.vocab)
        self.num_pos = len(self.POS_TAGS.vocab)
        self.num_nt = len(self.NONTERMS.vocab)
        self.num_actions = len(self.ACTIONS.vocab)
        self.logger.info('Found %d words, %d POS tags, %d nonterminals, %d actions',
                         self.num_words, self.num_pos, self.num_nt, self.num_actions)
 def __init__(self):
     glove = torchvocab.Vectors(name=os.path.join(
         Constants.Data.datadir, Constants.Data.glove_path))
     counter = Counter([w for w in glove.stoi])
     self.vocab = torchvocab.Vocab(counter,
                                   vectors=glove,
                                   specials=[
                                       Constants.SpecialTokens.pad,
                                       Constants.SpecialTokens.unk
                                   ])
     self.embedding_layer = nn.Embedding.from_pretrained(self.vocab.vectors)
Beispiel #20
0
def load_w2v_vectors(fname):
    """
    load pre-trained word2vec word embeddings from local disk file.
    :param fname: file name
    :return: word2vec word embeddings
    """
    print("Loading word2vec model from {}".format(fname))
    if not os.path.exists('model/w2v.mod'):
        model = gensim.models.KeyedVectors.load_word2vec_format(fname, binary=True, limit=1000000)
        model.wv.save_word2vec_format('model/w2v.mod')
    return vocab.Vectors('model/w2v.mod')
Beispiel #21
0
	def generate_embedding(self, path, dimensions):
		r"""Generates an embedding layer based on a vector of pretrained embeddings.

		Args:
			path: Path to the vector file of pretrained embeddings.
			dimensions: Dimensions of embedding layer for sequences.
		"""

		v = vocab.Vectors(path)
		emb = build_embedding(self.vocab_dict, self.vocab_len, dimensions, v)
		return emb, dimensions, v
Beispiel #22
0
def load_squad(path, raw_field, word_field, label_field, train_batch_size,
               dev_batch_size, device, word_embedding_file, cache_dir):

    if os.path.exists(cache_dir):
        print("dataset have cached, loding splits... ")

        list_fields = [('id', raw_field), ('s_idx', label_field),
                       ('e_idx', label_field), ('context', word_field),
                       ('question', word_field)]
        train_examples = torch.load(cache_dir + 'train_examples.pt')
        dev_examples = torch.load(cache_dir + "dev_examples.pt")

        train = data.Dataset(examples=train_examples, fields=list_fields)
        dev = data.Dataset(examples=dev_examples, fields=list_fields)

    else:

        dict_field = {
            'id': ('id', raw_field),
            's_idx': ('s_idx', label_field),
            'e_idx': ('e_idx', label_field),
            'context': ('context', word_field),
            'question': ('question', word_field)
        }

        train, dev = data.TabularDataset.splits(path=path,
                                                train='train.jsonl',
                                                validation='dev.jsonl',
                                                format='json',
                                                fields=dict_field)

        os.makedirs(cache_dir)
        torch.save(train.examples, cache_dir + 'train_examples.pt')
        torch.save(dev.examples, cache_dir + "dev_examples.pt")

    print("the size of train: {}, dev:{}".format(len(train.examples),
                                                 len(dev.examples)))

    word_field.build_vocab(train,
                           dev,
                           vectors=vocab.Vectors(word_embedding_file),
                           max_size=25000,
                           unk_init=torch.Tensor.normal_)

    print("building iterators...")

    train_iter, dev_iter = data.BucketIterator.splits(
        (train, dev),
        batch_sizes=[train_batch_size, dev_batch_size],
        device=device,
        sort_key=lambda x: len(x.c_word))

    return train_iter, dev_iter
Beispiel #23
0
def main():
    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    logging.critical('starting loading data')
    train_iter, dev_iter, total_steps = vulgar(text_field,
                                               label_field,
                                               args,
                                               device=-1,
                                               repeat=False)
    if args.load_vec:
        if args.load_vec == 'hi':
            args.load_vec = 'model/hi_1105_ml_100.w2v'

        logging.critical('start load word2vec')
        embeddings_file = args.load_vec
        vectors = vocab.Vectors(embeddings_file)
        text_field.vocab.set_vectors(vectors.stoi, vectors.vectors,
                                     vectors.dim)
        embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(text_field.vocab.vectors))
        args.embed_dim = vectors.dim
        embedding.weight.requires_grad = True
        # logging.critical(embedding.weight.requires_grad)
    else:
        # update args and print
        args.embed_num = len(text_field.vocab)
        embedding = nn.Embedding(args.embed_num, args.embed_dim)

    args.class_num = len(label_field.vocab) - 1  # 有个<unk>
    args.cuda = (not args.no_cuda) and torch.cuda.is_available()
    del args.no_cuda
    args.kernel_sizes = [int(k)
                         for k in args.kernel_sizes.split(',')]  # args中-变成了_
    args.save_dir = os.path.join(
        args.save_dir,
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    logging.critical('Parameters:')
    for attr, value in sorted(args.__dict__.items()):
        logging.critical("\t{}={}".format(attr.upper(), value))
    # model
    cnn = model.CNN_Text(args, embedding)
    if args.snapshot is not None:
        logging.critical('\nLoading model from {}...'.format(args.snapshot))
        cnn.load_state_dict(torch.load(args.snapshot))
    if args.cuda:
        torch.cuda.set_device(args.device)
        cnn = cnn.cuda()

    try:
        train.train(train_iter, dev_iter, cnn, args, total_steps)
    except KeyboardInterrupt:
        print('\n' + '-' * 89)
        print('Exiting from training early')
Beispiel #24
0
    def iters(cls, config, **kwargs):
        """
        Create the iterator objects for splits of the SemEval dataset.
        :param batch_size: Batch_size
        :param device: Device to create batches, -1 for CPU and None for GPU.
        :param root: The root directory containing datasets files.
        :param vectors: Load pretrained vectors
        :param kwargs:
        :return:
        """

        vectors = vocab.Vectors(name=config.vectors, cache=config.cache)

        ID = data.RawField()
        TEXT = data.Field(batch_first=True,
                          tokenize=lambda x: x,
                          fix_length=20)
        TAG = data.Field(batch_first=True, tokenize=lambda x: x, fix_length=20)
        RAW = data.RawField()
        REL = data.Field(sequential=False,
                         use_vocab=False,
                         batch_first=True,
                         tensor_type=torch.FloatTensor,
                         postprocessing=data.Pipeline(get_class_probs))
        CONF = data.RawField()

        #TAG.preprocessing = shrink_chunk

        train, val, test = cls.splits(ID,
                                      TEXT,
                                      REL,
                                      CONF,
                                      RAW,
                                      TAG,
                                      root=config.datasets_dir,
                                      **kwargs)

        TEXT.build_vocab(train)
        config.n_embed = len(TEXT.vocab)
        config.d_embed = vectors.dim
        TEXT.vocab.load_vectors(vectors)

        config.weights = TEXT.vocab.vectors

        config.n_classes = 2

        return data.BucketIterator.splits((train, val, test),
                                          batch_size=config.batch_size,
                                          shuffle=config.shuffle,
                                          device=config.device,
                                          repeat=False)
def GetIterator(TEXT, LABEL, path, args, **kwargs):
    """
    生成数据迭代器

    args:
        TEXT: torchtext.data生成的Field对象
        LABEL: torchtext.data生成的Field对象
        
    return:
        train_iter: 训练集迭代器
        dev_iter: 验证集迭代器

    """

    #定义TEXT的tokenize规则
    TEXT.tokenize = tokenizer

    #创建表格数据集
    train_dataset, dev_dataset, test_dataset = data.TabularDataset.splits(
        path=path,
        format='csv',
        skip_header=True,
        train='cnews.train.csv',
        validation='cnews.val.csv',
        test='cnews.test.csv',
        fields=[
            ('label', LABEL),
            ('text', TEXT),
        ])

    if args.static and args.pretrainedEmbeddingName and args.pretrainedEmbeddingPath:
        #加载预训练的词向量,name:包含词向量的文件名,cache:包含词向量的目录
        vectors = vocab.Vectors(name=args.pretrainedEmbeddingName,
                                cache=args.pretrainedEmbeddingPath)
        #建立TEXT的词汇表
        TEXT.build_vocab(train_dataset, dev_dataset, vectors=vectors)
    else:
        TEXT.build_vocab(train_dataset, dev_dataset)

    #建立LABEL的词汇表
    LABEL.build_vocab(train_dataset, dev_dataset)

    train_iter, dev_iter, test_iter = data.Iterator.splits(
        (train_dataset, dev_dataset, test_dataset),
        batch_sizes=(args.batch_size, len(dev_dataset) / 8,
                     len(test_dataset) / 8),
        sort_key=lambda x: len(x.text),
        **kwargs)

    return train_iter, dev_iter, test_iter
Beispiel #26
0
def dataloader(text_field,
               label_field,
               user_field,
               args,
               wdir=None,
               u2vdir=None,
               **kargs):
    train_data, dev_data, test_data = mydatasets.MR.splits(text_field,
                                                           label_field,
                                                           user_field,
                                                           args=args)
    if args.pretrained_embed_words:
        custom_embed = vocab.Vectors(name=wdir, max_vectors=100000)
        text_field.build_vocab(train_data,
                               dev_data,
                               test_data,
                               vectors=custom_embed)
        # print(args.custom_embed)
    else:
        text_field.build_vocab(train_data, dev_data, test_data)
    if args.pretrained_embed_users:
        custom_embed_u = vocab.Vectors(name=u2vdir, max_vectors=8000)
        user_field.build_vocab(train_data,
                               dev_data,
                               test_data,
                               vectors=custom_embed_u)
    else:
        user_field.build_vocab(train_data, dev_data, test_data)
    label_field.build_vocab(train_data, dev_data, test_data)
    # split valid and train (10%)

    train_iter, dev_iter, test_iter = data.Iterator.splits(
        (train_data, dev_data, test_data),
        batch_sizes=(batch_size, len(dev_data), len(test_data)),
        **kargs)
    return train_iter, dev_iter, test_iter
Beispiel #27
0
def dataset():

    build_csv()

    data_dir = "/home/donchan/Documents/DATA/jigsaw"

    start_t = time()

    vec = vocab.Vectors('glove.6B.100d.txt',
                        '/home/donchan/Documents/DATA/glove_embedding/')

    TEXT = Field(sequential=True, tokenize=tokenizer2, lower=True)
    LABEL = Field(sequential=False, use_vocab=False)

    datafields = [
        ("id",
         None),  # we won't be needing the id, so we pass in None as the field
        ("comment_text", TEXT),
        ("toxic", LABEL),
        ("severe_toxic", LABEL),
        ("obscene", LABEL),
        ("threat", LABEL),
        ("insult", LABEL),
        ("identity_hate", LABEL)
    ]

    train, val = TabularDataset.splits(path=data_dir,
                                       train='traindf.csv',
                                       validation='valdf.csv',
                                       format='csv',
                                       skip_header=True,
                                       fields=datafields)

    print("train val length", len(train), len(val))
    #print( train[0].comment_text )
    #print( train[0].toxic, train[0].severe_toxic, train[0].threat, train[0].insult, train[0].identity_hate  )

    TEXT.build_vocab(train, val, vectors=vec, min_freq=2)
    #LABEL.build_vocab(train, val)

    print("time to build vocab", (time() - start_t))
    print("length of vocaburary", len(TEXT.vocab), TEXT.vocab.vectors.shape)

    print("- " * 20)
    print("* most common words.")
    print(TEXT.vocab.freqs.most_common(20))

    return train, val, TEXT, LABEL
Beispiel #28
0
def init_workspace():
    if not os.path.exists(prodirectory):
        print("directory at " + prodirectory)
        os.makedirs(prodirectory)
    else:
        print("warning: directory already exists")

    global multi_classes
    multi_classes = [data.LabelField() for _ in range(3)]
    word_field = data.Field(tokenize=lambda x: x.split(','),
                            include_lengths=True,
                            batch_first=True,
                            fix_length=MAX_SEQ_LEN)

    print("load torch data ")
    class_fields = [('w', word_field), ('cate1_id', multi_classes[0]),
                    ('cate2_id', multi_classes[1]),
                    ('cate3_id', multi_classes[2])]
    train = data.TabularDataset(TRAINFILE,
                                'tsv',
                                skip_header=True,
                                fields=class_fields)
    valid = data.TabularDataset(VALFILE,
                                'tsv',
                                skip_header=True,
                                fields=class_fields)
    test = data.TabularDataset(TESTFILE,
                               'tsv',
                               skip_header=True,
                               fields=[('w', word_field)])
    # discretization
    word_field.build_vocab(train, valid, test)

    for cls in multi_classes:
        cls.build_vocab(train, valid)

    trainiter = data.BucketIterator(train,
                                    batch_size=BATCH_SIZE,
                                    sort_key=lambda x: len(x.w),
                                    shuffle=True)
    valiter = data.BucketIterator(valid, batch_size=BATCH_SIZE, shuffle=False)
    testiter = data.BucketIterator(test, batch_size=BATCH_SIZE, shuffle=False)

    vectors = vocab.Vectors(W2VFILE)
    print("Word2vec model Loaded")
    word_field.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

    return word_field.vocab.vectors, trainiter, valiter, testiter
Beispiel #29
0
 def from_pretrained_embedding(cls,
                               embedding_file_path,
                               inner_module_init_func,
                               cache_path="../data/cache"):
     word_embeddings = vocab.Vectors(embedding_file_path, cache=cache_path)
     padding_idx, embedding_length = word_embeddings.vectors.shape
     padding_embedding = np.zeros(embedding_length)
     inner_module = inner_module_init_func(embedding_length)
     torch_embedding = torch.from_numpy(
         np.row_stack((word_embeddings.vectors, padding_embedding)))
     nn_embedding = nn.Embedding.from_pretrained(embeddings=torch_embedding,
                                                 freeze=False,
                                                 padding_idx=padding_idx)
     return cls(inner_module,
                pretrained_embedding=word_embeddings,
                nn_embedding=nn_embedding)
def rnn_iter(train_path, test_path, batchsize, TEXT, LABEL):
    train = RnnDataset(train_path, text_field=TEXT, label_field=LABEL, aug=1)
    test = RnnDataset(test_path, text_field=TEXT, label_field=None, aug=1)
    # 传入用于构建词表的数据集
    vectors = vocab.Vectors(name="wordvec.txt", cache="data")
    TEXT.build_vocab(test, vectors=vectors)
    weight_matrix = TEXT.vocab.vectors
    # 同时对训练集和验证集构造迭代器
    train_iter, test_iter = data.BucketIterator.splits(
        (train, test),
        batch_sizes=(batchsize, batchsize),
        device=torch.device('cuda'),
        sort_key=lambda x: len(x.text),
        sort_within_batch=False)

    return train_iter, test_iter, weight_matrix