Example #1
0
def load_data(train_dir, test_dir):
    NLP = spacy.load('en_core_web_sm')
    tokenizer = lambda sent: [
        x.text for x in NLP.tokenizer(sent) if x.text != " "
    ]

    TEXT = data.Field(sequential=True,
                      batch_first=True,
                      lower=True,
                      fix_length=50,
                      tokenize=tokenizer)
    LABEL = data.Field(sequential=False, batch_first=True)

    train_data = TabularDataset(path=train_dir,
                                skip_header=True,
                                format='csv',
                                fields=[('text', TEXT), ('label', LABEL)])
    test_data = TabularDataset(path=test_dir,
                               skip_header=True,
                               format='csv',
                               fields=[('text', TEXT), ('label', LABEL)])

    train_data, valid_data = train_data.split(split_ratio=0.8)

    return train_data, valid_data, test_data, TEXT, LABEL
Example #2
0
def load_dataset_from_csv(params, device):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.

    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.

    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.

    """
    # define tokenizer
    en = English()

    def tokenize(sentence):
        return [tok.text for tok in en.tokenizer(sentence)]

    TEXT = Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', batch_first=True, fix_length=128)
    LABEL = LabelField()

    fields_list = [('Unnamed: 0', None),
                   ('text', TEXT),
                   ('conf', None),
                   ('label', LABEL)]
    base_path = params.DATA_PATH
    train_path = os.path.join(base_path, "filtered_train.csv")
    test_path = os.path.join(base_path, "filtered_test.csv")
    train_data = TabularDataset(path=train_path,  # the root directory where the data lies
                                format='csv',
                                skip_header=True,
                                fields=fields_list)

    test_data = TabularDataset(path=test_path,  # the root directory where the data lies
                               format='csv',
                               skip_header=True,
                               fields=fields_list)

    if params.VOCAB_USE_GLOVE:
        TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ, vectors=GloVe(name='6B', dim=300))
        logging.info("Loaded Glove embedding, Vector size of Text Vocabulary: " + str(TEXT.vocab.vectors.size()))

    else:
        TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ)
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    logging.info("Length of Text Vocabulary: " + str(len(TEXT.vocab)))

    train_iter, test_iter = data.BucketIterator.splits((train_data, test_data),
                                                       batch_sizes=(params.TRAIN_BATCH_SIZE, params.TRAIN_BATCH_SIZE),
                                                       sort_key=lambda x: len(x.text), repeat=False, shuffle=True,
                                                       device=device)
    # Disable shuffle
    test_iter.shuffle = False
    return TEXT, word_embeddings, train_iter, test_iter
Example #3
0
 def build_dataset(self, field):
     train = TabularDataset(path=self.train_path, format='tsv',
                            fields=[('sent', field)])
     valid = TabularDataset(path=self.valid_path, format='tsv',
                            fields=[('sent', field)])
     test = TabularDataset(path=self.test_path, format='tsv',
                            fields=[('sent', field)])
     return train, valid, test
Example #4
0
def create_tabular_dataset(data_info, **args):
    disable = [
        'tagger', 'parser', 'ner', 'textcat'
        'entity_ruler', 'sentencizer', 'merge_noun_chunks', 'merge_entities',
        'merge_subtokens'
    ]

    lang = args.get('lang', 'en')
    pretrained_emb = args.get('pretrained_emb', 'glove.6B.300d')

    _, path_train_dataset, path_valid_dataset = analyze_datainfo_paths(
        data_info)

    try:
        spacy_en = spacy.load(f'{lang}_core_web_sm', disable=disable)

    except:
        log(f"Download {lang}")
        import importlib

        os.system(f"python -m spacy download {lang}")
        spacy_en = importlib.import_module(f'{lang}_core_web_sm').load(
            disable=disable)

    #    sleep(60)
    #    spacy_en = spacy.load( f'{lang}_core_web_sm', disable= disable)

    def tokenizer(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    # Creating field for text and label
    TEXT = Field(sequential=True, tokenize=tokenizer, lower=True)
    LABEL = Field(sequential=False)

    print('Preprocessing the text...')
    # clean the text
    TEXT.preprocessing = torchtext.data.Pipeline(clean_str)

    print('Creating tabular datasets...It might take a while to finish!')
    train_datafield = [('text', TEXT), ('label', LABEL)]
    tabular_train = TabularDataset(path=path_train_dataset,
                                   format='csv',
                                   skip_header=True,
                                   fields=train_datafield)

    valid_datafield = [('text', TEXT), ('label', LABEL)]

    tabular_valid = TabularDataset(path=path_valid_dataset,
                                   format='csv',
                                   skip_header=True,
                                   fields=valid_datafield)

    print('Building vocaulary...')
    TEXT.build_vocab(tabular_train, vectors=pretrained_emb)
    LABEL.build_vocab(tabular_train)

    return tabular_train, tabular_valid, TEXT.vocab
Example #5
0
 def build_dataset(self, ORIG, PARA):
     train_val = TabularDataset(path=self.train_path,
                                format='tsv',
                                fields=[('orig', ORIG), ('para', PARA)])
     train, val = train_val.split(split_ratio=0.8)
     # FIXME: test data is too large!
     test = TabularDataset(path=self.test_path,
                           format='tsv',
                           fields=[('orig', ORIG), ('para', PARA)])
     return train, val, test
Example #6
0
def get_datasets(path_to_data: PathOrStr, 
                 len_context_vocab: int,
                 len_title_vocab: int,
                 len_aut_vocab: int) -> BaseData:
    """
    Initializes torchtext Field and TabularDataset objects used for training.
    The vocab of the author, context and title fields is built *on the whole dataset*
    with vocab_size=30000 for all fields. The dataset is split into train, valid and test with [0.7, 0.2, 0.1] splits. 
    
    ## Parameters:  
    
    - **path_to_data** *(PathOrStr)*:  Path object or string to a .csv dataset.   
    - **len_context_vocab** *(int)*:  Maximum length of context vocab size before adding special tokens.  
    - **len_title_vocab** *(int)*:  Maximum length of context vocab size before adding special tokens.  
    - **len_aut_vocab** *(int)*:  Maximum length of context vocab size before adding special tokens.   
    
    ## Output:  
    
    - **data** *(BaseData)*:  Container holding CNTXT (*Field*), TTL (*Field*), AUT (*Field*), 
        train (*TabularDataset*), valid (*TabularDataset*), test (*TabularDataset*) objects.
    """
    # set the seed for the data split
    random.seed(SEED)
    state = random.getstate()

    logger.info("Getting fields...")
    CNTXT, TTL, AUT = get_fields()
    # generate torchtext dataset from a .csv given the fields for each datatype
    # has to be single dataset in order to build proper vocabularies
    logger.info("Loading dataset...")
    dataset = TabularDataset(str(path_to_data), "CSV", 
                       [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],
                       skip_header=True)
    #train = TabularDataset('/home/maria/input/mag_subset2_train.csv', "CSV",
    #                    [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True)
    #valid = TabularDataset('/home/maria/input/mag_subset2_valid.csv', "CSV",
    #                    [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True)
    #test = TabularDataset('/home/maria/input/mag_test.csv', "CSV",
    #                [("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True)


    train, valid, test = TabularDataset.splits(path='/home/maria/input', train='mag_subset2_train.csv',validation='mag_subset2_valid.csv', test='mag_test.csv',format='csv',
            fields=[("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True)

    # build field vocab before splitting data
    logger.info("Building vocab...")
    TTL.build_vocab(dataset, max_size=len_title_vocab)
    AUT.build_vocab(dataset, max_size=len_aut_vocab)
    CNTXT.build_vocab(dataset, max_size=len_context_vocab)

    # split dataset
    #train, valid, test = TabularDataset.splits(path='/home/maria/input', train='mag_subset2_train.csv',validation='mag_subset2_valid.csv', test='mag_test.csv',format='csv',
    #        fields=[("context", CNTXT), ("authors_citing", AUT), ("title_cited", TTL), ("authors_cited", AUT)],skip_header=True)
    #train, valid, test = dataset.split([0.5,0.45,0.05], random_state = state)
    return BaseData(cntxt=CNTXT, ttl=TTL, aut=AUT, train=train, valid=valid, test=test)
Example #7
0
    def get_train_loader(self, batch_size):
        data_field = [('line', self.text_field), ('label', self.label_field)]

        train_set = TabularDataset('train_set.csv', 'csv', data_field)
        test_set = TabularDataset('test_set.csv', 'csv', data_field)
        self.text_field.build_vocab(train_set, test_set)
        self.label_field.build_vocab(train_set, test_set)

        train_loader = Iterator(train_set, batch_size)

        return train_loader
Example #8
0
 def build_dataset(self, DOCS, SUMM):
     fields = {
         'doc{}'.format(i): ('doc{}'.format(i), f)
         for i, f in enumerate(DOCS, 1)
     }
     fields['summ'] = ('summ', SUMM)
     data = TabularDataset(path=self.data_path,
                           format='json',
                           fields=fields)
     train, test, valid = data.split(split_ratio=[0.8, 0.1, 0.1])
     return train, valid, test
Example #9
0
    def __init__(self,
                 path=DATA_PATH,
                 device=torch.device('cpu'),
                 batch_size=DEFAULT_BATCH_SIZE,
                 train_test_val_ratio=TRAIN_TEST_VAL_RATIO):
        """Loads dataset examples and creates bucket iterators.

        Creates vocabulary from loaded examples. Train, test and validation
        splits and their iterators are created.

        Args:
            path (str, optional): Path to the dataset file. Default: constants.DATA_PATH.
            device (torch.device, optional): Torch device where tensors will be created.
                Default: torch.device('cpu').
            batch_size (int, optional): Size of batch. Default: 32.
            train_test_val_ratio (iterable, optional): Iterable of 3 elements denoting ratio of
                train, test and validation splits. Default: [0.90, 0.05, 0.05].
        """
        print(colorize('\nLoading dataset'))

        self._batch_size = batch_size
        self._device = device

        self._field = Field(tokenize='revtok', lower=True, batch_first=True)

        fields = [
            ('query', self._field),
            ('response', self._field),
        ]

        self.data = TabularDataset(path=path, format='csv', fields=fields)

        self._train, self._val, self._test = self.data.split(
            train_test_val_ratio)

        self.train_iter, self.validation_iter, self.test_iter = BucketIterator.splits(
            datasets=(self._train, self._val, self._test),
            batch_size=self._batch_size,
            repeat=False,
            sort_key=lambda ex: interleave_keys(len(ex.query), len(ex.response)
                                                ),
            device=self._device)

        self.iterator = BucketIterator(dataset=self.data,
                                       batch_size=self._batch_size,
                                       repeat=False,
                                       sort_key=lambda ex: interleave_keys(
                                           len(ex.query), len(ex.response)),
                                       device=self._device)

        print(colorize(' • Building vocabulary', color='yellow'))
        self._field.build_vocab(self.data)
        self.vocab = self._field.vocab
def binary_classification(obj):
    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True,
                 tokenize=tokenize,
                 lower=True,
                 batch_first=True,
                 fix_length=obj.fix_length)

    LABEL = Field(sequential=False,
                  dtype=torch.float,
                  batch_first=True,
                  use_vocab=False)

    fields = [
        ('id', None),
        ('content', TEXT),
        ('trump_percentage', LABEL),
    ]

    train_csv = 'twitter_pollster_' + str(
        obj.days) + '_days_train_trump_percentage.csv'
    test_csv = 'twitter_pollster_' + str(
        obj.days) + '_days_test_trump_percentage.csv'

    train_dataset = TabularDataset(path=obj.data_path + '/' + train_csv,
                                   format='csv',
                                   skip_header=True,
                                   fields=fields)

    test_dataset = TabularDataset(path=obj.data_path + '/' + test_csv,
                                  format='csv',
                                  skip_header=True,
                                  fields=fields)

    TEXT.build_vocab(train_dataset,
                     vectors=GloVe(name=obj.Glove_name, dim=obj.embedding_dim))
    vocab_size = len(TEXT.vocab)
    word_embeddings = TEXT.vocab.vectors
    print("vector size of text vocabulary: ", TEXT.vocab.vectors.size())

    train_iter, test_iter = Iterator.splits(
        (train_dataset, test_dataset),
        sort_key=lambda x: len(x.content),
        batch_sizes=(obj.train_batch_size, obj.test_batch_size),
        device=torch.device(obj.device),
        sort_within_batch=True,
        repeat=False)

    train_iter_ = BatchWrapper(train_iter, 'content', ['trump_percentage'])
    test_iter_ = BatchWrapper(test_iter, 'content', ['trump_percentage'])

    return TEXT, vocab_size, word_embeddings, train_iter_, test_iter_
Example #11
0
def loadDataset():
    """
    Loading and Spliting Dataset
    """
    train_data, test_data = [], []

    print("==> Loading Training Set")

    fields_tuple = {
        "question": ('Q', Q),
        "answer": ('A', A),
        "answer_sentence": ('Ans_Sen', Ans_Sen)
    }

    train_data = TabularDataset(path='data/Final_Dataset_Train.json',
                                format='json',
                                fields=fields_tuple)
    print("Size of Training Set : {}".format(len(train_data)))
    print("Training Set Example: {}".format(train_data[0].__dict__))

    print("==> Loading Test Set")
    test_data = TabularDataset(path='data/Final_Dataset_Test.json',
                               format='json',
                               fields=fields_tuple)

    #print(dataset.__dict__.keys())
    #dataset = extract_QnA_Ans_Sent(dataset)

    #print("==> Creating Training Set and Test Set")

    #train_data, test_data = dataset.split(split_ratio=0.8)
    #train, val = train_test_split(train, test_size=0.2)

    #train_data = make_torchtext(train,fields_tuple)
    #test_data = make_torchtext(test,fields_tuple)

    print("Size of Test Set : {}".format(len(test_data)))
    print("Test Set Example: {}".format(train_data[0].__dict__))

    #val_data = make_torchtext(val,fields_tuple)

    print("==> Building Vocabulary using Fasttext")

    Q.build_vocab(train_data, specials=['<sep>'], vectors='fasttext.en.300d')
    A.build_vocab(train_data, vectors='fasttext.en.300d')
    Ans_Sen.build_vocab(train_data, vectors='fasttext.en.300d')

    QnA_vocab = merge_vocabs([Q.vocab, A.vocab, Ans_Sen.vocab])

    #fields = [('A',A),('Q',Q),('Ans_Sen',Ans_Sen)]

    return train_data, test_data, QnA_vocab  #, fields
def load_data(train_dir, test_dir):
    nlp = spacy.load('en_core_web_sm')
    tokenizer = lambda sent: [x.text for x in nlp.tokenizer(sent) if x.text != " "]

    text = data.Field(sequential=True, batch_first=True, lower=True, fix_length=50, tokenize=tokenizer)
    label = data.LabelField()

    train_data = TabularDataset(path=train_dir, skip_header=True, format='csv', fields=[('turn1', text), ('turn2', text), ('turn3', text), ('label', label)])
    test_data = TabularDataset(path=test_dir, skip_header=True, format='csv', fields=[('turn1', text), ('turn2', text), ('turn3', text), ('label', label)])

    train_data, valid_data = train_data.split(split_ratio=0.8)

    return train_data, valid_data, test_data, text, label
Example #13
0
    def get_iterators_and_fields(self) -> Tuple:
        """
        Builds train test/iterators, fields, vocab and tokenizer

        Returns: train/test iterators, data and fields for source/target

        """
        data = self.result_df[['text', 'title']]
        data.columns = ['src', 'trg']
        data.to_csv('data/all_data.csv', index=False)

        SRC = Field(tokenize=self.tokenize,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True,
                    batch_first=True)

        TRG = Field(tokenize=self.tokenize,
                    init_token='<sos>',
                    eos_token='<eos>',
                    lower=True,
                    batch_first=True)

        data_fields = [('src', SRC), ('trg', TRG)]
        all_data = TabularDataset(path='data/all_data.csv',
                                  format='csv',
                                  fields=data_fields)

        train, test = train_test_split(data, test_size=self.test_size)
        train.to_csv(self.path_to_save_data.joinpath('train.csv'), index=False)
        test.to_csv(self.path_to_save_data.joinpath('val.csv'), index=False)

        train_data, test_data = TabularDataset.splits(path='data/',
                                                      train='train.csv',
                                                      validation='val.csv',
                                                      format='csv',
                                                      fields=data_fields)
        SRC.build_vocab(all_data, min_freq=self.min_freq)
        TRG.build_vocab(all_data,
                        min_freq=self.min_freq)
        # due to limited amount of data we have to build vocab on all data otherwise we get too much <unk> tokens

        train_iterator, test_iterator = BucketIterator.splits(
            (train_data, test_data),
            batch_size=self.batch_size,
            device=self.device,
            sort=True,
            sort_within_batch=True,
            sort_key=lambda x: len(x.src))

        return train_iterator, test_iterator, train_data, test_data, SRC, TRG
Example #14
0
def get_dataset(fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        lower = True
    logging.info("预处理 csv......")
    prepare_csv()

    TEXT = Field(sequential=True,
                 fix_length=fix_length,
                 tokenize=tokenizer,
                 pad_first=True,
                 lower=lower)
    LABEL = Field(sequential=False, use_vocab=False)

    train_datafields = [("id", None), ("comment_text", TEXT), ("toxic", LABEL),
                        ("severe_toxic", LABEL), ("threat", LABEL),
                        ("obscene", LABEL), ("insult", LABEL),
                        ("identity_hate", LABEL)]

    logging.info("读取 train.csv......")
    train, val = TabularDataset.splits(path='cache',
                                       train='train.csv',
                                       validation="val.csv",
                                       format='csv',
                                       skip_header=True,
                                       fields=train_datafields)
    logging.info("读取 test.csv......")
    test = TabularDataset(path='cache/test.csv',
                          format='csv',
                          skip_header=True,
                          fields=[('id', None), ('comment_text', TEXT)])

    logging.info('读取glove词向量......')
    # vectors = GloVe(name='6B', dim=300) #会下载词向量
    #读取本地词向量
    cache = '.vector_cache'
    if not os.path.exists(cache):
        os.mkdir(cache)
    vectors = Vectors(
        name='/home/sunyan/quora/input/embeddings/glove.840B.300d.txt',
        cache=cache,
        max_vectors=200000)
    vectors.unk_init = init.xavier_uniform_

    logging.info('构建词表......')
    TEXT.build_vocab(train, test, max_size=20000, min_freq=50, vectors=vectors)

    print(TEXT.vocab.freqs.most_common(10))

    logging.info("预处理结束!")

    return (train, val, test), TEXT
Example #15
0
def load_tabular_set(file_path,format,fields,split_ratio=None,split_seed=None,skip_header=False,save_vocab_path=os.getcwd(),**args):

    """

    :param file_path:
    :param format:
    :param fields:
    :param split_ratio:
    :param split_seed:
    :param skip_header:
    :param save_vocab_path:
    :param args:
    :return:
    """
    if os.path.exists(save_vocab_path) == False:
        os.mkdir(save_vocab_path)

    dataset_fields = []

    for field in fields:
        dataset_fields.append((field.name,field.field))

    dataset = TabularDataset(file_path,format,dataset_fields,skip_header=skip_header,**args)

    for f_input in fields:
        name = f_input.name
        field = f_input.field
        vocab = f_input.vocab

        if vocab is None:

            field.build_vocab(dataset,max_size=f_input.max_size, min_freq=f_input.min_freq,
                 vectors=f_input.vectors, unk_init=f_input.unk_init, vectors_cache=f_input.vectors_cache)

            with open(os.path.join(save_vocab_path,"{}.json".format(name)), "w") as jfile:
                json.dump(field.vocab.stoi,jfile,sort_keys=True)

        else:
            with open(vocab, "r") as jfile:
                dict_ = json.load(jfile)

                field.build_vocab()
                field.vocab.stoi = dict_



    if split_ratio is not None:

        dataset = dataset.split(split_ratio,random_state=split_seed)

    return dataset
Example #16
0
def load_data(preprocessing=None):
    # Fields for the dataset
    # The actual review message

    #TEXT = Field(tokenize='spacy') # -- Old way, unclear exactly what language model is used
    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 preprocessing=preprocessing)
    LABEL = LabelField(dtype=torch.float)

    # Get the entire dataset that we will then split
    data = TabularDataset(path=path,
                          format='tsv',
                          fields=[('text', TEXT), ('label', LABEL)])

    # We should probabily look at the proportion of fake to non fake in each of these
    # set to make sure it is fairly even. Though probabilistically it should be I suppose
    train_data, valid_data, test_data = data.split(
        split_ratio=TRAIN_VAL_TEST_SPLIT, random_state=random.seed(SEED))
    #valid_data, test_data = test_data.split(split_ratio=VAL_TEST_SPLIT, random_state=random.seed(SEED))

    print('Size of train set: ' + str(len(train_data.examples)))
    print('Size of val / test: ' + str(len(valid_data.examples)))
    '''
    # Try loading in the IMB dataset to label pos or negative
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    # Get train/valid split!!
    train_data, valid_data = train_data.split(random_state=random.seed(SEED))
    '''

    # Now we need to build the vocab for our actual data
    # Here we will use the pre-trained word vetors from "glove.6b.100"
    TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
    LABEL.build_vocab(train_data)

    # Print stuff for sanity checks
    print('Size of the vocab: ' + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_itr, valid_itr, test_itr = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device,
        sort_key=lambda x: len(x.text))

    return TEXT, train_itr, valid_itr, test_itr
Example #17
0
def generate_data(config):
    ## 不同字段的操作定义

    tokenizer = lambda x: [one for one in x]
    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 fix_length=config.sen_max_length)  ##截断句长直接影响acc!!!
    LABEL = Field(sequential=False, use_vocab=False)  ## 如果标签是数值型的话

    datafields = [("context", TEXT),
                  ("label_id", LABEL)]  ## TEXT field, LABEL field
    test_field = [("context", TEXT), ("label_id", LABEL)]
    train_file, valid_file = TabularDataset.splits(
        path=config.data_ori,
        train=config.train_path,
        validation=config.valid_path,
        format="csv",
        skip_header=True,
        fields=datafields)
    test_file = TabularDataset(path=config.data_ori + config.test_path,
                               format="csv",
                               skip_header=True,
                               fields=test_field)
    ## 构建词典
    vectors = Vectors(name=config.data_ori + config.embedding_path, cache="./")
    TEXT.build_vocab(train_file,
                     max_size=config.vocab_maxsize,
                     min_freq=config.vocab_minfreq,
                     vectors=vectors)
    TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

    train_iter, val_iter = BucketIterator.splits(
        (train_file, valid_file),
        batch_sizes=(config.batch_size, config.batch_size),
        device=config.device,
        sort_key=lambda x: len(x.context),
        sort_within_batch=True,
        # 当要使用pack_padded_sequence时,需要将sort_within_batch设置为True,同时会将paded sequence 转为PackedSequence对象
        repeat=False)

    test_iter = Iterator(test_file,
                         batch_size=config.batch_size,
                         device=config.device,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False)

    return train_iter, val_iter, test_iter, TEXT
Example #18
0
def predict_text_cnn(model_path, file_path, vocab_path, batch_size=64):

    TEXT = data.Field(sequential=True, lower=True, batch_first=True)
    fields = [('sentence', TEXT)]

    test_data = TabularDataset(path=file_path,
                               format='tsv',
                               skip_header=True,
                               fields=fields)
    with open(vocab_path, 'rb') as handle:
        vocab = pickle.load(handle)
    TEXT.vocab = vocab

    device = torch.device('cuda:0')
    test_iter = Iterator(test_data,
                         batch_size=batch_size,
                         shuffle=False,
                         device=device)
    model = torch.load(model_path)

    sentiments = []
    model.eval()
    with torch.no_grad():

        for batch in test_iter:
            sentence = batch.sentence
            logit = model(sentence)
            prob = torch.softmax(logit, dim=-1)[:, 1].tolist()
            sentiments.extend(prob)

    return sentiments
Example #19
0
def load_question_dataset(batch_size, dataset, device=0):
    spacy_en = spacy.load('en')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    inp_lang = Field(tokenize=tokenize_en,
                     init_token='<sos>',
                     eos_token='<eos>')
    opt_lang = Field(tokenize=tokenize_en,
                     init_token='<sos>',
                     eos_token='<eos>')

    dataset = QGenDataset(dataset)

    # associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT
    data_fields = [('ans', inp_lang), ('que', opt_lang)]
    train, val, test = TabularDataset.splits(path='./.data/',
                                             train='train.csv',
                                             validation='val.csv',
                                             test="test.csv",
                                             format='csv',
                                             fields=data_fields)

    inp_lang.build_vocab(train, val, test)
    opt_lang.build_vocab(train, val, test)

    train_iter = BucketIterator(train, batch_size=batch_size, \
            device=device, repeat=False , sort_key=lambda x: len(x.que), shuffle=True)
    val_iter = BucketIterator(val, batch_size=batch_size, \
            device=device, sort_key=lambda x: len(x.que), shuffle=True)
    test_iter = BucketIterator(test, batch_size=batch_size, \
            device=device, sort_key=lambda x: len(x.que), shuffle=True)

    return train_iter, val_iter, test_iter, inp_lang, opt_lang
Example #20
0
def prepare_vocab():
    '''
     加载需要的vocabulary 以及 迭代器
    :return:
    '''

    # 定义Field
    PREV = Field(
        tokenize=chi_tokenizer, init_token='<bos>',
        eos_token='<eos>')  # 在这例可以添加很多有用的参数, 比如pa_token,unknowntoken,stopwords
    NEXT = Field(tokenize=chi_tokenizer, init_token='<bos>', eos_token='<eos>')

    # 定义字段与FIELD之间读配对
    fields = [('prev', PREV), ('next', NEXT)]

    # 注意skip_header
    train, val = TabularDataset.splits(path='data',
                                       train='train.csv',
                                       validation='test.csv',
                                       format='csv',
                                       fields=fields,
                                       skip_header=True)

    # 构建vocabulary时同时使用到了train, 和val的数据
    PREV.build_vocab(train, val)
    NEXT.build_vocab(train, val)
    #  需要注意的是, PREV和NEXT的字典是不一样的

    # 定义数据生成器
    train_iter = BucketIterator(train, batch_size=bc.batch_size, \
                                sort_key=lambda x: len(x.prev), sort_within_batch=True, shuffle=True)

    val_iter = BucketIterator(val, batch_size=bc.batch_size, \
                              sort_key=lambda x: len(x.prev), sort_within_batch=True, shuffle=True)
    return PREV, NEXT, train_iter, val_iter
def make_dataset(train_csv, val_csv, test_csv):
    '''
    Generates the training, validation and testing datasets as torchtext
    objects for easy incorporation with Pytorch (cleaning them in the process)

    Inputs:
        train_csv(str): name of training data csv
        val_csv(str): name of validation data csv
        test_csv(str): name of testing data csv

    Outputs:
        train: tabular dataset obj representing the training data
        test: tabular dataset obj representing the testing data
        val: tabular dataset obj representing the validation data
        text: torchtext field obj representing how text should be
            processed and stored
        label: torchtext labelfield obj representing labels should be
            processed and stored
    '''
    text = Field(sequential=True, tokenize=word_tokenize,
                 preprocessing=normalize_tokens)
    label = LabelField(dtype=torch.float)
    data_fields = [('dab_id', None), ('alj_id', None), ('alj_text', text),
                   ('decision_binary', label), ('dab_year', None)]
    train, val, test = TabularDataset.splits(path='', train=train_csv,
                                             validation=val_csv, test=test_csv,
                                             format='csv', fields=data_fields,
                                             skip_header=True)
    return train, test, val, text, label
    def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)
Example #23
0
def preprocess(config: dict) -> None:

    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s - %(message)s',
                        datefmt='%d-%b-%y %H:%M:%S')
    logger = logging.getLogger(__name__)

    base_path = config['base_path']
    vocab_path = os.path.join(base_path, 'vocab.pkl')
    embedding_path = os.path.join(base_path, 'embedding.npy')
    glove_source_path = config['glove_source_path']

    TEXT = data.Field(sequential=True, lower=True, batch_first=True)
    fields = [('sentence', TEXT)]

    train_data = TabularDataset(path=os.path.join(base_path, 'train.tsv'),
                                format='tsv',
                                skip_header=True,
                                fields=fields)

    logger.info('build vocabulary')
    TEXT.build_vocab(train_data, specials=[UNK, PAD, SOS, EOS], max_size=50000)
    vocab = TEXT.vocab
    vocab_size = len(vocab.itos)
    logger.info('vocab_size: %d' % vocab_size)
    logger.info('save vocabulary')
    with open(vocab_path, 'wb') as handle:
        pickle.dump(vocab, handle)
    logger.info('load pretrained embedding')
    embedding = load_glove(glove_source_path, vocab_size, vocab.stoi)
    logger.info('save pretrained embedding')
    np.save(embedding_path, embedding)
    logger.info('finish')
Example #24
0
def load_data_loader(args, mode='train'):
    if mode == 'train':
        train_dir_s = os.path.join(args.data_path, 'train/train.en')
        train_dir_t = os.path.join(args.data_path, 'train/train.de')
        load_raw_data_to_csv(train_dir_s, train_dir_t, mode='train')
        path = './train.csv'
    elif mode == 'test':
        test_dir_s = os.path.join(args.data_path, 'test/test.en')
        test_dir_t = os.path.join(args.data_path, 'test/test.de')
        load_raw_data_to_csv(test_dir_s, test_dir_t, mode='test')
        path = './test.csv'

    source, target = create_fields(args)

    data = TabularDataset(path=path,
                          format='csv',
                          fields=[('source', source), ('target', target)])

    data_loader = BucketIterator(data,
                                 batch_size=args.batch_size,
                                 sort_key=lambda x: len(x.source),
                                 shuffle=True)

    source.build_vocab(data)
    target.build_vocab(data)

    if os.path.isfile('train.csv'):
        os.remove('train.csv')
    if os.path.isfile('test.csv'):
        os.remove('test.csv')

    return data_loader, source, target
Example #25
0
    def _process_data(self, filepath, train_dev_ratio):
        """ preprocess dataset

        Args:
            filepath: string, the path of dataset
            train_dev_ratio: a float, the ratio to split train and dev dataset

        Returns:
            A tuple of torchtext.data.Dataset objects: (train, dev)
        """
        train, dev = TabularDataset(
            path=filepath,
            format='csv',
            fields=[('text', self.text_field), ('label', self.label_field)],
            csv_reader_params=dict(delimiter='\t')).split(
                split_ratio=train_dev_ratio)

        train_words = list(map(lambda x: len(x.text), train.examples))
        train_labels = list(map(lambda x: int(x.label), train.examples))
        dev_words = list(map(lambda x: len(x.text), dev.examples))
        dev_labels = list(map(lambda x: int(x.label), dev.examples))

        print('----------------------------------------------------------')
        print('train: min words={}, max words={}, counter={}'.format(
            min(train_words), max(train_words), str(Counter(train_labels))))
        print('dev: min words={}, max words={}, counter={}'.format(
            min(dev_words), max(dev_words), str(Counter(dev_labels))))
        print('----------------------------------------------------------')
        print('\n')

        return train, dev
Example #26
0
def make_small_imdb(batch_size=8, device=-1, vectors=None):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # TEXT = data.Field(include_lengths=False, lower=True, batch_first=True)
    TEXT = data.Field(tokenize=get_tokenizer("basic_english"),
                      init_token='<sos>',
                      eos_token='<eos>',
                      lower=True,
                      batch_first=False)
    LABEL = data.LabelField()

    datafields = [('text', TEXT), ('label', LABEL)]
    train, test = TabularDataset.splits(path='.',
                                        train='train.csv',
                                        validation='cv.csv',
                                        format='csv',
                                        skip_header=True,
                                        fields=datafields)

    TEXT.build_vocab(train, test, vectors=vectors, max_size=30000)
    LABEL.build_vocab(train, test)
    train_iter, test_iter = BucketIterator.splits(
        (train, test),
        batch_sizes=(128, 128),
        device=device,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        repeat=False)

    return train_iter, test_iter, TEXT, LABEL
Example #27
0
def preprocess(**kwargs):
    SRC = Field(include_lengths=False,
                init_token=None,
                pad_token="<pad>",
                unk_token="<unk>",
                lower=True,
                batch_first=False,
                tokenize=lambda text: list(text.strip()))
    _train, _test = TabularDataset.splits(
        path="data",
        root="data",
        train="train.weibo.txt",
        test="test.weibo.txt",
        format='tsv',
        skip_header=False,
        fields=[("text", SRC), ("label", SRC), ("target", SRC)],
        csv_reader_params={"quoting": csv.QUOTE_NONE})
    SRC.build_vocab(_train, min_freq=5)
    train_iter = BucketIterator(_train,
                                batch_size=kwargs["batch_size"],
                                train=True,
                                sort_within_batch=True,
                                sort_key=lambda x: (len(x.text)),
                                repeat=False,
                                device=device)
    test_iter = BucketIterator(_test, batch_size=1, train=False, device=device)
    return train_iter, test_iter, SRC
Example #28
0
def preprocess_data_for_RNN(vectors, batch_size, train_tagged_sentences,
                            max_vocab_size, min_frequency):
    """
    preprocess the train tagged sentences, and use BucketIterator for training.
    """
    df = build_corpus_text_df(train_tagged_sentences)
    df.to_csv('train_text_data.csv', index=False)
    text_field = Field(lower=True, batch_first=True)
    tags_field = Field(batch_first=True)

    fields = [('text', text_field), ('tags', tags_field)]
    # TabularDataset

    train_data = TabularDataset(path='train_text_data.csv',
                                format='CSV',
                                fields=fields,
                                skip_header=True)

    # Iterators
    data_iter = BucketIterator(train_data, batch_size=batch_size)

    # Vocabulary
    text_field.build_vocab(train_data,
                           vectors=vectors,
                           min_freq=min_frequency,
                           max_size=max_vocab_size)
    tags_field.build_vocab(train_data,
                           min_freq=min_frequency,
                           max_size=max_vocab_size)

    pad_index = text_field.vocab.stoi[text_field.pad_token]
    tag_pad_index = tags_field.vocab.stoi[tags_field.pad_token]
    return data_iter, pad_index, tag_pad_index, text_field, tags_field
Example #29
0
def build_and_cache_dataset(config: Config, mode='train'):
    """
    返回每个属性的Field,以及所有的属性的值
    (id, category, news), datasets
    (Field, Field, Field), TabularDataset
    """
    # id 已经序列化
    ID = Field(sequential=False, use_vocab=False)
    CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True)
    NEWS = Field(
        sequential=True,
        tokenize=jieba.lcut,
        include_lengths=True,
    )

    fields = [
        ('id', ID),
        (None, None),
        ('category', CATEGORY),
        ('news', NEWS),
    ]

    logger.info("从当前目录创建特征 %s", config.dataset_dir)

    # `\t` 分割
    dataset = TabularDataset(
        os.path.join(config.dataset_dir, f'{mode}.csv'),
        format='csv',
        fields=fields,
        csv_reader_params={'delimiter': '\t'},
    )

    # TabularDataset.split()
    features = ((ID, CATEGORY, NEWS), dataset)
    return features
    def create_dataset(self):
        SOURCE = Field(
            sequential=True,
            tokenize=x_tokenize,
            use_vocab=False,
            batch_first=True,
            fix_length=self.
            fix_length,  #  如需静态padding,则设置fix_length, 但要注意要大于文本最大长度
            eos_token=None,
            init_token=None,
            include_lengths=True,
            pad_token=0)

        TARGET = Field(
            sequential=True,
            tokenize=x_tokenize,
            use_vocab=False,
            batch_first=True,
            fix_length=self.
            fix_length,  #  如需静态padding,则设置fix_length, 但要注意要大于文本最大长度
            eos_token=None,
            init_token=None,
            include_lengths=False,
            pad_token=-1)

        fields = {'source': ('source', SOURCE), 'target': ('target', TARGET)}

        train, valid = TabularDataset.splits(path=config.ROOT_DIR,
                                             train=self.train_path,
                                             validation=self.valid_path,
                                             format="json",
                                             skip_header=False,
                                             fields=fields)
        return train, valid
def pad_under_five(toknized):
    """
    모델에서 5-gram 단위 필터를 사용하기 때문에
    5-gram이 안되는 문장에 <pad>로 채워준다
    """
    if len(toknized) < 5:
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five)
LABEL = Field(sequential=False,use_vocab=True,unk_token=None)

train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/',
 train='ratings_train.txt',
 test='ratings_test.txt',
 format='tsv', 
 skip_header=True, 
 fields=[('id',None),('text',TEXT),('label',LABEL)], 
 filter_pred = lambda x: True if len(x.text) > 1 else False) 
# 토큰 레벨 문장의 길이가 1 이상인 경우만 허용

TEXT.build_vocab(train_data,min_freq=2)
LABEL.build_vocab(train_data)

# print (TEXT.vocab)
# print (len(TEXT.vocab),len(LABEL.vocab))

# print (TEXT.vocab.itos[:5])
# print (LABEL.vocab.itos)

train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,