def __init__(self, root_dir='data', batch_size=64, use_vector=True):
        self.TEXT = Field(sequential=True, use_vocab=True,
                          tokenize='spacy', lower=True, batch_first=True)
        self.LABEL = LabelField(tensor_type=torch.FloatTensor)
        vectors = Vectors(name='mr_vocab.txt', cache='./')

        dataset_path = os.path.join(root_dir, '{}.tsv')
        self.dataset = {}
        self.dataloader = {}
        for target in ['train', 'dev', 'test']:
            self.dataset[target] = TabularDataset(
                path=dataset_path.format(target),
                format='tsv',
                fields=[('text', self.TEXT), ('label', self.LABEL)]
            )
            if use_vector:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000, vectors=vectors)
            else:
                self.TEXT.build_vocab(self.dataset[target], max_size=25000)

            self.LABEL.build_vocab(self.dataset[target])
            self.dataloader[target] = Iterator(self.dataset[target],
                                               batch_size=batch_size,
                                               device=None,
                                               repeat=False,
                                               sort_key=lambda x: len(x.text),
                                               shuffle=True)
Exemple #2
0
def build_dataset(fpath, mode='train'):
    # For more info about torchtext.data,
    # turn to https://pytorch.org/text/data.html
    tokenize = lambda x: x.split()
    ID = Field(sequential=False, use_vocab=False)
    # NOTE: CATEGORY_CODE could be ignored
    CATEGORY_CODE = LabelField(sequential=False, use_vocab=False)
    CATEGORY = LabelField(sequential=False, use_vocab=False)
    NEWS = Field(
        sequential=True,
        use_vocab=False,
        tokenize=tokenize,
        include_lengths=True,
    )

    # Format of dataset:
    # 6552431613437805063_!_102_!_news_entertainment_!_谢娜为李浩菲澄清网络谣言,之后她的两个行为给自己加分_!_佟丽娅,网络谣言,快乐大本营,李浩菲,谢娜,观众们
    fields = [
        ('id', ID),
        ('category_code', CATEGORY_CODE),
        ('category', CATEGORY),
        ('news', NEWS),
        (None, None),
    ]

    # Since dataset is split by `_!_`.
    dataset = TabularDataset(
        fpath,
        format='csv',
        fields=fields,
        csv_reader_params={'delimiter': '_!_'},
    )
    return (ID, CATEGORY, NEWS), dataset
Exemple #3
0
    def load(self,
             text_label_col: str = "text",
             targets=('label', ),
             delimiter: str = ",",
             quotechar: str = '"'):
        field_headers = list(
            pd.read_csv(self.path_to_datadir + "train.csv",
                        quotechar=quotechar,
                        sep=delimiter))
        dset_row = []
        for header in field_headers:
            if header == text_label_col:
                dset_row.append((text_label_col, self.text_field))
            elif header in targets:
                dset_row.append((header, LabelField(dtype=torch.long)))
            else:
                dset_row.append((header, None))

        train, test = TabularDataset.splits(path=self.path_to_datadir,
                                            train='train.csv',
                                            test="test.csv",
                                            format='csv',
                                            skip_header=True,
                                            fields=dset_row)

        if self.stratified_sampling:
            train, test = stratified_sampler(
                train,
                test,
                targets,
                text_field=self.text_field,
                label_field=LabelField(dtype=torch.long))
        return train, test
Exemple #4
0
def load_dataset_from_csv(params, device):
    """
    tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied
    Field : A class that stores information about the way of preprocessing
    fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will
                 dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
                 will pad each sequence to have a fix length of 200.

    build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an
                  idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.

    vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings.
    BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed.

    """
    # define tokenizer
    en = English()

    def tokenize(sentence):
        return [tok.text for tok in en.tokenizer(sentence)]

    TEXT = Field(sequential=True, tokenize=tokenize, lower=True, eos_token='<eos>', batch_first=True, fix_length=128)
    LABEL = LabelField()

    fields_list = [('Unnamed: 0', None),
                   ('text', TEXT),
                   ('conf', None),
                   ('label', LABEL)]
    base_path = params.DATA_PATH
    train_path = os.path.join(base_path, "filtered_train.csv")
    test_path = os.path.join(base_path, "filtered_test.csv")
    train_data = TabularDataset(path=train_path,  # the root directory where the data lies
                                format='csv',
                                skip_header=True,
                                fields=fields_list)

    test_data = TabularDataset(path=test_path,  # the root directory where the data lies
                               format='csv',
                               skip_header=True,
                               fields=fields_list)

    if params.VOCAB_USE_GLOVE:
        TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ, vectors=GloVe(name='6B', dim=300))
        logging.info("Loaded Glove embedding, Vector size of Text Vocabulary: " + str(TEXT.vocab.vectors.size()))

    else:
        TEXT.build_vocab(train_data, test_data, min_freq=params.VOCAB_MIN_FREQ)
    LABEL.build_vocab(train_data)

    word_embeddings = TEXT.vocab.vectors
    logging.info("Length of Text Vocabulary: " + str(len(TEXT.vocab)))

    train_iter, test_iter = data.BucketIterator.splits((train_data, test_data),
                                                       batch_sizes=(params.TRAIN_BATCH_SIZE, params.TRAIN_BATCH_SIZE),
                                                       sort_key=lambda x: len(x.text), repeat=False, shuffle=True,
                                                       device=device)
    # Disable shuffle
    test_iter.shuffle = False
    return TEXT, word_embeddings, train_iter, test_iter
def clean_quora(path='../data/train.csv', output='list', tokenizer = nltk.word_tokenize, device=DEVICE, batch_size=32):
    data = pd.read_csv(path)
    questions1 = data['question1'].astype('str').tolist()
    questions2 = data['question2'].astype('str').tolist()
    is_duplicates = data['is_duplicate'].tolist()
    
    if output == 'list':
        return questions1, questions2, is_duplicates
    
    elif output == 'tokenized_list':
        return [tokenizer(q) for q in questions1], [tokenizer(q) for q in questions2], is_duplicates
    
    elif output == 'iterator' or output == 'iterator_from_file':
        TEXT = Field(
                sequential=True,
                tokenize = tokenizer,
                pad_first = False,
                dtype = torch.long,
                lower = True,
                batch_first = True
                )
        TARGET = LabelField(use_vocab = False)
        
        if output == 'iterator':
            examples = [Example.fromlist((questions1[i], questions2[i], is_duplicates[i]),
                                         [('question1', TEXT),
                                          ('question2', TEXT)
                                          ('is_duplicate', TARGET)]) for i in range(len(questions1))]
            dataset = Dataset(examples, {'question1': TEXT, 'question2': TEXT, 'is_duplicate': TARGET})
    
        if output == 'iterator_from_file':
            dataset = TabularDataset(path, 'csv', [('question1', TEXT),
                                                   ('question2', TEXT),
                                                   ('is_duplicate', TARGET)],
    skip_header=True)
        
        iterator = BucketIterator(
                dataset,
                batch_size=batch_size,
                sort_key=lambda x: len(x.question1) + len(x.question2),
                sort_within_batch=False,
                repeat = False,
                device = device
                # repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
        )
        
        TEXT.build_vocab(dataset)
        TARGET.build_vocab(dataset)
        
        
        return iterator
        

        #dataset = TabularDataset(path, 'csv', [('review', TEXT), ('sentiment', TARGET)])
        
    else:
        raise ValueError('Processing type not understood')
Exemple #6
0
def load_data(preprocessing=None):
    # Fields for the dataset
    # The actual review message

    #TEXT = Field(tokenize='spacy') # -- Old way, unclear exactly what language model is used
    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 preprocessing=preprocessing)
    LABEL = LabelField(dtype=torch.float)

    # Get the entire dataset that we will then split
    data = TabularDataset(path=path,
                          format='tsv',
                          fields=[('text', TEXT), ('label', LABEL)])

    # We should probabily look at the proportion of fake to non fake in each of these
    # set to make sure it is fairly even. Though probabilistically it should be I suppose
    train_data, valid_data, test_data = data.split(
        split_ratio=TRAIN_VAL_TEST_SPLIT, random_state=random.seed(SEED))
    #valid_data, test_data = test_data.split(split_ratio=VAL_TEST_SPLIT, random_state=random.seed(SEED))

    print('Size of train set: ' + str(len(train_data.examples)))
    print('Size of val / test: ' + str(len(valid_data.examples)))
    '''
    # Try loading in the IMB dataset to label pos or negative
    train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
    # Get train/valid split!!
    train_data, valid_data = train_data.split(random_state=random.seed(SEED))
    '''

    # Now we need to build the vocab for our actual data
    # Here we will use the pre-trained word vetors from "glove.6b.100"
    TEXT.build_vocab(train_data, max_size=25000, vectors="glove.6B.100d")
    LABEL.build_vocab(train_data)

    # Print stuff for sanity checks
    print('Size of the vocab: ' + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_itr, valid_itr, test_itr = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        device=device,
        sort_key=lambda x: len(x.text))

    return TEXT, train_itr, valid_itr, test_itr
Exemple #7
0
def data_load_without_cv(fname, args, seed=1234, split_ratio=0.9):
    TEXT   = Field(sequential=True, tokenize=str.split, batch_first=True, fix_length=56, lower=True)
    LABEL  = LabelField(sequential=False, dtype=torch.float)
    FIELDS = [('label', LABEL), ('text', TEXT)]

    dataset = TabularDataset(fname, fields=FIELDS, format='csv', skip_header=True)

    train_dataset, valid_dataset  = dataset.split(random_state=random.seed(seed), split_ratio=split_ratio)

    TEXT.build_vocab(train_dataset)
    LABEL.build_vocab(train_dataset)

    train_iterator, valid_iterator = BucketIterator.splits((train_dataset, valid_dataset), batch_size=args.batch_size,
                                                           device=args.device, sort=False, shuffle=True)

    return TEXT, train_iterator, valid_iterator
def make_dataset(train_csv, val_csv, test_csv):
    '''
    Generates the training, validation and testing datasets as torchtext
    objects for easy incorporation with Pytorch (cleaning them in the process)

    Inputs:
        train_csv(str): name of training data csv
        val_csv(str): name of validation data csv
        test_csv(str): name of testing data csv

    Outputs:
        train: tabular dataset obj representing the training data
        test: tabular dataset obj representing the testing data
        val: tabular dataset obj representing the validation data
        text: torchtext field obj representing how text should be
            processed and stored
        label: torchtext labelfield obj representing labels should be
            processed and stored
    '''
    text = Field(sequential=True, tokenize=word_tokenize,
                 preprocessing=normalize_tokens)
    label = LabelField(dtype=torch.float)
    data_fields = [('dab_id', None), ('alj_id', None), ('alj_text', text),
                   ('decision_binary', label), ('dab_year', None)]
    train, val, test = TabularDataset.splits(path='', train=train_csv,
                                             validation=val_csv, test=test_csv,
                                             format='csv', fields=data_fields,
                                             skip_header=True)
    return train, test, val, text, label
Exemple #9
0
def build_and_cache_dataset(args, mode='train'):

    # TorchText采用声明式方法加载数据
    # 声明Field对象,这个Field对象指定你想要怎么处理某个数据
    # sequential序列化,use_vocab数字化,把单词映射成数字
    ID = Field(sequential=False, use_vocab=False)  # 不需要被序列化,不需要被数字化
    CATEGORY = LabelField(sequential=False, use_vocab=True,
                          is_target=True)  # 不需要序列化,需要被数字化,是目标
    # tokenize传入一个函数,表示如何将文本字符串,切分成词或者字
    NEWS_TEXT = Field(
        sequential=True,  # 需要被序列化
        tokenize=jieba.lcut,  # 使用jieba分词切分
        include_lengths=True,  # 返回小型批处理的元组、长度列表
    )

    fields = [
        ('id', ID),
        (None, None),
        ('category', CATEGORY),
        ('news_text', NEWS_TEXT),
    ]

    logger.info("Creating features from dataset file at %s", args.data_dir)

    # Since dataset is split by `\t`.
    # 把train.csv中的每一行按\t做了划分,每一行生成一个Example对象
    dataset = TabularDataset(
        os.path.join(args.data_dir, f'{mode}.csv'),  # 数据集路径
        format='csv',  # 数据集格式
        fields=fields,  # 按何种方式处理数据,处理后每行数据符合'id': ,'category': ,'news_text': 的格式
        csv_reader_params={'delimiter': '\t'},  # 每行数据按\t划分,然后在按field定义的方式处理数据
    )

    features = ((ID, CATEGORY, NEWS_TEXT), dataset)
    return features
Exemple #10
0
    def load(self,
             text_label_col: str = "text",
             targets=('label', ),
             delimiter: str = ",",
             quotechar: str = '"'):
        field_headers = list(
            pd.read_csv(self.path_to_datadir, quotechar=quotechar))
        dset_row = []
        for header in field_headers:
            if header == text_label_col:
                dset_row.append((text_label_col, self.text_field))
            elif header in targets:
                dset_row.append((header, LabelField(dtype=torch.long)))
            else:
                dset_row.append((header, None))

        train = TabularDataset(path=self.path_to_datadir,
                               format="csv",
                               fields=dset_row,
                               skip_header=True,
                               csv_reader_params={
                                   "delimiter": delimiter,
                                   "quotechar": quotechar
                               })
        return train
Exemple #11
0
def build_and_cache_dataset(args, mode='train'):

    ID = Field(sequential=False, use_vocab=False)
    CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True)
    NEWS = Field(
        sequential=True,
        tokenize=jieba.lcut,  #可以使用自定义函数,即别的分词工具
        include_lengths=True,
    )

    fields = [
        ('id', ID),
        (None, None),
        ('category', CATEGORY),
        ('news', NEWS),
    ]

    logger.info("Creating features from dataset file at %s", args.data_dir)

    # Since dataset is split by `\t`.
    dataset = TabularDataset(
        os.path.join(args.data_dir, f'{mode}.csv'),
        format='csv',
        fields=fields,
        csv_reader_params={'delimiter': '\t'},
    )

    features = ((ID, CATEGORY, NEWS), dataset)
    return features
Exemple #12
0
def build_and_cache_dataset(config: Config, mode='train'):
    """
    返回每个属性的Field,以及所有的属性的值
    (id, category, news), datasets
    (Field, Field, Field), TabularDataset
    """
    # id 已经序列化
    ID = Field(sequential=False, use_vocab=False)
    CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True)
    NEWS = Field(
        sequential=True,
        tokenize=jieba.lcut,
        include_lengths=True,
    )

    fields = [
        ('id', ID),
        (None, None),
        ('category', CATEGORY),
        ('news', NEWS),
    ]

    logger.info("从当前目录创建特征 %s", config.dataset_dir)

    # `\t` 分割
    dataset = TabularDataset(
        os.path.join(config.dataset_dir, f'{mode}.csv'),
        format='csv',
        fields=fields,
        csv_reader_params={'delimiter': '\t'},
    )

    # TabularDataset.split()
    features = ((ID, CATEGORY, NEWS), dataset)
    return features
Exemple #13
0
    def __init__(self,
                 spacy_model_name,
                 question_vocab_dir,
                 answer_vocab_dir,
                 right_answer_col,
                 nn_weights_path,
                 batch_size=128,
                 device=tt.device('cpu')):
        self.parser = spacy.load(spacy_model_name)

        vocab = load_vocab(question_vocab_dir)
        preprocess = lambda x: [i if i in vocab.stoi else '<unk>' for i in x]
        self.TOKENS = Field(lower=True, preprocessing=preprocess)
        self.TOKENS.vocab = load_vocab(question_vocab_dir)

        self.ANSWER = LabelField(dtype=tt.int64,
                                 use_vocab=True,
                                 unk_token='<unk>')
        self.ANSWER.vocab = load_vocab(answer_vocab_dir)
        #self.ANSWER.vocab.vectors = Vectors(gensim_vectors_path)

        self.device = device
        self.nn_weights_path = nn_weights_path
        self.batch_size = batch_size

        self.model = None
        self.right_answer_col = right_answer_col
Exemple #14
0
 def __build_field(self):
     self.TEXT = Field(sequential=True,
                       use_vocab=True,
                       lower=True,
                       tokenize=tokenizer,
                       include_lengths=True,
                       batch_first=self._config.data.batch_first,
                       pad_token='[PAD]',
                       unk_token='[UNK]')
     # self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True,
     #                  batch_first=self._config.data.batch_first)
     self.TAG = LabelField(
         sequential=True,
         use_vocab=True,
         tokenize=tokenizer,
         is_target=True,
     )
     self._fields = [('text', self.TEXT), ('tag', self.TAG)]
     pass
Exemple #15
0
class ReviewsDataset():
    def __init__(self, data_path, train_path):

        ## write the tokenizer
        tokenize = lambda review: review.split()
        ## define your fields for ID filed you can use RAWField class
        self.TEXT = Field(sequential=True,
                          use_vocab=True,
                          tokenize=tokenize,
                          lower=True)
        self.LABEL = LabelField()

        self.fields = [
            ("PhraseId", None
             ),  # we won't be needing the id, so we pass in None as the field
            ("SentenceId", None),
            ("Phrase", self.TEXT),
            ("Sentiment", self.LABEL)
        ]  #{ 'Phrase': ('r', self.review), 'Sentiment': ('s', self.sentiment) }
        ## set paths
        self.data_path = data_path
        self.train_path = train_path

    def load_data(self):
        self.train_data = TabularDataset.splits(
            path='{}'.format(self.data_path),
            train='{}'.format(self.train_path),
            format='tsv',
            fields=self.fields)[0]

        self.TEXT.build_vocab(self.train_data, max_size=10000, min_freq=1)
        self.LABEL.build_vocab(self.train_data)
        self.train_iterator, _ = BucketIterator.splits(
            (self.train_data, None),
            batch_sizes=(64, 64),
            sort_within_batch=True,
            sort_key=lambda x: len(x.Phrase))

    def __str__(self):
        return 'review: {} \n sentiment: {}'.format(
            self.train_data[0][0].__dict__['r'],
            self.train_data[0][0].__dict__['s'])
 def _build_fields(self) -> Dict[str, Field]:
     fields = {
         'syllable_contents':
         Field(sequential=True, use_vocab=True, batch_first=True),
         'label':
         LabelField(sequential=False,
                    use_vocab=False,
                    dtype=torch.float32,
                    batch_first=True)
     }
     return fields
Exemple #17
0
def construct_field(
    field_type,
    batch_first=True,
    input_lower=True,
    lemmatized=False,
    input_include_lengths=True,
    input_fix_length=None,
):
    """ Construct TorchText field.

        Note: the `input_<x>` fields are specifically parameters for
              the `input_text` field type.
    """
    if field_type == 'input_text':
        if lemmatized:
            tokenizer = tokenize_fct_lemmatize
        else:
            tokenizer = tokenize_fct
        return SplitReversibleField(sequential=True,
                                    use_vocab=True,
                                    init_token=Constants.START_TOKEN,
                                    eos_token=Constants.END_TOKEN,
                                    lower=input_lower,
                                    tokenize=tokenizer,
                                    batch_first=batch_first,
                                    pad_token=Constants.PAD_TOKEN,
                                    unk_token=Constants.UNK_TOKEN,
                                    include_lengths=input_include_lengths,
                                    fix_length=input_fix_length,
                                    preprocessing=gen_text_preprocessor())
    elif field_type == 'numeric_label':
        return LabelField(
            use_vocab=False,
            batch_first=batch_first,
        )
    elif field_type == 'bool_label':
        return LabelField(use_vocab=False,
                          batch_first=batch_first,
                          preprocessing=lambda x: (x == 'True'))
    else:
        raise Exception('Invalid Field Type')
Exemple #18
0
    def __init__(self, data_path, train_path):

        ## write the tokenizer
        tokenize = lambda review: review.split()
        ## define your fields for ID filed you can use RAWField class
        self.TEXT = Field(sequential=True,
                          use_vocab=True,
                          tokenize=tokenize,
                          lower=True)
        self.LABEL = LabelField()

        self.fields = [
            ("PhraseId", None
             ),  # we won't be needing the id, so we pass in None as the field
            ("SentenceId", None),
            ("Phrase", self.TEXT),
            ("Sentiment", self.LABEL)
        ]  #{ 'Phrase': ('r', self.review), 'Sentiment': ('s', self.sentiment) }
        ## set paths
        self.data_path = data_path
        self.train_path = train_path
Exemple #19
0
def pre_process_text():

    ID = Field(sequential=False, use_vocab=False)
    # 处理CATEGORY,标签选择非序列,use_vocab置true建立词典,is_target置true指明这是目标变量
    CATEGORY = LabelField(sequential=False, use_vocab=True, is_target=True)
    # 处理NEWS,文本选择序列,分词函数用jieba的lcut,返回句子原始长度方便RNN使用
    NEWS = Field(sequential=True, tokenize=jieba.lcut, include_lengths=True)

    fields = [
        ('id', ID),
        (None, None),
        ('category', CATEGORY),
        ('news', NEWS),
    ]

    # 加载数据
    train_data = TabularDataset(
        os.path.join('data', 'train.csv'),
        format = 'csv',
        fields = fields,
        csv_reader_params={'delimiter': '\t'}
    )    
    valid_data = TabularDataset(
        os.path.join('data', 'dev.csv'),
        format = 'csv',
        fields = fields,
        csv_reader_params={'delimiter': '\t'}
    )    
    test_data = TabularDataset(
        os.path.join('data', 'test.csv'),
        format = 'csv',
        fields = fields,
        csv_reader_params={'delimiter': '\t'}
    )
    
    # 创建字典
    NEWS.build_vocab(train_data, vectors=GloVe(name='6B', dim=300))
    CATEGORY.build_vocab(train_data)

    return CATEGORY, NEWS, train_data, valid_data, test_data
Exemple #20
0
 def create_field(field_type,tokenizer=None):
   """Return a Field-like object using specified config.""" 
   if field_type == "label":
     return LabelField(dtype = torch.long)  
   elif field_type == "text":
     return Field(tokenize = 'spacy', lower= True)
   elif field_type == "contextual":
     return Field(batch_first = True,
                       use_vocab = False,
                       tokenize =  bert_tokenize_and_cut(tokenizer),
                       preprocessing = tokenizer.convert_tokens_to_ids,
                       init_token = tokenizer.cls_token_id,
                       eos_token = tokenizer.sep_token_id,
                       pad_token = tokenizer.pad_token_id,
                       unk_token = tokenizer.unk_token_id)
   else: raise ValueError(f'{field_type} was not recognized')
Exemple #21
0
    def __init__(self,
                 data_path,
                 test=False,
                 stop_words_path=None,
                 bert_model_path=None,
                 batch_first=False,
                 include_lengths=False,
                 tokenizer_language='cn'):
        """
        :param data_path:
        :param test: 如果为测试集,则不加载label
        :param stop_words_path:
        :param batch_first:
        :param include_lengths:
        """
        self.data = pd.read_csv(data_path)
        print('read data from {}'.format(data_path))
        self.text_field = "review"
        self.label_field = "label"
        self.test = test

        if stop_words_path:
            stop_words = read_stop_words(stop_words_path)
        else:
            stop_words = None

        self.LABEL = LabelField(sequential=False,
                                use_vocab=False,
                                dtype=torch.float)

        # lambda x: [y for y in x]
        # bert_tokenizer = BertTokenizer.from_pretrained(bert_model_path)
        # pad_index = bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.pad_token)
        # unk_index = bert_tokenizer.convert_tokens_to_ids(bert_tokenizer.unk_token)
        self.TEXT = Field(
            use_vocab=True,
            sequential=True,
            stop_words=stop_words,
            tokenize=lambda x: [y for y in x],
            batch_first=batch_first,
            tokenizer_language=tokenizer_language,
            include_lengths=include_lengths)  # include_lengths=True for LSTM

        self.fields = [("text", self.TEXT), ("label", self.LABEL)]

        self.examples = self.build_examples()
    def __init__(
        self,
        data,
        text_field,
        label_field,
        test=False,
        stop_words_path=None,
        batch_first=False,
        include_lengths=False,
        tokenizer_language='cn',
    ):
        if stop_words_path:
            stop_words = read_stop_words(stop_words_path)
        else:
            stop_words = None

        self.LABEL = LabelField(sequential=False,
                                use_vocab=False,
                                dtype=torch.float)

        # lambda x: [y for y in x]
        self.TEXT = Field(
            sequential=True,
            stop_words=stop_words,
            tokenize=lambda x: [y for y in x],
            batch_first=batch_first,
            tokenizer_language=tokenizer_language,
            include_lengths=include_lengths)  # include_lengths=True for LSTM

        fields = [("text", self.TEXT), ("label", self.LABEL)]

        examples = []
        if test:
            # 如果为测试集,则不加载label
            for text in tqdm(data[text_field]):
                examples.append(Example.fromlist([text, None], fields))
        else:
            for text, label in tqdm(zip(data[text_field], data[label_field])):
                # Example: Defines a single training or test example.
                # Stores each column of the example as an attribute.
                examples.append(Example.fromlist([text, label], fields))
        # 之前是一些预处理操作,此处调用super调用父类构造方法,产生标准Dataset
        super(NLPDataset, self).__init__(examples, fields)
Exemple #23
0
def build_and_cache_dataset(data_path=r"E:\Workspaces\Python\KG\QA_healty39\data"):
    """
    返回每个属性的Field,以及所有的属性的值
    (id, category, news), datasets
    (Field, Field, Field), TabularDataset
    """
    QUESTION = Field(sequential=True, tokenize=jieba.lcut, include_lengths=True)
    INTENTION = LabelField(sequential=False, use_vocab=True, is_target=True)
    fields = [
        ('question', QUESTION),
        ('intention',  INTENTION),
    ]

    # `\t` 分割
    dataset = TabularDataset(
        os.path.join(data_path, 'qa.csv'),
        format='csv',
        fields=fields,
        csv_reader_params={'delimiter': '\t'},
    )
    features = ((QUESTION, INTENTION), dataset)
    return features
Exemple #24
0
    def _build_loader(self):
        print("Loading data...")

        TEXT = Field(batch_first=True, fix_length=self.args.max_words)
        LABEL = LabelField(sequential=False, batch_first=True, use_vocab=False)
        field = [('text', TEXT), ('label', LABEL)]

        train = get_dataset("train", field)
        test = get_dataset("test", field)
        evl = get_dataset("eval", field)
        TEXT.build_vocab(train, test, evl, min_freq=3)

        self.vocab = TEXT
        self.train_iter, self.test_iter, self.evl_iter = BucketIterator.splits(
            (train, test, evl),
            batch_sizes=(self.args.batch_size, self.args.batch_size,
                         self.args.batch_size),
            device=self.device,
            shuffle=True,
            sort=False,
            repeat=False,
        )
Exemple #25
0
    def load(self,
             delimiter: str = ",",
             quotechar: str = '"',
             text_col_name: str = 'text',
             label_col_name: str = 'label') -> TabularDataset:
        """

        This methods is responsible for loading in the data from the csv file and converting it into
        a torchtext TabularDataset, it will automatically only select the columns from the file that are
        specified by the 'text_col_name' and 'label_col_name' parameters

        :param delimiter: string specifying the delimiter used when reading in the csv file
        :param quotechar: string specifying the quotechar used when reading in the csvfile
        :param text_col_name: string specifying the name of the column in the csv file containing \
        the text of the data point
        :param label_col_name: string specifying the name of the column in the csv file containing the \
        label of the datapoint
        :return: torch.data.TabularDataset
        """
        file_headers = list(
            pd.read_csv(self.file_name, sep=delimiter, quotechar=quotechar))
        dset_row = []
        for header in file_headers:
            if header == text_col_name:
                dset_row.append((text_col_name, self.text_field))
            elif header == label_col_name:
                dset_row.append((label_col_name, LabelField(dtype=torch.long)))
            else:
                dset_row.append((header, None))

        dataset = TabularDataset(path=self.file_name,
                                 format="csv",
                                 fields=dset_row,
                                 skip_header=True,
                                 csv_reader_params={
                                     "delimiter": delimiter,
                                     "quotechar": quotechar
                                 })
        return dataset
Exemple #26
0
 def prepare_fields_word_char(columns, word_ids_col, char_ids_col, len_col,
                              mask_col, inf_mask_col):
     long_field = Field(sequential=False,
                        use_vocab=False,
                        batch_first=True,
                        dtype=torch.long)
     float_field = Field(sequential=False,
                         use_vocab=False,
                         batch_first=True,
                         dtype=torch.float)
     label_field = LabelField(sequential=False,
                              use_vocab=True,
                              batch_first=True)
     fields = list()
     for column in columns:
         if column in (word_ids_col, char_ids_col, len_col, mask_col):
             fields.append((column, long_field))
         elif column in (inf_mask_col, ):
             fields.append((column, float_field))
         else:
             fields.append((column, label_field))
     return long_field, float_field, label_field, fields
Exemple #27
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--model',
        type=str,
        default='rnn',
        help=
        "Available models are: 'rnn', 'cnn', 'bilstm', 'fasttext', and 'distilbert'\nDefault is 'rnn'"
    )
    parser.add_argument('--train_data_path',
                        type=str,
                        default="./data/train_clean.csv",
                        help="Path to the training data")
    parser.add_argument('--test_data_path',
                        type=str,
                        default="./data/dev_clean.csv",
                        help="Path to the test data")
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--vectors',
                        type=str,
                        default='fasttext.simple.300d',
                        help="""
                                Pretrained vectors:
                                Visit 
                                https://github.com/pytorch/text/blob/9ce7986ddeb5b47d9767a5299954195a1a5f9043/torchtext/vocab.py#L146
                                for more 
                                """)
    parser.add_argument('--max_vocab_size', type=int, default=750)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--bidirectional', type=bool, default=True)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--hidden_dim', type=int, default=64)
    parser.add_argument('--output_dim', type=int, default=1)
    parser.add_argument('--n_layers', type=int, default=2)
    parser.add_argument('--lr', type=float, default=1e-3)
    parser.add_argument('--n_epochs', type=int, default=5)
    parser.add_argument('--n_filters', type=int, default=100)
    parser.add_argument('--filter_sizes', type=list, default=[3, 4, 5])

    args = parser.parse_args()

    torch.manual_seed(args.seed)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ##########  BILSTM ##########

    if args.model == "bilstm":
        print('\nBiLSTM')
        TEXT = Field(tokenize='spacy')
        LABEL = LabelField(dtype=torch.float)
        data_fields = [("text", TEXT), ("label", LABEL)]

        train_data = TabularDataset(args.train_data_path,
                                    format='csv',
                                    fields=data_fields,
                                    skip_header=True,
                                    csv_reader_params={'delimiter': ","})

        test_data = TabularDataset(args.test_data_path,
                                   format='csv',
                                   fields=data_fields,
                                   skip_header=True,
                                   csv_reader_params={'delimiter': ","})

        train_data, val_data = train_data.split(split_ratio=0.8,
                                                random_state=random.seed(
                                                    args.seed))

        TEXT.build_vocab(train_data,
                         max_size=args.max_vocab_size,
                         vectors=args.vectors,
                         unk_init=torch.Tensor.normal_)
        LABEL.build_vocab(train_data)

        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, val_data, test_data),
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
            device=device)

        input_dim = len(TEXT.vocab)
        embedding_dim = get_embedding_dim(args.vectors)
        pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
        unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

        model = BiLSTM(input_dim, embedding_dim, args.hidden_dim,
                       args.output_dim, args.n_layers, args.bidirectional,
                       args.dropout, pad_idx)

        pretrained_embeddings = TEXT.vocab.vectors

        model.embedding.weight.data.copy_(pretrained_embeddings)
        model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
        model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.BCEWithLogitsLoss()

        model.to(device)
        criterion.to(device)

        best_valid_loss = float('inf')

        print("\nTraining...")
        print("===========")
        for epoch in range(1, args.n_epochs + 1):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(),
                           './checkpoints/{}-model.pt'.format(args.model))

            print(
                f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )

        model.load_state_dict(
            torch.load('./checkpoints/{}-model.pt'.format(args.model)))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        print('\nEvaluating...')
        print("=============")
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%'
              )  # Test Loss: 0.139, Test Acc: 95.27%

    ##########  VANILLA RNN ##########

    else:
        print('\nVanilla RNN')
        TEXT = Field(tokenize='spacy')
        LABEL = LabelField(dtype=torch.float)
        data_fields = [("text", TEXT), ("label", LABEL)]

        train_data = TabularDataset(args.train_data_path,
                                    format='csv',
                                    fields=data_fields,
                                    skip_header=True,
                                    csv_reader_params={'delimiter': ","})

        test_data = TabularDataset(args.test_data_path,
                                   format='csv',
                                   fields=data_fields,
                                   skip_header=True,
                                   csv_reader_params={'delimiter': ","})

        train_data, val_data = train_data.split(split_ratio=0.8,
                                                random_state=random.seed(
                                                    args.seed))

        TEXT.build_vocab(train_data,
                         max_size=args.max_vocab_size,
                         vectors=args.vectors)
        LABEL.build_vocab(train_data)

        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, val_data, test_data),
            batch_size=args.batch_size,
            sort_key=lambda x: len(x.text),
            device=device)

        input_dim = len(TEXT.vocab)
        embedding_dim = get_embedding_dim(args.vectors)

        model = RNN(input_dim, embedding_dim, args.hidden_dim, args.output_dim)

        pretrained_embeddings = TEXT.vocab.vectors

        model.embedding.weight.data.copy_(pretrained_embeddings)

        optimizer = optim.Adam(model.parameters(), lr=args.lr)
        criterion = nn.BCEWithLogitsLoss()

        model.to(device)
        criterion.to(device)

        best_valid_loss = float('inf')

        print("\nTraining...")
        print("===========")
        for epoch in range(1, args.n_epochs + 1):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(),
                           './checkpoints/{}-model.pt'.format(args.model))

            print(
                f'[Epoch: {epoch:02}] | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            print(
                f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%'
            )
            print(
                f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%'
            )

        model.load_state_dict(
            torch.load('./checkpoints/{}-model.pt'.format(args.model)))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)

        print('\nEvaluating...')
        print("=============")
        print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%'
              )  # Test Loss: 0.138, Test Acc: 95.05%
def load_dataset(batch_size, cache_data=True, test_sen=None):

    if cache_data:
        print("Caching Data")
        office_actions = pd.read_csv(
            '../data/office_actions.csv',
            index_col='app_id',
            usecols=['app_id', 'rejection_102', 'rejection_103'],
            dtype={
                'app_id': int,
                'rejection_102': int,
                'rejection_103': int
            },
            nrows=200000)

        abstractList = []
        idList = []
        rejectionColumn = []
        obviousCount = 0
        notCount = 0
        path = "/scratch/dm4350/json_files/"
        count = 0

        for filename in os.listdir(path):

            if count % 1000 == 0:
                print(count)

            filepath = path + filename
            try:
                jfile = open(filepath, 'r')
            except FileNotFoundError:
                print("File Not Found")
                continue

            try:
                parsed_json = json.load(jfile)
                jfile.close()
            except UnicodeDecodeError:
                print("WARNING: UnicodeDecodeError")
                continue
            except json.decoder.JSONDecodeError:
                print("WARNING: JSONDecodeError")
                continue

            app_id = int(
                filename.replace("oa_", "").replace(".json",
                                                    "").replace("(1)", ""))
            try:
                row = office_actions.loc[app_id]
            except KeyError:
                print("WARNING: KeyError")
                continue

            try:
                n = int(row.rejection_102)
                o = int(row.rejection_103)
            except TypeError:
                n = int(row.rejection_102.iloc[0])
                o = int(row.rejection_103.iloc[0])

            if n == 0 and o == 0:
                rejType = 0  #neither
            elif n == 0 and o == 1:
                rejType = 1  #obvious
            elif n == 1 and o == 0:
                rejType = 0  #novelty
            elif n == 1 and o == 1:
                rejType = 1  #both
            else:
                print("Office actions dataframe error:", sys.exc_info()[0])
                raise

            if obviousCount >= notCount and rejType == 1:
                continue

            obviousCount += o
            notCount += not (o)

            # Skip any files not in the appropriate IPC class
            try:
                found_A61 = False
                for s in parsed_json[0]['ipc_classes']:
                    if (s.find("A61") != -1):
                        found_A61 = True
                if not found_A61:
                    continue
            except:
                print("WARNING: file " + filepath + " is empty!\n")
                continue

            # Read in data from json file if it exists
            try:
                a = parsed_json[0]['abstract_full']
                i = parsed_json[0]['application_number']
            except IndexError:
                print("WARNING: file " + filepath + " is empty!\n")
                continue
            except KeyError:
                print("WARNING: file " + filepath + " is empty!\n")
                continue

            abstractList.append(a)
            idList.append(i)
            rejectionColumn.append(rejType)

            count += 1
            #if count > 2000: break

        df = pd.DataFrame({
            'text': abstractList,
            'label': rejectionColumn
        },
                          index=idList)
        print("{} files loaded".format(count))

        df.to_pickle('./data_cache/abstracts_df_A61.pkl')
        # with open("data_cache/TEXT.Field","wb")as f:
        #     dill.dump(TEXT,f)
        # with open("data_cache/LABEL.Field","wb")as f:
        #     dill.dump(LABEL,f)

    else:
        print('Loading Dataset from Cache')
        df = pd.read_pickle('./data_cache/abstracts_df_A61.pkl')
        # with open("data_cache/TEXT.Field","rb")as f:
        #     TEXT=dill.load(f)
        # with open("data_cache/LABEL.Field","rb")as f:
        #     LABEL=dill.load(f)

    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True,
                 tokenize=tokenize,
                 lower=True,
                 include_lengths=True,
                 batch_first=True,
                 fix_length=200)
    LABEL = LabelField(sequential=False)

    fields = {'text': TEXT, 'label': LABEL}
    ds = DataFrameDataset(df, fields)

    TEXT.build_vocab(ds, vectors=GloVe(name='6B', dim=300))
    LABEL.build_vocab(ds)

    train_data, test_data = ds.split()
    train_data, valid_data = train_data.split(
    )  # Further splitting of training_data to create new training_data & validation_data
    word_embeddings = TEXT.vocab.vectors
    print("Length of Text Vocabulary: " + str(len(TEXT.vocab)))
    print("Vector size of Text Vocabulary: ", TEXT.vocab.vectors.size())
    print("Label Length: " + str(len(LABEL.vocab)))

    train_iter, valid_iter, test_iter = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=batch_size,
        sort_key=lambda x: len(x.text),
        repeat=False,
        shuffle=True)

    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, valid_iter, test_iter
Exemple #29
0
def test_single_gpu_batch_parse():
    trainer = Trainer(gpus=1)

    # non-transferrable types
    primitive_objects = [None, {}, [], 1.0, "x", [None, 2], {"x": (1, 2), "y": None}]
    for batch in primitive_objects:
        data = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
        assert data == batch

    # batch is just a tensor
    batch = torch.rand(2, 3)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor'

    # tensor list
    batch = [torch.rand(2, 3), torch.rand(2, 3)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].device.index == 0 and batch[0].type() == 'torch.cuda.FloatTensor'
    assert batch[1].device.index == 0 and batch[1].type() == 'torch.cuda.FloatTensor'

    # tensor list of lists
    batch = [[torch.rand(2, 3), torch.rand(2, 3)]]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor'
    assert batch[0][1].device.index == 0 and batch[0][1].type() == 'torch.cuda.FloatTensor'

    # tensor dict
    batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0]['a'].device.index == 0 and batch[0]['a'].type() == 'torch.cuda.FloatTensor'
    assert batch[0]['b'].device.index == 0 and batch[0]['b'].type() == 'torch.cuda.FloatTensor'

    # tuple of tensor list and list of tensor dict
    batch = ([torch.rand(2, 3) for _ in range(2)], [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)} for _ in range(2)])
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['a'].device.index == 0
    assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['b'].device.index == 0
    assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

    # namedtuple of tensor
    BatchType = namedtuple('BatchType', ['a', 'b'])
    batch = [BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)]
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))
    assert batch[0].a.device.index == 0
    assert batch[0].a.type() == 'torch.cuda.FloatTensor'

    # non-Tensor that has `.to()` defined
    class CustomBatchType:

        def __init__(self):
            self.a = torch.rand(2, 2)

        def to(self, *args, **kwargs):
            self.a = self.a.to(*args, **kwargs)
            return self

    batch = trainer.accelerator.batch_to_device(CustomBatchType(), torch.device('cuda:0'))
    assert batch.a.type() == 'torch.cuda.FloatTensor'

    # torchtext.data.Batch
    samples = [{
        'text': 'PyTorch Lightning is awesome!',
        'label': 0
    }, {
        'text': 'Please make it work with torchtext',
        'label': 1
    }]

    text_field = Field()
    label_field = LabelField()
    fields = {'text': ('text', text_field), 'label': ('label', label_field)}

    examples = [Example.fromdict(sample, fields) for sample in samples]
    dataset = Dataset(examples=examples, fields=fields.values())

    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
    text_field.build_vocab(dataset)
    label_field.build_vocab(dataset)

    batch = Batch(data=examples, dataset=dataset)
    batch = trainer.accelerator.batch_to_device(batch, torch.device('cuda:0'))

    assert batch.text.type() == 'torch.cuda.LongTensor'
    assert batch.label.type() == 'torch.cuda.LongTensor'
Exemple #30
0
    tensor = tensor.unsqueeze(0).to(device)
    output = model(tensor)
    output = F.softmax(output, dim=-1)
    print(output)


if __name__ == "__main__":
    text_field = Field(use_vocab=False,
                       tokenize=tokenize_and_trunc,
                       preprocessing=tokenizer.convert_tokens_to_ids,
                       batch_first=True,
                       init_token=init_token_idx,
                       eos_token=eos_token_idx,
                       pad_token=pad_token_idx,
                       unk_token=unk_token_idx)
    label_field = LabelField()

    train_data, test_data = IMDB.splits(text_field, label_field)
    train_data, valid_data = train_data.split()
    label_field.build_vocab(train_data)

    n_epochs = 5
    batch_size = 128
    rnn_hidden_size = 256
    dropout_p = 0.2
    num_classes = len(label_field.vocab)
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    model = BertGRU(bert.config.to_dict()['dim'],
                    rnn_hidden_size, num_classes=num_classes,
                    dropout_p=dropout_p)