def _create_loaders(path, traintsv, valtsv): def parse_int(tok, *args): return int(tok) # 对应.tsv文件中的几列,通过quesid.data[0]可以获得数据,ques.data[0]可以获得数据 quesid = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int)) ques = data.Field(include_lengths=True) imgid = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int)) ans = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(parse_int)) train_data, val_data = data.TabularDataset.splits(path=path, train=traintsv, validation=valtsv, fields=[('quesid', quesid), ('ques', ques), ('imgid', imgid), ('ans', ans)], format='tsv') batch_sizes = (1, 1) train_loader, val_loader = data.BucketIterator.splits( (train_data, val_data), batch_sizes=batch_sizes, repeat=False, sort_key=lambda x: len(x.ques)) ques.build_vocab(train_data) print('vocabulary size: {}'.format(len(ques.vocab.stoi))) return ques, train_loader, val_loader
def get_dataset(lower=False, vectors=None, n_folds=10, seed=42): lower = True if vectors is not None else False # tweet = data.Field(sequential=False, tensor_type=torch.LongTensor, lower=lower) tweet = data.Field(sequential=True) label = data.Field(sequential=False) # label = data.Field(sequential=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) retweet_count = data.Field(use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) favorite_count = data.Field(use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) user_followers_count = data.Field( use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) user_following_count = data.Field( use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) fields = [ ('id', None), ('created_at', None), ('text', tweet), ('retweet_count', retweet_count), ('favorite_count', favorite_count), ('user_screen_name', None), ('user_id', None), ('user_followers_count', user_followers_count), ('user_following_count', user_following_count), ('hate_label', label), ] all_tweets = data.TabularDataset(path='cache/tweets_data.csv', format='csv', skip_header=True, fields=fields) tweet.build_vocab(all_tweets, vectors=vectors) label.build_vocab(all_tweets) tweet_exp = np.array(all_tweets.examples) train_val = tweet_exp kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed) def iter_fold(): train_val_arr = [] for train_idx, val_idx in kf.split(train_val): train = data.Dataset(list(train_val[train_idx]), fields) val = data.Dataset(list(train_val[val_idx]), fields) train_val_arr.append((train, val)) # yield (train, val,) return train_val_arr return iter_fold(), tweet
def read_files(lower=False, vectors=None): ############################# # THIS ALL NEEDS TO BE FIXED lower = True if vectors is not None else False # tweet = data.Field(sequential=False, tensor_type=torch.LongTensor, lower=lower) tweet = data.Field(sequential=True) label = data.Field(sequential=False) # label = data.Field(sequential=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) retweet_count = data.Field(use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) favorite_count = data.Field(use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) user_followers_count = data.Field( use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) user_following_count = data.Field( use_vocab=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x))) fields = [ ('id', None), ('created_at', None), ('text', tweet), ('retweet_count', retweet_count), ('favorite_count', favorite_count), ('user_screen_name', None), ('user_id', None), ('user_followers_count', user_followers_count), ('user_following_count', user_following_count), ('hate_label', label), ] train, val = data.TabularDataset.splits(path='cache/', format='csv', skip_header=True, train='tweets_train.csv', validation='tweets_val.csv', fields=fields) # Might need to change this later test = data.TabularDataset(path='cache/tweets_test.csv', format='csv', skip_header=True, fields=fields) tweet.build_vocab(train, vectors=vectors) label.build_vocab(train) # What do these mean? return train, val, test, len(tweet.vocab), tweet
def test_pipeline(self): id_pipeline = data.Pipeline() assert id_pipeline("Test STring") == "Test STring" assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T" assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"] pipeline = data.Pipeline(six.text_type.lower) assert pipeline("Test STring") == "test string" assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t" assert pipeline(["1241", "Some String"]) == ["1241", "some string"] args_pipeline = data.Pipeline(TestPipeline.repeat_n) assert args_pipeline("test", 5) == "testtesttesttesttest" assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"]
def _createFields(self, min_occurance_freq): self.CAPTION_FIELD = data.ReversibleField( tokenize='spacy', init_token=self.start_token, eos_token=self.end_token, pad_token=self.pad_token, lower=True, batch_first=True, is_target=True, unk_token=UNKNOWN_TOKEN) self.INDEX_FIELD = data.Field( sequential=False, use_vocab=False, batch_first=True) if self.use_yt_categories: # preprocessing: if there is no category replace with -1 (unique number for dummy category) self.CATEGORY_FIELD = data.Field( sequential=False, use_vocab=False, batch_first=True, preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x)))) # filter the dataset if the a category is missing (31 -> 41 (count = 1 :())) self.filter_callback = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31 else: self.CATEGORY = None self.filter_callback = None if self.use_asr_subtitles: self.ASR_SUBTITLES_FIELD = data.ReversibleField( tokenize='spacy', init_token=self.start_token, eos_token=self.end_token, pad_token=self.pad_token, lower=True, batch_first=True, unk_token=UNKNOWN_TOKEN) else: self.ASR_SUBTITLES_FIELD = None
def __init__(self, text_field, input_text, path=None, examples=None, **kwargs): def clean_str(string): string = re.sub(r"[^ㄱ-ㅣ가-힣A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field)] examples = [] examples += [data.Example.fromlist([input_text], fields)] super(NewData, self).__init__(examples, fields, **kwargs)
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): def clean_str(string): string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: examples = [] with open(os.path.join('data', path), encoding="utf-8", errors="ignore") as f: for line in f.readlines(): #print(line) if line[-2] == '0': #print(line[:line.find('|')], '----negative') examples += [ data.Example.fromlist([line[:line.find('|')], 'negative'], fields)] else: #print(line[:line.find('|')], '----positive') examples += [ data.Example.fromlist([line[:line.find('|')], 'positive'], fields)] super(MR, self).__init__(examples, fields, **kwargs)
def gen_text_preprocessor(): """ Text field preprocessor for TorchText. """ def clean_str(string): # Replace multiple spaces with a single space. string = re.sub(r'\s+', ' ', string).strip() # Replace creature names with "creature" creature_regexes = [ r'kwep(s)?', r'morseth(s)?', r'luzak(s)?', r'zorb(s)?', r'oller(s)?', ] creature_misspellings = [ r'kweep(s)?', r'kewps(s)?', r'kweb(s)?', r'luzek(s)?', r'kewp(s)?', r'kewpt(s)?', r'kwerp(s)?', r'lulaz(s)?', r'lusak(s)?', r'moreseth(s)?', r'moresth(s)?', r'morthess(es)?' r'moseth(s)?' ] for expr in creature_regexes: string = re.sub(expr, 'creature', string) for expr in creature_misspellings: string = re.sub(expr, 'creature', string) # Replace '(' with ' ' string = re.sub(r'\(', ' ', string) string = re.sub(r'"+', '', string) return string return data.Pipeline(clean_str)
def test_preprocess(self): # Default case. field = data.Field() assert field.preprocess("Test string.") == ["Test", "string."] # Test that lowercase is properly applied. field_lower = data.Field(lower=True) assert field_lower.preprocess("Test string.") == ["test", "string."] # Test that custom preprocessing pipelines are properly applied. preprocess_pipeline = data.Pipeline(lambda x: x + "!") field_preprocessing = data.Field(preprocessing=preprocess_pipeline, lower=True) assert field_preprocessing.preprocess("Test string.") == [ "test!", "string.!" ] # Test that non-sequential data is properly handled. field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess( "Test string.") == "test string.!" # Non-regression test that we do not try to decode unicode strings to unicode field_not_sequential = data.Field(sequential=False, lower=True, preprocessing=preprocess_pipeline) assert field_not_sequential.preprocess( "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
def __init__(self, dataset, text_fields, label_fields, examples=None, **kwargs): def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_fields.preprocessing = data.Pipeline(clean_str) fields = [(f, text_fields) for f in FEATURES] from utils.DataPrepare.scenario import scenario_choice for c in scenario_choice.values(): fields.append((str(c), label_fields)) if examples is None: examples = [] for item in dataset: examples += [data.Example.fromlist(list(item), fields)] super(mydataset, self).__init__(examples, fields, **kwargs)
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] path = self.dirname if path is None else path if examples is None: examples = [] class_dirs = get_file_name(path) for class_dir_name in class_dirs: class_dir_path = os.path.join(path, class_dir_name) file_names = get_file_name(class_dir_path, "files") for file in file_names: file_path = os.path.join(class_dir_path, file) with open(file_path, errors='ignore') as f: raw_data = f.read() if len(raw_data.split(' ')) > 100: continue examples += [ data.Example.fromlist([raw_data, class_dir_name], fields) ] super(NewsGroup, self).__init__(examples, fields, **kwargs)
def __init__(self, text_field, label_field, examples=None, **kwargs): """Create an dataset instance. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: examples = [] with open('data/testovi.csv', errors='ignore') as f: import csv reader = csv.reader(f, delimiter=',', quotechar='"') # CSV: grade, answer, question, filename, questionnumber examples += [ data.Example.fromlist([ line[2] + ' <pad> ' + line[1], str(round(float(line[0]) * 2) / 2) ], fields) for line in reader ] # under sampling: # classes = sorted(set(map(lambda e: e.label, examples))) # examples_split = [[e for e in examples if e.label == x] for x in classes] # min_class_count = min([len(e) for e in examples_split]) # examples = [] # # for egroup in examples_split: # random.shuffle(egroup) # examples.extend(egroup[:min_class_count]) # # import numpy as np # print(np.unique([e.label for e in examples], return_counts=True)) super(TestsDS, self).__init__(examples, fields, **kwargs)
def gen_text_preprocessor(): """ Text field preprocessor for TorchText. """ def clean_str(string): misspellings = { r'pur ': 'purple', r'fea-': 'feather', r'wh-': 'white', r'whie': 'white', r'wh ': 'white', r'or ': 'orange', r'or-': 'orange', r'orge': 'orange', r'winngs': 'wings', r'feathes': 'feathers', } for expr, subst in misspellings.items(): string = re.sub(expr, subst, string) # Replace '(' with ' ' string = re.sub(r'\(', ' ', string) string = re.sub(r',', ' ', string) string = re.sub(r'-', ' ', string) string = re.sub(r'~+', ' ', string) # Replace multiple spaces with a single space. string = re.sub(r'\s+', ' ', string).strip() string = re.sub(r'"+', '', string) return string return data.Pipeline(clean_str)
def __init__(self, path, text_field, label_field, subtrees=False, fine_grained=True, **kwargs): fields = [('text', text_field), ('label', label_field)] def get_label_str(label): pre = 'very ' if fine_grained else '' return { '0': pre + 'negative', '1': 'negative', '2': 'neutral', '3': 'positive', '4': pre + 'positive', None: None }[label] label_field.preprocessing = data.Pipeline(get_label_str) with open(os.path.expanduser(path)) as f: if subtrees: examples = [ ex for line in f for ex in data.Example.fromtree(line, fields, True) ] else: examples = [data.Example.fromtree(line, fields) for line in f] super(SST, self).__init__(examples, fields, **kwargs)
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f ] with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f ] super(MR, self).__init__(examples, fields, **kwargs)
def __init__(self, text_field, label_field, path=None, file=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = None if os.path.join(path, file) is None else os.path.join( path, file) print("loading {}... ".format(path)) examples = [] with open(path) as f: for line in f.readlines(): if line[-2] == '0': examples += [ data.Example.fromlist( [line[:line.find('|')], 'negative'], fields=fields) ] elif line[-2] == '1': examples += [ data.Example.fromlist( [line[:line.find('|')], 'positive'], fields=fields) ] super(MR, self).__init__(examples, fields, **kwargs)
def cadec(self, opt, tag_type='ner'): """ cadec: CADEC (Parser only. You must place the files) Extract CADEC dataset using torchtext. """ logger.info('---------- CADEC = %s ---------' % (tag_type)) train_file = mapping_files[opt.lang] # Setup fields with batch dimension first inputs_word = data.Field( batch_first=True, fix_length=opt.maxlen, lower=opt.lower, preprocessing=data.Pipeline( lambda w: '0' if opt.convert_digits and w.isdigit() else w)) inputs_char_nesting = data.Field(tokenize=list, batch_first=True, fix_length=opt.maxlen) inputs_char = data.NestedField(inputs_char_nesting) inputs_case = data.Field( batch_first=True, fix_length=opt.maxlen, preprocessing=data.Pipeline(lambda w: self.getCasing(w))) labels = data.Field(batch_first=True, unk_token=None, fix_length=opt.maxlen) # pad_token=None, # preprocessing=data.Pipeline(lambda w: labels_map[w])) id = data.Field(batch_first=True, use_vocab=False) self.fields = ([(('inputs_word', 'inputs_char', 'inputs_case'), (inputs_word, inputs_char, inputs_case))] + [('labels', labels) if label == tag_type else (None, None) for label in ['ner']] + [('id', id)]) # Load the data datafile = NERDataset.splits(path='.', train=train_file, separator='\t', encoding='utf-8', fields=tuple(self.fields))[0] self.train, self.val, self.test = datafile.split( split_ratio=[5610, 1000, 1000]) return inputs_word, inputs_char, inputs_case, labels
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create an MR dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f: examples += [ data.Example.fromlist([line, 'negative'], fields) for line in f ] with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f: examples += [ data.Example.fromlist([line, 'positive'], fields) for line in f ] print('examples---0') print(len(examples)) super(MR, self).__init__(examples, fields, **kwargs)
def make_amazon(batch_size, device=-1, vectors=None, base_path="", suffix="", extrasuffix="", domain="", oodname="", topics=False): TEXT = data.Field(include_lengths=True, lower=True) LABEL = data.LabelField() TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x: float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True) if not topics: train = data.TabularDataset(path=base_path + "/" + domain + ".train.lower.tok" + suffix + extrasuffix + ".txt", format="tsv", fields=[('text', TEXT), ('label', LABEL)]) else: train = data.TabularDataset(path=base_path + "/" + domain + ".train.lower.tok" + suffix + extrasuffix + ".txt", format="tsv", fields=[('text', TEXT), ('label', LABEL), ('topics', TOPICS)]) val = data.TabularDataset(path=base_path + "/" + domain + ".valid.lower.tok" + suffix + ".txt", format="tsv", fields=[('text', TEXT), ('label', LABEL)]) test = data.TabularDataset(path=base_path + "/" + domain + ".test.lower.tok" + suffix + ".txt", format="tsv", fields=[('text', TEXT), ('label', LABEL)]) oodnames = oodname.split(",") outdomain_test = [] for oodname in oodnames: outdomain_test.append( data.TabularDataset(path=base_path + "/" + oodname + ".test.lower.tok" + suffix + ".txt", format="tsv", fields=[('text', TEXT), ('label', LABEL)])) # train, test = datasets.REDDIT.splits(TEXT, LABEL) TEXT.build_vocab(train, vectors=vectors, max_size=30000) LABEL.build_vocab(train) all_iters = data.BucketIterator.splits( tuple([train, val, test] + outdomain_test), batch_sizes=tuple([batch_size] * (3 + len(outdomain_test))), device=device, repeat=False, sort_key=lambda x: len(x.text)) # train_iter, val_iter, test_iter, outdomain_test_iters return all_iters, TEXT, LABEL, TOPICS
def preprocess(which_task, train_file, val_file, test_file, max_vocab_size=MAX_VOCAB_SIZE): ''' Load data and preprocess: - apply tokenization - one hot encode labels - build embeddings Takes: - string denoting which field is label ("response" or "product") - filename of training data csv - filename of validation csv - filename of testing csv - max vocab size Returns: - train data, validation data, test data object ''' if which_task not in ["response", "product"]: print("preprocessing error: which field is the label?") raise ValueError # define text field objects with tokenization TEXT = data.Field(sequential=True, tokenize=util.tokenize, lower=True) # define label field with one hot encoded labels if which_task == "response": OneHotEncoder = data.Pipeline(convert_token=util.one_hot_encode_response) LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder) else: OneHotEncoder = data.Pipeline(convert_token=util.one_hot_encode_product) LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder) # create dataset objects train_data = load_and_tokenize_data(train_file, TEXT, LABEL, which_task) valid_data = load_and_tokenize_data(val_file, TEXT, LABEL, which_task) test_data = load_and_tokenize_data(test_file, TEXT, LABEL, which_task) # create embeddings from training data TEXT.build_vocab(train_data, max_size=max_vocab_size) LABEL.build_vocab(train_data) print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}") print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}") return train_data, valid_data, test_data
def test_composition(self): id_pipeline = data.Pipeline() pipeline = data.Pipeline(TestPipeline.repeat_n) pipeline.add_before(id_pipeline) pipeline.add_after(id_pipeline) pipeline.add_before(six.text_type.lower) pipeline.add_after(six.text_type.capitalize) other_pipeline = data.Pipeline(six.text_type.swapcase) other_pipeline.add_before(pipeline) # Assert pipeline gives proper results after composition # (test that we aren't modfifying pipes member) assert pipeline("teST") == "Testtesttest" assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"] # Assert pipeline that we added to gives proper results assert other_pipeline("teST") == "tESTTESTTEST" assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"]
def get_input_processor_words(vocab_word, vocab_char=None, convert_digits=True): """ Returns a function that converts text into a processed batch. Required duing inference. Parameters: vocab_word: Instance of torchtext.Vocab for input word vocabulary vocab_char[optional]: Instance of torchtext.Vocab for input per-word character vocabulary convert_digits: If True will convert numbers to single 0's """ inputs_word = data.Field( init_token="<bos>", eos_token="<eos>", batch_first=True, lower=True, preprocessing=data.Pipeline(lambda w: '0' if convert_digits and w.isdigit() else w)) # Set the vocab object manually without building from training dataset inputs_word.vocab = vocab_word if vocab_char is not None: inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True) inputs_char = data.NestedField(inputs_char_nesting, init_token="<bos>", eos_token="<eos>") # Set the vocab object manually without building from training dataset inputs_char.vocab = inputs_char_nesting.vocab = vocab_char fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))] else: fields = [('inputs_word', inputs_word)] def input_processor_fn(inputs): if not isinstance(inputs, list): inputs = [inputs] examples = [] for line in inputs: examples.append(data.Example.fromlist([line], fields)) dataset = data.Dataset(examples, fields) # Entire input in one batch return data.Batch( data=dataset, dataset=dataset, device=torch.device( "cuda:0" if torch.cuda.is_available() else "cpu")) return input_processor_fn
def __init__(self, text_field, label_field, datas, examples=None, **kwargs): """ Create own dataset instance given a path and fields. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. data: Raw data. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) #string = re.sub(r"\'s", " \'s", string) #string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) # string = re.sub(r",", " , ", string) # string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) # string = re.sub(r"\s{2,}", " ", string) #print("in clean",string.strip()) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: examples = [] # for weibo in datas: # formdat = [str(weibo['text']),str(weibo['label'])] # print("formdat",formdat) # exam = list(data.Example.fromlist(formdat, fields)) # examples += exam examples += [ data.Example.fromlist([weibo['text'], weibo['label']], fields) for weibo in datas ] print("in examples", len(examples)) super(mydata, self).__init__(examples, fields, **kwargs)
def make_rt_gender(batch_size, base_path, train_file, valid_file, test_file, device=-1, vectors=None, topics=False): TEXT = data.Field(include_lengths=True, lower=True) LABEL = data.LabelField() INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True) if topics: TOPICS = data.Field(sequential=True, use_vocab=False, preprocessing=data.Pipeline(lambda x: float(x)), tensor_type=torch.cuda.FloatTensor, batch_first=True) train = data.TabularDataset(path=os.path.join(base_path, train_file), format="tsv", fields=[('index', INDEX), ('text', TEXT), ('label', LABEL), ('topics', TOPICS)]) else: train = data.TabularDataset(path=os.path.join(base_path, train_file), format="tsv", fields=[('index', INDEX), ('text', TEXT), ('label', LABEL)]) train.examples = train.examples[0:10] TEXT.build_vocab(train, vectors=vectors, max_size=30000) LABEL.build_vocab(train) print(LABEL.vocab.stoi) val = data.TabularDataset(path=os.path.join(base_path, valid_file), format="tsv", fields=[('index', INDEX), ('text', TEXT), ('label', LABEL)]) test = data.TabularDataset(path=os.path.join(base_path, test_file), format="tsv", fields=[('index', INDEX), ('text', TEXT), ('label', LABEL)]) train_iter, val_iter, test_iter = data.BucketIterator.splits( (train, val, test), batch_sizes=(batch_size, 256, 256), device=device, repeat=False, sort_key=lambda x: len(x.text)) if topics: return (train_iter, val_iter, test_iter), TEXT, LABEL, TOPICS, INDEX else: return (train_iter, val_iter, test_iter), TEXT, LABEL, INDEX
def imdb(text_field, label_field, **kargs): text_field.preprocessing = data.Pipeline(clean_str) train_data, test_data = datasets.IMDB.splits(text_field, label_field) train_data, dev_data = train_data.split(random_state=random.seed(SEED)) text_field.build_vocab(train_data, dev_data, test_data) label_field.build_vocab(train_data, dev_data, test_data) train_iter, dev_iter, test_iter = data.BucketIterator.splits( (train_data, dev_data, test_data), batch_sizes=(args.batch_size, len(dev_data), len(test_data)), **kargs) return train_iter, dev_iter, test_iter
def get_loader(batch_size=100, max_size=20000, is_train=True, data_dir=None): text_field = data.Field(tokenize=tokenizer, sequential=True) label_field = data.Field(sequential=False, use_vocab=False, postprocessing=data.Pipeline(postprocess)) train_file_path = Path(data_dir).joinpath('naver_train.txt') test_file_path = Path(data_dir).joinpath('naver_test.txt') train_dataset = data.TabularDataset( path=train_file_path, format='tsv', fields=[ ('id', None), ('text', text_field), ('label', label_field) ], filter_pred=filter_pred) print('Building Vocabulary \n') text_field.build_vocab(train_dataset, max_size=max_size - 2) if is_train: loader = data.Iterator( dataset=train_dataset, batch_size=batch_size, sort_key=lambda x: len(x.text), train=True, # if training set => repeat and shuffle : True repeat=False, device=-1 # CPU: -1 ) # vocab = text_field.vocab # with open('./vocab.pkl', 'wb') as f: # pickle.dump(vocab, f) else: test_dataset = data.TabularDataset( path=test_file_path, format='tsv', fields=[ ('id', None), ('text', text_field), ('label', label_field) ], filter_pred=filter_pred) loader = data.Iterator( dataset=test_dataset, batch_size=batch_size, sort=False, train=False, device=-1) return loader
def __init__(self, args): if not args.cuda: args.gpu = -1 if torch.cuda.is_available() and args.cuda: print("Note: You are using GPU for training") torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) if torch.cuda.is_available() and not args.cuda: print( "Warning: You have Cuda but do not use it. You are using CPU for training" ) torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) self.QID = data.Field(sequential=False) self.QUESTION = data.Field(batch_first=True) self.ANSWER = data.Field(batch_first=True) self.LABEL = data.Field(sequential=False) self.EXTERNAL = data.Field( sequential=True, dtype=torch.FloatTensor, batch_first=True, use_vocab=False, postprocessing=data.Pipeline( lambda arr, _, train: [float(y) for y in arr])) if 'TrecQA' in args.dataset: train, dev, test = TrecDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL) elif 'WikiQA' in args.dataset: train, dev, test = WikiDataset.splits(self.QID, self.QUESTION, self.ANSWER, self.EXTERNAL, self.LABEL) else: print("Unsupported dataset") exit() self.QID.build_vocab(train, dev, test) self.QUESTION.build_vocab(train, dev, test) self.ANSWER.build_vocab(train, dev, test) self.LABEL.build_vocab(train, dev, test) if args.cuda: self.model = torch.load( args.model, map_location=lambda storage, location: storage.cuda(args.gpu)) else: self.model = torch.load( args.model, map_location=lambda storage, location: storage) self.gpu = args.gpu
def __init__(self, text_field, label_field, path=None, examples=None, **kwargs): """Create a Legal Sentences dataset. Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) #string = re.sub(r"\'s", " \'s", string) #string = re.sub(r"\'ve", " \'ve", string) #string = re.sub(r"n\'t", " n\'t", string) #string = re.sub(r"\'re", " \'re", string) #string = re.sub(r"\'d", " \'d", string) #string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = self.dirname if path is None else path examples = [] df = pd.read_excel(os.path.join(path, 'training_data.xlsx')) #df = pd.read_excel(os.path.join(path, 'Quantum1.xlsx'), nrows=6e4) df = df.sample(frac=1) #examples += [data.Example.fromlist([str(line), str(target)], fields) for line,target in zip(df.Fallos,df.Grupo)] examples += [ data.Example.fromlist([str(line), str(target)], fields) for line, target in zip(df.fallo, df.grupo) ] super(Legal, self).__init__(examples, fields, **kwargs)
def __init__(self, dim=0, **kwargs): super().__init__( use_vocab=False, batch_first=True, tokenize=FloatVectorField._parse_vector, dtype=torch.float, preprocessing=textdata.Pipeline( float ), # Convert each string to float. float() takes care of whitespace. fix_length=dim, pad_token=0, # For irregular sized vectors, pad the missing units with 0s. )
def __init__(self, text_field, label_field, path=None, file=None, examples=None, **kwargs): """ Arguments: text_field: The field that will be used for text data. label_field: The field that will be used for label data. path: Path to the data file. examples: The examples contain all the data. char_data: The char level to solve Remaining keyword arguments: Passed to the constructor of data.Dataset. """ def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() text_field.preprocessing = data.Pipeline(clean_str) fields = [('text', text_field), ('label', label_field)] if examples is None: path = None if os.path.join(path, file) is None else os.path.join(path, file) examples = [] with open(path, encoding="utf-8") as f: a, b = 0, 0 for line in f: # sentence, flag = line.strip().split(' ||| ') # print(line) label, seq, sentence = line.partition(" ") # clear string in every sentence sentence = clean_str(sentence) if label == '0': a += 1 examples += [data.Example.fromlist([sentence, 'negative'], fields=fields)] elif label == '1': b += 1 examples += [data.Example.fromlist([sentence, 'positive'], fields=fields)] print("negative sentence a {}, positive sentence b {} ".format(a, b)) super(CV, self).__init__(examples, fields, **kwargs)