def _create_loaders(path, traintsv, valtsv):
    def parse_int(tok, *args):
        return int(tok)

    # 对应.tsv文件中的几列,通过quesid.data[0]可以获得数据,ques.data[0]可以获得数据
    quesid = data.Field(sequential=False,
                        use_vocab=False,
                        postprocessing=data.Pipeline(parse_int))
    ques = data.Field(include_lengths=True)
    imgid = data.Field(sequential=False,
                       use_vocab=False,
                       postprocessing=data.Pipeline(parse_int))
    ans = data.Field(sequential=False,
                     use_vocab=False,
                     postprocessing=data.Pipeline(parse_int))

    train_data, val_data = data.TabularDataset.splits(path=path,
                                                      train=traintsv,
                                                      validation=valtsv,
                                                      fields=[('quesid',
                                                               quesid),
                                                              ('ques', ques),
                                                              ('imgid', imgid),
                                                              ('ans', ans)],
                                                      format='tsv')
    batch_sizes = (1, 1)
    train_loader, val_loader = data.BucketIterator.splits(
        (train_data, val_data),
        batch_sizes=batch_sizes,
        repeat=False,
        sort_key=lambda x: len(x.ques))

    ques.build_vocab(train_data)
    print('vocabulary size: {}'.format(len(ques.vocab.stoi)))
    return ques, train_loader, val_loader
Exemple #2
0
def get_dataset(lower=False, vectors=None, n_folds=10, seed=42):
    lower = True if vectors is not None else False
    # tweet = data.Field(sequential=False, tensor_type=torch.LongTensor, lower=lower)
    tweet = data.Field(sequential=True)
    label = data.Field(sequential=False)
    # label = data.Field(sequential=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x)))
    retweet_count = data.Field(use_vocab=False,
                               tensor_type=torch.LongTensor,
                               preprocessing=data.Pipeline(lambda x: int(x)))
    favorite_count = data.Field(use_vocab=False,
                                tensor_type=torch.LongTensor,
                                preprocessing=data.Pipeline(lambda x: int(x)))
    user_followers_count = data.Field(
        use_vocab=False,
        tensor_type=torch.LongTensor,
        preprocessing=data.Pipeline(lambda x: int(x)))
    user_following_count = data.Field(
        use_vocab=False,
        tensor_type=torch.LongTensor,
        preprocessing=data.Pipeline(lambda x: int(x)))
    fields = [
        ('id', None),
        ('created_at', None),
        ('text', tweet),
        ('retweet_count', retweet_count),
        ('favorite_count', favorite_count),
        ('user_screen_name', None),
        ('user_id', None),
        ('user_followers_count', user_followers_count),
        ('user_following_count', user_following_count),
        ('hate_label', label),
    ]

    all_tweets = data.TabularDataset(path='cache/tweets_data.csv',
                                     format='csv',
                                     skip_header=True,
                                     fields=fields)

    tweet.build_vocab(all_tweets, vectors=vectors)
    label.build_vocab(all_tweets)
    tweet_exp = np.array(all_tweets.examples)
    train_val = tweet_exp

    kf = KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    def iter_fold():
        train_val_arr = []
        for train_idx, val_idx in kf.split(train_val):
            train = data.Dataset(list(train_val[train_idx]), fields)
            val = data.Dataset(list(train_val[val_idx]), fields)
            train_val_arr.append((train, val))
            # yield (train, val,)
        return train_val_arr

    return iter_fold(), tweet
Exemple #3
0
def read_files(lower=False, vectors=None):
    #############################
    #  THIS ALL NEEDS TO BE FIXED
    lower = True if vectors is not None else False
    # tweet = data.Field(sequential=False, tensor_type=torch.LongTensor, lower=lower)
    tweet = data.Field(sequential=True)
    label = data.Field(sequential=False)
    # label = data.Field(sequential=False, tensor_type=torch.LongTensor, preprocessing=data.Pipeline(lambda x: int(x)))
    retweet_count = data.Field(use_vocab=False,
                               tensor_type=torch.LongTensor,
                               preprocessing=data.Pipeline(lambda x: int(x)))
    favorite_count = data.Field(use_vocab=False,
                                tensor_type=torch.LongTensor,
                                preprocessing=data.Pipeline(lambda x: int(x)))
    user_followers_count = data.Field(
        use_vocab=False,
        tensor_type=torch.LongTensor,
        preprocessing=data.Pipeline(lambda x: int(x)))
    user_following_count = data.Field(
        use_vocab=False,
        tensor_type=torch.LongTensor,
        preprocessing=data.Pipeline(lambda x: int(x)))
    fields = [
        ('id', None),
        ('created_at', None),
        ('text', tweet),
        ('retweet_count', retweet_count),
        ('favorite_count', favorite_count),
        ('user_screen_name', None),
        ('user_id', None),
        ('user_followers_count', user_followers_count),
        ('user_following_count', user_following_count),
        ('hate_label', label),
    ]

    train, val = data.TabularDataset.splits(path='cache/',
                                            format='csv',
                                            skip_header=True,
                                            train='tweets_train.csv',
                                            validation='tweets_val.csv',
                                            fields=fields)
    # Might need to change this later
    test = data.TabularDataset(path='cache/tweets_test.csv',
                               format='csv',
                               skip_header=True,
                               fields=fields)
    tweet.build_vocab(train, vectors=vectors)
    label.build_vocab(train)
    # What do these mean?

    return train, val, test, len(tweet.vocab), tweet
Exemple #4
0
    def test_pipeline(self):
        id_pipeline = data.Pipeline()
        assert id_pipeline("Test STring") == "Test STring"
        assert id_pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T"
        assert id_pipeline(["1241", "Some String"]) == ["1241", "Some String"]

        pipeline = data.Pipeline(six.text_type.lower)
        assert pipeline("Test STring") == "test string"
        assert pipeline("ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t"
        assert pipeline(["1241", "Some String"]) == ["1241", "some string"]

        args_pipeline = data.Pipeline(TestPipeline.repeat_n)
        assert args_pipeline("test", 5) == "testtesttesttesttest"
        assert args_pipeline(["ele1", "ele2"], 2) == ["ele1ele1", "ele2ele2"]
Exemple #5
0
    def _createFields(self, min_occurance_freq):
        self.CAPTION_FIELD = data.ReversibleField(
            tokenize='spacy', init_token=self.start_token, 
            eos_token=self.end_token, pad_token=self.pad_token, lower=True, 
            batch_first=True, is_target=True, unk_token=UNKNOWN_TOKEN)

        self.INDEX_FIELD = data.Field(
            sequential=False, use_vocab=False, batch_first=True)

        if self.use_yt_categories:
            # preprocessing: if there is no category replace with -1 (unique number for dummy category)
            self.CATEGORY_FIELD = data.Field(
                sequential=False, use_vocab=False, batch_first=True, 
                preprocessing=data.Pipeline(lambda x: -1 if len(x) == 0 else int(float(x))))

            # filter the dataset if the a category is missing (31 -> 41 (count = 1 :()))
            self.filter_callback = lambda x: vars(x)['category_32'] != -1 and vars(x)['category_32'] != 31
        else:
            self.CATEGORY = None
            self.filter_callback = None

        if self.use_asr_subtitles:
            self.ASR_SUBTITLES_FIELD = data.ReversibleField(
                tokenize='spacy', init_token=self.start_token, 
                eos_token=self.end_token, pad_token=self.pad_token, lower=True, 
                batch_first=True, unk_token=UNKNOWN_TOKEN)
        else:
            self.ASR_SUBTITLES_FIELD = None
Exemple #6
0
    def __init__(self,
                 text_field,
                 input_text,
                 path=None,
                 examples=None,
                 **kwargs):
        def clean_str(string):
            string = re.sub(r"[^ㄱ-ㅣ가-힣A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field)]

        examples = []
        examples += [data.Example.fromlist([input_text], fields)]

        super(NewData, self).__init__(examples, fields, **kwargs)
    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        def clean_str(string):
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            examples = []
            with open(os.path.join('data', path), encoding="utf-8", errors="ignore") as f:
                for line in f.readlines():
                    #print(line)
                    if line[-2] == '0':
                        #print(line[:line.find('|')], '----negative')
                        examples += [
                            data.Example.fromlist([line[:line.find('|')], 'negative'], fields)]
                    else:
                        #print(line[:line.find('|')], '----positive')
                        examples += [
                            data.Example.fromlist([line[:line.find('|')], 'positive'], fields)]
        super(MR, self).__init__(examples, fields, **kwargs)
Exemple #8
0
def gen_text_preprocessor():
    """ Text field preprocessor for TorchText.
    """
    def clean_str(string):
        # Replace multiple spaces with a single space.
        string = re.sub(r'\s+', ' ', string).strip()

        # Replace creature names with "creature"
        creature_regexes = [
            r'kwep(s)?',
            r'morseth(s)?',
            r'luzak(s)?',
            r'zorb(s)?',
            r'oller(s)?',
        ]
        creature_misspellings = [
            r'kweep(s)?', r'kewps(s)?', r'kweb(s)?', r'luzek(s)?', r'kewp(s)?',
            r'kewpt(s)?', r'kwerp(s)?', r'lulaz(s)?', r'lusak(s)?',
            r'moreseth(s)?', r'moresth(s)?', r'morthess(es)?'
            r'moseth(s)?'
        ]
        for expr in creature_regexes:
            string = re.sub(expr, 'creature', string)
        for expr in creature_misspellings:
            string = re.sub(expr, 'creature', string)

        # Replace '(' with ' '
        string = re.sub(r'\(', ' ', string)
        string = re.sub(r'"+', '', string)
        return string

    return data.Pipeline(clean_str)
Exemple #9
0
    def test_preprocess(self):
        # Default case.
        field = data.Field()
        assert field.preprocess("Test string.") == ["Test", "string."]

        # Test that lowercase is properly applied.
        field_lower = data.Field(lower=True)
        assert field_lower.preprocess("Test string.") == ["test", "string."]

        # Test that custom preprocessing pipelines are properly applied.
        preprocess_pipeline = data.Pipeline(lambda x: x + "!")
        field_preprocessing = data.Field(preprocessing=preprocess_pipeline,
                                         lower=True)
        assert field_preprocessing.preprocess("Test string.") == [
            "test!", "string.!"
        ]

        # Test that non-sequential data is properly handled.
        field_not_sequential = data.Field(sequential=False,
                                          lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess(
            "Test string.") == "test string.!"

        # Non-regression test that we do not try to decode unicode strings to unicode
        field_not_sequential = data.Field(sequential=False,
                                          lower=True,
                                          preprocessing=preprocess_pipeline)
        assert field_not_sequential.preprocess(
            "ᑌᑎIᑕOᗪᕮ_Tᕮ᙭T") == "ᑌᑎiᑕoᗪᕮ_tᕮ᙭t!"
Exemple #10
0
        def __init__(self, dataset, text_fields, label_fields, examples=None, **kwargs):
            def clean_str(string):
                """
                Tokenization/string cleaning for all datasets except for SST.
                Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
                """
                string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
                string = re.sub(r"\'s", " \'s", string)
                string = re.sub(r"\'ve", " \'ve", string)
                string = re.sub(r"n\'t", " n\'t", string)
                string = re.sub(r"\'re", " \'re", string)
                string = re.sub(r"\'d", " \'d", string)
                string = re.sub(r"\'ll", " \'ll", string)
                string = re.sub(r",", " , ", string)
                string = re.sub(r"!", " ! ", string)
                string = re.sub(r"\(", " \( ", string)
                string = re.sub(r"\)", " \) ", string)
                string = re.sub(r"\?", " \? ", string)
                string = re.sub(r"\s{2,}", " ", string)
                return string.strip()

            text_fields.preprocessing = data.Pipeline(clean_str)
            fields = [(f, text_fields) for f in FEATURES]

            from utils.DataPrepare.scenario import scenario_choice
            for c in scenario_choice.values():
                fields.append((str(c), label_fields))

            if examples is None:
                examples = []
                for item in dataset:
                    examples += [data.Example.fromlist(list(item), fields)]

            super(mydataset, self).__init__(examples, fields, **kwargs)
Exemple #11
0
    def __init__(self,
                 text_field,
                 label_field,
                 path=None,
                 examples=None,
                 **kwargs):
        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]
        path = self.dirname if path is None else path

        if examples is None:
            examples = []
            class_dirs = get_file_name(path)
            for class_dir_name in class_dirs:
                class_dir_path = os.path.join(path, class_dir_name)
                file_names = get_file_name(class_dir_path, "files")
                for file in file_names:
                    file_path = os.path.join(class_dir_path, file)
                    with open(file_path, errors='ignore') as f:
                        raw_data = f.read()
                        if len(raw_data.split(' ')) > 100:
                            continue
                        examples += [
                            data.Example.fromlist([raw_data, class_dir_name],
                                                  fields)
                        ]
        super(NewsGroup, self).__init__(examples, fields, **kwargs)
Exemple #12
0
    def __init__(self, text_field, label_field, examples=None, **kwargs):
        """Create an dataset instance.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            examples = []
            with open('data/testovi.csv', errors='ignore') as f:
                import csv
                reader = csv.reader(f, delimiter=',', quotechar='"')
                # CSV: grade, answer, question, filename, questionnumber
                examples += [
                    data.Example.fromlist([
                        line[2] + ' <pad> ' + line[1],
                        str(round(float(line[0]) * 2) / 2)
                    ], fields) for line in reader
                ]

                # under sampling:
                # classes = sorted(set(map(lambda e: e.label, examples)))
                # examples_split = [[e for e in examples if e.label == x] for x in classes]
                # min_class_count = min([len(e) for e in examples_split])
                # examples = []
                #
                # for egroup in examples_split:
                #     random.shuffle(egroup)
                #     examples.extend(egroup[:min_class_count])
                #
                # import numpy as np
                # print(np.unique([e.label for e in examples], return_counts=True))

        super(TestsDS, self).__init__(examples, fields, **kwargs)
Exemple #13
0
def gen_text_preprocessor():
    """ Text field preprocessor for TorchText.
    """
    def clean_str(string):
        misspellings = {
            r'pur ': 'purple',
            r'fea-': 'feather',
            r'wh-': 'white',
            r'whie': 'white',
            r'wh ': 'white',
            r'or ': 'orange',
            r'or-': 'orange',
            r'orge': 'orange',
            r'winngs': 'wings',
            r'feathes': 'feathers',
        }

        for expr, subst in misspellings.items():
            string = re.sub(expr, subst, string)

        # Replace '(' with ' '
        string = re.sub(r'\(', ' ', string)
        string = re.sub(r',', ' ', string)
        string = re.sub(r'-', ' ', string)
        string = re.sub(r'~+', ' ', string)

        # Replace multiple spaces with a single space.
        string = re.sub(r'\s+', ' ', string).strip()

        string = re.sub(r'"+', '', string)
        return string

    return data.Pipeline(clean_str)
Exemple #14
0
    def __init__(self,
                 path,
                 text_field,
                 label_field,
                 subtrees=False,
                 fine_grained=True,
                 **kwargs):
        fields = [('text', text_field), ('label', label_field)]

        def get_label_str(label):
            pre = 'very ' if fine_grained else ''
            return {
                '0': pre + 'negative',
                '1': 'negative',
                '2': 'neutral',
                '3': 'positive',
                '4': pre + 'positive',
                None: None
            }[label]

        label_field.preprocessing = data.Pipeline(get_label_str)
        with open(os.path.expanduser(path)) as f:
            if subtrees:
                examples = [
                    ex for line in f
                    for ex in data.Example.fromtree(line, fields, True)
                ]
            else:
                examples = [data.Example.fromtree(line, fields) for line in f]
        super(SST, self).__init__(examples, fields, **kwargs)
Exemple #15
0
    def __init__(self,
                 text_field,
                 label_field,
                 path=None,
                 examples=None,
                 **kwargs):
        """Create an MR dataset instance given a path and fields.
        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg'),
                      errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields)
                    for line in f
                ]
            with open(os.path.join(path, 'rt-polarity.pos'),
                      errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields)
                    for line in f
                ]
        super(MR, self).__init__(examples, fields, **kwargs)
Exemple #16
0
    def __init__(self,
                 text_field,
                 label_field,
                 path=None,
                 file=None,
                 examples=None,
                 **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = None if os.path.join(path, file) is None else os.path.join(
                path, file)
            print("loading {}... ".format(path))
            examples = []
            with open(path) as f:
                for line in f.readlines():
                    if line[-2] == '0':
                        examples += [
                            data.Example.fromlist(
                                [line[:line.find('|')], 'negative'],
                                fields=fields)
                        ]
                    elif line[-2] == '1':
                        examples += [
                            data.Example.fromlist(
                                [line[:line.find('|')], 'positive'],
                                fields=fields)
                        ]
        super(MR, self).__init__(examples, fields, **kwargs)
Exemple #17
0
    def cadec(self, opt, tag_type='ner'):
        """
           cadec: CADEC (Parser only. You must place the files)
           Extract CADEC dataset using torchtext.
        """
        logger.info('---------- CADEC = %s ---------' % (tag_type))
        train_file = mapping_files[opt.lang]
        # Setup fields with batch dimension first
        inputs_word = data.Field(
            batch_first=True,
            fix_length=opt.maxlen,
            lower=opt.lower,
            preprocessing=data.Pipeline(
                lambda w: '0' if opt.convert_digits and w.isdigit() else w))

        inputs_char_nesting = data.Field(tokenize=list,
                                         batch_first=True,
                                         fix_length=opt.maxlen)
        inputs_char = data.NestedField(inputs_char_nesting)

        inputs_case = data.Field(
            batch_first=True,
            fix_length=opt.maxlen,
            preprocessing=data.Pipeline(lambda w: self.getCasing(w)))

        labels = data.Field(batch_first=True,
                            unk_token=None,
                            fix_length=opt.maxlen)  # pad_token=None,
        # preprocessing=data.Pipeline(lambda w: labels_map[w]))

        id = data.Field(batch_first=True, use_vocab=False)

        self.fields = ([(('inputs_word', 'inputs_char', 'inputs_case'),
                         (inputs_word, inputs_char, inputs_case))] +
                       [('labels', labels) if label == tag_type else
                        (None, None) for label in ['ner']] + [('id', id)])

        # Load the data
        datafile = NERDataset.splits(path='.',
                                     train=train_file,
                                     separator='\t',
                                     encoding='utf-8',
                                     fields=tuple(self.fields))[0]

        self.train, self.val, self.test = datafile.split(
            split_ratio=[5610, 1000, 1000])
        return inputs_word, inputs_char, inputs_case, labels
Exemple #18
0
    def __init__(self,
                 text_field,
                 label_field,
                 path=None,
                 examples=None,
                 **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)

        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg'),
                      errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields)
                    for line in f
                ]
            with open(os.path.join(path, 'rt-polarity.pos'),
                      errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields)
                    for line in f
                ]
            print('examples---0')
            print(len(examples))
        super(MR, self).__init__(examples, fields, **kwargs)
def make_amazon(batch_size,
                device=-1,
                vectors=None,
                base_path="",
                suffix="",
                extrasuffix="",
                domain="",
                oodname="",
                topics=False):
    TEXT = data.Field(include_lengths=True, lower=True)
    LABEL = data.LabelField()
    TOPICS = data.Field(sequential=True,
                        use_vocab=False,
                        preprocessing=data.Pipeline(lambda x: float(x)),
                        tensor_type=torch.cuda.FloatTensor,
                        batch_first=True)
    if not topics:
        train = data.TabularDataset(path=base_path + "/" + domain +
                                    ".train.lower.tok" + suffix + extrasuffix +
                                    ".txt",
                                    format="tsv",
                                    fields=[('text', TEXT), ('label', LABEL)])
    else:
        train = data.TabularDataset(path=base_path + "/" + domain +
                                    ".train.lower.tok" + suffix + extrasuffix +
                                    ".txt",
                                    format="tsv",
                                    fields=[('text', TEXT), ('label', LABEL),
                                            ('topics', TOPICS)])
    val = data.TabularDataset(path=base_path + "/" + domain +
                              ".valid.lower.tok" + suffix + ".txt",
                              format="tsv",
                              fields=[('text', TEXT), ('label', LABEL)])
    test = data.TabularDataset(path=base_path + "/" + domain +
                               ".test.lower.tok" + suffix + ".txt",
                               format="tsv",
                               fields=[('text', TEXT), ('label', LABEL)])
    oodnames = oodname.split(",")
    outdomain_test = []
    for oodname in oodnames:
        outdomain_test.append(
            data.TabularDataset(path=base_path + "/" + oodname +
                                ".test.lower.tok" + suffix + ".txt",
                                format="tsv",
                                fields=[('text', TEXT), ('label', LABEL)]))

    # train, test = datasets.REDDIT.splits(TEXT, LABEL)
    TEXT.build_vocab(train, vectors=vectors, max_size=30000)
    LABEL.build_vocab(train)
    all_iters = data.BucketIterator.splits(
        tuple([train, val, test] + outdomain_test),
        batch_sizes=tuple([batch_size] * (3 + len(outdomain_test))),
        device=device,
        repeat=False,
        sort_key=lambda x: len(x.text))
    # train_iter, val_iter, test_iter, outdomain_test_iters
    return all_iters, TEXT, LABEL, TOPICS
Exemple #20
0
def preprocess(which_task, train_file, val_file, test_file, max_vocab_size=MAX_VOCAB_SIZE):
    '''
    Load data and preprocess:
    - apply tokenization
    - one hot encode labels
    - build embeddings

    Takes:
    - string denoting which field is label ("response" or "product")
    - filename of training data csv
    - filename of validation csv
    - filename of testing csv
    - max vocab size
    Returns:
    - train data, validation data, test data object
    '''

    if which_task not in ["response", "product"]:
        print("preprocessing error: which field is the label?")
        raise ValueError

    # define text field objects with tokenization
    TEXT = data.Field(sequential=True, tokenize=util.tokenize, lower=True)

    # define label field with one hot encoded labels
    if which_task == "response":
        OneHotEncoder = data.Pipeline(convert_token=util.one_hot_encode_response)
        LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder)
    else:
        OneHotEncoder = data.Pipeline(convert_token=util.one_hot_encode_product)
        LABEL = data.LabelField(sequential=False, use_vocab=False, preprocessing=OneHotEncoder)

    # create dataset objects
    train_data = load_and_tokenize_data(train_file, TEXT, LABEL, which_task)
    valid_data = load_and_tokenize_data(val_file, TEXT, LABEL, which_task)
    test_data = load_and_tokenize_data(test_file, TEXT, LABEL, which_task)

    # create embeddings from training data
    TEXT.build_vocab(train_data, max_size=max_vocab_size)
    LABEL.build_vocab(train_data)
    print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
    print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

    return train_data, valid_data, test_data
Exemple #21
0
    def test_composition(self):
        id_pipeline = data.Pipeline()
        pipeline = data.Pipeline(TestPipeline.repeat_n)
        pipeline.add_before(id_pipeline)
        pipeline.add_after(id_pipeline)
        pipeline.add_before(six.text_type.lower)
        pipeline.add_after(six.text_type.capitalize)

        other_pipeline = data.Pipeline(six.text_type.swapcase)
        other_pipeline.add_before(pipeline)

        # Assert pipeline gives proper results after composition
        # (test that we aren't modfifying pipes member)
        assert pipeline("teST") == "Testtesttest"
        assert pipeline(["ElE1", "eLe2"]) == ["Ele1ele1ele1", "Ele2ele2ele2"]

        # Assert pipeline that we added to gives proper results
        assert other_pipeline("teST") == "tESTTESTTEST"
        assert other_pipeline(["ElE1", "eLe2"]) == ["eLE1ELE1ELE1", "eLE2ELE2ELE2"]
Exemple #22
0
def get_input_processor_words(vocab_word,
                              vocab_char=None,
                              convert_digits=True):
    """
    Returns a function that converts text into a processed batch. Required duing
    inference.
    Parameters:
        vocab_word: Instance of torchtext.Vocab for input word vocabulary
        vocab_char[optional]: Instance of torchtext.Vocab for input per-word 
                              character vocabulary
        convert_digits: If True will convert numbers to single 0's
    """
    inputs_word = data.Field(
        init_token="<bos>",
        eos_token="<eos>",
        batch_first=True,
        lower=True,
        preprocessing=data.Pipeline(lambda w: '0'
                                    if convert_digits and w.isdigit() else w))
    # Set the vocab object manually without building from training dataset
    inputs_word.vocab = vocab_word

    if vocab_char is not None:
        inputs_char_nesting = data.Field(tokenize=list,
                                         init_token="<bos>",
                                         eos_token="<eos>",
                                         batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting,
                                       init_token="<bos>",
                                       eos_token="<eos>")
        # Set the vocab object manually without building from training dataset
        inputs_char.vocab = inputs_char_nesting.vocab = vocab_char

        fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]
    else:
        fields = [('inputs_word', inputs_word)]

    def input_processor_fn(inputs):
        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []
        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        # Entire input in one batch
        return data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))

    return input_processor_fn
    def __init__(self,
                 text_field,
                 label_field,
                 datas,
                 examples=None,
                 **kwargs):
        """ Create own dataset instance given a path and fields.

            Arguments:
                text_field: The field that will be used for text data.
                label_field: The field that will be used for label data.
                data: Raw data.
                examples: The examples contain all the data.
                Remaining keyword arguments: Passed to the constructor of
                    data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """

            #string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            #string = re.sub(r"\'s", " \'s", string)
            #string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            # string = re.sub(r",", " , ", string)
            # string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            # string = re.sub(r"\s{2,}", " ", string)
            #print("in clean",string.strip())
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            examples = []
            # for weibo in datas:
            #     formdat = [str(weibo['text']),str(weibo['label'])]
            #     print("formdat",formdat)
            #     exam = list(data.Example.fromlist(formdat, fields))
            #     examples += exam

            examples += [
                data.Example.fromlist([weibo['text'], weibo['label']], fields)
                for weibo in datas
            ]
            print("in examples", len(examples))
        super(mydata, self).__init__(examples, fields, **kwargs)
def make_rt_gender(batch_size,
                   base_path,
                   train_file,
                   valid_file,
                   test_file,
                   device=-1,
                   vectors=None,
                   topics=False):
    TEXT = data.Field(include_lengths=True, lower=True)
    LABEL = data.LabelField()
    INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)

    if topics:
        TOPICS = data.Field(sequential=True,
                            use_vocab=False,
                            preprocessing=data.Pipeline(lambda x: float(x)),
                            tensor_type=torch.cuda.FloatTensor,
                            batch_first=True)
        train = data.TabularDataset(path=os.path.join(base_path, train_file),
                                    format="tsv",
                                    fields=[('index', INDEX), ('text', TEXT),
                                            ('label', LABEL),
                                            ('topics', TOPICS)])
    else:
        train = data.TabularDataset(path=os.path.join(base_path, train_file),
                                    format="tsv",
                                    fields=[('index', INDEX), ('text', TEXT),
                                            ('label', LABEL)])
        train.examples = train.examples[0:10]

    TEXT.build_vocab(train, vectors=vectors, max_size=30000)
    LABEL.build_vocab(train)
    print(LABEL.vocab.stoi)

    val = data.TabularDataset(path=os.path.join(base_path, valid_file),
                              format="tsv",
                              fields=[('index', INDEX), ('text', TEXT),
                                      ('label', LABEL)])
    test = data.TabularDataset(path=os.path.join(base_path, test_file),
                               format="tsv",
                               fields=[('index', INDEX), ('text', TEXT),
                                       ('label', LABEL)])

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test),
        batch_sizes=(batch_size, 256, 256),
        device=device,
        repeat=False,
        sort_key=lambda x: len(x.text))

    if topics:
        return (train_iter, val_iter, test_iter), TEXT, LABEL, TOPICS, INDEX
    else:
        return (train_iter, val_iter, test_iter), TEXT, LABEL, INDEX
Exemple #25
0
def imdb(text_field, label_field, **kargs):
    text_field.preprocessing = data.Pipeline(clean_str)
    train_data, test_data = datasets.IMDB.splits(text_field, label_field)
    train_data, dev_data = train_data.split(random_state=random.seed(SEED))
    text_field.build_vocab(train_data, dev_data, test_data)
    label_field.build_vocab(train_data, dev_data, test_data)
    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train_data, dev_data, test_data),
        batch_sizes=(args.batch_size, len(dev_data), len(test_data)),
        **kargs)
    return train_iter, dev_iter, test_iter
Exemple #26
0
def get_loader(batch_size=100, max_size=20000, is_train=True, data_dir=None):

    text_field = data.Field(tokenize=tokenizer, sequential=True)
    label_field = data.Field(sequential=False, use_vocab=False,
                             postprocessing=data.Pipeline(postprocess))

    train_file_path = Path(data_dir).joinpath('naver_train.txt')
    test_file_path = Path(data_dir).joinpath('naver_test.txt')

    train_dataset = data.TabularDataset(
        path=train_file_path,
        format='tsv',
        fields=[
            ('id', None),
            ('text', text_field),
            ('label', label_field)
        ],
        filter_pred=filter_pred)

    print('Building Vocabulary \n')
    text_field.build_vocab(train_dataset, max_size=max_size - 2)

    if is_train:
        loader = data.Iterator(
            dataset=train_dataset,
            batch_size=batch_size,
            sort_key=lambda x: len(x.text),
            train=True,  # if training set => repeat and shuffle : True
            repeat=False,
            device=-1  # CPU: -1
        )
        # vocab = text_field.vocab
        # with open('./vocab.pkl', 'wb') as f:
        #     pickle.dump(vocab, f)

    else:
        test_dataset = data.TabularDataset(
            path=test_file_path,
            format='tsv',
            fields=[
                ('id', None),
                ('text', text_field),
                ('label', label_field)
            ],
            filter_pred=filter_pred)

        loader = data.Iterator(
            dataset=test_dataset,
            batch_size=batch_size,
            sort=False,
            train=False,
            device=-1)

    return loader
Exemple #27
0
    def __init__(self, args):
        if not args.cuda:
            args.gpu = -1
        if torch.cuda.is_available() and args.cuda:
            print("Note: You are using GPU for training")
            torch.cuda.set_device(args.gpu)
            torch.cuda.manual_seed(args.seed)
        if torch.cuda.is_available() and not args.cuda:
            print(
                "Warning: You have Cuda but do not use it. You are using CPU for training"
            )

        torch.manual_seed(args.seed)
        np.random.seed(args.seed)
        random.seed(args.seed)

        self.QID = data.Field(sequential=False)
        self.QUESTION = data.Field(batch_first=True)
        self.ANSWER = data.Field(batch_first=True)
        self.LABEL = data.Field(sequential=False)
        self.EXTERNAL = data.Field(
            sequential=True,
            dtype=torch.FloatTensor,
            batch_first=True,
            use_vocab=False,
            postprocessing=data.Pipeline(
                lambda arr, _, train: [float(y) for y in arr]))

        if 'TrecQA' in args.dataset:
            train, dev, test = TrecDataset.splits(self.QID, self.QUESTION,
                                                  self.ANSWER, self.EXTERNAL,
                                                  self.LABEL)
        elif 'WikiQA' in args.dataset:
            train, dev, test = WikiDataset.splits(self.QID, self.QUESTION,
                                                  self.ANSWER, self.EXTERNAL,
                                                  self.LABEL)
        else:
            print("Unsupported dataset")
            exit()

        self.QID.build_vocab(train, dev, test)
        self.QUESTION.build_vocab(train, dev, test)
        self.ANSWER.build_vocab(train, dev, test)
        self.LABEL.build_vocab(train, dev, test)

        if args.cuda:
            self.model = torch.load(
                args.model,
                map_location=lambda storage, location: storage.cuda(args.gpu))
        else:
            self.model = torch.load(
                args.model, map_location=lambda storage, location: storage)

        self.gpu = args.gpu
Exemple #28
0
    def __init__(self,
                 text_field,
                 label_field,
                 path=None,
                 examples=None,
                 **kwargs):
        """Create a Legal Sentences dataset.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            #string = re.sub(r"\'s", " \'s", string)
            #string = re.sub(r"\'ve", " \'ve", string)
            #string = re.sub(r"n\'t", " n\'t", string)
            #string = re.sub(r"\'re", " \'re", string)
            #string = re.sub(r"\'d", " \'d", string)
            #string = re.sub(r"\'ll", " \'ll", string)

            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []

            df = pd.read_excel(os.path.join(path, 'training_data.xlsx'))
            #df = pd.read_excel(os.path.join(path, 'Quantum1.xlsx'), nrows=6e4)
            df = df.sample(frac=1)
            #examples += [data.Example.fromlist([str(line), str(target)], fields) for line,target in zip(df.Fallos,df.Grupo)]
            examples += [
                data.Example.fromlist([str(line), str(target)], fields)
                for line, target in zip(df.fallo, df.grupo)
            ]

        super(Legal, self).__init__(examples, fields, **kwargs)
Exemple #29
0
 def __init__(self, dim=0, **kwargs):
     super().__init__(
         use_vocab=False,
         batch_first=True,
         tokenize=FloatVectorField._parse_vector,
         dtype=torch.float,
         preprocessing=textdata.Pipeline(
             float
         ),  # Convert each string to float. float() takes care of whitespace.
         fix_length=dim,
         pad_token=0,  # For irregular sized vectors, pad the missing units with 0s.
     )
Exemple #30
0
    def __init__(self, text_field, label_field, path=None, file=None, examples=None, **kwargs):
        """
        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            char_data: The char level to solve
            Remaining keyword arguments: Passed to the constructor of data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)

            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = None if os.path.join(path, file) is None else os.path.join(path, file)
            examples = []
            with open(path, encoding="utf-8") as f:
                a, b = 0, 0
                for line in f:
                    # sentence, flag = line.strip().split(' ||| ')
                    # print(line)
                    label, seq, sentence = line.partition(" ")
                    # clear string in every sentence
                    sentence = clean_str(sentence)
                    if label == '0':
                        a += 1
                        examples += [data.Example.fromlist([sentence, 'negative'], fields=fields)]
                    elif label == '1':
                        b += 1
                        examples += [data.Example.fromlist([sentence, 'positive'], fields=fields)]
                print("negative sentence a {}, positive sentence b {} ".format(a, b))
        super(CV, self).__init__(examples, fields, **kwargs)