Esempio n. 1
0
 def __init__(self, seq_length, shared_setting, grouped_data):
     voca_path = os.path.join(data_path, shared_setting.vocab_filename)
     self.voca_size = shared_setting.vocab_size
     self.encoder = SubwordTextEncoder(voca_path)
     self.seq_length = seq_length
     self.grouped_data = grouped_data
     self.sampler = KeySampler(self.grouped_data)
Esempio n. 2
0
    def __init__(self, topic, max_sequence, vocab_filename):
        self.train_data = None
        self.dev_data = None
        self.test_data = None
        self.topic = topic

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        self.encoder = SubwordTextEncoder(voca_path)
        self.max_sequence = max_sequence
Esempio n. 3
0
 def __init__(self, seq_length, shared_setting, grouped_data):
     voca_path = os.path.join(data_path, shared_setting.vocab_filename)
     self.voca_size = shared_setting.vocab_size
     self.encoder = SubwordTextEncoder(voca_path)
     self.seq_length = seq_length
     self.mask_rate = 0.15
     self.grouped_data = grouped_data
     self.train_group = None
     self.test_group = None
     self.test_sampler = None
     self.train_sampler = None
Esempio n. 4
0
    def __init__(self, topic, max_sequence, vocab_filename):
        self.train_data = None
        self.dev_data = None
        self.test_data = None
        self.topic = topic

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        if "ST" in vocab_filename:
            self.encoder = TokenTextEncoder(voca_path, replace_oov=SPEC_4)
        else:
            self.encoder = SubwordTextEncoder(voca_path)
        self.max_sequence = max_sequence
Esempio n. 5
0
class AuxPairLoader:
    def __init__(self, seq_length, shared_setting, grouped_data):
        voca_path = os.path.join(data_path, shared_setting.vocab_filename)
        self.voca_size = shared_setting.vocab_size
        self.encoder = SubwordTextEncoder(voca_path)
        self.seq_length = seq_length
        self.grouped_data = grouped_data
        self.sampler = KeySampler(self.grouped_data)

    def encode(self, sent):
        tokens = self.encoder.encode(sent)
        pad_len = self.seq_length - len(tokens)
        return tokens + pad_len * [PAD_ID]

    def case_encoder(self, pair):
        # sent1 : list[int]
        # label : int
        sent1, sent2 = pair
        sent1_enc = slice_n_pad(self.encode(sent1), self.seq_length, PAD_ID)
        sent2_enc = slice_n_pad(self.encode(sent2), self.seq_length, PAD_ID)
        return [(sent1_enc, sent2_enc)]

    def get_insts(self, data_size):
        sent_pairs = pos_sampling(self.grouped_data, self.sampler, data_size)
        generator = [self.case_encoder(p) for p in sent_pairs]
        return list(generator)
Esempio n. 6
0
    def __init__(self, max_sequence, vocab_filename, voca_size, is_span):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        self.encoder = SubwordTextEncoder(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
        self.max_seq = max_sequence

        self.question = [
            "What is title of the controversy?",
            "What is the controversy about?"
        ]
        if not is_span:
            self.q_id = 0
        else:
            self.q_id = 1
        self.is_span = is_span
        self.text_offset = len(self.encoder.encode(
            self.question[self.q_id])) + 2

        data = load_annotation()
        self.all_data = self.generate_data(data)
        self.train_data, self.dev_data = self.held_out(self.all_data)
Esempio n. 7
0
    def __init__(self,
                 max_sequence,
                 vocab_filename,
                 voca_size,
                 using_alt_tokenizer=False):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.dev_file = os.path.join(corpus_dir, "dev.txt")
        self.test_file = os.path.join(corpus_dir, "test.txt")
        self.max_seq = max_sequence
        self.voca_size = voca_size
        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        if not using_alt_tokenizer:
            self.encoder = SubwordTextEncoder(voca_path)
            self.sep_char = "_"
            self.lower_case = False
        else:
            self.lower_case = True
            self.sep_char = "#"
            self.encoder = FullTokenizerWarpper(voca_path)
Esempio n. 8
0
def avg_token_length():
    s = "atheism"
    cont_list = tweet_reader.load_as_text_chunk(s)
    voca_path = os.path.join(data_path, Tweets2Stance.vocab_filename)

    encoder = SubwordTextEncoder(voca_path)

    n = 0
    histogram = Counter()
    for sent in cont_list:
        tokens = encoder.encode(sent)
        histogram[len(tokens)] += 1

        n += 1
        if n > 1000:
            break

    accum = 0
    for i in range(100):
        accum += histogram[i]
        print("{} : {}".format(i, accum))
Esempio n. 9
0
class DataLoader():
    def __init__(self, seq_length, shared_setting):
        voca_path = os.path.join(data_path, shared_setting.vocab_filename)
        self.voca_size = shared_setting.vocab_size
        self.encoder = SubwordTextEncoder(voca_path)
        self.seq_length = seq_length
        self.mask_rate = 0.15

    def token_generator(self, reader):
        buf = []
        for line in reader:
            tokens = self.encoder.encode(line)

            buf.extend(tokens)

            if len(buf) > self.seq_length:
                yield buf[:self.seq_length]
                buf = buf[self.seq_length:]

    def case_generator(self, reader):
        sents = self.token_generator(reader)
        random.seed(0)

        n_delete = int(self.seq_length * self.mask_rate)
        for sent in sents:
            delete_indice = random.sample(range(self.seq_length), n_delete)
            x = list(sent)
            for idx in delete_indice:
                action = random.randrange(0, 10)
                if action < 8:
                    x[idx] = C_MASK_ID
                elif action == 8:
                    rand_char = random.randrange(0, self.voca_size)
                    x[idx] = rand_char
                else:
                    pass
            y = list(sent)
            yield x, y

    # Child classs will feed own text to case_generator
    # and return generator of x,y tuples
    def get_train_generator(self):
        raise NotImplementedError()

    def get_test_generator(self):
        raise NotImplementedError()
Esempio n. 10
0
class DataLoader:
    def __init__(self, topic, max_sequence, vocab_filename):
        self.train_data = None
        self.dev_data = None
        self.test_data = None
        self.topic = topic

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        if "ST" in vocab_filename:
            self.encoder = TokenTextEncoder(voca_path, replace_oov=SPEC_4)
        else:
            self.encoder = SubwordTextEncoder(voca_path)
        self.max_sequence = max_sequence

    def example_generator(self, corpus_path, topic):
        select_target = dict_topic2full_desc[topic]
        label_list = stance_label
        f = open(corpus_path, "r", encoding="utf-8", errors="ignore")
        reader = csv.reader(f, delimiter=',')

        for idx, row in enumerate(reader):
            if idx == 0: continue  # skip header
            # Works for both splits even though dev has some extra human labels.
            sent = row[0]
            target = row[1]
            label = label_list.index(row[2])
            if select_target is None:
                f_include = True
            else:
                if target in select_target:
                    f_include = True
                else:
                    f_include = False
            if f_include:
                yield {"inputs": sent, "label": label}

    def load_train_data(self):

        path = os.path.join(corpus_dir, "train.csv")
        plain_data = list(self.example_generator(path, self.topic))
        random.shuffle(plain_data)

        train_size = int(0.9 * len(plain_data))
        dev_size = len(plain_data) - train_size
        self.train_data_raw = plain_data[:train_size]
        self.dev_data_raw = plain_data[train_size:]

        self.train_data = self.encode(self.train_data_raw)
        self.dev_data = self.encode(self.dev_data_raw)

    def load_test_data(self):
        path = os.path.join(corpus_dir, "test.csv")
        self.test_data_raw = list(self.example_generator(path, self.topic))
        self.test_data = self.encode(self.test_data_raw)

    @classmethod
    def dict2tuple(cls, data):
        X = []
        Y = []
        for entry in data:
            X.append(entry["inputs"])
            Y.append(entry["label"])

        return X, Y

    def get_train_data(self):
        if self.train_data is None:
            self.load_train_data()

        return self.dict2tuple(self.train_data)

    def get_dev_data(self):
        if self.dev_data is None:
            self.load_train_data()

        return self.dict2tuple(self.dev_data)

    def get_test_data(self):
        if self.test_data is None:
            self.load_test_data()

        return self.dict2tuple(self.test_data)

    def encode(self, plain_data):
        for entry in plain_data:
            key = "inputs"
            coded_text = [CLS_ID] + self.encoder.encode(
                entry[key])[:self.max_sequence - 1]
            pad = (self.max_sequence - len(coded_text)) * [text_encoder.PAD_ID]
            entry[key] = coded_text + pad
            yield entry
Esempio n. 11
0
 def __init__(self, seq_length, shared_setting):
     voca_path = os.path.join(data_path, shared_setting.vocab_filename)
     self.voca_size = shared_setting.vocab_size
     self.encoder = SubwordTextEncoder(voca_path)
     self.seq_length = seq_length
     self.mask_rate = 0.15
Esempio n. 12
0
class PairDataLoader():
    def __init__(self, seq_length, shared_setting, grouped_data):
        voca_path = os.path.join(data_path, shared_setting.vocab_filename)
        self.voca_size = shared_setting.vocab_size
        self.encoder = SubwordTextEncoder(voca_path)
        self.seq_length = seq_length
        self.mask_rate = 0.15
        self.grouped_data = grouped_data
        self.train_group = None
        self.test_group = None
        self.test_sampler = None
        self.train_sampler = None

    @classmethod
    def load_from_pickle(cls, id):
        pickle_name = "PairDataLoader_{}".format(id)
        path = os.path.join(cache_path, pickle_name)
        return pickle.load(open(path, "rb"))

    def save_to_pickle(self, id):
        pickle_name = "PairDataLoader_{}".format(id)
        path = os.path.join(cache_path, pickle_name)
        pickle.dump(self, open(path, "wb"))

    def encode(self, sent):
        tokens = self.encoder.encode(sent)
        pad_len = self.seq_length - len(tokens)
        return tokens + pad_len * [PAD_ID]

    def delete(self, sent):
        n_delete = int(self.seq_length * self.mask_rate)
        delete_indice = random.sample(range(self.seq_length), n_delete)
        x = list(sent)
        y = list(sent)
        for idx in delete_indice:
            action = random.randrange(0, 10)
            if action < 8:
                x[idx] = C_MASK_ID
            elif action == 8:
                rand_char = random.randrange(0, self.voca_size)
                x[idx] = rand_char
            else:
                pass
        return x, y

    def case_encoder(self, plain_insts):
        # sent1 : list[int]
        # label : int
        for sent1, sent2, label in plain_insts:
            sent1_enc = slice_n_pad(self.encode(sent1), self.seq_length,
                                    PAD_ID)
            sent2_enc = slice_n_pad(self.encode(sent2), self.seq_length,
                                    PAD_ID)

            sent1_del, y_1 = self.delete(sent1_enc)
            sent2_del, y_2 = self.delete(sent2_enc)
            x = sent1_del + [SEP_ID] + sent2_del
            y_seq = y_1 + [0] + y_2
            y_cls = label
            yield x, y_seq, y_cls

    @staticmethod
    def split_dict(d, held_out_size):
        keys = list(d.keys())
        indice = random.sample(range(0, len(keys)), held_out_size)
        held_out_keys = [keys[i] for i in indice]

        train_d = {}
        test_d = {}
        for key, items in d.items():
            if key in held_out_keys:
                test_d[key] = items
            else:
                train_d[key] = items
        return train_d, test_d

    def index_data(self):
        if self.test_group is None:
            self.split_train_test()

    def split_train_test(self):
        print("split_train_test 1")
        held_out_group = 4000
        self.train_group, self.test_group = self.split_dict(
            self.grouped_data, held_out_group)
        print("split_train_test 2")

        self.test_sampler = KeySampler(self.test_group)
        print("split_train_test 3")
        self.train_sampler = KeySampler(self.train_group)
        print("split_train_test 4")

    # Child classs will feed own text to case_generator
    # and return generator of x,y tuples
    def get_train_batch(self, data_size):
        if self.train_group is None:
            self.split_train_test()
        train_generator = self.case_encoder(
            pos_neg_pair_sampling(self.train_group, self.train_sampler,
                                  data_size))
        return train_generator

    def get_test_generator(self, data_size):
        if self.test_group is None:
            self.split_train_test()
        test_generator = self.case_encoder(
            pos_neg_pair_sampling(self.test_group, self.test_sampler,
                                  data_size))
        return test_generator
Esempio n. 13
0
class AuthorAsDoc:
    def __init__(self, seq_length, shared_setting, grouped_data):
        voca_path = os.path.join(data_path, shared_setting.vocab_filename)
        self.voca_size = shared_setting.vocab_size
        self.encoder = SubwordTextEncoder(voca_path)
        self.seq_length = seq_length
        self.grouped_data = grouped_data
        self.train_group = None
        self.test_group = None
        self.test_sampler = None
        self.train_sampler = None
        self.mask_rate = 0.15

    @staticmethod
    def split_dict(d, held_out_size):
        keys = list(d.keys())
        indice = random.sample(range(0, len(keys)), held_out_size)
        held_out_keys = [keys[i] for i in indice]

        train_d = {}
        test_d = {}
        for key, items in d.items():
            if key in held_out_keys:
                test_d[key] = items
            else:
                train_d[key] = items
        return train_d, test_d

    def index_data(self):
        if self.test_group is None:
            self.split_train_test()

    def split_train_test(self):
        print("split_train_test 1")
        held_out_group = 4000
        self.train_group, self.test_group = self.split_dict(
            self.grouped_data, held_out_group)
        print("split_train_test 2")

        self.test_sampler = KeySampler(self.test_group)
        print("split_train_test 3")
        self.train_sampler = KeySampler(self.train_group)
        print("split_train_test 4")

    @classmethod
    def load_from_pickle(cls, id):
        pickle_name = "AuthorAsDoc_{}".format(id)
        path = os.path.join(cache_path, pickle_name)
        return pickle.load(open(path, "rb"))

    def save_to_pickle(self, id):
        pickle_name = "AuthorAsDoc_{}".format(id)
        path = os.path.join(cache_path, pickle_name)
        pickle.dump(self, open(path, "wb"))

    def encode(self, sent):
        tokens = self.encoder.encode(sent)
        return tokens + [SEP_ID]

    def delete_alter(self, sent):
        n_delete = int(self.seq_length * self.mask_rate)
        delete_indice = random.sample(range(self.seq_length), n_delete)
        x = list(sent)
        for idx in delete_indice:
            action = random.randrange(0, 10)
            if action < 8:
                x[idx] = C_MASK_ID
            elif action == 8:
                rand_char = random.randrange(0, self.voca_size)
                x[idx] = rand_char
            else:
                pass
        y = list(sent)
        return x, y

    def get_instances(self, grouped_dict, key_sampler, data_size):
        data = []
        for i in range(data_size):
            key = key_sampler.sample2()
            items = grouped_dict[key]
            seq = []
            j_init = random.randint(0, len(items) - 1)
            j = 0
            while len(seq) < self.seq_length:
                sent = self.encode(items[j])
                if len(seq) + len(sent) > self.seq_length:
                    break
                seq += sent
                j = increment_circular(j, len(items))
                if j == j_init:
                    break

            seq = slice_n_pad(seq, self.seq_length, PAD_ID)
            data.append(self.delete_alter(seq))
        return data

    def get_train_instances(self, data_size):
        return self.get_instances(self.train_group, self.train_sampler,
                                  data_size)

    def get_test_instances(self, data_size):
        return self.get_instances(self.test_group, self.test_sampler,
                                  data_size)