Exemple #1
0
    def __init__(self,
                 max_sequence,
                 vocab_filename,
                 voca_size,
                 using_alt_tokenizer=False):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.dev_file = os.path.join(corpus_dir, "dev.txt")
        self.test_file = os.path.join(corpus_dir, "test.txt")
        self.max_seq = max_sequence
        self.voca_size = voca_size
        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        if not using_alt_tokenizer:
            self.encoder = SubwordTextEncoder(voca_path)
            self.sep_char = "_"
            self.lower_case = False
        else:
            self.lower_case = True
            self.sep_char = "#"
            self.encoder = FullTokenizerWarpper(voca_path)
Exemple #2
0
    def __init__(self, max_sequence, vocab_filename, voca_size, is_span):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        self.encoder = SubwordTextEncoder(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
        self.max_seq = max_sequence

        self.question = [
            "What is title of the controversy?",
            "What is the controversy about?"
        ]
        if not is_span:
            self.q_id = 0
        else:
            self.q_id = 1
        self.is_span = is_span
        self.text_offset = len(self.encoder.encode(
            self.question[self.q_id])) + 2

        data = load_annotation()
        self.all_data = self.generate_data(data)
        self.train_data, self.dev_data = self.held_out(self.all_data)
Exemple #3
0
def dataset_stat():
    ams_X, ams_Y = amsterdam.get_dev_data(False)
    clue_X, clue_Y = controversy.load_clueweb_testset()
    guardian_X, guardian_Y = controversy.load_guardian()

    vocab_size = 30522
    vocab_filename = "bert_voca.txt"
    voca_path = os.path.join(data_path, vocab_filename)
    encoder = FullTokenizerWarpper(voca_path)
    test_sets = []
    test_sets.append(("Ams18", [ams_X, ams_Y]))
    test_sets.append(("Clueweb", [clue_X, clue_Y]))
    test_sets.append(("Guardian", [guardian_X, guardian_Y]))

    for set_name, test_set in test_sets:
        dev_X, dev_Y = test_set
        num_over_size = 0
        length_list = []
        for doc in dev_X:
            tokens = encoder.encode(doc)
            if len(tokens) > 200:
                num_over_size += 1
            length_list.append(len(tokens))

        print("{0} {1:.03f} {2:.03f}".format(set_name,
                                             num_over_size / len(dev_X),
                                             average(length_list)))
Exemple #4
0
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.mscore = read_mscore_valid()
        self.mscore_dict = dict(self.mscore)
        self.train_topics, self.dev_topics = self.held_out(left(self.mscore))

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
        self.client = TextReaderClient()

        class UniformSampler:
            def __init__(self, topics):
                self.sample_space = topics

            def sample(self):
                return random.sample(self.sample_space, 2)


        class BiasSampler:
            def __init__(self, topics, score_dict):
                self.sample_space = []
                self.sample_group = dict()

                def score2key(score):
                    return int(math.log(score+1, 1.1))

                for topic in topics:
                    key = score2key(score_dict[topic])
                    if key not in self.sample_group:
                        self.sample_group[key] = []
                    self.sample_group[key].append(topic)

                self.sample_space = list(self.sample_group.keys())


            # Sample from all group
            def sample(self):
                def pick1(l):
                    return l[random.randrange(len(l))]

                g1, g2 = random.sample(self.sample_space, 2)
                t1 = pick1(self.sample_group[g1])
                t2 = pick1(self.sample_group[g2])
                return t1, t2

        self.train_sampler = BiasSampler(self.train_topics, self.mscore_dict)
        self.dev_sampler = BiasSampler(self.dev_topics, self.mscore_dict)
Exemple #5
0
 def __init__(self, max_sequence, vocab_filename):
     self.max_seq = max_sequence
     voca_path = os.path.join(data_path, vocab_filename)
     self.lower_case = True
     self.sep_char = "#"
     self.encoder = FullTokenizerWarpper(voca_path)
     self.all_data = list(agree.load_agree_data())
     self.dev_data = self.all_data[:200]
     self.train_data = self.all_data[200:]
Exemple #6
0
class DataLoader:
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        inst_per_query = 30
        self.generator = gen_trainable_iterator(inst_per_query)
        self.iter = iter(self.generator)
        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)

    def get_train_data(self, data_size):
        assert data_size % 2 == 0
        result = []
        while len(result) < data_size:
            raw_inst = self.iter.__next__()
            result += list(self.encode_pair(raw_inst))

        return result

    def get_dev_data(self):
        result = []
        for i in range(160):
            raw_inst = self.iter.__next__()
            result += list(self.encode_pair(raw_inst))

        return result

    def encode_pair(self, instance):
        query, case1, case2 = instance

        for y, sent in [case1, case2]:
            entry = self.encode(query, sent)
            yield entry["input_ids"], entry["input_mask"], entry[
                "segment_ids"], y

    def encode(self, query, text):
        tokens_a = self.encoder.encode(query)
        tokens_b = self.encoder.encode(text)
        return self.encoder_unit.encode_inner(tokens_a, tokens_b)
Exemple #7
0
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
Exemple #8
0
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        inst_per_query = 30
        self.generator = gen_trainable_iterator(inst_per_query)
        self.iter = iter(self.generator)
        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
Exemple #9
0
class DataLoader:
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)

    def get_train_data(self):
        if self.train_data is None:
            self.train_data = list(self.example_generator("train"))
        return self.train_data

    def get_dev_data(self):
        if self.dev_data is None:
            self.dev_data = list(self.example_generator("dev"))
        return self.dev_data

    def example_generator(self, split_name):
        X, Y = load_protest.load_data(split_name)
        for idx, x in enumerate(X):
            name, text = x
            l = Y[name]
            entry = self.encode(text)
            yield entry["input_ids"], entry["input_mask"], entry[
                "segment_ids"], l

    def encode(self, text):
        tokens_a = self.encoder.encode(text)
        return self.encoder_unit.encode_inner(tokens_a, [])
Exemple #10
0
class DataLoader:
    def __init__(self,
                 max_sequence,
                 vocab_filename,
                 using_alt_tokenizer=False):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.train_file = os.path.join(corpus_dir, "train.tsv")
        self.dev_file = os.path.join(corpus_dir, "dev.tsv")
        self.max_seq = max_sequence
        self.voca_size = 30522

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        self.name = "rte"
        if not using_alt_tokenizer:
            self.encoder = SubwordTextEncoder(voca_path)
            self.sep_char = "_"
            self.lower_case = False
        else:
            self.lower_case = True
            self.sep_char = "#"
            self.encoder = FullTokenizerWarpper(voca_path)

    def get_train_data(self):
        if self.train_data is None:
            self.train_data = load_cache("rte_train_cache")

        if self.train_data is None:
            self.train_data = list(self.example_generator(self.train_file))
        save_to_pickle(self.train_data, "rte_train_cache")
        return self.train_data

    def get_dev_data(self):
        if self.dev_data is None:
            self.dev_data = load_cache("rte_dev_cache")

        if self.dev_data is None:
            self.dev_data = list(self.example_generator(self.dev_file))
        save_to_pickle(self.dev_data, "rte_dev_cache")
        return self.dev_data

    def class_labels(self):
        return [
            "entailment",
            "not_entailment",
        ]

    def example_generator(self, filename):
        label_list = self.class_labels()
        for idx, line in enumerate(tf.gfile.Open(filename, "rb")):
            if idx == 0: continue  # skip header
            line = line.strip().decode("utf-8")
            split_line = line.split("\t")
            # Works for both splits even though dev has some extra human labels.
            s1, s2 = split_line[1:3]
            l = label_list.index(split_line[-1])
            entry = self.encode(s1, s2)

            yield entry["input_ids"], entry["input_mask"], entry[
                "segment_ids"], l

    def encode(self, s1, s2):
        return self.encode_pair(s1, s2)

    def encode_pair(self, text_a, text_b):
        tokens_a = self.encoder.encode(text_a)
        tokens_b = self.encoder.encode(text_b)

        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, self.max_seq - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append(CLS_ID)
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append(SEP_ID)
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append(SEP_ID)
            segment_ids.append(1)

        input_ids = tokens

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.max_seq:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.max_seq
        assert len(input_mask) == self.max_seq
        assert len(segment_ids) == self.max_seq

        return {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "segment_ids": segment_ids
        }
Exemple #11
0
class DataLoader:
    def __init__(self, max_sequence, vocab_filename):
        self.max_seq = max_sequence
        voca_path = os.path.join(data_path, vocab_filename)
        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.all_data = list(agree.load_agree_data())
        self.dev_data = self.all_data[:200]
        self.train_data = self.all_data[200:]

    def gen_data(self, data):
        result = []
        for x, y in data:
            entry = self.encode(x)
            e = entry["input_ids"], entry["input_mask"], entry["segment_ids"], y
            result.append(e)
        return result

    def get_train_data(self):
        return self.gen_data(self.train_data)

    def get_dev_data(self):
        return self.gen_data(self.dev_data)

    def encode(self, text_a):
        tokens_a = self.encoder.encode(text_a)
        tokens_b = []

        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, self.max_seq - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append(CLS_ID)
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append(SEP_ID)
        segment_ids.append(0)

        input_ids = tokens

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.max_seq:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.max_seq
        assert len(input_mask) == self.max_seq
        assert len(segment_ids) == self.max_seq

        return {
            "input_ids": input_ids,
            "input_mask":input_mask,
            "segment_ids": segment_ids
        }
Exemple #12
0
def get_encoder():
    voca_path = os.path.join(data_path, "bert_voca.txt")
    return FullTokenizerWarpper(voca_path)
Exemple #13
0
class DataLoader:
    def __init__(self, max_sequence, vocab_filename, voca_size, is_span):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        self.encoder = SubwordTextEncoder(voca_path)

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
        self.max_seq = max_sequence

        self.question = [
            "What is title of the controversy?",
            "What is the controversy about?"
        ]
        if not is_span:
            self.q_id = 0
        else:
            self.q_id = 1
        self.is_span = is_span
        self.text_offset = len(self.encoder.encode(
            self.question[self.q_id])) + 2

        data = load_annotation()
        self.all_data = self.generate_data(data)
        self.train_data, self.dev_data = self.held_out(self.all_data)

    def get_train_data(self):
        return self.train_data

    def get_dev_data(self):
        return self.dev_data

    def generate_data(self, data):
        result = []
        for entry in data:
            desc, title_indice, desc_indice = entry
            enc_entry = self.encode(desc, self.question[self.q_id])
            indice = [title_indice, desc_indice][self.q_id]

            new_indice = self.translate(desc, indice)
            if self.is_span:
                begin = np.zeros([self.max_seq], dtype=np.int32)
                end = np.zeros([self.max_seq], dtype=np.int32)
                if len(new_indice) > 0:
                    begin_idx = new_indice[0] + self.text_offset
                    end_idx = new_indice[-1] + self.text_offset
                    if begin_idx < self.max_seq:
                        begin[begin_idx] = 1
                        end[min(end_idx, self.max_seq - 1)] = 1
                        line = enc_entry['input_ids'], enc_entry[
                            'input_mask'], enc_entry['segment_ids'], begin, end
                        result.append(line)
                else:
                    begin[0] = 1
                    end[0] = 1
                    line = enc_entry['input_ids'], enc_entry[
                        'input_mask'], enc_entry['segment_ids'], begin, end
                    result.append(line)
            else:
                y = np.zeros([self.max_seq], dtype=np.int32)
                for idx in new_indice:
                    if idx + self.text_offset < self.max_seq:
                        y[idx + self.text_offset] = 1

                line = enc_entry['input_ids'], enc_entry[
                    'input_mask'], enc_entry['segment_ids'], y
                if sum(y) > 0:
                    result.append(line)
        return result

    def translate(self, text, indice):
        sw_tokens = self.encoder.decode_list(self.encoder.encode(text))
        parse_tokens = text.split()
        return nli.translate_index(parse_tokens, sw_tokens, indice)

    def encode(self, text_a, text_b):
        tokens_a = self.encoder.encode(text_a)
        tokens_b = self.encoder.encode(text_b)

        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, self.max_seq - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append(CLS_ID)
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append(SEP_ID)
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append(SEP_ID)
            segment_ids.append(1)

        input_ids = tokens

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.max_seq:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.max_seq
        assert len(input_mask) == self.max_seq
        assert len(segment_ids) == self.max_seq

        return {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "segment_ids": segment_ids
        }

    def held_out(self, data):
        heldout_size = int(len(data) * 0.1)
        dev_indice = set(random.sample(range(0, len(data)), heldout_size))

        train_data = []
        dev_data = []
        for idx, entry in enumerate(data):
            if idx not in dev_indice:
                train_data.append(entry)
            else:
                dev_data.append(entry)

        return train_data, dev_data
Exemple #14
0
class DataLoader:
    def __init__(self,
                 max_sequence,
                 vocab_filename,
                 voca_size,
                 using_alt_tokenizer=False):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        self.dev_file = os.path.join(corpus_dir, "dev.txt")
        self.test_file = os.path.join(corpus_dir, "test.txt")
        self.max_seq = max_sequence
        self.voca_size = voca_size
        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        if not using_alt_tokenizer:
            self.encoder = SubwordTextEncoder(voca_path)
            self.sep_char = "_"
            self.lower_case = False
        else:
            self.lower_case = True
            self.sep_char = "#"
            self.encoder = FullTokenizerWarpper(voca_path)

    def get_train_data(self):
        use_pickle = True
        if use_pickle:
            data = load_from_pickle("ubuntu_train")
        else:
            data = list(self.generate_train_data())
            save_to_pickle(data, "ubuntu_train")
        return data

    def to_triple(self, entry):
        return entry["input_ids"], entry["input_mask"], entry["segment_ids"]

    def generate_train_data(self, interval=None):
        train_label = self.read_train_label()
        text = self.text_reader()

        def get_comb_text(q_id):
            title, body = text[q_id]
            return title + " " + body

        if interval is not None:
            st, ed = interval
            train_label = train_label[st:ed]

        print("train data ", len(train_label))
        timer = TimeEstimator(len(train_label))
        for label_entry in train_label:
            q_id, pos_list, rand_list = label_entry
            q_str = get_comb_text(q_id)
            for pos_id in pos_list:
                pos_text = get_comb_text(pos_id)
                pos_entry = self.encode(q_str, pos_text)
                for neg_id in rand_list:
                    neg_text = get_comb_text(neg_id)
                    neg_entry = self.encode(q_str, neg_text)

                    yield self.to_triple(pos_entry)
                    yield self.to_triple(neg_entry)
            timer.tick()

    def get_dev_data(self):
        if self.dev_data is None:
            self.dev_data = list(self.eval_generator(self.dev_file))
        return self.dev_data

    def flatten_payload(self, data):
        result = []
        golds = []
        for query_set in data:
            payload, label = query_set

            golds.append(label)
            for p in payload:
                a, b, c = self.to_triple(p)
                result.append((a, b, c))
        return result, golds

    def get_test_data(self):
        if self.test_data is None:
            self.test_data = list(self.eval_generator(self.test_file))
        return self.test_data

    def eval_generator(self, file_path):
        text = self.text_reader()

        def get_comb_text(q_id):
            title, body = text[q_id]
            return title + " " + body

        def parse_list(list_str):
            return list([int(inst) for inst in list_str.split()])

        def get_text(list_str):
            return list([get_comb_text(id) for id in parse_list(list_str)])

        for idx, line in enumerate(tf.gfile.Open(file_path, "rb")):
            line = line.strip().decode("utf-8")
            split_line = line.split("\t")
            q_id, pos_list, neg_list, bm25_scores = split_line

            q_text = get_comb_text(int(q_id))
            pos_text_list = get_text(pos_list)
            neg_text_list = get_text(neg_list)
            label = [1] * len(pos_text_list) + [0] * len(neg_text_list)
            payload = list([
                self.encode(q_text, t) for t in pos_text_list + neg_text_list
            ])

            yield payload, label

    def convert_index_out(self, raw_sentence, subtoken_ids, target_idx):
        if self.lower_case:
            raw_sentence = raw_sentence.lower()
        #print("-------")
        #print("raw_sentence", raw_sentence)
        #print("subtoken_ids", subtoken_ids)
        #print("target_idx", target_idx)
        tokens = raw_sentence.split()
        subword_tokens = self.encoder.decode_list(subtoken_ids)
        #print("subword_tokens", subword_tokens)
        #print("target subword", subword_tokens[target_idx])
        if subword_tokens[target_idx].replace("_", "").replace(" ", "") == "":
            target_idx = target_idx - 1
            #print("Replace target_idx to previous", subword_tokens[target_idx])
        prev_text = "".join(subword_tokens[:target_idx])
        text_idx = 0

        #print("prev text", prev_text)
        # now we want to find a token from raw_sentence which appear after prev_text equivalent

        def update_text_idx(target_char, text_idx):
            while prev_text[text_idx] in [self.sep_char, " "]:
                text_idx += 1
            if target_char == prev_text[text_idx]:
                text_idx += 1
            return text_idx

        try:
            for t_idx, token in enumerate(tokens):
                for c in token:
                    # Here, previous char should equal prev_text[text_idx]
                    text_idx = update_text_idx(c, text_idx)
                    # Here, c should equal prev_text[text_idx-1]
                    assert c == prev_text[text_idx - 1]

        except IndexError:
            #print("target_token", tokens[t_idx])
            #print("t_idx", t_idx)
            return t_idx
        raise Exception

    def convert_indice_in(self, tokens, input_x, indice, seg_idx):
        sub_tokens = self.split_p_h(input_x[0], input_x)
        subword_tokens = self.encoder.decode_list(sub_tokens[seg_idx])
        start_idx = [1, 1 + len(sub_tokens[0]) + 1][seg_idx]
        in_segment_indice = translate_index(tokens, subword_tokens, indice)
        return list([start_idx + idx for idx in in_segment_indice])

    def class_labels(self):
        return ["similar", "not-similar"]

    def text_reader(self):
        filename = "text_tokenized.txt"
        file_path = os.path.join(corpus_dir, filename)
        data = dict()
        for idx, line in enumerate(tf.gfile.Open(file_path, "rb")):
            line = line.strip().decode("utf-8")
            split_line = line.split("\t")
            if len(split_line) == 3:
                id, title, body = split_line
            else:
                id, title = split_line
                body = ""

            data[int(id)] = (title, body)
        return data

    def read_train_label(self):
        filename = "train_random.txt"
        file_path = os.path.join(corpus_dir, filename)

        def parse_list(list_str):
            return list([int(inst) for inst in list_str.split()])

        data = []
        for idx, line in enumerate(tf.gfile.Open(file_path, "rb")):
            line = line.strip().decode("utf-8")
            split_line = line.split("\t")
            q_id, pos_list, rand_list = split_line

            entry = int(q_id), parse_list(pos_list), parse_list(rand_list)
            data.append(entry)
        return data

    # split the np_arr, which is an attribution scores
    @staticmethod
    def split_p_h(np_arr, input_x):
        input_ids, _, seg_idx = input_x
        return DataLoader.split_p_h_with_input_ids(np_arr, input_ids)

    @staticmethod
    def split_p_h_with_input_ids(np_arr, input_ids):

        for i in range(len(input_ids)):
            if input_ids[i] == SEP_ID:
                idx_sep1 = i
                break

        p = np_arr[1:idx_sep1]
        for i in range(idx_sep1 + 1, len(input_ids)):
            if input_ids[i] == SEP_ID:
                idx_sep2 = i
        h = np_arr[idx_sep1 + 1:idx_sep2]
        return p, h

    def encode(self, text_a, text_b):
        tokens_a = self.encoder.encode(text_a)
        tokens_b = self.encoder.encode(text_b)

        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for [CLS], [SEP], [SEP] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, self.max_seq - 3)

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0     0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = []
        segment_ids = []
        tokens.append(CLS_ID)
        segment_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            segment_ids.append(0)
        tokens.append(SEP_ID)
        segment_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                segment_ids.append(1)
            tokens.append(SEP_ID)
            segment_ids.append(1)

        input_ids = tokens

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < self.max_seq:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)

        assert len(input_ids) == self.max_seq
        assert len(input_mask) == self.max_seq
        assert len(segment_ids) == self.max_seq

        return {
            "input_ids": input_ids,
            "input_mask": input_mask,
            "segment_ids": segment_ids
        }
Exemple #15
0
class DataLoader:
    def __init__(self, max_sequence, vocab_filename, voca_size):
        self.train_data = None
        self.dev_data = None
        self.test_data = None

        voca_path = os.path.join(data_path, vocab_filename)
        assert os.path.exists(voca_path)
        print(voca_path)

        self.mscore = read_mscore_valid()
        self.mscore_dict = dict(self.mscore)
        self.train_topics, self.dev_topics = self.held_out(left(self.mscore))

        self.lower_case = True
        self.sep_char = "#"
        self.encoder = FullTokenizerWarpper(voca_path)
        self.voca_size = voca_size
        self.dev_explain = None
        self.encoder_unit = EncoderUnit(max_sequence, voca_path)
        self.client = TextReaderClient()

        class UniformSampler:
            def __init__(self, topics):
                self.sample_space = topics

            def sample(self):
                return random.sample(self.sample_space, 2)


        class BiasSampler:
            def __init__(self, topics, score_dict):
                self.sample_space = []
                self.sample_group = dict()

                def score2key(score):
                    return int(math.log(score+1, 1.1))

                for topic in topics:
                    key = score2key(score_dict[topic])
                    if key not in self.sample_group:
                        self.sample_group[key] = []
                    self.sample_group[key].append(topic)

                self.sample_space = list(self.sample_group.keys())


            # Sample from all group
            def sample(self):
                def pick1(l):
                    return l[random.randrange(len(l))]

                g1, g2 = random.sample(self.sample_space, 2)
                t1 = pick1(self.sample_group[g1])
                t2 = pick1(self.sample_group[g2])
                return t1, t2

        self.train_sampler = BiasSampler(self.train_topics, self.mscore_dict)
        self.dev_sampler = BiasSampler(self.dev_topics, self.mscore_dict)


    def get_train_data(self, size):
        return self.generate_data(self.train_sampler.sample, size)

    def get_dev_data(self, size):
        return self.generate_data(self.dev_sampler.sample, size)

    def generate_data(self, sample_fn, size):
        pair_n = int(size / 2)
        assert pair_n * 2 == size
        topic_pairs = self.sample_pairs(sample_fn, pair_n)
        result = []
        for topic_pair in topic_pairs:
            t1, t2 = topic_pair
            inst = (self.retrieve(t1),  self.retrieve(t2))
            result += list(self.encode_pair(inst))

        return result

    def retrieve(self, topic):
        r = self.client.retrieve(topic)
        if not r:
            print(topic)
        return r

    def encode_pair(self, sent_pair):
        for sent in sent_pair:
            entry = self.encode(sent)
            yield entry["input_ids"], entry["input_mask"], entry["segment_ids"]

    def encode(self, text):
        tokens_a = self.encoder.encode(text)
        return self.encoder_unit.encode_inner(tokens_a, [])

    def sample_pairs(self, sample_fn, n_pairs):
        result = []
        for i in range(n_pairs):
            selected = sample_fn()
            t1 = selected[0]
            t2 = selected[1]
            score1 = self.mscore_dict[t1]
            score2 = self.mscore_dict[t2]
            if score1 < score2:
                result.append((t1, t2))
            else:
                result.append((t2, t1))
        return result

    def held_out(self,topics):
        heldout_size = int(len(topics) * 0.1)
        dev_topics = set(random.sample(topics, heldout_size))
        train_topics = set(topics) - dev_topics
        return train_topics, dev_topics