Example #1
0
 def __init__(self):
     vocab_file = os.path.join(data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     self.stemmer = CacheStemmer()
     self.stopword = load_stopwords()
     self.df = self.load_galgo_df_stat()
Example #2
0
    def __init__(self, window_size):
        self.stemmer = CacheStemmer()
        self.window_size = window_size
        self.doc_posting = None
        self.stopword = load_stopwords()

        vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=True)


        def load_pickle(name):
            p = os.path.join(cpath.data_path, "adhoc", name + ".pickle")
            return pickle.load(open(p, "rb"))

        self.doc_len_dict = load_pickle("doc_len")
        self.qdf = load_pickle("robust_qdf_ex")
        self.meta = load_pickle("robust_meta")
        self.head_tokens = load_pickle("robust_title_tokens")
        self.seg_info = load_pickle("robust_seg_info")
        self.not_found = set()

        self.total_doc_n =  len(self.doc_len_dict)
        self.avdl = sum(self.doc_len_dict.values()) / len(self.doc_len_dict)
        tprint("Init PassageRanker")
Example #3
0
def dev():
    train_data_feeder = load_cache("train_data_feeder")
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    html_writer = HtmlVisualizer("nli_w_dict.html", dark_mode=False)

    for _ in range(100):
        batch = train_data_feeder.get_random_batch(1)

        input_ids, input_mask, segment_ids, d_input_ids, d_input_mask, d_location_ids, y = batch

        tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

        for i in range(len(tokens)):
            if i is not 0 and i in d_location_ids:
                tokens[i] = "<b>{}</b>".format(tokens[i])
            if tokens[i] == "[unused3]":
                tokens[i] = "[SEP]\n"

        s = tokenizer_wo_tf.pretty_tokens(tokens)
        html_writer.write_headline("Input")
        html_writer.write_paragraph(s)

        d_tokens = tokenizer.convert_ids_to_tokens(d_input_ids[0])
        for i in range(len(d_tokens)):
            if tokens[i] == "[unused5]":
                tokens[i] = "<br>\n"

        s = tokenizer_wo_tf.pretty_tokens(d_tokens)
        html_writer.write_headline("Dict def")
        html_writer.write_paragraph(s)

    html_writer.close()
Example #4
0
def main():
    mark_path = os.path.join(working_path, "wiki_eval_token")
    mtm = MTM(100, mark_path)
    vocab_file = os.path.join(data_path, "bert_voca.txt")

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    docs_dict = {}
    job_id = mtm.pool_job()
    print("Job id : ", job_id)
    todo = "dev"
    if todo == "train":
        out_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_train_tokens.{}"
        in_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_train.txt.line.{}"
    elif todo == "dev":
        out_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_eval_tokens.{}"
        in_path_format = "/mnt/nfs/work3/youngwookim/data/enwiki4bert/enwiki_eval.txt.line.{}"
    else:
        assert False

    while job_id is not None:
        i = int(job_id / 100)
        if i not in docs_dict:
            file_path = in_path_format.format(i)
            docs_dict[i] = parse_wiki(file_path)
        work(job_id, docs_dict[i], tokenizer, out_path_format)
        job_id = mtm.pool_job()
        print("Job id : ", job_id)
Example #5
0
def worker_p(job_id):
    max_seq = 512
    vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    p = os.path.join(cpath.data_path, "tlm", "instances",
                     "inst_{}.pickle".format(job_id))
    if not os.path.exists(p):
        return
    output_path = os.path.join(cpath.data_path, "tlm", "tf_record_pred",
                               "tf_{}.pickle".format(job_id))
    #if os.path.exists(output_path):
    #    return
    inst_list, info_list = filter_instances(pickle.load(open(p, "rb")))

    uid_list = []
    info_d = {}
    for inst, info in zip(inst_list, info_list):
        a, b, c = info.split("_")
        unique_id = int(a) * 1000 * 1000 + int(b) * 10 + int(c)
        uid_list.append(unique_id)
        info_d[unique_id] = info

    max_pred = 20
    data = zip(inst_list, uid_list)

    p = os.path.join(cpath.data_path, "tlm", "pred",
                     "info_d_{}.pickle".format(job_id))
    pickle.dump(info_d, open(p, "wb"))
    write_predict_instance(data, tokenizer, max_seq, max_pred, [output_path])
Example #6
0
def load_and_analyze_gradient():
    p = os.path.join(output_path, "dict_grad1.pickle")
    data = pickle.load(open(p, "rb"))
    data = data[0]
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    analyze_gradient(data, tokenizer)
Example #7
0
 def __init__(self):
     vocab_file = os.path.join(data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                 do_lower_case=True)
     self.masked_lm_prob = 0.15
     self.max_seq_length = 512
     self.dupe_factor = 1
     self.rng = random.Random(time.time())
Example #8
0
    def __init__(self, number_of_pairs=10000):
        vocab_file = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)

        self.number_of_pairs = number_of_pairs
        self.max_seq_length = 200
        self.rng = random.Random(time.time())
Example #9
0
    def __init__(self):
        self.token_reader = load_seg_token_readers()
        vocab_file = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)
        vocab = vocab_words = list(self.tokenizer.vocab.keys())

        self.tf_inst_maker = TFInstanceMakerPair(vocab)
Example #10
0
 def __init__(self, dictionary_pickle, max_word_tokens, max_seq_length, out_dir):
     vocab_file = os.path.join(data_path, "bert_voca.txt")
     self.tokenizer = tokenization.FullTokenizer(
         vocab_file=vocab_file, do_lower_case=True)
     self.d = dictionary_pickle
     self.max_word_tokens = max_word_tokens
     self.max_seq_length = max_seq_length
     self.out_dir = out_dir
Example #11
0
def read(fn):
    examples = load_record(fn)
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    for feature in examples:
        print(inst2str(feature, tokenizer))
        print()
        print()
Example #12
0
    def __init__(self):
        super(LMTrainGen, self).__init__()
        vocab_file = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)

        self.short_seq_prob = 0.1
        self.problem_per_job = 100 * 1000
        self.max_predictions_per_seq = int(self.max_seq_length *
                                           self.masked_lm_prob)
Example #13
0
def pritn_token_id():
    vocab_file = os.path.join(data_path, "bert_voca.txt")
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    tokens = ["[CLS]", "[SEP]", "[MASK]", "[PAD]"]

    ids = tokenizer.convert_tokens_to_ids(tokens)
    for token, id in zip(tokens, ids):
        print(token, id)
Example #14
0
def dump_robust_cap_tokens():
    vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
    cap_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                               do_lower_case=False)

    c = trec.load_robust_ingham()
    d = {}
    for key in c:
        doc = c[key]
        d[key] = cap_tokenizer.basic_tokenizer.tokenize(doc)
    dump_dict(d, "robust_token_cap")
Example #15
0
    def __init__(self, window_size):
        super().__init__(window_size)

        self.date_dict = load_from_pickle("robust_date")
        #self.token_reader = get_token_reader()
        self.token_dump = DumpAccess("robust_token")
        self.text_dump = DumpAccess("robust")
        c_path = os.path.join(data_path, "stream_pickled", "CandiSet_{}_0")
        self.ll = LazyLoader(c_path)
        vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
        self.cap_tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                        do_lower_case=False)
Example #16
0
def load_and_analyze_hv():
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    p = os.path.join(output_path, "hv_tt.pickle")
    hv_tt = pickle.load(open(p, "rb"))

    p = os.path.join(output_path, "hv_lm.pickle")
    hv_lm = pickle.load(open(p, "rb"))

    p = os.path.join(output_path, "grad.pickle")
    tt_grad = pickle.load(open(p, "rb"))

    analyze_hv(hv_tt, hv_lm, tt_grad, tokenizer)
Example #17
0
    def __init__(self, max_seq):
        print("TFRecordMaker Init")
        self.max_seq = max_seq
        self.robust_tokens = load_robust_token()
        vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)
        self.vocab_words = list(self.tokenizer.vocab.keys())
        self.rng = random.Random(0)

        def load_pickle(name):
            p = os.path.join(cpath.data_path, "adhoc", name + ".pickle")
            return pickle.load(open(p, "rb"))

        self.seg_info = load_pickle("robust_seg_info")
        print("TFRecordMaker Init Done")
Example #18
0
    def __init__(self, data, data_info):
        super(DictAuxDataFeeder, self).__init__(data)
        self.stopword = load_stopwords()
        vocab_file = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)

        self.dict = self.encode_dict_as_feature(self.raw_dictionary)

        # data is already truncated and padded
        self.data = data

        self.data_len = len(self.data)
        if data_info is not None:
            self.data_info = data_info
        else:
            self.data_info = self.nli_data_indexing(data)
Example #19
0
def print_as_html(fn):
    examples = load_record(fn)
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    html_output = HtmlVisualizer("out_name.html")

    for feature in examples:
        masked_inputs = feature["input_ids"].int64_list.value
        idx = 0
        step = 512
        while idx < len(masked_inputs):
            slice = masked_inputs[idx:idx + step]
            tokens = tokenizer.convert_ids_to_tokens(slice)
            idx += step
            cells = cells_from_tokens(tokens)
            html_output.multirow_print(cells)
        html_output.write_paragraph("----------")
Example #20
0
    def __init__(self, out_path):
        vocab_file = os.path.join(data_path, "bert_voca.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                                    do_lower_case=True)

        self.masked_lm_prob = 0.15
        self.short_seq_prob = 0.1
        self.problem_per_job = 100 * 1000
        self.max_seq_length = 512
        self.max_predictions_per_seq = 20
        self.dupe_factor = 1
        self.out_dir = out_path

        seed = time.time()
        self.rng = random.Random(seed)
        print("Loading documents")
        self.documents = self.load_documents_from_pickle()
        print("Loading documents Done : ", len(self.documents))
Example #21
0
def tokenize_stream(in_file, out_path):
    dp = DumpPickle(out_path)
    vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    in_f = open(in_file, "r")

    def read_doc(f):
        line = f.readline()
        if not line:
            raise EndofDocument()

        assert "<DOC>" in line
        line = f.readline()
        assert "<DOCNO>" in line
        pre_n = len("<DOCNO>")
        ed_n = len("</DOCNO>") + 1
        title = line[pre_n:-ed_n].strip()

        line = f.readline()
        assert "<TEXT>" in line
        content = []
        line = f.readline()
        while line.strip() != "</TEXT>":
            content.append(line)
            line = f.readline()
        line = f.readline()
        assert "</DOC>" in line
        return title, content

    try:
        ticker = TimeEstimator(1285381, "reader", 100)
        while True:
            title, content = read_doc(in_f)
            tokens = flatten(lmap(tokenizer.tokenize, content))
            dp.dump(title, tokens)
            ticker.tick()
    except EndofDocument as e:
        pass
    dp.close()
Example #22
0
def worker(job_id):
    max_seq = 512
    print("TF_record_writer")
    rng = random.Random(0)
    vocab_file = os.path.join(cpath.data_path, "bert_voca.txt")
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)

    p = os.path.join(cpath.data_path, "tlm", "instances_local",
                     "inst_{}.pickle".format(job_id))
    if not os.path.exists(p):
        return
    output_path = os.path.join(cpath.data_path, "tlm", "tf_record_local",
                               "tf_rand_{}.pickle".format(job_id))
    if os.path.exists(output_path):
        return
    inst_list, info_list = filter_instances(pickle.load(open(p, "rb")))

    rng.shuffle(inst_list)
    max_pred = 20
    print(inst_list[0])
    write_instance_to_example_files(inst_list, tokenizer, max_seq, max_pred,
                                    [output_path])
def load_and_visualize():
    tokenizer = tokenizer_wo_tf.FullTokenizer(
        os.path.join(data_path, "bert_voca.txt"))

    data_id = "1"

    n_list = open(os.path.join(output_path, "lookup_n", data_id),
                  "r").readlines()
    p = os.path.join(output_path, "example_loss.pickle")
    data = pickle.load(open(p, "rb"))
    data = data[0]["masked_lm_example_loss"]

    feature_itr = load_record_v1(
        os.path.join(output_path, "lookup_example", data_id))

    n = len(n_list)
    feature_idx = 0
    html_writer = HtmlVisualizer("lookup_loss2.html", dark_mode=False)

    for i in range(n):
        n_sample = int(n_list[i])
        rows = []
        assert n_sample > 0
        for j in range(n_sample):
            feature = feature_itr.__next__()

            input_ids = take(feature["input_ids"])
            masked_lm_ids = take(feature["masked_lm_ids"])
            masked_lm_positions = take(feature["masked_lm_positions"])
            input_mask = take(feature["input_mask"])
            selected_word = take(feature["selected_word"])
            d_input_ids = take(feature["d_input_ids"])
            d_location_ids = take(feature["d_location_ids"])

            word_tokens = tokenizer.convert_ids_to_tokens(selected_word)
            word = tokenizer_wo_tf.pretty_tokens((word_tokens))

            emph_word = "<b>" + word + "</b>"

            if j == 0:
                mask_ans = {}
                masked_terms = tokenizer.convert_ids_to_tokens(masked_lm_ids)
                for pos, id in zip(list(masked_lm_positions), masked_terms):
                    mask_ans[pos] = id

                tokens = tokenizer.convert_ids_to_tokens(input_ids)

            for i in range(len(tokens)):
                if tokens[i] == "[MASK]":
                    tokens[i] = "[MASK_{}: {}]".format(i, mask_ans[i])
                if i in d_location_ids and i is not 0:
                    if tokens[i - 1] != emph_word:
                        tokens[i] = emph_word
                    else:
                        tokens[i] = "-"

            def_str = tokenizer_wo_tf.pretty_tokens(
                tokenizer.convert_ids_to_tokens(d_input_ids), True)
            row = list()
            row.append(Cell(word))
            row.append(Cell(data[feature_idx]))
            row.append(Cell(def_str))
            rows.append(row)

            feature_idx += 1

        s = tokenizer_wo_tf.pretty_tokens(tokens, True)
        html_writer.write_paragraph(s)

        html_writer.write_table(rows)

    html_writer.close()