Example #1
0
 def __init__(self, output_folder, threads, language, split_size, data_dbs,
              info_dbs, custom_unfilled, custom_filled, min_count):
     self.output_folder = output_folder
     self.threads = threads
     self.split_size = split_size
     self.encoder = TextEncoder(language)
     self.min_count = min_count
     self.custom_unfilled = custom_unfilled
     if data_dbs == None:
         self.custom_data_DBs = None
     else:
         self.custom_data_DBs = data_dbs.split(";")
     if info_dbs == None:
         self.custom_info_DBs = None
     else:
         self.custom_info_DBs = info_dbs.split(";")
     if custom_unfilled == None:
         self.cluster_folder = "{}/clusters/unfilled".format(
             self.output_folder)
     else:
         self.cluster_folder = custom_unfilled
         if not os.path.exists(self.cluster_folder):
             os.makedirs(self.cluster_folder)
     if custom_filled == None:
         self.save_folder = "{}/clusters/filled".format(self.output_folder)
     else:
         self.save_folder = custom_filled
         if not os.path.exists(self.save_folder):
             os.makedirs(self.save_folder)
Example #2
0
 def encode_text(self, keys, file_index, name=None):
     if name:
         db = self.open_database("original_data_{}_DB".format(name))
     else:
         db = self.open_database("original_data_DB")
     encoder = TextEncoder(self.language)
     with db.begin() as txn, gzip.open(
             self.output_folder + "/encoded/f_{}.gz".format(file_index),
             "wt") as gzip_file:
         for key in keys:
             d = {}
             text = txn.get(key).decode("unicode-escape")
             d["id"] = key.decode("utf-8")
             d["text"] = encoder.encode_text(text)
             if len(d["text"]) == 0: continue
             gzip_file.write(json.dumps(d) + "\n")
 def get_blastpair_data(self, item):
     orig_db = lmdb.open(self.db_loc, readonly=True, lock=False)
     o_db = orig_db.begin()
     textenc = TextEncoder("eng")
     blastpair = BlastPairDB(item, textenc)
     blastpair.set_correct_indices_and_texts(o_db)
     return blastpair.get_outdict()
Example #4
0
def test_text_dis_end_to_end():
    d_batch = 2
    d_max_seq_len = 26
    d_vocab = 27699
    d_dis_hidden = 256
    d_text_feature = 512
    text_enc_dropout = 0.5
    d_text_enc_cnn = 512

    text_enc = TextEncoder(d_vocab=d_vocab, d_text_feature=d_text_feature, text_enc_dropout=text_enc_dropout, d_text_enc_cnn=d_text_enc_cnn)
    lat_dis = LatentDiscriminator(d_text_feature=d_text_feature, d_dis_hidden=d_dis_hidden)
    text_enc.load_state_dict(torch.load('new_text_enc.pth'))

    texts = torch.randint(low=0, high=d_vocab, size=(d_batch, d_max_seq_len))
    text_features = text_enc(to_one_hot(texts, d_vocab))
    assert text_features.size() == (d_batch, d_text_feature) and text_features.dtype == torch.float

    valids = lat_dis(text_features)
    assert valids.size() == (d_batch,) and valids.dtype == torch.float
 def get_blastpair_group_data(self, blastpair_group):
     orig_db = lmdb.open(self.db_loc, readonly=True, lock=False)
     o_db = orig_db.begin()
     textenc = TextEncoder("eng")
     group_outdata = []
     for item in blastpair_group:
         blastpair = BlastPairDB(item, textenc)
         blastpair.set_correct_indices_and_texts(o_db)
         group_outdata.append(blastpair.get_outdict())
     return group_outdata
Example #6
0
def fill_line(from_id, line, output_location, lang):
    splits = line.split()
    curr_id = splits[0].replace("\\", "")
    print(curr_id)
    orig_text_from = lmdb.open(
        output_location + "db/original_data_DB", readonly=True).begin().get(
            from_id.encode("ascii")).decode("unicode-escape")
    orig_text_curr = lmdb.open(
        output_location + "db/original_data_DB", readonly=True).begin().get(
            curr_id.encode("ascii")).decode("unicode-escape")

    #print(orig_text)
    indexes = splits[3:5]
    print(indexes)
    encoder = TextEncoder(lang)
    from_enc = encoder.decode_text(orig_text_from, splits[1], splits[2])
    curr_enc = encoder.decode_text(orig_text_curr, splits[3], splits[4])

    return from_enc, curr_enc
Example #7
0
def get_batch_jsondata_inmem(input_data, inmemtxtdata):
    textenc = TextEncoder("eng")
    batchdata = blastdr.read_blast_cluster_csv_inmem(input_data)
    i = 0
    max_i = len(batchdata)
    outdata = []
    for item in batchdata:
        i += 1
        if i % 100 == 0:
            print("   --- " + str(i) + "/" + str(max_i))
        blastpair = BlastPairMulti(item, textenc)
        blastpair.set_correct_indices_and_texts(inmemtxtdata)
        outdata.append(blastpair.get_outdict())
    return outdata
Example #8
0
 def __init__(self, blastdata, textenc=TextEncoder("eng")):
     self.source_id = blastdata['source_id']
     self.source_text_start = None
     self.source_text_end = None
     self.source_start_blast = blastdata['source_start_blast']
     self.source_end_blast = blastdata['source_end_blast']
     self.source_text = None
     self.target_id = blastdata['target_id']
     self.target_text_start = None
     self.target_text_end = None
     self.target_start_blast = blastdata['target_start_blast']
     self.target_end_blast = blastdata['target_end_blast']
     self.target_text = None
     self.align_length = blastdata['align_length']
     self.positives_percent = blastdata['positives_percent']
     self.textenc = textenc
def write_batch_json(input_fname,
                     fname_prefix,
                     ecco_id_dict,
                     eebo_id_dict,
                     outputdir="../output/blast_batches/"):
    textenc = TextEncoder("eng")
    outjson = get_out_json_fname(input_fname, fname_prefix)
    batchdata = blastdr.read_blast_cluster_csv(input_fname)
    i = 0
    max_i = len(batchdata)
    outdata = []
    for item in batchdata:
        i += 1
        if i % 100 == 0:
            print(outjson + " " + str(i) + "/" + str(max_i))
        blastpair = BlastPair(item, ecco_id_dict, eebo_id_dict, textenc)
        blastpair.set_correct_indices_and_texts()
        outdata.append(blastpair.get_outdict())
    outfile = outputdir + outjson
    with open(outfile, 'w', encoding='utf-8') as jsonout:
        json.dump(outdata, jsonout, indent=2, ensure_ascii=False)
    return outfile
Example #10
0
    def add_logit_op(self):
        """Adds the unrolled RNN:
            h_0 = 0
            for t in 1 to T:
                o_t, h_t = cell(x_t, h_{t-1})
                o_drop_t = Dropout(o_t, dropout_rate)
                y_t = o_drop_t U + b_2

        TODO: There a quite a few things you'll need to do in this function:
            - Define the variables U, b_2.
            - Define the vector h as a constant and inititalize it with
              zeros. See tf.zeros and tf.shape for information on how
              to initialize this variable to be of the right shape.
              https://www.tensorflow.org/api_docs/python/constant_op/constant_value_tensors#zeros
              https://www.tensorflow.org/api_docs/python/array_ops/shapes_and_shaping#shape
            - In a for loop, begin to unroll the RNN sequence. Collect
              the predictions in a list.
            - When unrolling the loop, from the second iteration
              onwards, you will HAVE to call
              tf.get_variable_scope().reuse_variables() so that you do
              not create new variables in the RNN cell.
              See https://www.tensorflow.org/versions/master/how_tos/variable_scope/
            - Concatenate and reshape the predictions into a predictions
              tensor.
        Hint: You will find the function tf.pack (similar to np.asarray)
              useful to assemble a list of tensors into a larger tensor.
              https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#pack
        Hint: You will find the function tf.transpose and the perms
              argument useful to shuffle the indices of the tensor.
              https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#transpose

        Remember:
            * Use the xavier initilization for matrices.
            * Note that tf.nn.dropout takes the keep probability (1 - p_drop) as an argument.
            The keep probability should be set to the value of self.dropout_placeholder

        Returns:
            pred: tf.Tensor of shape (batch_size, max_length, n_classes)
        """

        with tf.variable_scope("QuoraModel"):

            text1 = self.add_embedding(self.input_placeholder1)
            text2 = self.add_embedding(self.input_placeholder2)
            dropout_rate = self.dropout_placeholder

            # step 1: Text encoder
            textEncoder = TextEncoder(self.config)
            with tf.variable_scope("TextEncoder"):
                text1_preds = textEncoder.add_prediction_op(text1)
                tf.get_variable_scope().reuse_variables()
                text2_preds = textEncoder.add_prediction_op(text2)

            # apply mask to each encoding output
            #text1_encoding = tf.boolean_mask(text1_preds, self.mask_placeholder1)
            #text2_encoding = tf.boolean_mask(text2_preds, self.mask_placeholder2)
            text1_encoding = text1_preds * tf.expand_dims(
                tf.to_float(self.mask_placeholder1), -1)
            text2_encoding = text2_preds * tf.expand_dims(
                tf.to_float(self.mask_placeholder2), -1)

            # take last timestep as encoding
            #text1_encoding = text1_preds[:, -1]
            #text2_encoding = text2_preds[:, -1]

            # # # step 2: concatenate two encodings
            # text1_encoding = tf.reduce_sum(text1_encoding, 1)
            # text2_encoding = tf.reduce_sum(text2_encoding, 1)
            # encodings = tf.concat(1, [text1_encoding, text2_encoding])

            # more complex alternative
            # step 2: Co-attention layer
            # non-linearity on one question encoding
            question_encoder = QuestionEncoder(self.config)
            text1_encoding = question_encoder.add_prediction_op(text1_encoding)
            encodings = coattention_layer.encode(text1_encoding,
                                                 text2_encoding)

            # step 3: Bi-LSTM
            biLSTMEncoder = BidirectionalLSTMEncoder(self.config)
            encodings = biLSTMEncoder.add_prediction_op(encodings)
            _shape = encodings.get_shape()[1] * encodings.get_shape()[2].value
            encodings = tf.reshape(encodings, [-1, _shape.value])  # flatten

            # step 4: Decoder/Quora classifier
            classifier = QuoraClassifier(self.config)
            logits = classifier.add_prediction_op(encodings)

        assert logits.get_shape().as_list() == [None, self.config.n_classes_quora], \
            "predictions are not of the right shape. Expected {}, got {}"\
                .format([None, self.config.n_classes_quora],
                        logits.get_shape().as_list())
        return logits
Example #11
0
class ClusterFiller:
    def __init__(self, output_folder, threads, language, split_size, data_dbs,
                 info_dbs, custom_unfilled, custom_filled, min_count):
        self.output_folder = output_folder
        self.threads = threads
        self.split_size = split_size
        self.encoder = TextEncoder(language)
        self.min_count = min_count
        self.custom_unfilled = custom_unfilled
        if data_dbs == None:
            self.custom_data_DBs = None
        else:
            self.custom_data_DBs = data_dbs.split(";")
        if info_dbs == None:
            self.custom_info_DBs = None
        else:
            self.custom_info_DBs = info_dbs.split(";")
        if custom_unfilled == None:
            self.cluster_folder = "{}/clusters/unfilled".format(
                self.output_folder)
        else:
            self.cluster_folder = custom_unfilled
            if not os.path.exists(self.cluster_folder):
                os.makedirs(self.cluster_folder)
        if custom_filled == None:
            self.save_folder = "{}/clusters/filled".format(self.output_folder)
        else:
            self.save_folder = custom_filled
            if not os.path.exists(self.save_folder):
                os.makedirs(self.save_folder)

    def fill_clusters(self):
        if self.custom_unfilled == None:
            folders = natsorted(os.listdir(self.cluster_folder))
            if len(folders) == 1:
                right_folders = folders
            else:
                highest_round = folders[-1].split("_")[1]
                right_folders = [
                    f for f in folders if "round_{}".format(highest_round) in f
                ]
                assert len(right_folders) == 1
            files = []
            for right_folder in right_folders:
                right_files = os.listdir("{}/{}".format(
                    self.cluster_folder, right_folder))
                for right_file in right_files:
                    files.append((right_folder, right_file))
            files = natsorted(files)
        else:
            files = []
            cfiles = natsorted(os.listdir(self.cluster_folder))
            for cfile in cfiles:
                files.append(("", cfile))
        Parallel(n_jobs=self.threads)(
            delayed(self.fill_cluster)(folder, filename, index)
            for index, (folder, filename) in enumerate(files))

    def fill_cluster(self, folder, filename, file_index):
        if self.custom_data_DBs == None:
            use_custom_dbs = False
            orig_db = lmdb.open(self.output_folder + "/db/original_data_DB",
                                readonly=True)
            info_db = lmdb.open(self.output_folder + "/db/info_DB",
                                readonly=True)
            o_db = orig_db.begin()
            i_db = [info_db.begin()]
        else:
            o_db = []
            i_db = []
            use_custom_dbs = True
            for db_path in self.custom_data_DBs:
                db = lmdb.open(db_path, readonly=True)
                o_db.append(db.begin())
            for db_path in self.custom_info_DBs:
                db = lmdb.open(db_path, readonly=True)
                i_db.append(db.begin())

        with gzip.open(self.cluster_folder + "/" + folder + "/" + filename,
                       "rt") as gzip_file:
            data = json.loads(gzip_file.read())
        filled_clusters = {}
        for cluster_key, cluster_data in data.items():
            if len(cluster_data[0]) > self.min_count:
                filled_clusters[cluster_key] = self.fill(
                    cluster_data, o_db, i_db, use_custom_dbs)
            else:
                print()
                print()
                print("FOUND EMPTY CLUSTER!")
        self.save_clusters(filled_clusters, file_index)

    def generate_split_indexes(self, indexes):
        self.split_size = int(self.split_size)
        end = (int(int(indexes[0]) / self.split_size) + 1) * self.split_size
        start = int(int(indexes[1]) / self.split_size) * self.split_size
        return start, end

    def fill(self, data, o_db, i_db, use_custom_dbs):
        cluster = {}
        length = 0
        hits = []
        skips = []
        for node in data[0]:
            text_id = node.split("___")[0]
            indexes = node.split("___")[1].split("_")
            if self.split_size != None and self.split_size > 0:
                doc_start, doc_end = self.generate_split_indexes(indexes)
                text_id = "{}__{}_{}".format(text_id, doc_start, doc_end)

            orig_text = self.get_original_text(text_id, o_db, use_custom_dbs)
            try:
                text, indices = self.encoder.decode_text(
                    orig_text, indexes[0], indexes[1])
            except IndexError:
                print()
                print("\n\n", "Index Error", node, indexes)
                skips.append(node)
                continue
            hit_data = {}
            length += len(text)
            doc_key = node.split("___")[0]
            for info_db in i_db:
                info = info_db.get(doc_key.encode("ascii"))
                if info == None:  ##go through all DBs, skip the wrong one
                    continue
                info = json.loads(info.decode("unicode-escape"))

                for key, value in info.items():
                    hit_data[key] = value

            hit_data["text"] = text
            hit_data["node"] = node
            hit_data["doc_id"] = doc_key
            hit_data["original_indices"] = indices
            hit_data["encoded_indices"] = indexes
            hits.append(hit_data)

        cluster["length"] = int(length / len(hits))
        cluster["hits"] = hits
        cluster["skips"] = skips
        return cluster

    def get_original_text(self, text_id, db, use_custom_dbs):
        if use_custom_dbs:
            text = None
            for custom_db in db:
                text = custom_db.get(text_id.encode("ascii"))
                if text != None: break
                text = custom_db.get(
                    text_id.replace("__", "_").encode("ascii"))
                if text != None: break
        else:
            text = db.get(text_id.encode("ascii"))
        if text == None:
            return None
        else:
            return text.decode("unicode-escape")

    def save_clusters(self, clusters, file_index):
        if len(clusters) != 0:
            with gzip.open(
                    "{}/clusters_{}.gz".format(self.save_folder, file_index),
                    "wt") as gzf:
                gzf.write(json.dumps(clusters))
Example #12
0
    def seperate_levenshtein(self, key, value):
        encoder = TextEncoder(self.language)
        new_clusters = []

        ## Get texts, sort, encode
        texts = sorted(value["hits"], key=len, reverse=True)
        encoded = [encoder.encode_text(t["text"]) for t in texts]
        dones = set()

        ## Go through texts, starting for longest to shortest. Add indexes to own clusters if l distance low enough
        for start_text_index in range(len(texts)):
            if start_text_index in dones:
                continue
            new_cluster = [start_text_index]
            dones.add(start_text_index)
            for comp_text_index in range(start_text_index + 1, len(texts)):
                if comp_text_index in dones or len(
                        texts[start_text_index]["text"]
                ) * self.max_distance > len(texts[comp_text_index]["text"]):
                    continue

                distance = Levenshtein.distance(encoded[start_text_index],
                                                encoded[comp_text_index])
                if (len(encoded[start_text_index]) - distance) / len(
                        encoded[start_text_index]) >= self.max_distance:
                    new_cluster.append(comp_text_index)
                    dones.add(comp_text_index)
            new_clusters.append(new_cluster)

        ## Find single clusters
        single_clusters = []
        single_clusters = [
            index for index, hits in enumerate(new_clusters) if len(hits) == 1
        ]
        single_clusters = [
            new_clusters.pop(index) for index in reversed(single_clusters)
        ]

        ## Combine single clusters to some other cluster
        for single_cluster in single_clusters:
            ## Only comparing against one node in each cluster
            current_to_add = 0
            current_distance = 0
            for cluster_i, cluster in enumerate(new_clusters):
                hit_to_compare = cluster[0]
                distance = Levenshtein.distance(encoded[single_cluster[0]],
                                                encoded[hit_to_compare])
                if distance > current_distance:
                    current_to_add = cluster_i
                    current_distance = distance
            new_clusters[current_to_add].append(single_cluster[0])

        clusters = {}
        for cluster_i, cluster in enumerate(new_clusters):
            new_key = "{}_{}".format(key, cluster_i)
            clusters[new_key] = {}
            clusters[new_key]["length"] = 0
            clusters[new_key]["hits"] = []
            for text_index in cluster:
                clusters[new_key]["hits"].append(texts[text_index])

        return clusters
Example #13
0
    def seperate_blast(self, key, value, filename):
        print()
        print("Filename: {}\tCluster size: {}".format(filename,
                                                      len(value["hits"])))

        self.blast_folder = self.save_folder + "/blast"
        self.clean_blast_folder()
        encoder = TextEncoder(self.language)

        hits = value["hits"]
        hits.sort(key=lambda k: len(k["text"]), reverse=False)

        texts = [v["text"] for v in value["hits"]]
        encoded = [encoder.encode_text(t) for t in texts]

        self.make_db(encoded)
        results = self.blast_data()
        hit_results = self.extract_hit_results(results)
        clusters = []
        cluster_map = {}
        done_i = set()

        for i in range(len(hit_results)):
            if i in done_i: continue
            curr = hit_results[i]
            hit_length = curr[0][2]
            cluster = [i]
            done_i.add(i)
            for hsp in curr:
                align_text_i = hsp[0]
                align_length = hsp[1]
                align_text_full_length = len(encoded[align_text_i])
                if align_text_i in done_i: continue
                if hit_length > align_text_full_length:
                    longer = hit_length
                else:
                    longer = align_text_full_length
                if longer * self.max_distance < align_length:
                    cluster.append(align_text_i)
                    done_i.add(align_text_i)
                    cluster_map[align_text_i] = len(clusters)

            cluster_map[i] = len(clusters)
            clusters.append(cluster)

        ## FIND LEN == 1:
        top = []
        for cluster_i, cluster in enumerate(clusters):
            if len(cluster) == 1:
                hit_index = cluster[0]
                res = hit_results[hit_index]
                best = (None, 10000)
                for v in res:
                    align_text_i = v[0]
                    align_length = v[1]
                    hit_length = v[2]
                    align_text_full_length = len(encoded[align_text_i])
                    diff = abs(hit_length - align_text_full_length)
                    if diff < best[1]:
                        best = (align_text_i, diff)
                res.sort(key=itemgetter(1), reverse=True)
                #print(res[0][0], cluster_map[res[0][0]])
                try:
                    clusters[cluster_map[res[0][0]]].append(hit_index)
                except KeyError:
                    pass
                top.append(cluster_i)

        top.sort(reverse=True)
        for i in top:
            clusters.pop(i)

        new_clusters = {}
        for cluster_i, cluster in enumerate(clusters):
            l = 0
            cluster_hits = []
            for hit_index in cluster:
                cluster_hits.append(hits[hit_index])
                l += len(texts[hit_index])
            d = {"hits": cluster_hits, "length": int(l / len(cluster_hits))}
            new_clusters[key + "_" + str(cluster_i)] = d
        print("Extracted {} clusters.".format(len(new_clusters)))
        return new_clusters
def run(config, output_dir):
    config = read_and_update_config(config)
    os.makedirs(output_dir, exist_ok=True)
    with open(f'{output_dir}/config.json', 'w') as f:
        json.dump(config, f)
    print('running with config:')
    print(config)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('running on device:', device)

    print('loading dataset:')
    #     dataset, loader = get_emnlp_2017_news_pretrained_vocab(**config)
    dataset, loader = get_quora_texts_pretrained_vocab(
        split='train', **config
    )  # d_batch=d_batch, should_pad=True, pad_to_length=d_max_seq_len)
    print('d_max_seq_len:', dataset.d_max_seq_len)
    references = torch.load(QUORA_TEXT_PRETRAINED_VOCAB_VALID_SET_PATH
                            )[:config['num_fast_bleu_references']]
    print('num validation set BLEU references:', len(references))

    print('constructing models:')
    d_batch = 512
    d_noise = 100
    d_vocab = 27699
    num_epochs = 50
    start_epoch = 1
    d_gen_layers = 1
    gen_dropout = 0.5
    d_max_seq_len = 26
    d_gen_hidden = 512
    d_dis_hidden = 512
    d_text_feature = 512
    d_text_enc_cnn = 512
    d_text_enc_cnn = 512
    text_enc_dropout = 0.5
    text_enc_weights_path = 'new_text_enc.pth'
    text_dec_weights_path = 'faster_text_gen_v1.pth'
    lat_gen_weights_path = 'run_12_all_fixed/epoch_46_lat_gen.pth'
    lat_dis_weights_path = 'run_12_all_fixed/epoch_46_lat_dis.pth'
    references_path = QUORA_TEXT_PRETRAINED_VOCAB_VALID_SET_PATH

    text_enc = TextEncoder(d_vocab=d_vocab,
                           d_text_feature=d_text_feature,
                           text_enc_dropout=text_enc_dropout,
                           d_text_enc_cnn=d_text_enc_cnn).to(device)
    text_enc.load_state_dict(torch.load(text_enc_weights_path))
    text_dec = TextDecoder(d_vocab=d_vocab,
                           d_text_feature=d_text_feature,
                           d_gen_hidden=d_gen_hidden,
                           d_max_seq_len=d_max_seq_len,
                           d_gen_layers=d_gen_layers,
                           gen_dropout=gen_dropout,
                           pad_token=dataset.pad_token,
                           start_token=dataset.start_token,
                           end_token=dataset.end_token).to(device)
    text_dec.load_state_dict(torch.load(text_dec_weights_path))

    lat_dis = LatentDiscriminator(d_text_feature=d_text_feature,
                                  d_dis_hidden=d_dis_hidden).to(device)
    lat_dis.load_state_dict(torch.load(lat_dis_weights_path))
    lat_gen = LatentGenerator(d_noise=d_noise,
                              d_text_feature=d_text_feature,
                              d_gen_hidden=d_gen_hidden).to(device)
    lat_gen.load_state_dict(torch.load(lat_gen_weights_path))

    print('training:')
    train(lat_gen,
          text_dec,
          text_enc,
          lat_dis,
          dataset,
          loader,
          device,
          output_dir=output_dir,
          references=references,
          **config)
    print('finished training')