def __init__(self, output_folder, threads, language, split_size, data_dbs, info_dbs, custom_unfilled, custom_filled, min_count): self.output_folder = output_folder self.threads = threads self.split_size = split_size self.encoder = TextEncoder(language) self.min_count = min_count self.custom_unfilled = custom_unfilled if data_dbs == None: self.custom_data_DBs = None else: self.custom_data_DBs = data_dbs.split(";") if info_dbs == None: self.custom_info_DBs = None else: self.custom_info_DBs = info_dbs.split(";") if custom_unfilled == None: self.cluster_folder = "{}/clusters/unfilled".format( self.output_folder) else: self.cluster_folder = custom_unfilled if not os.path.exists(self.cluster_folder): os.makedirs(self.cluster_folder) if custom_filled == None: self.save_folder = "{}/clusters/filled".format(self.output_folder) else: self.save_folder = custom_filled if not os.path.exists(self.save_folder): os.makedirs(self.save_folder)
def encode_text(self, keys, file_index, name=None): if name: db = self.open_database("original_data_{}_DB".format(name)) else: db = self.open_database("original_data_DB") encoder = TextEncoder(self.language) with db.begin() as txn, gzip.open( self.output_folder + "/encoded/f_{}.gz".format(file_index), "wt") as gzip_file: for key in keys: d = {} text = txn.get(key).decode("unicode-escape") d["id"] = key.decode("utf-8") d["text"] = encoder.encode_text(text) if len(d["text"]) == 0: continue gzip_file.write(json.dumps(d) + "\n")
def get_blastpair_data(self, item): orig_db = lmdb.open(self.db_loc, readonly=True, lock=False) o_db = orig_db.begin() textenc = TextEncoder("eng") blastpair = BlastPairDB(item, textenc) blastpair.set_correct_indices_and_texts(o_db) return blastpair.get_outdict()
def test_text_dis_end_to_end(): d_batch = 2 d_max_seq_len = 26 d_vocab = 27699 d_dis_hidden = 256 d_text_feature = 512 text_enc_dropout = 0.5 d_text_enc_cnn = 512 text_enc = TextEncoder(d_vocab=d_vocab, d_text_feature=d_text_feature, text_enc_dropout=text_enc_dropout, d_text_enc_cnn=d_text_enc_cnn) lat_dis = LatentDiscriminator(d_text_feature=d_text_feature, d_dis_hidden=d_dis_hidden) text_enc.load_state_dict(torch.load('new_text_enc.pth')) texts = torch.randint(low=0, high=d_vocab, size=(d_batch, d_max_seq_len)) text_features = text_enc(to_one_hot(texts, d_vocab)) assert text_features.size() == (d_batch, d_text_feature) and text_features.dtype == torch.float valids = lat_dis(text_features) assert valids.size() == (d_batch,) and valids.dtype == torch.float
def get_blastpair_group_data(self, blastpair_group): orig_db = lmdb.open(self.db_loc, readonly=True, lock=False) o_db = orig_db.begin() textenc = TextEncoder("eng") group_outdata = [] for item in blastpair_group: blastpair = BlastPairDB(item, textenc) blastpair.set_correct_indices_and_texts(o_db) group_outdata.append(blastpair.get_outdict()) return group_outdata
def fill_line(from_id, line, output_location, lang): splits = line.split() curr_id = splits[0].replace("\\", "") print(curr_id) orig_text_from = lmdb.open( output_location + "db/original_data_DB", readonly=True).begin().get( from_id.encode("ascii")).decode("unicode-escape") orig_text_curr = lmdb.open( output_location + "db/original_data_DB", readonly=True).begin().get( curr_id.encode("ascii")).decode("unicode-escape") #print(orig_text) indexes = splits[3:5] print(indexes) encoder = TextEncoder(lang) from_enc = encoder.decode_text(orig_text_from, splits[1], splits[2]) curr_enc = encoder.decode_text(orig_text_curr, splits[3], splits[4]) return from_enc, curr_enc
def get_batch_jsondata_inmem(input_data, inmemtxtdata): textenc = TextEncoder("eng") batchdata = blastdr.read_blast_cluster_csv_inmem(input_data) i = 0 max_i = len(batchdata) outdata = [] for item in batchdata: i += 1 if i % 100 == 0: print(" --- " + str(i) + "/" + str(max_i)) blastpair = BlastPairMulti(item, textenc) blastpair.set_correct_indices_and_texts(inmemtxtdata) outdata.append(blastpair.get_outdict()) return outdata
def __init__(self, blastdata, textenc=TextEncoder("eng")): self.source_id = blastdata['source_id'] self.source_text_start = None self.source_text_end = None self.source_start_blast = blastdata['source_start_blast'] self.source_end_blast = blastdata['source_end_blast'] self.source_text = None self.target_id = blastdata['target_id'] self.target_text_start = None self.target_text_end = None self.target_start_blast = blastdata['target_start_blast'] self.target_end_blast = blastdata['target_end_blast'] self.target_text = None self.align_length = blastdata['align_length'] self.positives_percent = blastdata['positives_percent'] self.textenc = textenc
def write_batch_json(input_fname, fname_prefix, ecco_id_dict, eebo_id_dict, outputdir="../output/blast_batches/"): textenc = TextEncoder("eng") outjson = get_out_json_fname(input_fname, fname_prefix) batchdata = blastdr.read_blast_cluster_csv(input_fname) i = 0 max_i = len(batchdata) outdata = [] for item in batchdata: i += 1 if i % 100 == 0: print(outjson + " " + str(i) + "/" + str(max_i)) blastpair = BlastPair(item, ecco_id_dict, eebo_id_dict, textenc) blastpair.set_correct_indices_and_texts() outdata.append(blastpair.get_outdict()) outfile = outputdir + outjson with open(outfile, 'w', encoding='utf-8') as jsonout: json.dump(outdata, jsonout, indent=2, ensure_ascii=False) return outfile
def add_logit_op(self): """Adds the unrolled RNN: h_0 = 0 for t in 1 to T: o_t, h_t = cell(x_t, h_{t-1}) o_drop_t = Dropout(o_t, dropout_rate) y_t = o_drop_t U + b_2 TODO: There a quite a few things you'll need to do in this function: - Define the variables U, b_2. - Define the vector h as a constant and inititalize it with zeros. See tf.zeros and tf.shape for information on how to initialize this variable to be of the right shape. https://www.tensorflow.org/api_docs/python/constant_op/constant_value_tensors#zeros https://www.tensorflow.org/api_docs/python/array_ops/shapes_and_shaping#shape - In a for loop, begin to unroll the RNN sequence. Collect the predictions in a list. - When unrolling the loop, from the second iteration onwards, you will HAVE to call tf.get_variable_scope().reuse_variables() so that you do not create new variables in the RNN cell. See https://www.tensorflow.org/versions/master/how_tos/variable_scope/ - Concatenate and reshape the predictions into a predictions tensor. Hint: You will find the function tf.pack (similar to np.asarray) useful to assemble a list of tensors into a larger tensor. https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#pack Hint: You will find the function tf.transpose and the perms argument useful to shuffle the indices of the tensor. https://www.tensorflow.org/api_docs/python/array_ops/slicing_and_joining#transpose Remember: * Use the xavier initilization for matrices. * Note that tf.nn.dropout takes the keep probability (1 - p_drop) as an argument. The keep probability should be set to the value of self.dropout_placeholder Returns: pred: tf.Tensor of shape (batch_size, max_length, n_classes) """ with tf.variable_scope("QuoraModel"): text1 = self.add_embedding(self.input_placeholder1) text2 = self.add_embedding(self.input_placeholder2) dropout_rate = self.dropout_placeholder # step 1: Text encoder textEncoder = TextEncoder(self.config) with tf.variable_scope("TextEncoder"): text1_preds = textEncoder.add_prediction_op(text1) tf.get_variable_scope().reuse_variables() text2_preds = textEncoder.add_prediction_op(text2) # apply mask to each encoding output #text1_encoding = tf.boolean_mask(text1_preds, self.mask_placeholder1) #text2_encoding = tf.boolean_mask(text2_preds, self.mask_placeholder2) text1_encoding = text1_preds * tf.expand_dims( tf.to_float(self.mask_placeholder1), -1) text2_encoding = text2_preds * tf.expand_dims( tf.to_float(self.mask_placeholder2), -1) # take last timestep as encoding #text1_encoding = text1_preds[:, -1] #text2_encoding = text2_preds[:, -1] # # # step 2: concatenate two encodings # text1_encoding = tf.reduce_sum(text1_encoding, 1) # text2_encoding = tf.reduce_sum(text2_encoding, 1) # encodings = tf.concat(1, [text1_encoding, text2_encoding]) # more complex alternative # step 2: Co-attention layer # non-linearity on one question encoding question_encoder = QuestionEncoder(self.config) text1_encoding = question_encoder.add_prediction_op(text1_encoding) encodings = coattention_layer.encode(text1_encoding, text2_encoding) # step 3: Bi-LSTM biLSTMEncoder = BidirectionalLSTMEncoder(self.config) encodings = biLSTMEncoder.add_prediction_op(encodings) _shape = encodings.get_shape()[1] * encodings.get_shape()[2].value encodings = tf.reshape(encodings, [-1, _shape.value]) # flatten # step 4: Decoder/Quora classifier classifier = QuoraClassifier(self.config) logits = classifier.add_prediction_op(encodings) assert logits.get_shape().as_list() == [None, self.config.n_classes_quora], \ "predictions are not of the right shape. Expected {}, got {}"\ .format([None, self.config.n_classes_quora], logits.get_shape().as_list()) return logits
class ClusterFiller: def __init__(self, output_folder, threads, language, split_size, data_dbs, info_dbs, custom_unfilled, custom_filled, min_count): self.output_folder = output_folder self.threads = threads self.split_size = split_size self.encoder = TextEncoder(language) self.min_count = min_count self.custom_unfilled = custom_unfilled if data_dbs == None: self.custom_data_DBs = None else: self.custom_data_DBs = data_dbs.split(";") if info_dbs == None: self.custom_info_DBs = None else: self.custom_info_DBs = info_dbs.split(";") if custom_unfilled == None: self.cluster_folder = "{}/clusters/unfilled".format( self.output_folder) else: self.cluster_folder = custom_unfilled if not os.path.exists(self.cluster_folder): os.makedirs(self.cluster_folder) if custom_filled == None: self.save_folder = "{}/clusters/filled".format(self.output_folder) else: self.save_folder = custom_filled if not os.path.exists(self.save_folder): os.makedirs(self.save_folder) def fill_clusters(self): if self.custom_unfilled == None: folders = natsorted(os.listdir(self.cluster_folder)) if len(folders) == 1: right_folders = folders else: highest_round = folders[-1].split("_")[1] right_folders = [ f for f in folders if "round_{}".format(highest_round) in f ] assert len(right_folders) == 1 files = [] for right_folder in right_folders: right_files = os.listdir("{}/{}".format( self.cluster_folder, right_folder)) for right_file in right_files: files.append((right_folder, right_file)) files = natsorted(files) else: files = [] cfiles = natsorted(os.listdir(self.cluster_folder)) for cfile in cfiles: files.append(("", cfile)) Parallel(n_jobs=self.threads)( delayed(self.fill_cluster)(folder, filename, index) for index, (folder, filename) in enumerate(files)) def fill_cluster(self, folder, filename, file_index): if self.custom_data_DBs == None: use_custom_dbs = False orig_db = lmdb.open(self.output_folder + "/db/original_data_DB", readonly=True) info_db = lmdb.open(self.output_folder + "/db/info_DB", readonly=True) o_db = orig_db.begin() i_db = [info_db.begin()] else: o_db = [] i_db = [] use_custom_dbs = True for db_path in self.custom_data_DBs: db = lmdb.open(db_path, readonly=True) o_db.append(db.begin()) for db_path in self.custom_info_DBs: db = lmdb.open(db_path, readonly=True) i_db.append(db.begin()) with gzip.open(self.cluster_folder + "/" + folder + "/" + filename, "rt") as gzip_file: data = json.loads(gzip_file.read()) filled_clusters = {} for cluster_key, cluster_data in data.items(): if len(cluster_data[0]) > self.min_count: filled_clusters[cluster_key] = self.fill( cluster_data, o_db, i_db, use_custom_dbs) else: print() print() print("FOUND EMPTY CLUSTER!") self.save_clusters(filled_clusters, file_index) def generate_split_indexes(self, indexes): self.split_size = int(self.split_size) end = (int(int(indexes[0]) / self.split_size) + 1) * self.split_size start = int(int(indexes[1]) / self.split_size) * self.split_size return start, end def fill(self, data, o_db, i_db, use_custom_dbs): cluster = {} length = 0 hits = [] skips = [] for node in data[0]: text_id = node.split("___")[0] indexes = node.split("___")[1].split("_") if self.split_size != None and self.split_size > 0: doc_start, doc_end = self.generate_split_indexes(indexes) text_id = "{}__{}_{}".format(text_id, doc_start, doc_end) orig_text = self.get_original_text(text_id, o_db, use_custom_dbs) try: text, indices = self.encoder.decode_text( orig_text, indexes[0], indexes[1]) except IndexError: print() print("\n\n", "Index Error", node, indexes) skips.append(node) continue hit_data = {} length += len(text) doc_key = node.split("___")[0] for info_db in i_db: info = info_db.get(doc_key.encode("ascii")) if info == None: ##go through all DBs, skip the wrong one continue info = json.loads(info.decode("unicode-escape")) for key, value in info.items(): hit_data[key] = value hit_data["text"] = text hit_data["node"] = node hit_data["doc_id"] = doc_key hit_data["original_indices"] = indices hit_data["encoded_indices"] = indexes hits.append(hit_data) cluster["length"] = int(length / len(hits)) cluster["hits"] = hits cluster["skips"] = skips return cluster def get_original_text(self, text_id, db, use_custom_dbs): if use_custom_dbs: text = None for custom_db in db: text = custom_db.get(text_id.encode("ascii")) if text != None: break text = custom_db.get( text_id.replace("__", "_").encode("ascii")) if text != None: break else: text = db.get(text_id.encode("ascii")) if text == None: return None else: return text.decode("unicode-escape") def save_clusters(self, clusters, file_index): if len(clusters) != 0: with gzip.open( "{}/clusters_{}.gz".format(self.save_folder, file_index), "wt") as gzf: gzf.write(json.dumps(clusters))
def seperate_levenshtein(self, key, value): encoder = TextEncoder(self.language) new_clusters = [] ## Get texts, sort, encode texts = sorted(value["hits"], key=len, reverse=True) encoded = [encoder.encode_text(t["text"]) for t in texts] dones = set() ## Go through texts, starting for longest to shortest. Add indexes to own clusters if l distance low enough for start_text_index in range(len(texts)): if start_text_index in dones: continue new_cluster = [start_text_index] dones.add(start_text_index) for comp_text_index in range(start_text_index + 1, len(texts)): if comp_text_index in dones or len( texts[start_text_index]["text"] ) * self.max_distance > len(texts[comp_text_index]["text"]): continue distance = Levenshtein.distance(encoded[start_text_index], encoded[comp_text_index]) if (len(encoded[start_text_index]) - distance) / len( encoded[start_text_index]) >= self.max_distance: new_cluster.append(comp_text_index) dones.add(comp_text_index) new_clusters.append(new_cluster) ## Find single clusters single_clusters = [] single_clusters = [ index for index, hits in enumerate(new_clusters) if len(hits) == 1 ] single_clusters = [ new_clusters.pop(index) for index in reversed(single_clusters) ] ## Combine single clusters to some other cluster for single_cluster in single_clusters: ## Only comparing against one node in each cluster current_to_add = 0 current_distance = 0 for cluster_i, cluster in enumerate(new_clusters): hit_to_compare = cluster[0] distance = Levenshtein.distance(encoded[single_cluster[0]], encoded[hit_to_compare]) if distance > current_distance: current_to_add = cluster_i current_distance = distance new_clusters[current_to_add].append(single_cluster[0]) clusters = {} for cluster_i, cluster in enumerate(new_clusters): new_key = "{}_{}".format(key, cluster_i) clusters[new_key] = {} clusters[new_key]["length"] = 0 clusters[new_key]["hits"] = [] for text_index in cluster: clusters[new_key]["hits"].append(texts[text_index]) return clusters
def seperate_blast(self, key, value, filename): print() print("Filename: {}\tCluster size: {}".format(filename, len(value["hits"]))) self.blast_folder = self.save_folder + "/blast" self.clean_blast_folder() encoder = TextEncoder(self.language) hits = value["hits"] hits.sort(key=lambda k: len(k["text"]), reverse=False) texts = [v["text"] for v in value["hits"]] encoded = [encoder.encode_text(t) for t in texts] self.make_db(encoded) results = self.blast_data() hit_results = self.extract_hit_results(results) clusters = [] cluster_map = {} done_i = set() for i in range(len(hit_results)): if i in done_i: continue curr = hit_results[i] hit_length = curr[0][2] cluster = [i] done_i.add(i) for hsp in curr: align_text_i = hsp[0] align_length = hsp[1] align_text_full_length = len(encoded[align_text_i]) if align_text_i in done_i: continue if hit_length > align_text_full_length: longer = hit_length else: longer = align_text_full_length if longer * self.max_distance < align_length: cluster.append(align_text_i) done_i.add(align_text_i) cluster_map[align_text_i] = len(clusters) cluster_map[i] = len(clusters) clusters.append(cluster) ## FIND LEN == 1: top = [] for cluster_i, cluster in enumerate(clusters): if len(cluster) == 1: hit_index = cluster[0] res = hit_results[hit_index] best = (None, 10000) for v in res: align_text_i = v[0] align_length = v[1] hit_length = v[2] align_text_full_length = len(encoded[align_text_i]) diff = abs(hit_length - align_text_full_length) if diff < best[1]: best = (align_text_i, diff) res.sort(key=itemgetter(1), reverse=True) #print(res[0][0], cluster_map[res[0][0]]) try: clusters[cluster_map[res[0][0]]].append(hit_index) except KeyError: pass top.append(cluster_i) top.sort(reverse=True) for i in top: clusters.pop(i) new_clusters = {} for cluster_i, cluster in enumerate(clusters): l = 0 cluster_hits = [] for hit_index in cluster: cluster_hits.append(hits[hit_index]) l += len(texts[hit_index]) d = {"hits": cluster_hits, "length": int(l / len(cluster_hits))} new_clusters[key + "_" + str(cluster_i)] = d print("Extracted {} clusters.".format(len(new_clusters))) return new_clusters
def run(config, output_dir): config = read_and_update_config(config) os.makedirs(output_dir, exist_ok=True) with open(f'{output_dir}/config.json', 'w') as f: json.dump(config, f) print('running with config:') print(config) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print('running on device:', device) print('loading dataset:') # dataset, loader = get_emnlp_2017_news_pretrained_vocab(**config) dataset, loader = get_quora_texts_pretrained_vocab( split='train', **config ) # d_batch=d_batch, should_pad=True, pad_to_length=d_max_seq_len) print('d_max_seq_len:', dataset.d_max_seq_len) references = torch.load(QUORA_TEXT_PRETRAINED_VOCAB_VALID_SET_PATH )[:config['num_fast_bleu_references']] print('num validation set BLEU references:', len(references)) print('constructing models:') d_batch = 512 d_noise = 100 d_vocab = 27699 num_epochs = 50 start_epoch = 1 d_gen_layers = 1 gen_dropout = 0.5 d_max_seq_len = 26 d_gen_hidden = 512 d_dis_hidden = 512 d_text_feature = 512 d_text_enc_cnn = 512 d_text_enc_cnn = 512 text_enc_dropout = 0.5 text_enc_weights_path = 'new_text_enc.pth' text_dec_weights_path = 'faster_text_gen_v1.pth' lat_gen_weights_path = 'run_12_all_fixed/epoch_46_lat_gen.pth' lat_dis_weights_path = 'run_12_all_fixed/epoch_46_lat_dis.pth' references_path = QUORA_TEXT_PRETRAINED_VOCAB_VALID_SET_PATH text_enc = TextEncoder(d_vocab=d_vocab, d_text_feature=d_text_feature, text_enc_dropout=text_enc_dropout, d_text_enc_cnn=d_text_enc_cnn).to(device) text_enc.load_state_dict(torch.load(text_enc_weights_path)) text_dec = TextDecoder(d_vocab=d_vocab, d_text_feature=d_text_feature, d_gen_hidden=d_gen_hidden, d_max_seq_len=d_max_seq_len, d_gen_layers=d_gen_layers, gen_dropout=gen_dropout, pad_token=dataset.pad_token, start_token=dataset.start_token, end_token=dataset.end_token).to(device) text_dec.load_state_dict(torch.load(text_dec_weights_path)) lat_dis = LatentDiscriminator(d_text_feature=d_text_feature, d_dis_hidden=d_dis_hidden).to(device) lat_dis.load_state_dict(torch.load(lat_dis_weights_path)) lat_gen = LatentGenerator(d_noise=d_noise, d_text_feature=d_text_feature, d_gen_hidden=d_gen_hidden).to(device) lat_gen.load_state_dict(torch.load(lat_gen_weights_path)) print('training:') train(lat_gen, text_dec, text_enc, lat_dis, dataset, loader, device, output_dir=output_dir, references=references, **config) print('finished training')