def labeled(srf, orig_best=None, orig_docset_id=None): output = [] n = 0 curr = [] last_docset_id = None for source in os.listdir(srf): base = ".".join(source.split(".")[:-1]) embedding = os.path.join(srf, base + ".emd") if source.endswith(".nrm") and os.path.exists(embedding): doc_id = source.split(".")[0] sentences = utils.fileaslist(os.path.join(srf, source)) embeddings = [[float(y) for y in x.split(" ")] for x in utils.fileaslist(embedding)] if parse_rouge(os.path.join(srf, base + ".rge"), 2) < min_rge: continue best = orig_best if orig_best else set( utils.fileaslist(os.path.join(srf, "%s.best%d" % (base, ver)))) docset_id = orig_docset_id if orig_docset_id else doc_id if docset_id != last_docset_id: if len(curr) > 0: output.append(curr) curr = [] last_docset_id = docset_id for i, sen in enumerate(sentences): d = dict() n += 1 d["docset_id"] = docset_id d["doc_id"] = doc_id d["sentence_id"] = n if orig_best else str(i + 1) d["embedding"] = embeddings[i] d["label"] = 1 if sen in best else 0 d["text"] = sentences[i] if len(d) > 0: curr.append(d) if len(curr) > 0: output.append(curr) return output
def get_input_paths(folder, qResults, language): paths = [] if os.path.isfile(qResults): with open(qResults) as r: results = json.load(r) for res in results["document info"]["results"]: index = res["index"] index_toks = index.replace("index_store", "mt_store").split("/") filename = res["filename"] ep = "%s/%s/%s/%s.txt" % (folder, "/".join( index_toks[:5]), index_toks[-2], filename) if language == "en": paths.append((ep, ep)) else: # check that the correct laguage was selected in the server assert (language == "sw" and "1A/" in index or language == "tl" and "1B/" in index) input_name = tempfile.NamedTemporaryFile().name index_toks = index.replace("index_store", "morphology_store").split("/") morpho_store = "%s/%s" % (folder, "/".join(index_toks[:5])) if DEBUG: print("looking in morpho store: %s" % morpho_store) morpho_ver = list( filter( lambda x: "morph-v3.0" in x.name and ("v4.0" in x.name or "audio" not in ep), sorted(Path(morpho_store).iterdir(), key=lambda f: f.stat().st_mtime)))[-1].name #list(filter(lambda x: "morph-v3.0" in x, os.listdir(morpho_store)))[0] input_file = "%s/%s/%s.txt" % (morpho_store, morpho_ver, filename) with open(input_name, "w") as w: with open(input_file) as r: for line in r: d = json.loads(line) if len(d) > 0: w.write(" ".join( map(lambda x: x["word"], d[0])) + "\n") else: w.write("empty.\n") paths.append((input_name, ep)) if (len(fileaslist(input_name)) != len( list(filter(lambda x: len(x) > 0, fileaslist(ep))))): if DEBUG: print("DEBUG: diff sizes %s vs %s" % (input_file, ep)) else: for path in os.listdir(folder): p = "%s/%s" % (folder, path) paths.append((p, p)) return paths
def kl(source_path): def dist(text): words = text.split(" ") d = defaultdict(int) for w in words: d[w] += 1.0 / len(words) return d sentences = utils.fileaslist(source_path) D_dist = dist(" ".join(sentences)) best = [] for j in range(FIRST_N_LINES): min_dist = 100000 best_sen = None for s in sentences: if s in best: pass candidate = best + [s] S_dist = dist(" ".join(candidate)) distance = 0.0 for w in s.split(" "): distance += -S_dist[w] * math.log( D_dist[w] / (S_dist[w] + 0.00000000001), 2.0) if distance < min_dist: min_dist = distance best_sen = s if best_sen: best.append(best_sen) return "\n".join(best)
def sum2img(self, summary_dir, query_path, highlight): # get weights query_embd, _ = self.get_query_embd(query_path) weights_dir = tempfile.mkdtemp() try: for summary_fn in os.listdir(summary_dir): weights = [] summary_path = os.path.join(summary_dir, summary_fn) for sen in fileaslist(summary_path): sen_weights = [] for word in sen.split(" "): word_embd = self.embed_word(word) weight = 0.0 if word.lower( ) in self.stopwords else cossim_weight( word_embd, query_embd) assert (weight >= 0.0 and weight <= 1.0) sen_weights.append(weight) weights.append([str(w) for w in sen_weights]) write2file( "\n".join( [" ".join([str(w) for w in ws]) for ws in weights]), os.path.join(weights_dir, summary_fn)) # gen image os.system("./gen_images.sh %s %s %s %s" % (summary_dir, weights_dir, summary_dir, highlight)) finally: os.system("rm -r %s" % weights_dir)
def get_embds(self, norm_text_path, query_path): # deal with the text out_f = tempfile.NamedTemporaryFile() em.print_embeddings( em.get_embeddings(self.em[0], self.em[1], self.em[2], self.em[3], norm_text_path, self.em[4]), out_f.name) sen_embds = [[float(x) for x in line.split(" ")] for line in fileaslist(out_f.name)] qry_embds, query = self.get_query_embd(query_path) return sen_embds, qry_embds, query
def centroidemd(source_path): get_embds = lambda path: [[float(y) for y in x.split(" ")] for x in utils.fileaslist(path[:-3] + "emd")] if source_path not in cache: source_embds = get_embds(source_path) cache[source_path] = utils.average(source_embds) source_sens = utils.fileaslist(source_path) centroid = cache[source_path] assert len(source_sens) == len(source_embds) best = set() for j in range(FIRST_N_LINES): try: best.add( max(set(range(len(source_embds))) - best, key=lambda i: utils.cosine_similarity( source_embds[i], centroid))) except ValueError: print "too small text" return "\n".join([source_sens[i] for i in best])
def split2sens(self, raw_text_path): out_file_name = tempfile.NamedTemporaryFile().name if self.segment: with open(out_file_name, "w", encoding="utf-8") as out_f: test = sbd.get_data(raw_text_path, tokenize=True) test.featurize(self.splitta_model, verbose=False) self.splitta_model.classify(test, verbose=False) test.segment(use_preds=True, tokenize=False, output=out_f) else: with open(out_file_name, "w") as w: for line in fileaslist(raw_text_path): line = line.strip() if len(line) > 0: w.write(line + "\n") return out_file_name
def get_query_embd(self, query_path): # extract the query from the query_path with open(query_path, encoding="utf-8") as qr: query_dict = json.load(qr) query = query_dict["parsed_query"][0][ "content"] if not self.translate_query else get_translated_query( query_dict) # deal with query qin_f = tempfile.NamedTemporaryFile() write2file(wt.normalize(query), qin_f.name) qout_f = tempfile.NamedTemporaryFile() em.print_embeddings( em.get_embeddings(self.em[0], self.em[1], self.em[2], self.em[3], qin_f.name, self.em[4]), qout_f.name) qry_embds = [float(x) for x in fileaslist(qout_f.name)[0].split(" ")] return qry_embds, query
def runformds(datapoint_folder): dsf = dataset_folder dpf = datapoint_folder srf = os.path.join(dsf, dpf, "sources") docset_id = datapoint_folder max_rouge = -1.0 best_source = None for source in os.listdir(srf): if source.endswith(".rge"): score = parse_rouge(os.path.join(srf, source), 1) if score > max_rouge: max_rouge = score best_source = ".".join(source.split(".")[:-1]) if not best_source: return [] best = set( utils.fileaslist(os.path.join(srf, "%s.best%d" % (best_source, ver)))) return labeled(srf, orig_best=best, orig_docset_id=docset_id)
def ingest_text(self, raw_text_path, out_text_path, query_path): sens_text_path = self.split2sens(raw_text_path) sens_text_path2 = self.split2sens(out_text_path) norm_text_path = self.normalize(sens_text_path) sen_embds, qry_embds, query = self.get_embds(norm_text_path, query_path) assert (len(fileaslist(sens_text_path)) == len( fileaslist(norm_text_path))) if DEBUG: print("compare sizes: %d - %d" % (len( fileaslist(sens_text_path2)), len(fileaslist(norm_text_path)))) assert (len(fileaslist(sens_text_path2)) == len( fileaslist(norm_text_path))) clean_texts = fileaslist(sens_text_path2) sent_tokens = [sen.split(" ") for sen in fileaslist(norm_text_path)] return get_inputs_metadata(sent_tokens, clean_texts, sen_embds, qry_embds, query=query)
def get_embeddings(json_input_file): code = abs(hash(json_input_file)) em.create_embeddings(json_input_file, "%s/mds/%d.norm" % (TMP, code), "%s/mds/%d.emb" % (TMP, code), models) return [[float(n) for n in l.split(" ")] for l in utils.fileaslist("%s/mds/%d.emb" % (TMP, code))]
dps = [] # get a squad document sens = [] embeddings = [] for p in d["paragraphs"][:30]: write2file(p["context"], raw_text_path) sens_text_path = featurizer.split2sens(raw_text_path) norm_text_path = featurizer.normalize(sens_text_path) q = random.choice(p["qas"]) query = q["question"] write2file(query, query_path) sen_embds, qry_embds, _ = featurizer.get_embds( norm_text_path, query_path) dps.append((query, len(sens), len(sen_embds), qry_embds)) embeddings.extend(sen_embds) sens.extend(fileaslist(sens_text_path)) # write dps for the document inputs = [] for i, sen in enumerate(sens): inpt = dict() inpt["sentence_id"] = i inpt["text"] = sen inpt["embedding"] = embeddings[i] inpt["word_count"] = len(sen.split(" ")) inputs.append(inpt) for query, st, cnt, qry_embds in dps: dp = dict() dp["inputs"] = inputs dp["qembedding"] = qry_embds
def first3(text_path): return "\n".join(utils.fileaslist(text_path)[:FIRST_N_LINES])
def rand3(text_path): l = utils.fileaslist(text_path) return "\n".join(random.sample(l, min(FIRST_N_LINES, len(l))))
import sys, os import utils sys.path.append("../rouge-scripts") import rouge as rge dataset = sys.argv[1] sample = int(sys.argv[2]) for d in os.listdir(dataset)[:sample]: ref = utils.fileaslist(os.path.join(dataset, d, "content.txt.nrm")) ref = [x for x in ref if len(x.split(" ")) > 3][:3] best_score = 0.0 best_f = None for f in os.listdir(os.path.join(dataset, d, "sources")): if f.endswith(".rge"): score = rge.parse_rouge(os.path.join(dataset, d, "sources", f), 1) if score > best_score: best_score = score best_f = f best = utils.fileaslist( os.path.join(dataset, d, "sources", best_f[:-3] + "best2")) print "==================================================" print "\n".join(ref) print "--------------------------------------------------" print "\n".join(best) print "=================================================="
if score > max_score: max_score = score best = s if best: base = ".".join(best.split(".")[:-1]) s = base + ".best" + ver text_path = os.path.join(sources, base + ".nrm") cont_path = os.path.join(dataset, dp, "content.txt.nrm") if os.path.exists(os.path.join(sources, s)) and os.path.exists(text_path): if rge.parse_rouge(os.path.join(sources, base + ".rge"), 2) < min_rge: continue can_text = candidate(text_path) ref_text = "\n".join([ x for x in utils.fileaslist(cont_path) if len(x.split(" ")) > 3 ][:FIRST_N_LINES]) can_path = "/tmp/mds/%s.can.txt" % base ref_path = "/tmp/mds/%s.ref.txt" % base utils.write2file(can_text, can_path) utils.write2file(ref_text, ref_path) eval_writer.write("%s %s\n" % (can_path, ref_path)) eval_writer.close() print "created the evaluation file, running rouge..." os.chdir(rouge_dir) rge.rouge(1000, eval_path, eval_out) print "done."
import os, sys, utils input_dir = sys.argv[1] for f in os.listdir(input_dir): text = utils.fileaslist("%s/%s" % (input_dir, f)) clean = [] for sen in text: if len(sen.split(" ")) >= 4: clean.append(sen) if len(text) > len(clean): print "%d => %d" % (len(text), len(clean)) utils.write2file("\n".join(clean), "%s/%s" % (input_dir, f))
outputs_dir = sys.argv[5] os.system("mkdir -p %s" % (outputs_dir)) n = 1000 print("number of shards: %d, start doc %d, end doc %d" % (n, start, end)) data_path = "%s/data%d-%d.txt" % (outputs_dir, start, end) os.system("> %s" % data_path) doc = start d = start / n if doc != 0: d += 1 with open(data_path, "w") as w: while doc <= end: if doc % n == 0: d += 1 os.system("cp %s/%d/%d.query /tmp/yan/query/queries.txt" % (inputs_path, d, doc)) os.system("cp %s/%d/%d.txt /tmp/yan/inputs/input.txt" % (inputs_path, d, doc)) run(port) summary_sens = fileaslist("/tmp/yan/outputs/input.txt") newdoc = " ".join(summary_sens) if "\n" in newdoc: raise Exception("new line in the summary!") if len(summary_sens) > 0: w.write(newdoc + "\n") else: w.write("\n") print("done with document %d" % doc) doc += 1
def normalize(self, sens_path): out_file_name = tempfile.NamedTemporaryFile().name write2file( "\n".join([wt.normalize(line) for line in fileaslist(sens_path)]), out_file_name) return out_file_name
import sys,os sys.path.append("../training") from utils import fileaslist d1 = sys.argv[1] d2 = sys.argv[2] s = 0.0 n = 0 for f1 in os.listdir(d1): pred1 = set(fileaslist("%s/%s" % (d1,f1))) pred2 = set(fileaslist("%s/%s" % (d2,f1))) score = float(len(pred1.intersection(pred2))) / len(pred1.union(pred2)) s += score n += 1 print("avg similarity: %f" % (s / n))