def topicalchat( file_name="train.json", data_path=os.environ["HOME"] + "/DIALOGUE/alexa-prize-topical-chat-dataset/conversations", hist_len=3, ): file = os.path.join(data_path, file_name) data = list(data_io.read_json(file).values()) Utt = namedtuple( "Utterance", "message agent sentiment knowledge_source turn_rating", defaults=[SILENCE] + [None] * 4, ) def build_turn(req: Utt, res: Utt): assert req.agent != res.agent return Turn(req.message, res.message) def build_dialogues(utts): turns = [ build_turn(utts[k], utts[k + 1]) for k in range(0, len(utterances) // 2 * 2, 2) ] background = "" for k in range(len(turns)): some_turns = get_limited_history(turns, k, hist_len) yield build_input_target(background, some_turns, SEP) for datum in data: utterances = [Utt(**d) for d in datum["content"]] yield from build_dialogues(utterances) # insert silence utter to switch roles yield from build_dialogues([Utt()] + utterances)
def __init__(self, state_file, write_interval=1000_000): self.state_file = state_file self.write_interval = write_interval if os.path.isfile(state_file): self.state = data_io.read_json(state_file) else: self.state = {}
def update_store_data(video_file, _, model_name): print(f"DEBUG: update_store_data with video_file={video_file}") if video_file is not None and os.path.isfile( build_json_name(video_file, model_name)): return json.dumps( data_io.read_json(build_json_name(video_file, model_name))) else: raise PreventUpdate
def read_prediction(in_file, format): if format=='json': data= numpy.array(dio.read_json(in_file)) return data[:, 5], data[:, 6], data[:, 7] elif format=='csv': data= dio.read_csv(in_file) return data[:, 1], data[:, 2], data[:, 3] else: print("Not supported input format") return None
def generate_personachat_seq2seq(file_name): data = data_io.read_json(os.environ["HOME"] + "/data/QA/" + file_name)["train"] for datum in data: background = ' '.join(datum['personality']) for d in datum['utterances']: x = d['history'] + d['candidates'][-1:] qas, aas = list(zip(*[x[k:k + 2] for k in range(0, len(x), 2)])) dialogue, target = build_input_target(background, qas, aas, SEP) yield dialogue, target
def calc_distances(tati_data: List[Dict], tilo_data: List[Dict]) -> Dict[str, Dict]: distances = defaultdict(dict) distances_json = "/tmp/distances.json" if not os.path.isfile(distances_json): for i, tilo in tqdm(enumerate(tilo_data)): for ii, tati in enumerate(tati_data): distances[str(i)][str(ii)] = Levenshtein.distance( str(tilo), str(tati)) data_io.write_json(distances_json, distances) else: distances = data_io.read_json(distances_json) return distances
def squad20(file_name): file = os.environ["HOME"] + "/data/QA/SQUAD20/" + file_name data = data_io.read_json(file)["data"] for datum in data: for p in datum["paragraphs"]: background = p["context"] for qa in p["qas"]: if not qa["is_impossible"]: q = qa["question"] for a in qa["answers"]: turns = [Turn(q, a["text"])] yield build_input_target(background, turns, SEP)
def build_schema_and_corpus(): schema = Schema( id=ID(stored=True), filename=ID(stored=True), story=TEXT(analyzer=StemmingAnalyzer(), stored=True, lang="en"), ) file = os.environ["HOME"] + "/data/QA/coqa/" + "coqa-train-v1.0.json" data = ({ "id": d["id"], "filename": d["filename"], "story": d["story"] } for d in data_io.read_json(file)["data"]) return schema, data
def generate_squad20_seq2seq(file_name): data = data_io.read_json(os.environ["HOME"] + "/data/QA/SQUAD20/" + file_name)["data"] for datum in data: for p in datum['paragraphs']: background = p['context'] for qa in p['qas']: if not qa['is_impossible']: q = qa['question'] for a in qa['answers']: dialogue, target = build_input_target( background, [q], [a['text']], SEP) yield dialogue, target
def generate_coqa_seq2seq(file_name, hist_len=3): data = data_io.read_json(os.environ["HOME"] + "/data/QA/coqa/" + file_name)["data"] def get_history(l: List, k, hist_len): return [d["input_text"] for d in l[max(0, k - hist_len):(k + 1)]] for datum in data: dialogue_len = len(datum["questions"]) for k in range(dialogue_len): q_hist = get_history(datum["questions"], k, hist_len) a_hist = get_history(datum["answers"], k, hist_len) dialogue, target = build_input_target(datum["story"], q_hist, a_hist, SEP) yield dialogue, target
def merge_edictos_proceso_tables( edictos: List, data_path=f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables" ) -> List: raw_data = list( data_io.read_json(str(file)) for file in tqdm(Path(data_path).glob("*.json"))) print("parse tables") table_data = (parse_table(d) for d in raw_data) exp2table = {t.expediente: t for t in tqdm(table_data)} g = (merge_dicts([ asdict(e), { "tables": [asdict(exp2table[exp]) for exp in e.expedientes] } ]) for e in edictos) merged_data = list(g) return merged_data
def build_index(): schema = Schema( id=ID(stored=True), filename=ID(stored=True), story=TEXT(analyzer=StemmingAnalyzer(), stored=True,lang='en'), ) if not os.path.exists("indexdir"): os.mkdir("indexdir") ix = index.create_in("indexdir", schema) data = data_io.read_json( os.environ["HOME"] + "/data/QA/coqa/" + "coqa-train-v1.0.json" )["data"] writer = ix.writer() for d in tqdm(data): writer.add_document(id=d["id"], filename=d["filename"], story=d["story"]) writer.commit()
def coqa(file_name, hist_len=3, use_danqi=False): file = os.environ["HOME"] + "/data/QA/coqa/" + file_name data = data_io.read_json(file)["data"] def get_history(l: List, k): return [ fix_brackets(d["input_text"]) for d in get_limited_history(l, k, hist_len) ] for datum in data: dialogue_len = len(datum["questions"]) for k in range(dialogue_len): q_hist = get_history(datum["questions"], k) a_hist = get_history(datum["answers"], k) turns = [Turn(req, res) for req, res in zip(q_hist, a_hist)] yield build_input_target(fix_brackets(datum["story"]), turns, SEP, use_danqi=use_danqi)
def personachat(data_path=os.environ["HOME"] + "/data/QA", hist_len=3): file_name = "personachat_self_original.json" file = os.path.join(data_path, file_name) data = data_io.read_json(file)["train"] def build_dialogues(background, utt): num_utt = len(utt) assert num_utt % 2 == 0 turns = [ Turn(request=utt[k], response=utt[k + 1]) for k in range(0, num_utt, 2) ] some_turns = turns[-hist_len:] yield build_input_target(background, some_turns, SEP) for datum in data: background = " ".join(datum["personality"]) for d in datum["utterances"]: response = d["candidates"][-1] yield from build_dialogues(background, d["history"] + [response]) yield from build_dialogues(background, [SILENCE] + d["history"])
def plot_learncurve(paths, split_name, save_dir="images"): def build_method_name(path): return path.split("/")[-1] methods = [build_method_name(f) for f in paths] fig, ax = plt.subplots(figsize=(5, 10)) sns.set(style="ticks", palette="pastel") data = [ { "train_size": 100*round(float(train_size), 2), "f1-micro-spanlevel": score[split_name]["f1-micro-spanlevel"], "method": build_method_name(path), } for path in paths for train_size, scores in data_io.read_json( path + "/learning_curve.json" ).items() for score in scores ] num_cross_val = len(data) / len(set([d["train_size"] for d in data])) / len(methods) df = pd.DataFrame(data=data) ax = sns.boxplot( ax=ax, x="train_size", y="f1-micro-spanlevel", hue="method", data=df, ) # sns.despine(offset=10, trim=True) ax.set_title( "evaluated on %s-set with %d-fold-crossval" % (split_name, num_cross_val) ) ax.set_xlabel("subset of train-dataset in %") plt.tight_layout() ax.figure.savefig( save_dir + "/learning_curve_%s_%s.png" % (split_name, "-".join(methods)) ) plt.close()
def fit_fasttextt(training_data_json, validation_data_json, class_lvl: int, tmp_folder: str, embedding_file: str): class_level = class_lvl if class_lvl == 1: class_lvl = 5 elif class_lvl == 2: class_lvl = 6 elif class_lvl == 3: class_lvl = 7 else: print("Not supported") exit(1) #load data and apply light normalisation of the text before using embeddings train = numpy.array(dio.read_json(training_data_json)) for row in train: text = row[1] words = tokenize(text) text = " ".join(words).strip() row[1] = text val = numpy.array(dio.read_json(validation_data_json)) for row in val: text = row[1] words = tokenize(text) text = " ".join(words).strip() row[1] = text X_train = train[:, 1] #use product name only y_train = train[:, class_lvl] X_test = val[:, 1] y_test = val[:, class_lvl] for i in range(len(y_test)): label = y_test[i] y_test[i] = "__label__" + label.replace(" ", "|") # prepare fasttext data fasttext_train = tmp_folder + "/fasttext_train.tsv" with open(fasttext_train, mode='w') as outfile: csvwriter = csv.writer(outfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in range(len(X_train)): label = y_train[i] text = X_train[i] csvwriter.writerow(["__label__" + label.replace(" ", "|"), text]) if embedding_file is not None: model = fasttext.train_supervised(input=fasttext_train, minn=4, maxn=10, wordNgrams=3, neg=10, loss='ns', epoch=3000, thread=30, dim=300, pretrainedVectors=embedding_file) else: model = fasttext.train_supervised(input=fasttext_train, minn=4, maxn=10, wordNgrams=3, neg=10, loss='ns', epoch=3000, thread=30, dim=300) # evaluate the model predictions = model.predict(list(X_test))[0] f = open(tmp_folder + "/" + str(class_level) + "_predictions.txt", 'w') for p in predictions: pred = p[0][9:].replace("|", " ") f.write(pred + "\n") f.close() return scorer.score(predictions, list(y_test))
cmap=cmap, marker="o", # norm=norm, linewidths=0.0, ) for txt, i in ent2id.items(): ax.annotate(txt, (X[i][0], X[i][1])) plt.savefig("scatterplot.png") # plt.show() if __name__ == "__main__": tsne = TSNE(n_components=2, n_jobs=4, n_iter=1000) X = torch.load("entity_embeddings.pt") ent2id = data_io.read_json("ent2id.json") some_entities = { k: v for k, v in ent2id.items() if k in [ "human", "animal", "organism", "vertebrate", "bacterium", "plant", "fungus", "virus", "mammal", ]
def read_gold_standard(in_file): data= numpy.array(dio.read_json(in_file)) return data[:,5],data[:,6],data[:,7]
batch_size=3 ) -> Generator[List[DialogRequest], None, None]: dialog_it = iter(data) def get_id_questions(d): return [(d["id"], d["story"], q) for q in d["questions"]] gs = [ utt_generator(dialog_it, get_id_questions) for _ in range(batch_size) ] while True: batch = list(filter(None, [next(g) for g in gs])) batch = [ DialogRequest(dialogue_id, q["turn_id"], is_start, background, q["input_text"]) for is_start, (dialogue_id, background, q) in batch ] if len(batch) > 0: yield batch else: break if __name__ == "__main__": data_file = os.environ["HOME"] + "/data/QA/coqa/coqa-dev-v1.0.json" data = data_io.read_json(data_file)["data"][:5] for batch in coqa_to_batches(data): print(batch)