コード例 #1
0
def topicalchat(
    file_name="train.json",
    data_path=os.environ["HOME"] +
    "/DIALOGUE/alexa-prize-topical-chat-dataset/conversations",
    hist_len=3,
):

    file = os.path.join(data_path, file_name)
    data = list(data_io.read_json(file).values())
    Utt = namedtuple(
        "Utterance",
        "message agent sentiment knowledge_source turn_rating",
        defaults=[SILENCE] + [None] * 4,
    )

    def build_turn(req: Utt, res: Utt):
        assert req.agent != res.agent
        return Turn(req.message, res.message)

    def build_dialogues(utts):
        turns = [
            build_turn(utts[k], utts[k + 1])
            for k in range(0,
                           len(utterances) // 2 * 2, 2)
        ]
        background = ""
        for k in range(len(turns)):
            some_turns = get_limited_history(turns, k, hist_len)
            yield build_input_target(background, some_turns, SEP)

    for datum in data:
        utterances = [Utt(**d) for d in datum["content"]]
        yield from build_dialogues(utterances)
        # insert silence utter to switch roles
        yield from build_dialogues([Utt()] + utterances)
コード例 #2
0
    def __init__(self, state_file, write_interval=1000_000):
        self.state_file = state_file
        self.write_interval = write_interval

        if os.path.isfile(state_file):
            self.state = data_io.read_json(state_file)
        else:
            self.state = {}
コード例 #3
0
def update_store_data(video_file, _, model_name):
    print(f"DEBUG: update_store_data with video_file={video_file}")
    if video_file is not None and os.path.isfile(
            build_json_name(video_file, model_name)):
        return json.dumps(
            data_io.read_json(build_json_name(video_file, model_name)))
    else:
        raise PreventUpdate
コード例 #4
0
ファイル: scorer.py プロジェクト: ir-ischool-uos/mwpd
def read_prediction(in_file, format):
    if format=='json':
        data= numpy.array(dio.read_json(in_file))
        return data[:, 5], data[:, 6], data[:, 7]
    elif format=='csv':
        data= dio.read_csv(in_file)
        return data[:, 1], data[:, 2], data[:, 3]
    else:
        print("Not supported input format")
        return None
コード例 #5
0
def generate_personachat_seq2seq(file_name):

    data = data_io.read_json(os.environ["HOME"] + "/data/QA/" +
                             file_name)["train"]
    for datum in data:
        background = ' '.join(datum['personality'])
        for d in datum['utterances']:
            x = d['history'] + d['candidates'][-1:]
            qas, aas = list(zip(*[x[k:k + 2] for k in range(0, len(x), 2)]))
            dialogue, target = build_input_target(background, qas, aas, SEP)
            yield dialogue, target
コード例 #6
0
def calc_distances(tati_data: List[Dict],
                   tilo_data: List[Dict]) -> Dict[str, Dict]:
    distances = defaultdict(dict)
    distances_json = "/tmp/distances.json"
    if not os.path.isfile(distances_json):
        for i, tilo in tqdm(enumerate(tilo_data)):
            for ii, tati in enumerate(tati_data):
                distances[str(i)][str(ii)] = Levenshtein.distance(
                    str(tilo), str(tati))
        data_io.write_json(distances_json, distances)
    else:
        distances = data_io.read_json(distances_json)
    return distances
コード例 #7
0
def squad20(file_name):

    file = os.environ["HOME"] + "/data/QA/SQUAD20/" + file_name
    data = data_io.read_json(file)["data"]
    for datum in data:
        for p in datum["paragraphs"]:
            background = p["context"]
            for qa in p["qas"]:
                if not qa["is_impossible"]:
                    q = qa["question"]
                    for a in qa["answers"]:
                        turns = [Turn(q, a["text"])]
                        yield build_input_target(background, turns, SEP)
コード例 #8
0
def build_schema_and_corpus():
    schema = Schema(
        id=ID(stored=True),
        filename=ID(stored=True),
        story=TEXT(analyzer=StemmingAnalyzer(), stored=True, lang="en"),
    )
    file = os.environ["HOME"] + "/data/QA/coqa/" + "coqa-train-v1.0.json"
    data = ({
        "id": d["id"],
        "filename": d["filename"],
        "story": d["story"]
    } for d in data_io.read_json(file)["data"])
    return schema, data
コード例 #9
0
def generate_squad20_seq2seq(file_name):

    data = data_io.read_json(os.environ["HOME"] + "/data/QA/SQUAD20/" +
                             file_name)["data"]
    for datum in data:
        for p in datum['paragraphs']:
            background = p['context']
            for qa in p['qas']:
                if not qa['is_impossible']:
                    q = qa['question']
                    for a in qa['answers']:
                        dialogue, target = build_input_target(
                            background, [q], [a['text']], SEP)
                        yield dialogue, target
コード例 #10
0
def generate_coqa_seq2seq(file_name, hist_len=3):

    data = data_io.read_json(os.environ["HOME"] + "/data/QA/coqa/" +
                             file_name)["data"]

    def get_history(l: List, k, hist_len):
        return [d["input_text"] for d in l[max(0, k - hist_len):(k + 1)]]

    for datum in data:
        dialogue_len = len(datum["questions"])
        for k in range(dialogue_len):
            q_hist = get_history(datum["questions"], k, hist_len)
            a_hist = get_history(datum["answers"], k, hist_len)
            dialogue, target = build_input_target(datum["story"], q_hist,
                                                  a_hist, SEP)
            yield dialogue, target
コード例 #11
0
def merge_edictos_proceso_tables(
    edictos: List,
    data_path=f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables"
) -> List:
    raw_data = list(
        data_io.read_json(str(file))
        for file in tqdm(Path(data_path).glob("*.json")))
    print("parse tables")
    table_data = (parse_table(d) for d in raw_data)
    exp2table = {t.expediente: t for t in tqdm(table_data)}
    g = (merge_dicts([
        asdict(e), {
            "tables": [asdict(exp2table[exp]) for exp in e.expedientes]
        }
    ]) for e in edictos)
    merged_data = list(g)
    return merged_data
コード例 #12
0
def build_index():
    schema = Schema(
        id=ID(stored=True),
        filename=ID(stored=True),
        story=TEXT(analyzer=StemmingAnalyzer(), stored=True,lang='en'),
    )
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")
    ix = index.create_in("indexdir", schema)
    data = data_io.read_json(
        os.environ["HOME"] + "/data/QA/coqa/" + "coqa-train-v1.0.json"
    )["data"]

    writer = ix.writer()
    for d in tqdm(data):
        writer.add_document(id=d["id"], filename=d["filename"], story=d["story"])
    writer.commit()
コード例 #13
0
def coqa(file_name, hist_len=3, use_danqi=False):

    file = os.environ["HOME"] + "/data/QA/coqa/" + file_name
    data = data_io.read_json(file)["data"]

    def get_history(l: List, k):
        return [
            fix_brackets(d["input_text"])
            for d in get_limited_history(l, k, hist_len)
        ]

    for datum in data:
        dialogue_len = len(datum["questions"])
        for k in range(dialogue_len):
            q_hist = get_history(datum["questions"], k)
            a_hist = get_history(datum["answers"], k)
            turns = [Turn(req, res) for req, res in zip(q_hist, a_hist)]
            yield build_input_target(fix_brackets(datum["story"]),
                                     turns,
                                     SEP,
                                     use_danqi=use_danqi)
コード例 #14
0
def personachat(data_path=os.environ["HOME"] + "/data/QA", hist_len=3):
    file_name = "personachat_self_original.json"
    file = os.path.join(data_path, file_name)
    data = data_io.read_json(file)["train"]

    def build_dialogues(background, utt):
        num_utt = len(utt)
        assert num_utt % 2 == 0
        turns = [
            Turn(request=utt[k], response=utt[k + 1])
            for k in range(0, num_utt, 2)
        ]
        some_turns = turns[-hist_len:]
        yield build_input_target(background, some_turns, SEP)

    for datum in data:
        background = " ".join(datum["personality"])
        for d in datum["utterances"]:
            response = d["candidates"][-1]
            yield from build_dialogues(background, d["history"] + [response])
            yield from build_dialogues(background, [SILENCE] + d["history"])
コード例 #15
0
def plot_learncurve(paths, split_name, save_dir="images"):
    def build_method_name(path):
        return path.split("/")[-1]

    methods = [build_method_name(f) for f in paths]
    fig, ax = plt.subplots(figsize=(5, 10))
    sns.set(style="ticks", palette="pastel")
    data = [
        {
            "train_size": 100*round(float(train_size), 2),
            "f1-micro-spanlevel": score[split_name]["f1-micro-spanlevel"],
            "method": build_method_name(path),
        }
        for path in paths
        for train_size, scores in data_io.read_json(
            path + "/learning_curve.json"
        ).items()
        for score in scores
    ]
    num_cross_val = len(data) / len(set([d["train_size"] for d in data])) / len(methods)
    df = pd.DataFrame(data=data)
    ax = sns.boxplot(
        ax=ax,
        x="train_size",
        y="f1-micro-spanlevel",
        hue="method",
        data=df,
    )
    # sns.despine(offset=10, trim=True)
    ax.set_title(
        "evaluated on %s-set with %d-fold-crossval" % (split_name, num_cross_val)
    )
    ax.set_xlabel("subset of train-dataset in %")
    plt.tight_layout()
    ax.figure.savefig(
        save_dir + "/learning_curve_%s_%s.png" % (split_name, "-".join(methods))
    )

    plt.close()
コード例 #16
0
def fit_fasttextt(training_data_json, validation_data_json, class_lvl: int,
                  tmp_folder: str, embedding_file: str):
    class_level = class_lvl
    if class_lvl == 1:
        class_lvl = 5
    elif class_lvl == 2:
        class_lvl = 6
    elif class_lvl == 3:
        class_lvl = 7
    else:
        print("Not supported")
        exit(1)

    #load data and apply light normalisation of the text before using embeddings
    train = numpy.array(dio.read_json(training_data_json))
    for row in train:
        text = row[1]
        words = tokenize(text)
        text = " ".join(words).strip()
        row[1] = text

    val = numpy.array(dio.read_json(validation_data_json))
    for row in val:
        text = row[1]
        words = tokenize(text)
        text = " ".join(words).strip()
        row[1] = text

    X_train = train[:, 1]  #use product name only
    y_train = train[:, class_lvl]

    X_test = val[:, 1]
    y_test = val[:, class_lvl]
    for i in range(len(y_test)):
        label = y_test[i]
        y_test[i] = "__label__" + label.replace(" ", "|")

    # prepare fasttext data
    fasttext_train = tmp_folder + "/fasttext_train.tsv"
    with open(fasttext_train, mode='w') as outfile:
        csvwriter = csv.writer(outfile,
                               delimiter='\t',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
        for i in range(len(X_train)):
            label = y_train[i]
            text = X_train[i]
            csvwriter.writerow(["__label__" + label.replace(" ", "|"), text])

    if embedding_file is not None:
        model = fasttext.train_supervised(input=fasttext_train,
                                          minn=4,
                                          maxn=10,
                                          wordNgrams=3,
                                          neg=10,
                                          loss='ns',
                                          epoch=3000,
                                          thread=30,
                                          dim=300,
                                          pretrainedVectors=embedding_file)
    else:
        model = fasttext.train_supervised(input=fasttext_train,
                                          minn=4,
                                          maxn=10,
                                          wordNgrams=3,
                                          neg=10,
                                          loss='ns',
                                          epoch=3000,
                                          thread=30,
                                          dim=300)
    # evaluate the model
    predictions = model.predict(list(X_test))[0]
    f = open(tmp_folder + "/" + str(class_level) + "_predictions.txt", 'w')
    for p in predictions:
        pred = p[0][9:].replace("|", " ")
        f.write(pred + "\n")
    f.close()

    return scorer.score(predictions, list(y_test))
コード例 #17
0
        cmap=cmap,
        marker="o",
        # norm=norm,
        linewidths=0.0,
    )

    for txt, i in ent2id.items():
        ax.annotate(txt, (X[i][0], X[i][1]))
    plt.savefig("scatterplot.png")
    # plt.show()


if __name__ == "__main__":
    tsne = TSNE(n_components=2, n_jobs=4, n_iter=1000)
    X = torch.load("entity_embeddings.pt")
    ent2id = data_io.read_json("ent2id.json")
    some_entities = {
        k: v
        for k, v in ent2id.items()
        if k
        in [
            "human",
            "animal",
            "organism",
            "vertebrate",
            "bacterium",
            "plant",
            "fungus",
            "virus",
            "mammal",
        ]
コード例 #18
0
ファイル: scorer.py プロジェクト: ir-ischool-uos/mwpd
def read_gold_standard(in_file):
    data= numpy.array(dio.read_json(in_file))
    return data[:,5],data[:,6],data[:,7]
コード例 #19
0
                    batch_size=3
                    ) -> Generator[List[DialogRequest], None, None]:
    dialog_it = iter(data)

    def get_id_questions(d):
        return [(d["id"], d["story"], q) for q in d["questions"]]

    gs = [
        utt_generator(dialog_it, get_id_questions) for _ in range(batch_size)
    ]
    while True:
        batch = list(filter(None, [next(g) for g in gs]))
        batch = [
            DialogRequest(dialogue_id, q["turn_id"], is_start, background,
                          q["input_text"])
            for is_start, (dialogue_id, background, q) in batch
        ]
        if len(batch) > 0:
            yield batch
        else:
            break


if __name__ == "__main__":
    data_file = os.environ["HOME"] + "/data/QA/coqa/coqa-dev-v1.0.json"

    data = data_io.read_json(data_file)["data"][:5]

    for batch in coqa_to_batches(data):
        print(batch)