Exemple #1
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        dest='inputs',
                        type=str,
                        nargs="+",
                        help="Input files (JSON) for SPR1 splits.")
    parser.add_argument('-o',
                        dest='output_dir',
                        type=str,
                        required=True,
                        help="Output directory.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    pd.options.display.float_format = '{:.2f}'.format
    for fname in args.inputs:
        log.info("Converting %s", fname)
        source_records = list(utils.load_json_data(fname))
        converted_records = (convert_record(r) for r in tqdm(source_records))
        stats = utils.EdgeProbingDatasetStats()
        converted_records = stats.passthrough(converted_records)
        target_fname = os.path.join(args.output_dir, os.path.basename(fname))
        utils.write_json_data(target_fname, converted_records)
        log.info("Wrote examples to %s", target_fname)
        log.info(stats.format())
def _vi_to_zalo():
    squad_dir = "squad_data"
    zalo_samples = []
    _id = 0
    for file_name in ["vi_train-v2.0.json", "vi_dev-v2.0.json"]:
        file_path = "{}/{}".format(squad_dir, file_name)
        samples = read_json_data(file_path)
        for sample in samples["data"]:
            title = sample["title"]
            for p in sample["paragraphs"]:
                context = p["context"]
                for qa in p["qas"]:
                    zalo_sample = {
                        "id": "squad-{}".format(_id),
                        "title": title,
                        "question": qa["question"],
                        "text": context,
                        "label": not qa["is_impossible"],
                    }
                    zalo_samples.append(zalo_sample)
                    _id += 1

    out_path = "qna_data/squad.json"
    write_json_data(out_path, zalo_samples)
    print ("Write file {}".format(out_path))
Exemple #3
0
def convert_with_stats(source_records, target_fname, convert_fn):
    converted_records = (convert_fn(r) for r in tqdm(source_records))
    stats = utils.EdgeProbingDatasetStats()
    converted_records = stats.passthrough(converted_records)
    utils.write_json_data(target_fname, converted_records)
    log.info("Wrote examples to %s", target_fname)
    log.info(stats.format())
def main(args):
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ontonotes",
        type=str,
        required=True,
        help="Path to OntoNotes, e.g. /path/to/conll-formatted-ontonotes-5.0",
    )
    parser.add_argument("--tasks",
                        type=str,
                        nargs="+",
                        help="Tasks, one or more of {const, coref, ner, srl}.")
    parser.add_argument(
        "--splits",
        type=str,
        nargs="+",
        default=["train", "development", "test", "conll-2012-test"],
        help=
        "Splits, one or more of {train, development, test, conll-2012-test}.",
    )
    parser.add_argument("-o",
                        dest="output_dir",
                        type=str,
                        default=".",
                        help="Output directory for JSON files.")
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    import pandas as pd

    pd.options.display.float_format = "{:.2f}".format

    # Load OntoNotes reader.
    ontonotes = Ontonotes()
    for split in args.splits:
        for task in args.tasks:
            source_path = os.path.join(args.ontonotes, "data", split)
            print('########### Reading ontonotes split from', source_path)
            ontonotes_reader = ontonotes.dataset_iterator(
                file_path=source_path)

            log.info("Processing split '%s' for task '%s'", split, task)
            task_dir = os.path.join(args.output_dir, task)
            if not os.path.isdir(task_dir):
                os.mkdir(task_dir)
            target_fname = os.path.join(task_dir, f"{split}.json")
            ontonotes_stats = collections.Counter()
            converted_records = process_task_split(tqdm(ontonotes_reader),
                                                   task, ontonotes_stats)

            stats = utils.EdgeProbingDatasetStats()
            converted_records = stats.passthrough(converted_records)
            utils.write_json_data(target_fname, converted_records)
            log.info("Wrote examples to %s", target_fname)
            log.info(stats.format())
            log.info(str(pd.Series(ontonotes_stats, dtype=object)))
Exemple #5
0
def main(args):
    parser = argparse.ArgumentParser()
    parser.add_argument("-i",
                        dest="input_files",
                        type=str,
                        nargs="+",
                        help="Input file(s), e.g. en_ewt-ud-*.conllu")
    parser.add_argument(
        "-o",
        dest="output_dir",
        type=str,
        required=True,
        help="Output directory, e.g. /path/to/edges/data/ud_ewt",
    )
    args = parser.parse_args(args)

    if not os.path.isdir(args.output_dir):
        os.mkdir(args.output_dir)

    for filename in args.input_files:
        with open(filename) as fd:
            records = convert_ud_file(fd)
        stats = utils.EdgeProbingDatasetStats()
        records = stats.passthrough(records)
        target_basename = os.path.basename(filename).replace(
            ".conllu", ".json")
        target_fname = os.path.join(args.output_dir, target_basename)
        utils.write_json_data(target_fname, records)
        log.info("Wrote examples to %s", target_fname)
        log.info(stats.format())

    log.info("Done!")
Exemple #6
0
def do_notes(in_dir):
    """
        Read tracks info inside the in_dir folder and write
        tracks info to the tracks.json file

        :param in_dir: Input dir

        :return None
    """
    # create or read tracks json
    out_file = f"{in_dir}/tracks.json"
    if os.path.exists(out_file):
        out_json = read_json_data(out_file)
    else:
        out_json = {"instruments": {}}

    # we update this object
    instruments_json = out_json["instruments"]

    # get instruments info and update tracks json
    for path in glob.glob(f"{in_dir}/scores/*.mid"):
        instrument_name = path.split("\\")[-1].replace(".mid", "")
        instrument_info = get_instrument_info(path)

        instruments_json[instrument_name] = instrument_info

    # write tracks json
    write_json_data(out_file, out_json)
    print(f"Write file {out_file}")
Exemple #7
0
    def save_json(self, file_path):
        save_data = {
            "unk_id": self.unk_id,
            "max_sent_len": self.max_sent_len,
            "word2id": self.word2id,
            "id2word": self.id2word,
        }

        write_json_data(file_path, save_data)
def _translate_to_qnli(dataset):
    # vi_train_questions.json
    # vi_train_sentences.json
    # vi_train.tsv
    src_folder = "glue_data/qnli/en"
    trg_folder = "glue_data/qnli/vi"

    json_files = ["questions", "sentences"]
    tran_ids_to_questions_dict = _from_translate_to_json(
        dataset, src_folder, trg_folder, "questions")
    tran_ids_to_sentences_dict = _from_translate_to_json(
        dataset, src_folder, trg_folder, "sentences")

    # build translated table
    table_path = "glue_data/qnli/{}_table.tsv".format(dataset)
    df_table = pd.read_csv(table_path,
                           encoding="utf-8",
                           quoting=csv.QUOTE_NONE,
                           sep="\t")
    tran_dict = {
        "index": [],
        "question": [],
        "sentence": [],
        "label": [],
    }
    for i in range(0, df_table.shape[0]):
        index_id = df_table["index"][i]
        label = df_table["label"][i]
        question = tran_ids_to_questions_dict[str(df_table["question"][i])]
        sentence = tran_ids_to_sentences_dict[str(df_table["sentence"][i])]

        # remove end question mark from question
        if question[-1] == "?":
            question = question[:-1].strip()

        tran_dict["index"].append(index_id)
        tran_dict["question"].append(question)
        tran_dict["sentence"].append(sentence)
        tran_dict["label"].append(label)

    tran_df = pd.DataFrame(tran_dict)
    tran_dataset_path = "glue_data/qnli/vi_{}.tsv".format(dataset)
    _write_tsv(tran_df, tran_dataset_path)
    print("Write file {}".format(tran_dataset_path))

    # write translated questions, sentences json
    tran_ids_to_questions_path = "glue_data/qnli/vi_{}_questions.json".format(
        dataset)
    tran_ids_to_sentences_path = "glue_data/qnli/vi_{}_sentences.json".format(
        dataset)

    write_json_data(tran_ids_to_questions_path, tran_ids_to_questions_dict)
    print("Write file {}".format(tran_ids_to_questions_path))
    write_json_data(tran_ids_to_sentences_path, tran_ids_to_sentences_dict)
    print("Write file {}".format(tran_ids_to_sentences_path))
def _build_data(dataset):
    # train_questions.json
    # train_sentences.json
    # train_table.tsv
    tsv_path = "glue_data/qnli/{}.tsv".format(dataset)
    table_path = "glue_data/qnli/{}_table.tsv".format(dataset)
    df = pd.read_csv(tsv_path,
                     encoding="utf-8",
                     quoting=csv.QUOTE_NONE,
                     sep="\t")

    table = {
        "index": [],
        "question": [],
        "sentence": [],
        "label": [],
    }
    questions_to_id_dict = {}
    sentences_to_id_dict = {}
    curr_q_id = 0
    curr_s_id = 0
    for i in range(0, df.shape[0]):
        index = df["index"][i]
        question = df["question"][i]
        sentence = df["sentence"][i]
        label = df["label"][i]

        if question not in questions_to_id_dict:
            questions_to_id_dict[question] = curr_q_id
            curr_q_id += 1
        if sentence not in sentences_to_id_dict:
            sentences_to_id_dict[sentence] = curr_s_id
            curr_s_id += 1

        q_id = questions_to_id_dict[question]
        s_id = sentences_to_id_dict[sentence]

        table["index"].append(index)
        table["label"].append(label)
        table["question"].append(q_id)
        table["sentence"].append(s_id)

    id_to_qs_dict = {v: k for k, v in questions_to_id_dict.items()}
    id_to_ss_dict = {v: k for k, v in sentences_to_id_dict.items()}
    id_to_qs_path = "glue_data/qnli/{}_questions.json".format(dataset)
    id_to_ss_path = "glue_data/qnli/{}_sentences.json".format(dataset)

    write_json_data(id_to_qs_path, id_to_qs_dict)
    print("Write file {}".format(id_to_qs_path))
    write_json_data(id_to_ss_path, id_to_ss_dict)
    print("Write file {}".format(id_to_ss_path))

    table_df = pd.DataFrame(table)
    _write_tsv(table_df, table_path)
    print("Write file {}".format(table_path))
def _to_zalo():
    split_folder = "glue_data\qnli\split_tsv"
    en_folder_zalo = "glue_data/qnli/en"

    for tsv_path in glob.glob("{}\*.tsv".format(split_folder)):
        df = pd.read_csv(tsv_path,
                         encoding="utf-8",
                         quoting=csv.QUOTE_NONE,
                         sep="\t")
        json_examples = _tsv_to_zalo(df)
        json_file_name = tsv_path.split('\\')[-1].replace(".tsv", ".json")
        json_path = "{}/{}".format(en_folder_zalo, json_file_name)
        write_json_data(json_path, json_examples)
        print("Write to file {}".format(json_path))
    def preprocess_qna_data(
        self, method, bert_type, dataset_types,
    ):
        for dataset_type in dataset_types:
            data_file = "qna_data/en_{}.json".format(dataset_type)

            # Init features columns
            if self.for_train:
                features_columns = {
                    "id": [],
                    "question": [],
                    "text": [],
                    "label": [],
                    "pid": [],
                }

            json_samples = read_json_data(data_file)

            for json_sample in json_samples:
                if self.for_train:
                    features_columns["id"].append(json_sample["id"])
                    features_columns["label"].append(1 if json_sample["label"] else 0)
                    features_columns["pid"].append(json_sample["pid"])

                for key in ["question", "text"]:
                    pre_key = "{}_{}_{}".format(
                        method, bert_type, key
                    )
                    pre_text, tokens_id = self.pre_process_text(
                        json_sample[key], method, self.for_train
                    )
                    json_sample[pre_key] = pre_text

                    if self.for_train:
                        features_columns[key].append(tokens_id)

            # samples with preprocessed keys
            write_json_data(data_file, json_samples)
            print ("{}. Length {}. Done write to file {}".format(
                dataset_type, len(json_samples), data_file
            ))

            # generate featured dataset
            if self.for_train:
                folder_name = "{}_{}".format(method, bert_type)
                self.write_features_columns(
                    features_columns, folder_name, dataset_type
                )
def _summary(tsv_dir):
    summary_dict = {}
    for tsv_path in glob.glob("{}/*.tsv".format(tsv_dir)):
        file_name = tsv_path.split("\\")[-1].replace(".tsv", "")
        res_dict, q_len_df, s_len_df, outlier_df = _get_tsv_summary(tsv_path)

        if res_dict:
            summary_dict[file_name] = res_dict
        if q_len_df is not None and s_len_df is not None:
            q_len_df.to_csv("{}/{}_question_length_dist.csv".format(tsv_dir, file_name), header=True, index=False)
            s_len_df.to_csv("{}/{}_sentence_length_dist.csv".format(tsv_dir, file_name), header=True, index=False)
        if outlier_df is not None:
            outlier_df.to_csv(
                "{}/{}_outlier.csv".format(tsv_dir, file_name),
                header=False, encoding="utf-8", sep="\t", quoting=csv.QUOTE_NONE)

    summary_path = "{}/summary.json".format(tsv_dir)
    write_json_data(summary_path, summary_dict)
    print ("Write file {}".format(summary_path))
Exemple #13
0
def convert_data(dataset_type, include_txt=True):
    file_path = "qna_data/{}.json".format(dataset_type)

    data_json = read_json_data(file_path)

    converted_samples = []

    for sample_json in data_json:
        converted_sample = None

        if dataset_type in ["train", "squad"]:
            converted_sample = sample_json
            converted_sample["pid"] = "p1"
            converted_samples.append(converted_sample)

        elif dataset_type in ["test", "private", "ltest"]:
            for p in sample_json["paragraphs"]:
                if 'label' in p:
                    label = True if p['label'] == '1' else False
                else:
                    label = False
                converted_sample = {
                    "id": sample_json["__id__"],
                    "title": sample_json["title"],
                    "question": sample_json["question"],
                    "text": p["text"],
                    "label": label,
                    "pid": p["id"]
                }
                converted_samples.append(converted_sample)

    new_file_path = "qna_data/vi_{}.json".format(dataset_type)

    write_json_data(new_file_path, converted_samples)
    print ("Length {}. Done write to file {}".format(len(converted_samples), new_file_path))

    write_txt_for_translation(converted_samples, dataset_type) # write only vi files
    print ("Done write raw files for translation")
Exemple #14
0
    def build_data(self):
        corpus = self.train_paragraph_texts + \
                 self.train_question_texts + \
                 self.test_question_texts + \
                 self.test_paragraph_texts

        vocab = VocabEntry.from_corpus(corpus, freq_cutoff=1)

        vocab_file = "qna_data/{}_vocab.json".format(self.method)
        vocab.save_json(vocab_file)

        self.train_questions = vocab.padd_sents(self.train_question_texts,
                                                start_end=False)
        self.train_paragraphs = vocab.padd_sents(self.train_paragraph_texts,
                                                 start_end=False)
        self.test_questions = vocab.padd_sents(self.test_question_texts,
                                               start_end=False)
        self.test_paragraphs = vocab.padd_sents(self.test_paragraph_texts,
                                                start_end=False)

        save_data = {
            "train_questions": self.train_questions,
            "train_paragraphs": self.train_paragraphs,
            "test_questions": self.test_questions,
            "test_paragraphs": self.test_paragraphs,
        }

        save_file = "qna_data/{}_dataset.json".format(self.method)
        write_json_data(save_file, save_data)

        self.vocab = vocab

        self._to_numpy()

        print("corpus len: ", len(corpus))
        print(corpus[0])
        print("max length: ", vocab.max_sent_len)
Exemple #15
0
def convert_raw_en_to_json(dataset_type):
    raw_id_type_file = "qna_data/back_tran/raw_id_type_{}.txt".format(dataset_type)
    raw_en_file = "qna_data/back_tran/raw_vi_{}.txt".format(dataset_type)
    en_file = "qna_data/vi_{}.json".format(dataset_type)
    # for getting the title only
    vi_json_file = "qna_data/vi_{}.json".format(dataset_type[1:])
    vi_json_samples = read_json_data(vi_json_file)

    en_json_samples = []
    id_lines = [line.strip() for line in open(raw_id_type_file, "r", encoding="utf-8")]
    en_lines = [line.strip() for line in open(raw_en_file, "r", encoding="utf-8")]

    current_question = None
    text_idx = 0
    for i, id_line in enumerate(id_lines):
        parts = id_line.split("\t")

        if parts[1] == "question":
            current_question = {
                "id": parts[0],
                "question": en_lines[i],
            }

        elif parts[1] == "text":
            en_json_sample = copy.deepcopy(current_question)
            en_json_sample["title"] = vi_json_samples[text_idx]["title"]
            en_json_sample["text"] = en_lines[i]
            en_json_sample["label"] = True if parts[3] == "True" else False
            en_json_sample["pid"] = parts[2]

            en_json_samples.append(en_json_sample)
            text_idx += 1

    write_json_data(en_file, en_json_samples)
    print ("{}. Length {}. Done write to file {}".format(
        dataset_type, len(en_json_samples), en_file
    ))
Exemple #16
0
def do_mappings(in_dir):
    """
        Update the mappings likes convert bbt to second

        :param in_dir: The input project dir

        :return: None
    """
    tracks_file = f"{in_dir}/tracks.json"

    tracks_json = read_json_data(tracks_file)
    tracks_data = tracks_json["tracks_data"]

    bpm = tracks_data["bpm"]
    ppq = tracks_data["ppq"]
    time_signature = tracks_data["time_signature"]
    mappings = tracks_data["mappings"]

    for mapping in mappings:
        loops_data = mapping.get("loops_data")
        if loops_data:
            between_first_s = bbt_to_second(
                bpm, ppq, loops_data.get("between_first_bbt", "1:01:00"),
                time_signature)
            between_second_s = bbt_to_second(
                bpm, ppq, loops_data.get("between_second_bbt", "1:01:00"),
                time_signature)
            loops_data["between"] = between_second_s - between_first_s

            for loop in loops_data["loops"]:
                print(loop)
                loop["start"] = bbt_to_second(bpm, ppq, loop["start_bbt"],
                                              time_signature)

    write_json_data(tracks_file, tracks_json)
    print(f"Write file {tracks_file}")
    def preprocess_qna_data(
        self, method, cased, dataset_types,
    ):
        folder_name = "{}_{}".format(method, cased)
        folder_path = "qna_data/pre_data/vi_{}".format(folder_name)
        create_folder(folder_path)

        # preprocess fields
        dataset_features_columns = {}
        for dataset_type in dataset_types:
            data_file = "qna_data/vi_{}.json".format(dataset_type)

            # Init features columns
            if self.for_train:
                features_columns = {
                    "id": [],
                    "question": [],
                    "text": [],
                    "label": [],
                    "pid": [],
                }

            json_samples = read_json_data(data_file)

            for json_sample in json_samples:

                if self.for_train:
                    features_columns["id"].append(json_sample["id"])
                    features_columns["label"].append(1 if json_sample["label"] else 0)
                    features_columns["pid"].append(json_sample["pid"])

                for key in ["question", "text"]:
                    pre_key = "{}_{}_{}".format(
                        method, cased, key
                    )
                    pre_text, tokens = self.pre_process_text(
                        json_sample[key], method, cased, self.for_train, key
                    )
                    json_sample[pre_key] = pre_text

                    if self.for_train:
                        features_columns[key].append(tokens)

            # samples with preprocessed keys
            write_json_data(data_file, json_samples)
            print ("{}. Length {}. Done write to file {}".format(
                dataset_type, len(json_samples), data_file
            ))

            # save for writing later when we have vocab
            if self.for_train:
                dataset_features_columns[dataset_type] = features_columns

        # build vocab
        vocab_file = "{}/vocab.json".format(folder_path, dataset_type)
        if self.build_vocab:
            self._build_vocab(vocab_file, method, cased)
        else:
            self.vocab = VocabEntry.from_json(vocab_file)

        # write configs
        configs = {
            "vocab_size": len(self.vocab),
            "question_size": self.question_size,
            "text_size": self.text_size,
        }
        configs_file = "{}/configs.json".format(folder_path)
        write_json_data(configs_file, configs)
        print ("Done wirte config file {}".format(configs_file))

        # write features columns
        # generate featured dataset
        if self.for_train:
            for dataset_type, features_columns in dataset_features_columns.items():
                self.write_features_columns(
                    features_columns, folder_name, dataset_type
                )
Exemple #18
0
    scratch = ScratchCode()

    if mode == '1':
        filename = input(
            'Введите имя файла, куда будут записаны скретч-коды: ')
        directory, file = os.path.split(filename)

        if not os.path.isdir(filename) and not directory or os.path.exists(
                directory):
            scratch_codes_count = get_value('scratch_codes_count')
            serial_number_length = len(
                str(int(first_serial_number) +
                    scratch_codes_count).zfill(first_serial_number_length))
            scratch_codes = scratch.generate(serial_number_length, hash_type,
                                             hash_length, scratch_codes_count)
            write_json_data(scratch_codes, filename)

        else:
            print('Неправильное имя файла.')

    if mode == '2':
        filename = input(
            'Введите имя файла, откуда будут взяты скретч-коды для проверки: ')

        if os.path.isfile(filename):
            scratch_codes = get_json_data(filename)
            serial_number_length = len(
                str(int(first_serial_number) +
                    len(scratch_codes)).zfill(first_serial_number_length))
            scratch.create_activated_codes_table()
            checked_codes, right_codes = scratch.check(scratch_codes,
Exemple #19
0
 def create_match_details_directory(self,directory):
     for index in range(self.start,self.end):
         url = self.get_url(index)
         if not os.path.exists(directory):
             os.makedirs(os.path.dirname(directory), exist_ok=True)
         write_json_data(os.path.join(directory,f"{self.get_file_name(url)}.json"),get_detail_json(url))