Ejemplo n.º 1
0
def calc_distances(tati_data: List[Dict],
                   tilo_data: List[Dict]) -> Dict[str, Dict]:
    distances = defaultdict(dict)
    distances_json = "/tmp/distances.json"
    if not os.path.isfile(distances_json):
        for i, tilo in tqdm(enumerate(tilo_data)):
            for ii, tati in enumerate(tati_data):
                distances[str(i)][str(ii)] = Levenshtein.distance(
                    str(tilo), str(tati))
        data_io.write_json(distances_json, distances)
    else:
        distances = data_io.read_json(distances_json)
    return distances
Ejemplo n.º 2
0
 def read_lines_from_files(
     self, path, mode="b", encoding="utf-8", limit=sys.maxsize
 ):
     c = 0
     for file in os.listdir(path):
         if self.state.get(file, 0) == "all":
             continue
         for line_idx, line in enumerate(
             data_io.read_lines(path + "/" + file, mode, encoding)
         ):
             c += 1
             if line_idx < self.state.get(file, 0):
                 continue
             if c > limit:
                 break
             yield line
             self.state[file] = line_idx
             if c % self.write_interval == 0:
                 data_io.write_json(self.state_file, self.state)
         self.state[file] = "all"
Ejemplo n.º 3
0
def calc_write_learning_curve(exp: Experiment, max_num_workers=40):
    num_workers = min(min(max_num_workers,
                          multiprocessing.cpu_count() - 1), exp.num_folds)

    name = exp.name
    print("got %d evaluations to calculate" % len(exp.jobs))
    results_path = results_folder + "/" + name
    os.makedirs(results_path, exist_ok=True)
    start = time()
    scores = calc_scores(exp.score_task,
                         [split for train_size, split in exp.jobs],
                         n_jobs=num_workers)
    duration = time() - start
    meta_data = {
        "duration": duration,
        "num-workers": num_workers,
        "experiment": str(exp),
    }
    data_io.write_json(results_path + "/meta_datas.json", meta_data)
    print("calculating learning-curve for %s took %0.2f seconds" %
          (name, duration))
    pprint(scores)
    results = groupandsort_by_first(
        zip([train_size for train_size, _ in exp.jobs], scores))
    data_io.write_json(results_path + "/learning_curve.json", results)

    trainsize_to_mean_std_scores = {
        train_size: tuple_2_dict(calc_mean_and_std(m))
        for train_size, m in results.items()
    }
    data_io.write_json(
        results_path + "/learning_curve_meanstd.json",
        trainsize_to_mean_std_scores,
    )
Ejemplo n.º 4
0
def batch_inference(args: argparse.Namespace):

    torch.set_grad_enabled(False)

    if args.asr_model.endswith(".nemo"):
        print(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        print(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)

    manifest = prepare_manifest(args.corpora_dir, args.limit)
    asr_model.setup_test_data(
        test_data_config={
            "sample_rate": 16000,
            "manifest_filepath": manifest,
            "labels": asr_model.decoder.vocabulary,
            "batch_size": args.batch_size,
            "normalize_transcripts": args.normalize_text,
        })

    refs_hyps = list(tqdm(generate_ref_hyps(asr_model, args.search,
                                            args.arpa)))
    references, hypotheses = [list(k) for k in zip(*refs_hyps)]

    os.makedirs(args.results_dir, exist_ok=True)
    data_io.write_lines(f"{args.results_dir}/refs.txt.gz", references)
    data_io.write_lines(f"{args.results_dir}/hyps.txt.gz", hypotheses)

    wer_value = word_error_rate(hypotheses=hypotheses, references=references)
    sys.stdout.flush()
    stats = {
        "wer": wer_value,
        "args": args.__dict__,
    }
    data_io.write_json(f"{args.results_dir}/stats.txt", stats)
    print(f"Got WER of {wer_value}")
    return stats
Ejemplo n.º 5
0
def scrape_proceso_tables(search_ids: List):
    base_url = "https://www.corteconstitucional.gov.co/secretaria/"
    data_path = f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables"
    os.makedirs(data_path, exist_ok=True)
    download_path = f"{data_path}/downloads"
    wd = build_chrome_driver(download_path, headless=True)

    ids_files = ((eid, f"{data_path}/{eid}.json") for eid in search_ids)
    to_be_scraped = [(eid, file) for eid, file in ids_files
                     if not os.path.isfile(file)]
    print(f"already got {len(search_ids)-len(to_be_scraped)}")

    for search_id, file in tqdm(to_be_scraped):
        try:
            fire_search(base_url, search_id, wd)
            datum = dump_proceso_table(wd)
            datum["id"] = search_id
            data_io.write_json(file, datum)
        except BaseException as e:
            # traceback.print_stack()
            # raise e
            data_io.write_lines(f"{data_path}/could_not_scrape.txt",
                                [search_id])
            print(f"{search_id} f****d it up!")
Ejemplo n.º 6
0
        model.eval()
        pred_scores = []
        dev_data = []
        for mini_batch in eval_loader:
            dev_data.extend(mini_batch)
            e1, r = convert_tuples_to_tensors(mini_batch)
            scores = model.forward(e1.to(device), r.to(device)).cpu()
            pred_scores.append(scores)
        dev_scores = torch.cat(pred_scores)
        return hits_and_ranks(dev_data, dev_scores, data.dataset2trees)

    pbar = tqdm(range(100))
    model.to(device)
    for epoch in pbar:
        model.train()
        epoch_loss = numpy.mean([
            train_one_batch(model, optimizer, raw_batch)
            for raw_batch in train_loader
        ])
        if epoch % 10 == 0:
            mrr = run_evaluation(eval_loader, model)["mrr"]
            named_params = {n: v for n, v in model.named_parameters()}
            data_io.write_json('ent2id.json', data.ent2id)
            torch.save(named_params['entity_embeddings.weight'].data,
                       "entity_embeddings.pt")
        pbar.set_description(
            "Epoch: {}; mean-loss: {:.4f}; MRR: {:.3f}".format(
                epoch + 1, epoch_loss, mrr))
'''
Epoch: 100; mean-loss: 0.0891; MRR: 0.947: 100%|██████████| 100/100 [02:10<00:00,  1.30s/it]
'''
Ejemplo n.º 7
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     data_io.write_json(self.state_file, self.state)
     pprint(self.state)
Ejemplo n.º 8
0
def dump_to_disk_process_subtitles(n_clicks, video_file, texts, titles,
                                   model_name):
    print(f"video_file:{video_file}")
    assert all((isinstance(s, str) for s in texts))
    if n_clicks > 0 and video_file is not None:
        data = {
            title: TranslatedTranscript(title, k, text)
            for k, (title, text) in enumerate(zip(titles, texts))
        }
        data_io.write_json(
            build_json_name(video_file, model_name),
            {name: asdict(v)
             for name, v in data.items()},
        )

        named_blocks = segment_transcript_to_subtitle_blocks(
            get_letters_csv(video_file, model_name), list(data.values()))
        subtitles = dbc.Row(
            [
                dash_table.DataTable(
                    columns=[{
                        "id": cn,
                        "name": cn
                    } for cn in ["start-time"] + titles],
                    data=[{
                        **{
                            name: "".join([l.letter for l in b[name]])
                            for name in titles
                        },
                        **{
                            "start-time":
                            str(
                                timedelta(milliseconds=round(1000 * b[titles[0]][0].index / TARGET_SAMPLE_RATE)))
                        },
                    } for b in named_blocks],
                    style_table={
                        "height": 200 * len(titles),
                        "overflowY": "scroll",
                        "width": "100%",
                        "font-size": 9,
                    },
                    style_cell={
                        # "overflow": "hidden",
                        # "textOverflow": "ellipsis",
                        # "maxWidth": 0,
                        "textAlign": "left",
                        "height": "auto",
                    },
                ),
            ],
            style={"width": "100%"},
        )
        return (
            "content-of-this-string-does-not-matter",
            [subtitles],
            json.dumps([
                asdict(SubtitleBlock.from_dict_letters(dl))
                for dl in named_blocks
            ]),
        )
    else:
        print(f"DEBUG: prevented to update dump_to_disk_process_subtitles")
        raise PreventUpdate
Ejemplo n.º 9
0
    data_supplier, splits = build_data_supplier_splits_trainset_only(
        raw_data_supplier, num_folds, 0.1
    )

    start = time()
    task = SpacyCrfScorer(
        params=Params(c1=0.5, c2=0.0, max_it=2), data_supplier=data_supplier
    )
    num_workers = 0  # min(multiprocessing.cpu_count() - 1, num_folds)
    m_scores_std_scores = calc_mean_std_scores(task, splits, n_jobs=num_workers)
    print(
        "spacy+crfsuite-tagger %d folds %d workers took: %0.2f seconds"
        % (num_folds, num_workers, time() - start)
    )
    pprint(m_scores_std_scores)
    data_io.write_json("spacy-crf-scores.json", m_scores_std_scores)

"""
#############################################################################
on x1-carbon scierc-data

spacy+crfsuite-tagger 3 folds-PARALLEL took: 74.86 seconds
{'m_scores': {'dev': {'f1-macro': 0.8822625032484681,
                      'f1-micro': 0.9528343173272004,
                      'f1-spanwise': 0.8470436086284675},
              'test': {'f1-macro': 0.5742946309433821,
                       'f1-micro': 0.832899550463387,
                       'f1-spanwise': 0.5345123493111902},
              'train': {'f1-macro': 0.8844589822247658,
                        'f1-micro': 0.9522832740014087,
                        'f1-spanwise': 0.842115934181045}},