Beispiel #1
0
def dump_mappings(base_dir: str, mapping: List[Dict]):
    def build_line(d: Dict[str, Any], k: str) -> str:
        return "\t".join([str(x) for x in d.get(k, {}).values()])

    data_io.write_lines(f"{base_dir}/tilo_mapped.csv",
                        [build_line(m, "tilo") for m in mapping])
    data_io.write_lines(f"{base_dir}/tati_mapped.csv",
                        (build_line(m, "tati") for m in mapping))
Beispiel #2
0
def build_vocabulary(
    text_g, vocab_file="fairseq_dict.txt", min_freq=1000,
):
    counter = Counter((c for t in tqdm(text_g) for c in t.replace(" ", "_")))
    vocab = counter.most_common(200)
    assert len(vocab) > 0
    data_io.write_lines(
        vocab_file, ["%s %d" % (c, f) for c, f in vocab if f > min_freq],
    )
Beispiel #3
0
def parse_pdf(pdf_file) -> str:
    bytes = textract.process(pdf_file)
    text_with_linebreaks = bytes.decode("utf-8")
    data_io.write_lines(DEBUG_RAW_TEXT,
                        text_with_linebreaks.split("\n"),
                        mode="ab")

    text = text_with_linebreaks.replace("\n", " ")

    # if len(matches) == 0 and pdf_file.split("/")[-1] not in KNOWN_TO_HAVE_NO_EXPEDIENTE:
    #     assert False

    # html_lines = exec_command(f"pdftohtml -noframes -stdout '{pdf_file}'")["stdout"]
    # html = "\n".join([l.decode("utf-8") for l in html_lines])
    # soup = BeautifulSoup(html, features="html.parser")
    return text
Beispiel #4
0
def create_or_load_raw_transcript(video_file, model_name) -> str:
    file = Path(f"{APP_DATA_DIR}/{video_file}")
    raw_transcript_file = (
        f"{SUBTITLES_DIR}/{file.stem}_{raw_transcript_name(model_name)}.txt")
    if not os.path.isfile(raw_transcript_file):
        asr = SpeechToText(model_name=model_name, ).init()
        transcript = convert_to_wav_transcribe(asr, str(file))
        data_io.write_lines(
            get_letters_csv(video_file, model_name),
            [f"{l.letter}\t{l.index}" for l in transcript.letters],
        )

        raw_transcript = "".join([l.letter for l in transcript.letters])
        data_io.write_lines(
            raw_transcript_file,
            [raw_transcript],
        )
    else:
        raw_transcript = list(data_io.read_lines(raw_transcript_file))[0]
    return raw_transcript
Beispiel #5
0
def extract_date(string: str):
    dates_numeric = date_numeric_pattern.findall(string)
    dates_nonnum = date_nonnum_pattern.findall(string)

    if len(dates_numeric) >= 1:
        date_string = dates_numeric[
            -1]  # take very last which is closest to sentencia mention!
        # return date_string
        mes = meses_pattern.search(date_string).group()
        mes_i = MESES.get(mes)
        day, year = [
            int(regex.sub(CIRCLE, "", s[1:-1]))
            for s in number_in_brackets_pattern.findall(date_string)
        ]
        date_s = reformat_date(
            f"{mes_i:02d}/{day:02d}/{year}")  # just for validation
        return date_s
    elif len(dates_nonnum) >= 1:

        date_nonnum = dates_nonnum[-1]
        mes = meses_pattern.search(date_nonnum).group()
        mes_i = MESES.get(mes)

        day = None
        for k in range(31, 1, -1):
            if num2name[k] in date_nonnum:
                day = k
                break
        assert day is not None
        year = [
            int(s[1:-1].replace("º", ""))
            for s in number_in_brackets_pattern.findall(date_nonnum)
        ][0]
        if year == 200:  # HACK!: see 2010 Mayo, No. 64
            year = 2009
        date_s = reformat_date(f"{mes_i:02d}/{day:02d}/{year}")
        return date_s
    else:
        data_io.write_lines(DEBUG_NO_DATE, [string], "ab")
        return None
Beispiel #6
0
def extract_from_edicto(source, string, edicto_num: int, edicto_year):
    edicto_date = parse_edicto_date(string)
    if edicto_date is None:
        data_io.write_jsonl(DEBUG_EDICTO_DATE, [{
            "source": source,
            "text": string
        }],
                            mode="ab")
        return []
    spans = [get_sentencia_span(m) for m in sentencia_pattern.finditer(string)]
    edictos = []
    for k, (start, end, sentencia) in enumerate(spans):
        next_start, _, _ = (spans[k + 1] if k + 1 < len(spans) else
                            (len(string), None, None))
        _, previous_end, _ = spans[k - 1] if k > 0 else (None, 0, None)
        behind_sentencia = string[end:next_start]
        expedientes = extract_expedientes(behind_sentencia)
        if len(expedientes) > 0:
            before_sentencia = string[previous_end:start]
            data_io.write_lines(DEBUG_BEFORE_SENTENCIA,
                                [before_sentencia.replace("\n", "€")],
                                mode="ab")
            date = extract_date(before_sentencia)
            if date is not None:
                edictos.append(
                    Edicto(
                        sentencia,
                        date,
                        edicto_date,
                        edicto_year,
                        expedientes,
                        source,
                        edicto_num,
                    ))
    if len(edictos) != 1:
        data_io.write_jsonl(DEBUG_NO_EDICTO, [{
            "source": source,
            "string": string
        }], "ab")
    return edictos
Beispiel #7
0
def batch_inference(args: argparse.Namespace):

    torch.set_grad_enabled(False)

    if args.asr_model.endswith(".nemo"):
        print(f"Using local ASR model from {args.asr_model}")
        asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model)
    else:
        print(f"Using NGC cloud ASR model {args.asr_model}")
        asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model)

    manifest = prepare_manifest(args.corpora_dir, args.limit)
    asr_model.setup_test_data(
        test_data_config={
            "sample_rate": 16000,
            "manifest_filepath": manifest,
            "labels": asr_model.decoder.vocabulary,
            "batch_size": args.batch_size,
            "normalize_transcripts": args.normalize_text,
        })

    refs_hyps = list(tqdm(generate_ref_hyps(asr_model, args.search,
                                            args.arpa)))
    references, hypotheses = [list(k) for k in zip(*refs_hyps)]

    os.makedirs(args.results_dir, exist_ok=True)
    data_io.write_lines(f"{args.results_dir}/refs.txt.gz", references)
    data_io.write_lines(f"{args.results_dir}/hyps.txt.gz", hypotheses)

    wer_value = word_error_rate(hypotheses=hypotheses, references=references)
    sys.stdout.flush()
    stats = {
        "wer": wer_value,
        "args": args.__dict__,
    }
    data_io.write_json(f"{args.results_dir}/stats.txt", stats)
    print(f"Got WER of {wer_value}")
    return stats
Beispiel #8
0
def scrape_proceso_tables(search_ids: List):
    base_url = "https://www.corteconstitucional.gov.co/secretaria/"
    data_path = f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables"
    os.makedirs(data_path, exist_ok=True)
    download_path = f"{data_path}/downloads"
    wd = build_chrome_driver(download_path, headless=True)

    ids_files = ((eid, f"{data_path}/{eid}.json") for eid in search_ids)
    to_be_scraped = [(eid, file) for eid, file in ids_files
                     if not os.path.isfile(file)]
    print(f"already got {len(search_ids)-len(to_be_scraped)}")

    for search_id, file in tqdm(to_be_scraped):
        try:
            fire_search(base_url, search_id, wd)
            datum = dump_proceso_table(wd)
            datum["id"] = search_id
            data_io.write_json(file, datum)
        except BaseException as e:
            # traceback.print_stack()
            # raise e
            data_io.write_lines(f"{data_path}/could_not_scrape.txt",
                                [search_id])
            print(f"{search_id} f****d it up!")
Beispiel #9
0
    def answer(input):
        # print(input.shape)
        with torch.no_grad():
            chat_history_ids = model.generate(
                input,
                max_length=1000,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                temperature=0.7,
                # num_beams=3
            )

        output = tokenizer.decode(
            chat_history_ids[:, input.shape[-1]:][0],
            skip_special_tokens=True,
        )
        # print("OUTPUT: %s"%output)
        return output

    file_name = "valid_freq"
    dialogues_g = topicalchat(
        file_name=file_name,
        data_path=os.environ["HOME"] +
        "/Response-Generation-Baselines/processed_output",
        limit=None)
    g = (answer(build_gpt2_input(utts)) for utts in dialogues_g)
    data_io.write_lines("microsoft-gpt2-%s.pred" % file_name, g)

    # dialogue_test()
Beispiel #10
0
import os

from util import data_io

if __name__ == "__main__":
    base_path = os.environ["HOME"] + "/hpc/data/parallel_text_corpora/wmt_en_ro"
    files = [
        f for f in os.listdir(base_path)
        if f.endswith(".source") or f.endswith(".target")
    ]
    some_data = "some_data"
    os.makedirs("%s" % some_data, exist_ok=True)
    for f in files:
        data_io.write_lines(
            "%s/%s" % (some_data, f),
            data_io.read_lines(base_path + "/%s" % f, limit=1000),
        )
Beispiel #11
0
    g = (
        " ".join(ngram)
        for ref, hyp in tqdm(zip(refs, hyps))
        for ngram in calc_corrected_ngrams(tokenize(ref), tokenize(hyp), order)
    )
    return list(g)


if __name__ == "__main__":

    refs_hyps_dir = "/tmp/train_kenlm_3_089_mp3"
    # ngrams = corrected_ngrams(refs_hyps_dir)

    refs = data_io.read_lines(f"{refs_hyps_dir}/refs.txt.gz")
    hyps = data_io.read_lines(f"{refs_hyps_dir}/hyps.txt.gz")

    tuples = (
        (h, r)
        for ref, hyp in tqdm(zip(refs, hyps))
        for h, r in calc_aligned_ngram_tuples(tokenize(ref), tokenize(hyp), 3)
    )
    error_tuples = ((r, h) for h, r in tuples if h != r and len(r) == 3)

    data_io.write_lines(
        "erroneous_3grams.tsv",
        (f"{' '.join(r)}\t{' '.join(h)}" for h, r in error_tuples),
    )

    # data_io.write_lines("ngrams.txt.gz", ngrams)
    # data_io.write_lines("unique_ngrams.txt.gz", list(set(ngrams)))
Beispiel #12
0
import os
from tqdm import tqdm
from util import data_io

from erroneous_ngrams import corrected_ngrams
from kenlm_arpa import convert_and_filter_topk, build_lm, ArpaArgs

if __name__ == "__main__":
    colab_asr_data = f"{os.environ['HOME']}/googledrive/data/asr_data"
    ngrams_file = "/tmp/ngrams.txt.gz"

    if not os.path.isfile(ngrams_file):
        ngrams = corrected_ngrams(
            f"{colab_asr_data}/results/train_kenlm_3_089_mp3")
        data_io.write_lines(ngrams_file, ngrams)
        data_io.write_lines("unique_ngrams.txt.gz", list(set(ngrams)))

    librispeech_lm_data = f"{os.environ['HOME']}/data/asr_data/ENGLISH/librispeech-lm-norm.txt.gz"

    for name, files in [
        ("vanilla", [librispeech_lm_data]),
        ("tedlium", [librispeech_lm_data] + 10 * [ngrams_file]),
    ]:
        cache_dir = f"kenlm_cache_{name}"

        if os.path.isdir(cache_dir):
            shutil.rmtree(cache_dir)
        os.makedirs(cache_dir)
        data_lower, vocab_str = convert_and_filter_topk(
            files, cache_dir, 200_000)
Beispiel #13
0
    "--target_file",
    default=os.environ["HOME"] +
    "/data/QA/topical-chat/processed_output/test_rare.tgt",
    type=str,
)
parser.add_argument(
    "--pred_file",
    default="test_rare.pred",
    type=str,
)

if __name__ == "__main__":
    args = parser.parse_args()

    rouge = Rouge()
    sources = [
        " "  # beginning with space? see: https://github.com/huggingface/transformers/blob/5ddd8d6531c8c49fdd281b55b93f6c81c9826f4b/examples/summarization/bart/evaluate_cnn.py#L66
        + x.rstrip() for x in data_io.read_lines(args.source_file)
    ]
    targets = list(data_io.read_lines(args.target_file))
    hyps = list(
        generate_summaries_or_translations(
            sources,
            args.model_path,
            batch_size=8,
            fp16=True,
        ))
    data_io.write_lines(args.pred_file, hyps)

    pprint(calc_rouge_scores(hyps, targets))
Beispiel #14
0
if __name__ == "__main__":

    import subprocess

    model = sys.argv[1]
    input_dir = sys.argv[2]
    output_dir = sys.argv[3]

    if os.path.isdir(output_dir):
        shutil.rmtree(output_dir)
    os.makedirs(output_dir)

    asr = SpeechToText(
        model_name=model,
    ).init()

    files = list(Path(input_dir).glob("*.*"))
    assert len(files)>0
    for file in files:  # mp4, m4a
        # transcript = transcribe_audio_file(asr, file)
        transcript = convert_to_wav_transcribe(asr, file)
        data_io.write_lines(
            f"{output_dir}/{file.stem}.csv",
            [f"{l.letter}\t{l.r_idx}" for l in transcript.letters],
        )

        data_io.write_lines(
            f"{output_dir}/{file.stem}.txt",
            ["".join([l.letter for l in transcript.letters])],
        )
Beispiel #15
0
                                             corpus_embeddings, "cosine")[0]
    similarities = [1 - dist for dist in distances]
    return [(s, float(sim)) for s, sim in zip(corpus, similarities)]


if __name__ == "__main__":

    import spacy

    # nlp = spacy.load("en_core_web_sm")
    nlp = spacy.blank("en")
    nlp.add_pipe(nlp.create_pipe("sentencizer"))

    file_path = (os.environ["HOME"] +
                 "/pubtator/download/bioconcepts2pubtatorcentral.offset.gz")
    g = (pubtator_parser(content)
         for content in doc_generator(file_path, limit=100_000))
    texts_g = (sent.text for d in g for sent in nlp(d["text"]).sents)

    embedder = SentenceTransformer("bert-base-nli-mean-tokens")
    query = "Clinical characteristics of novel coronavirus disease 2019 (COVID-19) in newborns, infants and children"
    g = util_methods.process_batchwise(
        partial(calc_similarities, embedder=embedder, query=query),
        texts_g,
        batch_size=1024,
    )
    data_io.write_lines(
        "results.csv",
        ("\t".join([s, str(sim)]) for s, sim in tqdm(g) if sim > 0.9),
    )