def dump_mappings(base_dir: str, mapping: List[Dict]): def build_line(d: Dict[str, Any], k: str) -> str: return "\t".join([str(x) for x in d.get(k, {}).values()]) data_io.write_lines(f"{base_dir}/tilo_mapped.csv", [build_line(m, "tilo") for m in mapping]) data_io.write_lines(f"{base_dir}/tati_mapped.csv", (build_line(m, "tati") for m in mapping))
def build_vocabulary( text_g, vocab_file="fairseq_dict.txt", min_freq=1000, ): counter = Counter((c for t in tqdm(text_g) for c in t.replace(" ", "_"))) vocab = counter.most_common(200) assert len(vocab) > 0 data_io.write_lines( vocab_file, ["%s %d" % (c, f) for c, f in vocab if f > min_freq], )
def parse_pdf(pdf_file) -> str: bytes = textract.process(pdf_file) text_with_linebreaks = bytes.decode("utf-8") data_io.write_lines(DEBUG_RAW_TEXT, text_with_linebreaks.split("\n"), mode="ab") text = text_with_linebreaks.replace("\n", " ") # if len(matches) == 0 and pdf_file.split("/")[-1] not in KNOWN_TO_HAVE_NO_EXPEDIENTE: # assert False # html_lines = exec_command(f"pdftohtml -noframes -stdout '{pdf_file}'")["stdout"] # html = "\n".join([l.decode("utf-8") for l in html_lines]) # soup = BeautifulSoup(html, features="html.parser") return text
def create_or_load_raw_transcript(video_file, model_name) -> str: file = Path(f"{APP_DATA_DIR}/{video_file}") raw_transcript_file = ( f"{SUBTITLES_DIR}/{file.stem}_{raw_transcript_name(model_name)}.txt") if not os.path.isfile(raw_transcript_file): asr = SpeechToText(model_name=model_name, ).init() transcript = convert_to_wav_transcribe(asr, str(file)) data_io.write_lines( get_letters_csv(video_file, model_name), [f"{l.letter}\t{l.index}" for l in transcript.letters], ) raw_transcript = "".join([l.letter for l in transcript.letters]) data_io.write_lines( raw_transcript_file, [raw_transcript], ) else: raw_transcript = list(data_io.read_lines(raw_transcript_file))[0] return raw_transcript
def extract_date(string: str): dates_numeric = date_numeric_pattern.findall(string) dates_nonnum = date_nonnum_pattern.findall(string) if len(dates_numeric) >= 1: date_string = dates_numeric[ -1] # take very last which is closest to sentencia mention! # return date_string mes = meses_pattern.search(date_string).group() mes_i = MESES.get(mes) day, year = [ int(regex.sub(CIRCLE, "", s[1:-1])) for s in number_in_brackets_pattern.findall(date_string) ] date_s = reformat_date( f"{mes_i:02d}/{day:02d}/{year}") # just for validation return date_s elif len(dates_nonnum) >= 1: date_nonnum = dates_nonnum[-1] mes = meses_pattern.search(date_nonnum).group() mes_i = MESES.get(mes) day = None for k in range(31, 1, -1): if num2name[k] in date_nonnum: day = k break assert day is not None year = [ int(s[1:-1].replace("º", "")) for s in number_in_brackets_pattern.findall(date_nonnum) ][0] if year == 200: # HACK!: see 2010 Mayo, No. 64 year = 2009 date_s = reformat_date(f"{mes_i:02d}/{day:02d}/{year}") return date_s else: data_io.write_lines(DEBUG_NO_DATE, [string], "ab") return None
def extract_from_edicto(source, string, edicto_num: int, edicto_year): edicto_date = parse_edicto_date(string) if edicto_date is None: data_io.write_jsonl(DEBUG_EDICTO_DATE, [{ "source": source, "text": string }], mode="ab") return [] spans = [get_sentencia_span(m) for m in sentencia_pattern.finditer(string)] edictos = [] for k, (start, end, sentencia) in enumerate(spans): next_start, _, _ = (spans[k + 1] if k + 1 < len(spans) else (len(string), None, None)) _, previous_end, _ = spans[k - 1] if k > 0 else (None, 0, None) behind_sentencia = string[end:next_start] expedientes = extract_expedientes(behind_sentencia) if len(expedientes) > 0: before_sentencia = string[previous_end:start] data_io.write_lines(DEBUG_BEFORE_SENTENCIA, [before_sentencia.replace("\n", "€")], mode="ab") date = extract_date(before_sentencia) if date is not None: edictos.append( Edicto( sentencia, date, edicto_date, edicto_year, expedientes, source, edicto_num, )) if len(edictos) != 1: data_io.write_jsonl(DEBUG_NO_EDICTO, [{ "source": source, "string": string }], "ab") return edictos
def batch_inference(args: argparse.Namespace): torch.set_grad_enabled(False) if args.asr_model.endswith(".nemo"): print(f"Using local ASR model from {args.asr_model}") asr_model = EncDecCTCModel.restore_from(restore_path=args.asr_model) else: print(f"Using NGC cloud ASR model {args.asr_model}") asr_model = EncDecCTCModel.from_pretrained(model_name=args.asr_model) manifest = prepare_manifest(args.corpora_dir, args.limit) asr_model.setup_test_data( test_data_config={ "sample_rate": 16000, "manifest_filepath": manifest, "labels": asr_model.decoder.vocabulary, "batch_size": args.batch_size, "normalize_transcripts": args.normalize_text, }) refs_hyps = list(tqdm(generate_ref_hyps(asr_model, args.search, args.arpa))) references, hypotheses = [list(k) for k in zip(*refs_hyps)] os.makedirs(args.results_dir, exist_ok=True) data_io.write_lines(f"{args.results_dir}/refs.txt.gz", references) data_io.write_lines(f"{args.results_dir}/hyps.txt.gz", hypotheses) wer_value = word_error_rate(hypotheses=hypotheses, references=references) sys.stdout.flush() stats = { "wer": wer_value, "args": args.__dict__, } data_io.write_json(f"{args.results_dir}/stats.txt", stats) print(f"Got WER of {wer_value}") return stats
def scrape_proceso_tables(search_ids: List): base_url = "https://www.corteconstitucional.gov.co/secretaria/" data_path = f"{os.environ['HOME']}/data/corteconstitucional/procesos_tables" os.makedirs(data_path, exist_ok=True) download_path = f"{data_path}/downloads" wd = build_chrome_driver(download_path, headless=True) ids_files = ((eid, f"{data_path}/{eid}.json") for eid in search_ids) to_be_scraped = [(eid, file) for eid, file in ids_files if not os.path.isfile(file)] print(f"already got {len(search_ids)-len(to_be_scraped)}") for search_id, file in tqdm(to_be_scraped): try: fire_search(base_url, search_id, wd) datum = dump_proceso_table(wd) datum["id"] = search_id data_io.write_json(file, datum) except BaseException as e: # traceback.print_stack() # raise e data_io.write_lines(f"{data_path}/could_not_scrape.txt", [search_id]) print(f"{search_id} f****d it up!")
def answer(input): # print(input.shape) with torch.no_grad(): chat_history_ids = model.generate( input, max_length=1000, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, temperature=0.7, # num_beams=3 ) output = tokenizer.decode( chat_history_ids[:, input.shape[-1]:][0], skip_special_tokens=True, ) # print("OUTPUT: %s"%output) return output file_name = "valid_freq" dialogues_g = topicalchat( file_name=file_name, data_path=os.environ["HOME"] + "/Response-Generation-Baselines/processed_output", limit=None) g = (answer(build_gpt2_input(utts)) for utts in dialogues_g) data_io.write_lines("microsoft-gpt2-%s.pred" % file_name, g) # dialogue_test()
import os from util import data_io if __name__ == "__main__": base_path = os.environ["HOME"] + "/hpc/data/parallel_text_corpora/wmt_en_ro" files = [ f for f in os.listdir(base_path) if f.endswith(".source") or f.endswith(".target") ] some_data = "some_data" os.makedirs("%s" % some_data, exist_ok=True) for f in files: data_io.write_lines( "%s/%s" % (some_data, f), data_io.read_lines(base_path + "/%s" % f, limit=1000), )
g = ( " ".join(ngram) for ref, hyp in tqdm(zip(refs, hyps)) for ngram in calc_corrected_ngrams(tokenize(ref), tokenize(hyp), order) ) return list(g) if __name__ == "__main__": refs_hyps_dir = "/tmp/train_kenlm_3_089_mp3" # ngrams = corrected_ngrams(refs_hyps_dir) refs = data_io.read_lines(f"{refs_hyps_dir}/refs.txt.gz") hyps = data_io.read_lines(f"{refs_hyps_dir}/hyps.txt.gz") tuples = ( (h, r) for ref, hyp in tqdm(zip(refs, hyps)) for h, r in calc_aligned_ngram_tuples(tokenize(ref), tokenize(hyp), 3) ) error_tuples = ((r, h) for h, r in tuples if h != r and len(r) == 3) data_io.write_lines( "erroneous_3grams.tsv", (f"{' '.join(r)}\t{' '.join(h)}" for h, r in error_tuples), ) # data_io.write_lines("ngrams.txt.gz", ngrams) # data_io.write_lines("unique_ngrams.txt.gz", list(set(ngrams)))
import os from tqdm import tqdm from util import data_io from erroneous_ngrams import corrected_ngrams from kenlm_arpa import convert_and_filter_topk, build_lm, ArpaArgs if __name__ == "__main__": colab_asr_data = f"{os.environ['HOME']}/googledrive/data/asr_data" ngrams_file = "/tmp/ngrams.txt.gz" if not os.path.isfile(ngrams_file): ngrams = corrected_ngrams( f"{colab_asr_data}/results/train_kenlm_3_089_mp3") data_io.write_lines(ngrams_file, ngrams) data_io.write_lines("unique_ngrams.txt.gz", list(set(ngrams))) librispeech_lm_data = f"{os.environ['HOME']}/data/asr_data/ENGLISH/librispeech-lm-norm.txt.gz" for name, files in [ ("vanilla", [librispeech_lm_data]), ("tedlium", [librispeech_lm_data] + 10 * [ngrams_file]), ]: cache_dir = f"kenlm_cache_{name}" if os.path.isdir(cache_dir): shutil.rmtree(cache_dir) os.makedirs(cache_dir) data_lower, vocab_str = convert_and_filter_topk( files, cache_dir, 200_000)
"--target_file", default=os.environ["HOME"] + "/data/QA/topical-chat/processed_output/test_rare.tgt", type=str, ) parser.add_argument( "--pred_file", default="test_rare.pred", type=str, ) if __name__ == "__main__": args = parser.parse_args() rouge = Rouge() sources = [ " " # beginning with space? see: https://github.com/huggingface/transformers/blob/5ddd8d6531c8c49fdd281b55b93f6c81c9826f4b/examples/summarization/bart/evaluate_cnn.py#L66 + x.rstrip() for x in data_io.read_lines(args.source_file) ] targets = list(data_io.read_lines(args.target_file)) hyps = list( generate_summaries_or_translations( sources, args.model_path, batch_size=8, fp16=True, )) data_io.write_lines(args.pred_file, hyps) pprint(calc_rouge_scores(hyps, targets))
if __name__ == "__main__": import subprocess model = sys.argv[1] input_dir = sys.argv[2] output_dir = sys.argv[3] if os.path.isdir(output_dir): shutil.rmtree(output_dir) os.makedirs(output_dir) asr = SpeechToText( model_name=model, ).init() files = list(Path(input_dir).glob("*.*")) assert len(files)>0 for file in files: # mp4, m4a # transcript = transcribe_audio_file(asr, file) transcript = convert_to_wav_transcribe(asr, file) data_io.write_lines( f"{output_dir}/{file.stem}.csv", [f"{l.letter}\t{l.r_idx}" for l in transcript.letters], ) data_io.write_lines( f"{output_dir}/{file.stem}.txt", ["".join([l.letter for l in transcript.letters])], )
corpus_embeddings, "cosine")[0] similarities = [1 - dist for dist in distances] return [(s, float(sim)) for s, sim in zip(corpus, similarities)] if __name__ == "__main__": import spacy # nlp = spacy.load("en_core_web_sm") nlp = spacy.blank("en") nlp.add_pipe(nlp.create_pipe("sentencizer")) file_path = (os.environ["HOME"] + "/pubtator/download/bioconcepts2pubtatorcentral.offset.gz") g = (pubtator_parser(content) for content in doc_generator(file_path, limit=100_000)) texts_g = (sent.text for d in g for sent in nlp(d["text"]).sents) embedder = SentenceTransformer("bert-base-nli-mean-tokens") query = "Clinical characteristics of novel coronavirus disease 2019 (COVID-19) in newborns, infants and children" g = util_methods.process_batchwise( partial(calc_similarities, embedder=embedder, query=query), texts_g, batch_size=1024, ) data_io.write_lines( "results.csv", ("\t".join([s, str(sim)]) for s, sim in tqdm(g) if sim > 0.9), )