def corrected_ngrams( refs_hyps_dir=f"{os.environ['HOME']}/googledrive/data/asr_data/results/kenlm_3_089_mp3", order=3, ): refs = data_io.read_lines(f"{refs_hyps_dir}/refs.txt.gz") hyps = data_io.read_lines(f"{refs_hyps_dir}/hyps.txt.gz") g = ( " ".join(ngram) for ref, hyp in tqdm(zip(refs, hyps)) for ngram in calc_corrected_ngrams(tokenize(ref), tokenize(hyp), order) ) return list(g)
def topicalchat( file_name="train", data_path=os.environ["HOME"] + "/data/QA/topical-chat/processed_output", limit=None, ): backgrounds = data_io.read_lines(os.path.join(data_path, file_name) + ".fct", limit=limit) dialogs = data_io.read_lines(os.path.join(data_path, file_name) + ".src", limit=limit) targets = data_io.read_lines(os.path.join(data_path, file_name) + ".tgt", limit=limit) for b, d, t in tqdm(zip(backgrounds, dialogs, targets)): turns = d.split("_eos")[:-1] + [t.strip("_go").strip("_eos")] yield turns[-3:]
def doc_generator(file_path,limit=None): """ PubTator docs are of the form: UID|TITLE|TEXT UID|ABSTRACT|TEXT UID SPAN MENTION ENTITY_TYPE MESH_ID ... See -- data/bioconcepts2pubtator_offsets.sample """ k=0 lines = [] for line in data_io.read_lines(file_path): if len(line.rstrip()) == 0: if len(lines) > 0: # filter docs to target set doc_id = re.split(r'\|', lines[0].rstrip(), maxsplit=2) yield lines lines = [] k+=1 if limit is not None and k>limit: break else: lines.append(line)
def preprocessed_topicalchat( file_name="train", data_path=os.environ["HOME"] + "/data/QA/topical-chat/processed_output", use_danqi=False, ): backgrounds = data_io.read_lines( os.path.join(data_path, file_name) + ".fct") dialogs = data_io.read_lines(os.path.join(data_path, file_name) + ".src") targets = data_io.read_lines(os.path.join(data_path, file_name) + ".tgt") for b, d, t in tqdm(zip(backgrounds, dialogs, targets)): turns = d.split("_eos")[:-1] + [t.strip("_go").strip("_eos")] if len(turns) % 2 != 0: turns = [SILENCE] + turns turns = [Turn(turns[k], turns[k + 1]) for k in range(0, len(turns), 2)] yield build_input_target(b, turns, SEP, use_danqi=use_danqi)
def parse_csv_to_examples_build_fields(dataset_name="AG_NEWS"): csv_path = DATA_PATH + "/" + dataset_name.lower() + "_csv" if not os.path.isdir(csv_path): # download data _, _ = text_classification.DATASETS[dataset_name](root=DATA_PATH) train_csv = csv_path + "/train.csv" test_csv = csv_path + "/test.csv" def regex_tokenizer( text, pattern=r"(?u)\b\w\w+\b" ) -> List[str]: # pattern stolen from scikit-learn return [m.group() for m in re.finditer(pattern, text)] def parse_line(line): splits = line.split(",") text = " ".join(splits[1:]) label = splits[0] return text, label text_field = Field(include_lengths=False, batch_first=True, tokenize=regex_tokenizer) label_field = Field(batch_first=True, sequential=False, unk_token=None) fields = [("text", text_field), ("label", label_field)] g = (parse_line(l) for l in tqdm(data_io.read_lines(train_csv))) train_examples = [ Example.fromlist([text, label], fields) for text, label in g ] text_field.build_vocab([example.text for example in train_examples]) label_field.build_vocab([example.label for example in train_examples]) g = (parse_line(l) for l in tqdm(data_io.read_lines(test_csv))) test_examples = [ Example.fromlist([text, label], fields) for text, label in g ] return train_examples, test_examples, fields
def build_trees(file,dataset_name) -> Trees: trees = {} def add_triple(subj, predi, obje, trees: Trees): if subj not in trees: trees[subj] = {} if predi not in trees[subj]: trees[subj][predi] = set() trees[subj][predi].add(obje) for line in data_io.read_lines(file): s, o, p = line.strip().split("\t") s_id, o_id, p_id = get_id(ent2id, s), get_id(ent2id, o), get_id(rel2id, p) add_triple(s_id, p_id, o_id, trees) if 'train' in dataset_name: p_inv_id = get_id(rel2id, p + "_inv") add_triple(o_id, p_inv_id, s_id, trees) return trees
def create_or_load_raw_transcript(video_file, model_name) -> str: file = Path(f"{APP_DATA_DIR}/{video_file}") raw_transcript_file = ( f"{SUBTITLES_DIR}/{file.stem}_{raw_transcript_name(model_name)}.txt") if not os.path.isfile(raw_transcript_file): asr = SpeechToText(model_name=model_name, ).init() transcript = convert_to_wav_transcribe(asr, str(file)) data_io.write_lines( get_letters_csv(video_file, model_name), [f"{l.letter}\t{l.index}" for l in transcript.letters], ) raw_transcript = "".join([l.letter for l in transcript.letters]) data_io.write_lines( raw_transcript_file, [raw_transcript], ) else: raw_transcript = list(data_io.read_lines(raw_transcript_file))[0] return raw_transcript
def read_lines_from_files( self, path, mode="b", encoding="utf-8", limit=sys.maxsize ): c = 0 for file in os.listdir(path): if self.state.get(file, 0) == "all": continue for line_idx, line in enumerate( data_io.read_lines(path + "/" + file, mode, encoding) ): c += 1 if line_idx < self.state.get(file, 0): continue if c > limit: break yield line self.state[file] = line_idx if c % self.write_interval == 0: data_io.write_json(self.state_file, self.state) self.state[file] = "all"
def segment_transcript_to_subtitle_blocks( transcript_letters_csv, translated_transcript: List[TranslatedTranscript] ) -> List[Dict[str, List[LetterIdx]]]: g = (line.split("\t") for line in data_io.read_lines(str(transcript_letters_csv))) raw_letters = [LetterIdx(l, int(i)) for l, i in g] assert all((raw_letters[k].r_idx > raw_letters[k - 1].r_idx for k in range(1, len(raw_letters)))) subtitles = [] letters = raw_letters for tt in sorted(translated_transcript, key=lambda x: x.order): aligned_letters = temporal_align_text_to_letters(tt.text, letters) subtitles.append((tt.name, aligned_letters)) letters = aligned_letters # align next transcript on already aligned, heuristic is to use language-similarities: native->spanish->english->german _, first_aligned = subtitles[0] named_blocks = [ cut_block_out_of_transcript(subtitles, s, e) for s, e in generate_block_start_ends(first_aligned) ] return named_blocks
import os from pprint import pprint import json from rouge import Rouge from util import data_io # # with open('./tests/data.json') as f: # data = json.load(f) # # hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data])) rouge = Rouge() # scores = rouge.get_scores(hyps, refs) hyps = list( data_io.read_lines( os.environ['HOME'] + '/hpc/transformers/examples/summarization/bart/cnn_predicted_summaries.txt', limit=1000)) refs = list( data_io.read_lines(os.environ['HOME'] + '/hpc/data/cnn_dm/test.target', limit=1000)) scores = rouge.get_scores(hyps, refs, avg=True) pprint(scores) ''' {'rouge-1': {'f': 0.2923597368088391, 'p': 0.2430148556164662, 'r': 0.38675644246961155}, 'rouge-2': {'f': 0.11613720277501925, 'p': 0.09577817141610762, 'r': 0.15578198219928469}, 'rouge-l': {'f': 0.28175826380865326, 'p': 0.23867975605955946,
import os from util import data_io if __name__ == "__main__": base_path = os.environ["HOME"] + "/hpc/data/parallel_text_corpora/wmt_en_ro" files = [ f for f in os.listdir(base_path) if f.endswith(".source") or f.endswith(".target") ] some_data = "some_data" os.makedirs("%s" % some_data, exist_ok=True) for f in files: data_io.write_lines( "%s/%s" % (some_data, f), data_io.read_lines(base_path + "/%s" % f, limit=1000), )
from time import time import dask.bag as db from tqdm import tqdm from util import data_io from dask.distributed import Client, progress from alignment import calc_aligned_ngram_tuples if __name__ == "__main__": client = Client() print(client) refs_hyps_dir = "/tmp/train_kenlm_3_089_mp3" refs = data_io.read_lines(f"{refs_hyps_dir}/refs.txt.gz") hyps = data_io.read_lines(f"{refs_hyps_dir}/hyps.txt.gz") refs_hyps = list((ref, hyp) for ref, hyp in zip(refs, hyps)) tokenize = lambda s: s.split(" ") order = 3 start = time() aligned_ngrams = (db.from_sequence( refs_hyps, npartitions=4 * 4).map(lambda rh: calc_aligned_ngram_tuples( tokenize(rh[0]), tokenize(rh[1]), order)).flatten().map( lambda rh: (" ".join(rh[0]), " ".join(rh[1])))) def error_rate(ref, hyp_counts: Counter): overall_num_erros = sum(v for k, v in hyp_counts.items() if ref != k) num_correct = hyp_counts[ref]
"--target_file", default=os.environ["HOME"] + "/data/QA/topical-chat/processed_output/test_rare.tgt", type=str, ) parser.add_argument( "--pred_file", default="test_rare.pred", type=str, ) if __name__ == "__main__": args = parser.parse_args() rouge = Rouge() sources = [ " " # beginning with space? see: https://github.com/huggingface/transformers/blob/5ddd8d6531c8c49fdd281b55b93f6c81c9826f4b/examples/summarization/bart/evaluate_cnn.py#L66 + x.rstrip() for x in data_io.read_lines(args.source_file) ] targets = list(data_io.read_lines(args.target_file)) hyps = list( generate_summaries_or_translations( sources, args.model_path, batch_size=8, fp16=True, )) data_io.write_lines(args.pred_file, hyps) pprint(calc_rouge_scores(hyps, targets))
type=str, ) parser.add_argument( "--target_file", default=os.environ["HOME"] + "/gunther/Response-Generation-Baselines/processed_output/test_rare.tgt", type=str, ) def calc_rouge_scores(pred: List[str], tgt: List[str]): rouge = Rouge() scores = rouge.get_scores(pred, tgt, avg=True) scores = { "f1-scores": {s: v for s, d in scores.items() for k, v in d.items() if k == "f"}, "huggingface-rouge": calculate_rouge(pred, tgt) } return scores if __name__ == "__main__": args = parser.parse_args() pred = list(data_io.read_lines(args.pred_file)) tgt = list(data_io.read_lines(args.target_file)) pprint(calc_rouge_scores(pred, tgt))
def build_id_text_generator(paths: List[str]): for p in paths: for file in Path(p).rglob("*.trans.txt"): for line in data_io.read_lines(str(file)): eid, text = line.split(" ", 1) yield eid, text