def __init__(self, vocabs, lex_map, filename, batch_size, for_train): self.data = [] for amr, token, lemma, pos, ner in zip(*read_file(filename)): if for_train: _, _, not_ok = amr.root_centered_sort() if not_ok: continue if ' '.join( token ) == "https://www.com.html https://www.com.html </a>": continue cp_seq, mp_seq, token2idx, idx2token = lex_map.get_concepts( lemma, token, vocabs['predictable_concept']) #print("datum",len(pos),len(token),len(ner),len(lemma)) #print("datum",pos,token,ner) if len(pos) != len(token): exit() datum = {'amr':amr, 'tok':token, 'lem':lemma, 'pos':pos, 'ner':ner, \ 'cp_seq':cp_seq, 'mp_seq':mp_seq,\ 'token2idx':token2idx, 'idx2token':idx2token} self.data.append(datum) print("Get %d AMRs from %s" % (len(self.data), filename)) self.vocabs = vocabs self.batch_size = batch_size self.train = for_train self.unk_rate = 0.
def __init__(self, vocabs, lex_map, filename, batch_size, for_train): self.data = read_file(filename) self.vocabs = vocabs self.lex_map = lex_map self.batch_size = batch_size self.train = for_train self.unk_rate = 0. self.nprocessors = 8 self.record_flag = False
def validate(model, test_data, golden_file, beam_size=8, alpha=0.6, max_time_step=100): """For development Only""" pp = PostProcess() ref_stream = [] for line in open(golden_file + '.input_clean'): if line.startswith('# ::tokens '): o = json.loads(line[len('# ::tokens '):].strip()) ref_stream.append(' '.join(o).lower()) # gold model output graph, gold_sys_stream, _, abstract = read_file(golden_file + '.preproc') ref_streams = [ref_stream] sys_stream = [] for batch in test_data: res = generate_batch(model, batch, beam_size, alpha, max_time_step) sys_stream.extend(res['token']) assert len(sys_stream) == len(ref_stream) sys_stream = [ pp.post_process(o, abstract[i], graph[i]) for i, o in enumerate(sys_stream) ] bleu = sacrebleu.corpus_bleu(sys_stream, ref_streams, force=True, lowercase=True, tokenize='none').score chrf = sacrebleu.corpus_chrf(sys_stream, ref_stream) return bleu, chrf
if __name__ == '__main__': import json from extract import read_file import sacrebleu args = parse_config() pp = PostProcess(retokenize=args.retokenize, span=args.span, compound_map_file=args.compound_map_file) ref_stream = [] for line in open(args.golden_file): if line.startswith('# ::original '): o = json.loads(line[len('# ::original '):].strip()) ref_stream.append(' '.join(o).lower()) # gold model output graph, gold_sys_stream, _, abstract = read_file(args.golden_file + '.preproc') ref_streams = [ref_stream] pred_sys_stream = [] for line in open(args.pred_file): if line.startswith('#model output:'): ans = line[len('#model output:'):].strip().split() pred_sys_stream.append(ans) prev = [' '.join(o) for o in pred_sys_stream] # choose one (gold or pred) and postprocess sys_stream = pred_sys_stream sys_stream = [ pp.post_process(o, abstract[i], graph[i]) for i, o in enumerate(sys_stream)
def setUp(self): self.data = read_file('changes_python.txt')
import matplotlib.pyplot as plt import extract, statistics, csv DATA_FILE = "data.txt" OUTPUT_FILE = "processed_data.csv" def read_data_file(file_name): with open(file_name, 'r') as csv_file: csv_reader = csv.DictReader(csv_file) return [row for row in csv_reader] data = extract.read_file(DATA_FILE) extract.write_file(data, OUTPUT_FILE)