def main(): if len(sys.argv) != 3: print('USAGE: python3 evalsplit.py \\') print(' <in-file: reference splitter>') print(' <in-file: hypothesis splitter>') return fname_ref = sys.argv[1] fname_hyp = sys.argv[2] sp_ref = 0 sp_hyp = 0 sp_match = 0 nosp_ref = 0 nosp_hyp = 0 nosp_match = 0 sps_ref = {(int(x[0]), int(x[1])) for x in dlmread(fname_ref)} sps_hyp = {(int(x[0]), int(x[1])) for x in dlmread(fname_hyp)} match = 0 for sp in sps_hyp: if sp in sps_ref: match += 1 len_r = len(sps_ref) len_h = len(sps_hyp) print('SPLIT-R: %f [%d/%d]' % (match/len_r, match, len_r)) print('SPLIT-P: %f [%d/%d]' % (match/len_h, match, len_h)) print('SPLIT-F: %f [2*%d/(%d+%d)]' % (2*match/(len_r+len_h), match, len_r, len_h))
def main(): if len(sys.argv) != 3: print('USAGE: python3 evalsplit.py \\') print(' <in-file: reference splitter>') print(' <in-file: hypothesis splitter>') return fname_ref = sys.argv[1] fname_hyp = sys.argv[2] sp_ref = 0 sp_hyp = 0 sp_match = 0 nosp_ref = 0 nosp_hyp = 0 nosp_match = 0 sps_ref = {(int(x[0]), int(x[1])) for x in dlmread(fname_ref)} sps_hyp = {(int(x[0]), int(x[1])) for x in dlmread(fname_hyp)} match = 0 for sp in sps_hyp: if sp in sps_ref: match += 1 len_r = len(sps_ref) len_h = len(sps_hyp) print('SPLIT-R: %f [%d/%d]' % (match / len_r, match, len_r)) print('SPLIT-P: %f [%d/%d]' % (match / len_h, match, len_h)) print('SPLIT-F: %f [2*%d/(%d+%d)]' % (2 * match / (len_r + len_h), match, len_r, len_h))
def load(fnames): print('--------') print('loading data ...') corpus_in = [x for x in dlmread(fnames['IN'], ' ')] tab_pid_pmt = {} with \ codecs.open(fnames['PID'], 'r', 'utf-8') as fp_pid, \ codecs.open(fnames['PMT'], 'r', 'utf-8') as fp_pmt: for lid, lmt in zip(fp_pid, fp_pmt): key = tuple(int(x) for x in lid.strip().split(' ')) val = lmt.strip().split(' ') tab_pid_pmt[key] = val tab_sp = defaultdict(lambda: []) with codecs.open(fnames['SP'], 'r', 'utf-8') as fp: for l in fp: lineno, wordno = tuple(int(x) for x in l.strip().split(' ')) tab_sp[lineno].append(wordno) print(' # of input sentence : %d' % len(corpus_in)) print(' # of part id-mt table : %d' % len(tab_pid_pmt)) print(' # of splitting table : %d' % sum(len(x) for x in tab_sp.values())) return corpus_in, tab_pid_pmt, tab_sp
def main(): if len(sys.argv) != 4: print('USAGE: python3 predicttosplit.py \\') print(' <in-file: LIBLINEAR prediction> \\') print(' <in-file: input corpus> \\') print(' <out-file: splitter table>') return fname_predict = sys.argv[1] fname_in = sys.argv[2] fname_splitter = sys.argv[3] corpus_in = [x for x in dlmread(fname_in, ' ')] with \ codecs.open(fname_predict, 'r', 'utf-8') as fp_pred, \ codecs.open(fname_splitter, 'w', 'utf-8') as fp_sp: line = 0 pos = 0 for i, l in enumerate(fp_pred): while not corpus_in[line] or pos == len(corpus_in[line])-1: line += 1 pos = 0 status = int(l.strip()) if status == 1: fp_sp.write('%d %d\n' % (line, pos)) pos += 1
def main(): if len(sys.argv) != 5: print('USAGE: python3 gdptosplit.py') print(' <[1] in-file: GreedyDP model>') print(' <[2] in-file: input corpus with POS>') print(' <[3] out-file: splitter table>') print(' <[4] float: mean of #words>') return fname_model = sys.argv[1] fname_in = sys.argv[2] fname_sp = sys.argv[3] mu = float(sys.argv[4]) try: model = loadmodel(fname_model, mu) except Exception as ex: sys.stderr.write('ERROR: mu is too small %s' % ex) return 1 print(model) corpus_in = [[tuple(x.split('_')) for x in inp] for inp in dlmread(fname_in, ' ')] with open(fname_sp, 'w', encoding='utf-8') as fp: for i, inp in enumerate(corpus_in): for j in range(len(inp)-1): if (inp[j][1], inp[j+1][1]) in model: fp.write('%d %d\n' % (i, j))
def main(): if len(sys.argv) != 4: print('USAGE: python3 predicttosplit.py \\') print(' <in-file: LIBLINEAR prediction> \\') print(' <in-file: input corpus> \\') print(' <out-file: splitter table>') return fname_predict = sys.argv[1] fname_in = sys.argv[2] fname_splitter = sys.argv[3] corpus_in = [x for x in dlmread(fname_in, ' ')] with \ codecs.open(fname_predict, 'r', 'utf-8') as fp_pred, \ codecs.open(fname_splitter, 'w', 'utf-8') as fp_sp: line = 0 pos = 0 for i, l in enumerate(fp_pred): while not corpus_in[line] or pos == len(corpus_in[line]) - 1: line += 1 pos = 0 status = int(l.strip()) if status == 1: fp_sp.write('%d %d\n' % (line, pos)) pos += 1
def main(): if len(sys.argv) != 3: print('USAGE: python3 evalliblin.py \\') print(' <in-file: LIBLINEAR input file>') print(' <in-file: LIBLINEAR prediction result>') return fname_ref = sys.argv[1] fname_hyp = sys.argv[2] sp_ref = 0 sp_hyp = 0 sp_match = 0 nosp_ref = 0 nosp_hyp = 0 nosp_match = 0 for ls_ref, ls_hyp in zip(dlmread(fname_ref), dlmread(fname_hyp)): ref = int(ls_ref[0]) hyp = int(ls_hyp[0]) if ref == 1: sp_ref += 1 else: nosp_ref += 1 if hyp == 1: sp_hyp += 1 else: nosp_hyp += 1 if ref == hyp: if ref == 1: sp_match += 1 else: nosp_match += 1 print('LIBLIN-R: %f [%d/%d]' % (0 if sp_ref == 0 else sp_match / sp_ref, sp_match, sp_ref)) print('LIBLIN-P: %f [%d/%d]' % (0 if sp_hyp == 0 else sp_match / sp_hyp, sp_match, sp_hyp)) print('LIBLIN-F: %f [2*%d/(%d+%d)]' % (0 if sp_ref + sp_hyp == 0 else 2 * sp_match / (sp_ref + sp_hyp), sp_match, sp_ref, sp_hyp))
def load(fnames): print('--------') print('loading data ...') corpus_in = [x for x in dlmread(fnames['IN'], ' ')] corpus_ref = [x for x in dlmread(fnames['REF'], ' ')] tab_pid_pmt = {} with \ codecs.open(fnames['PID'], 'r', 'utf-8') as fp_pid, \ codecs.open(fnames['PMT'], 'r', 'utf-8') as fp_pmt: for lid, lmt in zip(fp_pid, fp_pmt): key = tuple(int(x) for x in lid.strip().split(' ')) val = lmt.strip().split(' ') tab_pid_pmt[key] = val print(' # of input sentence : %d' % len(corpus_in)) print(' # of reterence sentence : %d' % len(corpus_ref)) print(' # of part id-mt table : %d' % len(tab_pid_pmt)) return corpus_in, corpus_ref, tab_pid_pmt
def main(): if len(sys.argv) != 3: print('USAGE: python3 evalliblin.py \\') print(' <in-file: LIBLINEAR input file>') print(' <in-file: LIBLINEAR prediction result>') return fname_ref = sys.argv[1] fname_hyp = sys.argv[2] sp_ref = 0 sp_hyp = 0 sp_match = 0 nosp_ref = 0 nosp_hyp = 0 nosp_match = 0 for ls_ref, ls_hyp in zip(dlmread(fname_ref), dlmread(fname_hyp)): ref = int(ls_ref[0]) hyp = int(ls_hyp[0]) if ref == 1: sp_ref += 1 else: nosp_ref += 1 if hyp == 1: sp_hyp += 1 else: nosp_hyp += 1 if ref == hyp: if ref == 1: sp_match += 1 else: nosp_match += 1 print('LIBLIN-R: %f [%d/%d]' % (0 if sp_ref==0 else sp_match/sp_ref, sp_match, sp_ref)) print('LIBLIN-P: %f [%d/%d]' % (0 if sp_hyp==0 else sp_match/sp_hyp, sp_match, sp_hyp)) print('LIBLIN-F: %f [2*%d/(%d+%d)]' % (0 if sp_ref+sp_hyp==0 else 2*sp_match/(sp_ref+sp_hyp), sp_match, sp_ref, sp_hyp))
def main(): if len(sys.argv) != 6: print('USAGE: python3 makeliblin_greedy.py \\') print(' <str: mode ("dev" or "test")>') print(' <in-file: input sentence with POS> \\') print(' <in-file: splitter table> \\') print( ' <(dev)out-file, (test)in-file: feature ID table> \\' ) print(' <out-file: LIBLINEAR input data>') return mode = sys.argv[1] fname_pos = sys.argv[2] fname_splitter = sys.argv[3] fname_fid = sys.argv[4] fname_liblin = sys.argv[5] if mode not in ['dev', 'test']: sys.stderr.write('ERROR: unknown mode.\n') return # load word and pos corpus_in_pos = [x for x in dlmread(fname_pos, ' ')] for i in range(len(corpus_in_pos)): corpus_in_pos[i] = [w.split('_') for w in corpus_in_pos[i]] # load splitter tab_sp = defaultdict(lambda: []) with open(fname_splitter, 'r', encoding='utf-8') as fp: for l in fp: lineno, wordno = tuple(int(x) for x in l.strip().split(' ')) tab_sp[lineno].append(wordno) # load or new feature id table fid = defaultdict(lambda: len(fid) + 1) if mode == 'test': with open(fname_fid, 'r', encoding='utf-8') as fp: for l in fp: ls = l.split() k = ls[0] v = int(ls[1]) fid[k] = v # make/save training data n = 0 with open(fname_liblin, 'w', encoding='utf-8') as fp: for i in range(len(corpus_in_pos)): data = [['<s>', '<s>']] * 2 + corpus_in_pos[i] + [['</s>', '</s>'] ] * 2 for j in range(len(data) - 5): # ignore end of sentence jj = j + 2 features = [] # unigram words # addfeature(features, fid, 'WORD[-2]=%s' % data[jj-2][0], mode) addfeature(features, fid, 'WORD[-1]=%s' % data[jj - 1][0], mode) addfeature(features, fid, 'WORD[0]=%s' % data[jj + 0][0], mode) addfeature(features, fid, 'WORD[+1]=%s' % data[jj + 1][0], mode) addfeature(features, fid, 'WORD[+2]=%s' % data[jj + 2][0], mode) # unigram POSes # addfeature(features, fid, 'POS[-2]=%s' % data[jj-2][1], mode) addfeature(features, fid, 'POS[-1]=%s' % data[jj - 1][1], mode) addfeature(features, fid, 'POS[0]=%s' % data[jj + 0][1], mode) addfeature(features, fid, 'POS[+1]=%s' % data[jj + 1][1], mode) addfeature(features, fid, 'POS[+2]=%s' % data[jj + 2][1], mode) # bigram words # addfeature(features, fid, 'WORD[-2:-1]=%s_%s' % (data[jj-2][0], data[jj-1][0]), mode) addfeature( features, fid, 'WORD[-1:0]=%s_%s' % (data[jj - 1][0], data[jj + 0][0]), mode) addfeature( features, fid, 'WORD[0:+1]=%s_%s' % (data[jj + 0][0], data[jj + 1][0]), mode) addfeature( features, fid, 'WORD[+1:+2]=%s_%s' % (data[jj + 1][0], data[jj + 2][0]), mode) # bigram POSes # addfeature(features, fid, 'POS[-2:-1]=%s_%s' % (data[jj-2][1], data[jj-1][1]), mode) addfeature( features, fid, 'POS[-1:0]=%s_%s' % (data[jj - 1][1], data[jj + 0][1]), mode) addfeature( features, fid, 'POS[0:+1]=%s_%s' % (data[jj + 0][1], data[jj + 1][1]), mode) addfeature( features, fid, 'POS[+1:+2]=%s_%s' % (data[jj + 1][1], data[jj + 2][1]), mode) # trigram words # addfeature(features, fid, 'WORD[-2:0]=%s_%s_%s' % (data[jj-2][0], data[jj-1][0], data[jj+0][0]), mode) addfeature( features, fid, 'WORD[-1:+1]=%s_%s_%s' % (data[jj - 1][0], data[jj + 0][0], data[jj + 1][0]), mode) addfeature( features, fid, 'WORD[0:+2]=%s_%s_%s' % (data[jj + 0][0], data[jj + 1][0], data[jj + 2][0]), mode) # trigram POSes # addfeature(features, fid, 'POS[-2:0]=%s_%s_%s' % (data[jj-2][1], data[jj-1][1], data[jj+0][1]), mode) addfeature( features, fid, 'POS[-1:+1]=%s_%s_%s' % (data[jj - 1][1], data[jj + 0][1], data[jj + 1][1]), mode) addfeature( features, fid, 'POS[0:+2]=%s_%s_%s' % (data[jj + 0][1], data[jj + 1][1], data[jj + 2][1]), mode) line = '1 ' if j in tab_sp[i] else '2 ' line += ' '.join('%d:1' % f for f in sorted(features)) fp.write(line + '\n') n += 1 # save feature id table if mode == 'dev': with open(fname_fid, 'w', encoding='utf-8') as fp: for k, v in fid.items(): fp.write('%s\t%d\n' % (k, v))