all_anno = dict() for i, pair in enumerate(g_n()): n, anno = pair all_anno[n] = anno print('Loading ({0}/{1}) : {2}\r'.format(i + 1, oracle_nc, n), end='') sys.stdout.flush() print('\nAnnotations loaded') #~ for a in all_anno.values(): #~ print(len(a.units)) #~ print(len(set(u.id for u in a.units if u.type=='Commitment'))) #~ sys.exit() ### Train on data ! c_t = cs.TabData(ffinal) c_t.new_class('is_commitment') #~ end_c = cs.Trainer(c_t, 10, 'dialogue', learner='logreg') end_c = cs.Trainer(c_t, 10, 'dialogue') end_c.evaluate() ### Create reports... fnl, fpl = [], [] for pred, row in end_c.pred_rows(): # False negative case if pred.value == 'False' and row.getclass().value == 'True': fnl.append(row) # False positive case if pred.value == 'True' and row.getclass().value == 'False': fpl.append(row)
all_anno = dict() for i, pair in enumerate(g_n()): n, anno = pair all_anno[n] = anno print('Loading ({0}/{1}) : {2}\r'.format(i + 1, oracle_nc, n), end='') sys.stdout.flush() print('\nAnnotations loaded') def nsplit(id): l = id.split('_') return tuple('_'.join(pl) for pl in (l[:2], l[-2:])) ques = defaultdict(list) x_t = cs.TabData(fxqap) for row in x_t: gi, si = nsplit(row['q_id'].value) if gi in all_anno: ques[nsplit(row['a_id'].value)].append((gi, si)) #~ print(ques) #~ sys.exit() c_t = cs.TabData(ffinal) c_t.new_class('is_commitment') #~ end_c = cs.Trainer(c_t, 10, 'dialogue', learner='logreg') end_c = cs.Trainer(c_t, 10, 'dialogue') end_c.evaluate() tot_ok = 0 tot_in = 0
break # Without counter #~ all_anno = dict(g_n()) # With counter all_anno = dict() for i, pair in enumerate(g_n()): n, anno = pair all_anno[n] = anno print('Loading ({0}/{1}) : {2}\r'.format(i + 1, oracle_nc, n), end='') sys.stdout.flush() print('\nAnnotations loaded') #~ e_t = cs.TabData(fsing) e_t = cs.TabData(fnsing) def gsname(row): """ Returns game section name from id """ return '_'.join(row['id'].value.split('_', 2)[:2]) snames = set(map(gsname, e_t)) # Only keep rows with Commitment annotations e_t.sel_row_by(lambda r: gsname(r) in all_anno) ######### Stat #~ count = defaultdict(lambda:0) #~ for n, a in all_anno.items(): #~ if n not in snames: #~ continue #~ print(n)
# Quicker script with already-built data ! from __future__ import print_function import sys import os import annodata as ad import classify as cs from collections import defaultdict fcomm = '/home/arthur/These/Data/socl-season1.custom-edus.tab' fmerge = '/home/arthur/These/Data/socl-season1.merged.tab' fturns = '/home/arthur/These/Data/socl-season1.turns2.tab' fqap = '/home/arthur/These/Data/socl-season1.qap.tab' ffinal = '/home/arthur/These/Data/socl-season1.final.tab' c_t = cs.TabData(fmerge) pc_t = cs.TabData(fqap) c_t.merge(pc_t) c_t.fuse_rows('turn_id') c_t.save(ffinal) print(len(set(row['is_commitment'].value == 'True' for row in c_t))) #~ sys.exit() #~ ddd = defaultdict(list) #~ for row in c_t: #~ ddd[row['dialogue'].value].append(row['turn_id'].value) #~ count = defaultdict(int) #~ for k,v in ddd.items(): #~ count[len(v)] += 1 #~
'position_in_dialogue_DU1', 'position_in_game_DU1', 'edu_position_in_turn_DU1', 'has_correction_star_DU1', 'ends_with_bang_DU1', 'ends_with_qmark_DU1', 'has_FOR_np_DU1', 'is_question_DU1', 'num_tokens_DU2', 'has_player_name_exact_DU2', 'has_player_name_fuzzy_DU2', 'has_emoticons_DU2', 'is_emoticon_only_DU2', 'speaker_started_the_dialogue_DU2', 'speaker_already_spoken_in_dialogue_DU2', 'speakers_first_turn_in_dialogue_DU2', 'turn_follows_gap_DU2', 'position_in_dialogue_DU2', 'position_in_game_DU2', 'edu_position_in_turn_DU2', 'has_correction_star_DU2', 'ends_with_bang_DU2', 'ends_with_qmark_DU2', 'has_FOR_np_DU2', 'is_question_DU2' ] meta_sel = ['dialogue', 'id_DU1', 'id_DU2'] t_r = cs.TabData(fpairs) #~ t_r.sel_row({'CLASS':'UNRELATED'}, negate=1) t_r.sel_col(feat_sel, meta_sel, 'CLASS') #~ t_r.save('res/cut.tab') #~ c_r = cs.Trainer(t_r, grouper='dialogue') c_r = cs.Trainer(t_r, learner='logreg', grouper='dialogue') c_r.evaluate() sys.exit() with open('../res/gpred.tab', 'w') as f: for pred, row in c_r.pred_rows(): line = '\t'.join([ k.value for k in (pred, row.getclass(), row['id_DU1'], row['id_DU2']) ])
'speaker_already_spoken_in_dialogue_DU2', 'speakers_first_turn_in_dialogue_DU2', 'turn_follows_gap_DU2', 'position_in_dialogue_DU2', 'position_in_game_DU2', 'edu_position_in_turn_DU2', 'has_correction_star_DU2', 'ends_with_bang_DU2', 'ends_with_qmark_DU2', 'lemma_subject_DU2', 'has_FOR_np_DU2', 'is_question_DU2' ] meta_sel = ['dialogue'] step_size = 10 if len(sys.argv) >= 2: step_size = int(sys.argv[1]) # Step 1 : master data table if False: t_full = cs.TabData(fpairs) t_full.sel_col(feat_sel, meta_sel, 'CLASS') t_full.save(fsrc) # Step 2 : set of all dialogues t_master = cs.TabData(fsrc) dials = list(set(l['dialogue'].value for l in t_master)) random.shuffle(dials) print('Data loaded') # Step 3 : the curve loop all_scores = list() n = len(dials) n_steps = int(n / step_size) for m in range(n_steps): t_size = step_size * (m + 1)
# Merging custom and attelo sources # For great justice # Python 2 import classify as cs fcustom = '../res/custom.tab' fmerge = '../res/merge.tab' frel = '/home/arthur/These/Master/Stac/data/SNAPSHOTS/2014-06-04/socl-season1.relations.csv' t_c, t_r = (cs.TabData(f) for f in (fcustom, frel)) t_c.newmerge(t_r, ('id_DU1', 'id_DU2')) t_c.save(fmerge)