def get_instance(ti_line): j = json.loads(ti_line) ti = TrainingInstance.from_dict(j) obs = [ o.l2_word for o in ti.current_sent if o.lang == 'de' if o.l2_word.strip() != '' ] guesses = [g.guess for g in ti.current_guesses if g.guess.strip() != ''] guesses += [ o.l2_word for o in ti.current_sent if o.lang == 'en' if o.l2_word.strip() != '' ] # wink! ;) guesses += [ g.guess for g in ti.past_correct_guesses if g.guess.strip() != '' ] guesses += [ g.guess for g in ti.past_guesses_for_current_sent if g.guess.strip() != '' ] guesses = [g.split() for g in guesses] obs = [o.split() for o in obs] guesses_flat = sum(guesses, []) #for gf in guesses_flat: # add_to_tags(gf) obs_flat = sum(obs, []) for of in obs_flat: add_to_obs(of) return ti, obs_flat, guesses_flat
def batch_predictions(training_instance, theta_en_en_names, theta_en_de_names, theta_en_en, theta_en_de, phi_wapper, lr, en_domain, de2id, en2id, d2t, qp=False): j_ti = json.loads(training_instance) ti = TrainingInstance.from_dict(j_ti) sent_id = ti.current_sent[0].sent_id fg = create_factor_graph(ti=ti, learning_rate=lr, theta_en_en_names=theta_en_en_names, theta_en_de_names=theta_en_de_names, theta_en_de=theta_en_de, theta_en_en=theta_en_en, phi_wrapper=phi_wapper, en_domain=en_domain, de2id=de2id, en2id=en2id, d2t=d2t) fg.initialize() fg.treelike_inference(3) p = fg.get_posterior_probs() if qp: factor_dist = None fgs = None else: fgs = '\n'.join(['*SENT_ID:' + str(sent_id)] + fg.to_string()) factor_dist = fg.to_dist() p0,p25, p50, t = fg.get_precision_counts() return [p, fgs, factor_dist, (p0, p25, p50,t)]
def batch_predictions(training_instance, f_en_en_theta, f_en_de_theta, adapt_phi_en_en, adapt_phi_en_de, lr, en_domain, de2id, en2id, basic_f_en_en, basic_f_en_de, domain2theta): j_ti = json.loads(training_instance) ti = TrainingInstance.from_dict(j_ti) sent_id = ti.current_sent[0].sent_id fg = create_factor_graph(ti=ti, learning_rate=lr, theta_en_de=f_en_de_theta, theta_en_en=f_en_en_theta, phi_en_en=adapt_phi_en_en, phi_en_de=adapt_phi_en_de, basic_f_en_en=basic_f_en_en, basic_f_en_de=basic_f_en_de, en_domain=en_domain, de2id=de2id, en2id=en2id, domain2theta=domain2theta) fg.initialize() fg.treelike_inference(3) return fg.get_posterior_probs()
def batch_sgd(training_instance, theta_en_en, theta_en_de, phi_en_en, phi_en_de, lr, en_domain, de2id, en2id, basic_f_en_en, basic_f_en_de, domain2theta): j_ti = json.loads(training_instance) ti = TrainingInstance.from_dict(j_ti) sent_id = ti.current_sent[0].sent_id fg = create_factor_graph(ti=ti, learning_rate=lr, theta_en_de=theta_en_de, theta_en_en=theta_en_en, phi_en_en=phi_en_en, phi_en_de=phi_en_de, basic_f_en_en=basic_f_en_en, basic_f_en_de=basic_f_en_de, en_domain=en_domain, de2id=de2id, en2id=en2id, domain2theta=domain2theta) fg.initialize() # sys.stderr.write('.') fg.treelike_inference(3) # sys.stderr.write('.') # f_en_en_theta, f_en_de_theta = fg.update_theta() g_en_en, g_en_de = fg.get_unregularized_gradeint() sample_ag = {} for f_type, u in fg.active_domains: g = g_en_en.copy() if f_type == 'en_en' else g_en_de.copy() t = domain2theta[f_type, u] r = fg.regularization_param l = fg.learning_rate sample_ag[f_type, u] = apply_regularization(r * 0.001, g, l, t) # use a smaller regularization term g_en_en = apply_regularization(r, g_en_en, l, fg.theta_en_en) g_en_de = apply_regularization(r, g_en_de, l, fg.theta_en_de) # turn off adapt_phi return [sent_id, g_en_en, g_en_de, sample_ag]
def get_instance(ti_line): j = json.loads(ti_line) ti = TrainingInstance.from_dict(j) obs = [o.l2_word for o in ti.current_sent if o.lang == 'de' if o.l2_word.strip() != ''] guesses = [g.guess for g in ti.current_guesses if g.guess.strip() != ''] guesses += [o.l2_word for o in ti.current_sent if o.lang == 'en' if o.l2_word.strip() != ''] # wink! ;) guesses += [g.guess for g in ti.past_correct_guesses if g.guess.strip() != ''] guesses += [g.guess for g in ti.past_guesses_for_current_sent if g.guess.strip() != ''] guesses = [g.split() for g in guesses] obs = [o.split() for o in obs] guesses_flat = sum(guesses, []) #for gf in guesses_flat: # add_to_tags(gf) obs_flat = sum(obs, []) for of in obs_flat: add_to_obs(of) return ti, obs_flat, guesses_flat
def batch_sgd(training_instance, theta_en_en_names, theta_en_de_names, theta_en_en, theta_en_de, phi_wrapper, lr, en_domain, de2id, en2id, d2t): j_ti = json.loads(training_instance) ti = TrainingInstance.from_dict(j_ti) sent_id = ti.current_sent[0].sent_id fg = create_factor_graph(ti=ti, learning_rate=lr, theta_en_en_names=theta_en_en_names, theta_en_de_names=theta_en_de_names, theta_en_de=theta_en_de, theta_en_en=theta_en_en, phi_wrapper=phi_wrapper, en_domain=en_domain, de2id=de2id, en2id=en2id, d2t=d2t) fg.initialize() fg.treelike_inference(3) # f_en_en_theta, f_en_de_theta = fg.update_theta() if options.user_adapt or options.experience_adapt: g_en_en, g_en_de = fg.get_unregularized_gradeint() sample_ag = {} for f_type, d in fg.active_domains: g = g_en_en.copy() if f_type == 'en_en' else g_en_de.copy() t = domain2theta[f_type, d] r = fg.regularization_param l = fg.learning_rate scale_reg = float(options.reg_param_ua_scale) sample_ag[f_type, d] = apply_regularization(r * scale_reg, g, l, t) # use a smaller regularization term g_en_en = apply_regularization(r, g_en_en, l, fg.theta_en_en) g_en_de = apply_regularization(r, g_en_de, l, fg.theta_en_de) else: sample_ag = None g_en_en, g_en_de = fg.return_gradient() fg.display_timing_info() p = fg.get_posterior_probs() return [sent_id, p, g_en_en, g_en_de, sample_ag]
def batch_sgd(training_instance, theta_en_en, theta_en_de, phi_en_en, phi_en_de, lr, en_domain, de2id, en2id, basic_f_en_en, basic_f_en_de, domain2theta): j_ti = json.loads(training_instance) ti = TrainingInstance.from_dict(j_ti) sent_id = ti.current_sent[0].sent_id fg = create_factor_graph(ti=ti, learning_rate=lr, theta_en_de=theta_en_de, theta_en_en=theta_en_en, phi_en_en=phi_en_en, phi_en_de=phi_en_de, basic_f_en_en=basic_f_en_en, basic_f_en_de=basic_f_en_de, en_domain=en_domain, de2id=de2id, en2id=en2id, domain2theta=domain2theta) fg.initialize() # sys.stderr.write('.') fg.treelike_inference(3) # sys.stderr.write('.') # f_en_en_theta, f_en_de_theta = fg.update_theta() g_en_en, g_en_de = fg.get_unregularized_gradeint() sample_ag = {} for f_type, u in fg.active_domains: g = g_en_en.copy() if f_type == 'en_en' else g_en_de.copy() t = domain2theta[f_type, u] r = fg.regularization_param l = fg.learning_rate sample_ag[f_type, u] = apply_regularization( r * 0.001, g, l, t) # use a smaller regularization term g_en_en = apply_regularization(r, g_en_en, l, fg.theta_en_en) g_en_de = apply_regularization(r, g_en_de, l, fg.theta_en_de) # turn off adapt_phi return [sent_id, g_en_en, g_en_de, sample_ag]
# pre_fire_en_en = sparse.csr_matrix(pre_fire_en_en) f_en_de_theta = np.zeros((1, len(f_en_de))) phi_en_de = np.random.rand(len(en_domain) * len(de_domain), len(f_en_de)) phi_en_de[phi_en_de > 0.5] = 1.0 phi_en_de[phi_en_de < 0.2] = 0.0 load_times = [] grad_times = [] inference_times = [] mp_times = [] fg = None split_ratio = int(len(training_instances) * 0.33) test_instances = training_instances[:split_ratio] all_training_instances = training_instances[split_ratio:] lr = 0.1 for t_idx, training_instance in enumerate(training_instances): print t_idx j_ti = json.loads(training_instance) ti = TrainingInstance.from_dict(j_ti) lt = time.time() fg = FactorGraph( theta_en_en=f_en_en_theta if fg is None else fg.theta_en_en, theta_en_de=f_en_de_theta if fg is None else fg.theta_en_de, phi_en_en=phi_en_en, phi_en_de=phi_en_de) fg.learning_rate = lr fg = load_fg(fg, ti, en_domain, de2id=de2id, en2id=en2id) print 'done checking', len(training_instances)
tag_list = list(set(to.split('\n')[0].split())) obs_list = list(set(to.split('\n')[1].split())) fac_cell_2feat = {} feat2id = {} for features_fired_in_factor in factors.strip().split('FACTOR:'): feature_lines = features_fired_in_factor.strip().split('\n') fac_type = feature_lines[0].strip() for fl in feature_lines[1:]: items = fl.split() label1, label2 = items[0], items[1] fac_cell_2feat[fac_type, label1, label2] = [(f_name, 1.0) for f_name in items[2:]] for f_fired in items[2:]: feat2id[f_fired] = feat2id.get(f_fired, len(feat2id)) return tag_list, obs_list, fac_cell_2feat, feat2id if __name__ == '__main__': opt = OptionParser() # insert options here opt.add_option('--ti', dest='training_instances', default='') (options, _) = opt.parse_args() if options.training_instances == '': sys.stderr.write("Usage: jython macaronic-tagger.py --ti [training instances file]\n") exit(1) for line in codecs.open(options.training_instances, 'r', 'utf8').readlines(): jti = json.loads(line) ti = TrainingInstance.from_dict(jti) pdb.set_trace()