def evaluate_method(classifier, stream_name, method_name, initial_size, step_size): print(stream_name, method_name) try: start = time.time() data, meta = arff.loadarff("streams/%s.arff" % stream_name) if data is None: print("Empty data") raise Exception classes = meta[meta.names()[-1]][1] evl = evaluation.Evaluation(classifier=classifier, stream_name="%s" % stream_name, method_name=method_name, tqdm=False) evl.test_and_train(data=data, classes=classes, initial_size=initial_size, step_size=step_size) evl.compute_metrics() evl.save_to_csv_metrics() print("End", stream_name, method_name, time.time() - start) except Exception as ex: print(str(ex)) traceback.print_exc() print("Exception in ", stream_name, method_name)
def evaluate_method(classifier, stream_name, method_name, initial_size, step_size): logging.basicConfig(filename='realStreams.log', filemode="a", format='%(asctime)s - %(levelname)s: %(message)s', level='DEBUG') try: logging.info("Start %s %s", stream_name, method_name) print(stream_name, method_name) start = time.time() data, meta = arff.loadarff("streams/%s.arff" % stream_name) if data is None: print("Empty data") raise Exception classes = meta[meta.names()[-1]][1] evl = evaluation.Evaluation(classifier=classifier, stream_name="%s" % stream_name, method_name=method_name, experiment_name=experiment_name, tqdm=False) evl.test_and_train(data=data, classes=classes, initial_size=initial_size, step_size=step_size) evl.save_to_csv_confmat() logging.info("End %s %s %f", stream_name, method_name, time.time()-start) print("End", stream_name, method_name, time.time()-start) except Exception as ex: logging.exception("Exception in %s %s", stream_name, method_name) print(str(ex)) traceback.print_exc() print("Exception in %s %s" % (stream_name, method_name))
def evaluate(self, learner, partition='test', debug_idxs=None, skip_idxs=(), decoder='ilp', n_eval=(1, 2, 3, 4), streaming=True, overwritten_params=(), eval_path=None, output_path=None, lm_proxy=None, **kwargs): """Run the transduction model on designated test instances and report performance metrics. """ # When evaluating multiple iterations of the same model over a fixed # partition, decoding should ensure that initialization isn't # unnecessarily repeated. #print(kwargs) if partition == 'test' and kwargs['subcorpus'] is not None and kwargs[ 'subcorpus'] == 'final': #if kwargs['subcorpus'] == 'final': print("FINAL") eval_instances = self.get_instances(partition=partition, debug_idxs=debug_idxs, skip_idxs=skip_idxs) system_name = learner.name elif learner is not None: eval_instances = self.decode_instances(learner, partition=partition, debug_idxs=debug_idxs, skip_idxs=skip_idxs, decoder=decoder, streaming=streaming, overwritten_params=\ overwritten_params, **kwargs) system_name = learner.name else: eval_instances = self.get_instances(partition=partition, debug_idxs=debug_idxs, skip_idxs=skip_idxs) system_name = 'baseline' num_instances = len(eval_instances) # Record overwritten parameters in the filenames overwriting_str = None if len(overwritten_params) > 0: overwriting_str = '_OW-' i = 0 for param_name, value in overwritten_params.iteritems(): if isinstance(value, list) or isinstance(value, tuple): overwriting_str += '+'.join(str(v) for v in sorted(value)) else: overwriting_str += str(value) i += 1 if i < len(overwritten_params): overwriting_str += '-' if output_path is not None: output_filename = ''.join( (output_path, '/', '_'.join((partition, 'under', system_name)), overwriting_str if overwriting_str is not None else '', '_', decoder, '.out')) outf = open(output_filename, 'wb') # Determine the evaluations to run by looking at a representative # instance i = 0 while i < len(eval_instances) and \ not hasattr(eval_instances[i], 'output_sent'): i += 1 if i == len(eval_instances): print "WARNING: all instances failed; skipping evaluation" sys.exit() some_instance = eval_instances[i] has_labels = hasattr(some_instance, 'label_sentences') has_rasp = hasattr(some_instance.gold_sentences[0], 'relgraph') has_outtrees = hasattr(some_instance.output_sent, 'outtree') has_outframes = hasattr(some_instance.output_sent, 'outframes') # FIXME TEMPORARY! MUST MAKE "False" FOR TEST! skip_failed = False # Initialize the evaluations eval_obj = evaluation.Evaluation(title='TRANSDUCTION_EVAL') output_sents = [] with timer.AvgTimer(num_instances): for i, instance in enumerate(eval_instances): sys.stdout.write("Evaluating " + str(num_instances) + (" " + partition if partition is not None else "") + " instances: " + str(i + 1) + '\r') # Duration and failure status eval_obj.include( system=system_name, corpus='other', decode_time=instance.decode_times[-1], solution_time=instance.solution_times[-1] \ if len(instance.solution_times) > 0 else 0, inputs=len(instance.input_sents), _failed=int(not hasattr(instance, 'output_sent')), ) if skip_failed and not hasattr(instance, 'output_sent'): print "WARNING: Skipping failed instance", instance.idx continue # POS tag recall for use_labels in set([False]) | set([has_labels]): #for prefix in ('NN', 'VB', 'JJ', 'RB'): # p, r, f = instance.score_content_words( # use_labels=use_labels, prefixes=(prefix,)) # eval_obj.add_metrics( # precision=p, # recall=r, # system=system_name, # corpus=('LBLs ' + prefix) if use_labels \ # else ('GOLD ' + prefix), # ) p, r, f = instance.score_content_words( use_labels=use_labels, prefixes=('NN', 'VB')) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus=('LBLs ' + 'NN+VB') if use_labels \ else ('GOLD ' + 'NN+VB'), ) try: if lm_proxy is not None: output_tokens = instance.output_sent.tokens \ if hasattr(instance, 'output_sent') else [] eval_obj.include(system=system_name, corpus='other', lm=lm_proxy.score_sent(output_tokens)) except jsonrpc.RPCTransportError: print "ERROR: JSON-RPC hiccups; skipping LM scoring" pass if decoder.startswith('dp+'): # Record convergence of dual decomposition or # bisection. Will be 0 if neither are used. eval_obj.include( system=system_name, corpus='other', convergence_=int(instance.converged), iterations=instance.num_iterations, ) if len(instance.sentences) == 1: # Paraphrasing or compression-specific metrics eval_obj.include( system=system_name, corpus='STATS gold', comp_=instance.get_gold_compression_rate(), length=instance.avg_gold_len, proj_=avg( int(gold_sent.dparse.is_projective()) for gold_sent in instance.gold_sentences), overlap_=avg( instance.get_overlap(gold_sent) for gold_sent in instance.gold_sentences), ) eval_obj.include( system=system_name, corpus='STATS input', comp_=1.0, length=instance.avg_len, proj_=int( instance.sentences[0].dparse.is_projective()), overlap_=instance.get_overlap(instance.sentences[0])) eval_obj.include( system=system_name, corpus='STATS output', comp_=instance.get_compression_rate(), length=len(instance.output_sent.tokens) if hasattr( instance, 'output_sent') else 0, ) if hasattr(instance, 'output_sent') and has_outtrees: eval_obj.include( system=system_name, corpus='STATS output', proj_=int(instance.output_sent.\ outtree.is_projective()) if hasattr(instance.output_sent.outtree,\ 'is_projective') else 0, overlap_=instance.get_overlap( instance.output_sent, parse_type='outtree') ) # print "INSTANCE ", instance.idx # crossing_edges = \ # instance.output_sent.outtree.get_crossing_edges() # print "\n\nINPUT:", # self.dump_parse(instance.sentences[0]) # # for gs, gold_sent in enumerate( # instance.gold_sentences): # # get output indices for gold # gold_idxs = [] # i = 0 # for token in gold_sent.tokens: # while instance.sentences[0].tokens[i] != token: # i += 1 # gold_idxs.append((0,i)) # # print "\nGOLD:", gs, # self.dump_parse(gold_sent, # idx_mapper=gold_idxs) # # print "\n\nOUTPUT:", # self.dump_parse(instance.output_sent, # parse_type='outtree', # crossing_edges=crossing_edges, # idx_mapper=instance.output_idxs) # n-gram precision and recall for use_labels in set([False]) | set([has_labels]): for n in n_eval: p, r, f = instance.score_ngrams(n=n, use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus='LBLs n=' + str(n) if use_labels else 'GOLD n=' + str(n), ) if hasattr(instance, 'output_sent') and has_outframes: # Precision and recall for frames p, r, f = instance.score_frames(fes=False, frames_type='outframes', use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus="GOLD frames", ) # Precision and recall for frame elements p, r, f = instance.score_frames(fes=True, frames_type='outframes', use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus="GOLD fes", ) # Parse output sentences for syntactic evaluation. The # 100 token limit is intended for the Stanford parser. if hasattr(instance, 'output_sent') and \ len(instance.output_sent.tokens) <= 100: output_sents.append(instance.output_sent) # Write the output to a file if output_path is not None: outf.write(instance.get_display_string()) # print if output_path is not None: outf.close() # Parse-based evaluations try: parse_types = ['dparse'] if has_outtrees: parse_types.append('outtree') # Get annotations. Only run RASP if the inputs have RASP # annotations since it's slow annotations.annotate(output_sents, 'Stanford') if has_rasp: annotations.annotate(output_sents, 'Rasp') parse_types.append('relgraph') # Add dependency results to evaluations for i, instance in enumerate(eval_instances): if skip_failed and not hasattr(instance, 'output_sent'): print "WARNING: Skipping failed instance", print instance.idx, "again" continue for parse_type in parse_types: for use_labels in set([False]) | set([has_labels]): name = ('LBLs ' if use_labels else 'GOLD ') + \ parse_type p, r, f = instance.score_dependencies( parse_type=parse_type, use_labels=use_labels) eval_obj.add_metrics( precision=p, recall=r, system=system_name, corpus=name, _failed=int(not instance.has_output_parses( parse_type=parse_type))) except OSError: print "Skipping parser evaluations" print eval_obj.title print eval_obj.table(skip_single_keys=True) if eval_path is not None and debug_idxs is None: eval_filename = ''.join( (eval_path, '/', '_'.join((partition, 'under', system_name)), overwriting_str if overwriting_str is not None else '', '_', decoder, '.eval')) eval_obj.save(eval_filename, append=False)
pred_pca_val = logreg.predict_proba(X_scaled_pca_val) fpr, tpr, thresholds = metrics.roc_curve(y_val, pred_pca_val[:, 0]) roc_auc = metrics.auc(fpr, tpr) models.append([ p, c, X_scaled_pca_val.shape, scaler, pca, logreg, pred_pca_val[:, 0] ]) for top in tops: t1 = np.argsort(pred_pca_val[:, 0])[0:top] y_pred_val = logreg.predict(X_scaled_pca_val) y_pred_val[t1] = 1 result_val_pred = [] for i in xrange(len(ks_val)): if y_pred_val[i] == 1: result_val_pred.append(ks_val[i]) E1 = evaluation.Evaluation(result_val_pred, result_val_truth) f1 = E1.F1() i1 = E1.intersection() p1 = E1.precision() r1 = E1.recall() print 'pca: %f, c: %e, top: %d, auc: %f, f1: %f, i1: %d, p1: %f, r1: %f' % ( p, c, top, roc_auc, f1, i1, p1, r1) f.write( 'pca: %f, c: %e, top: %d, auc: %f, f1: %f, i1: %d, p1: %f, r1: %f\n' % (p, c, top, roc_auc, f1, i1, p1, r1)) models_file = path + '/models/logreg1.pkl' with open(models_file, 'wb') as fp: pickle.dump(models, fp, protocol=2) ### Tunning GBDT