def train(xml_file, con_file, dep_file, alg, concept, classifier_pickle): aus = get_annotation_units(xml_file) aus = UnifiedReader(aus, con_file, dep_file) aus = instance_filter(aus, None, True, concept) fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus]) print fss_n_lists[0][0][1] classifier = nltk.MaxentClassifier.train(fss_n_lists[0], alg, trace=0, max_iter=1000) print len(classifier.labels()), classifier.labels pickle_out = open(classifier_pickle, 'wb') pickle.dump(classifier, pickle_out) pickle_out.close()
def classify(txt_file, con_file, dep_file, concept, classifier_pickle, output_file): #print >> sys.stderr, "1" aus = get_annotation_units_from_txt(txt_file) aus = UnifiedReader(aus, con_file, dep_file) fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus]) #print >> sys.stderr, "2" pickle_in = open(classifier_pickle, 'rb') classifier = pickle.load(pickle_in) pickle_in.close() #labels = classifier.labels() #print >> sys.stderr, "3" fout = codecs.open(output_file, mode='w', encoding='utf-8') for fs, l in fss_n_lists[0]: prob_dist = classifier.prob_classify(fs) label = prob_dist.max() #print >> fout, '\t'.join(['%s\t%f' % (x, prob_dist.prob(x)) for x in labels]) print >> fout, '%s\t%f' % (label, prob_dist.prob(label))
def ARFFPrinter(aus, concept, outFile): #fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus]) featuresets = FeatureGenerator.get_featuresets(aus, concept) # calculation for header attDict = dict() for featureset in featuresets: for key, value in featureset[0].items(): try: attDict[key].add(value) except KeyError: attDict[key] = set([value]) attributes = attDict.keys() fout = open(outFile, 'w') # print header print >> fout, "@relation %s"%concept for attribute in attributes: if attribute.startswith('contain-'): dataType = '{True, False}' else: dataType = 'string' print >> fout, '@attribute "%s" %s'%(re.sub('"','\\"',attribute), dataType) if concept is 'CCS': classes = 'unidentifiable normalTOcancer cancerTOnormal' elif concpet is 'PT': classes = 'observation causality' else: raise ValueError print >> fout, '@attribute %s {%s}'%(concept, classes) #print data print >> fout, "@data" for featureset in featuresets: dataLine = "" for attribute in attributes: try: dataLine += '"'+re.sub('"','\\"',unicode(featureset[0][attribute]).encode('ascii','ignore'))+'"'+',' except KeyError: dataLine += 'False'+',' dataLine += featureset[1] print >> fout, dataLine fout.close()
def n_fold_test(n_folds, xml_file, con_file, dep_file, alg, concept, classification_method, multiple_cancer_terms, unique_pmids, dup_pmids_in_one_fold, classifier_pickle): # instance filtering according to the options aus = get_annotation_units(xml_file) aus = UnifiedReader(aus, con_file, dep_file) aus = instance_filter(aus, classification_method, multiple_cancer_terms, concept) # divide into n sets n_lists = fold_divider(n_folds, aus, unique_pmids, dup_pmids_in_one_fold) # convert annotation units into feature sets fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), n_lists) print fss_n_lists[0][0][1] # N-fold cross validation results = [] classifiers = [] #threads = [] for i in range(n_folds): results.append(0) classifiers.append(0) start = time.time() for i in range(len(fss_n_lists)): one_fold_test(i, fss_n_lists, results, classifiers, alg) #threads.append(threading.Thread(target=one_fold_test, args=(i, fss_n_lists, results, classifiers, alg))) #threads[i].start() #for i in range(len(fss_n_lists)): # threads[i].join() print '#fold\taccuracy\ttrain_time\ttest_time' for i in range(len(fss_n_lists)): print 'fold_%s\t%s\t%s\t%s'%(i, results[i][0], results[i][1], results[i][2]), results[i][3] acc_sum, t_train_sum, t_test_sum = reduce(lambda x, y: (x[0]+y[0],x[1]+y[1],x[2]+y[2]),results) print 'average\t%s\t%s\t%s\t' % (float(acc_sum/n_folds), float(t_train_sum/n_folds), float(t_test_sum/n_folds)) print 'total elapsed time: %d' % (time.time()-start) # for excel print 'accuracy' for i in range(len(fss_n_lists)): print results[i][0] print float(acc_sum/n_folds) classes = list() for i in range(len(results[0][3])): classes.append(results[0][3][i][0]) for clas in classes: print clas print 'precision' for i in range(len(fss_n_lists)): for numbers in results[i][3]: if numbers[0] == clas: print numbers[1] print 'recall' for i in range(len(fss_n_lists)): for numbers in results[i][3]: if numbers[0] == clas: print numbers[2] print 'f' for i in range(len(fss_n_lists)): for numbers in results[i][3]: if numbers[0] == clas: print numbers[3] pickle_out = open(classifier_pickle, 'wb') pickle.dump(classifiers, pickle_out) pickle_out.close()