Exemple #1
0
def train(xml_file, con_file, dep_file, alg, concept, classifier_pickle):
	aus = get_annotation_units(xml_file)
	aus = UnifiedReader(aus, con_file, dep_file)
	aus = instance_filter(aus, None, True, concept)
	fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus])
	print fss_n_lists[0][0][1]
	classifier = nltk.MaxentClassifier.train(fss_n_lists[0], alg, trace=0, max_iter=1000)
	print len(classifier.labels()), classifier.labels
	pickle_out = open(classifier_pickle, 'wb')
	pickle.dump(classifier, pickle_out)
	pickle_out.close()
Exemple #2
0
def classify(txt_file, con_file, dep_file, concept, classifier_pickle, output_file):
	#print >> sys.stderr, "1"
	aus = get_annotation_units_from_txt(txt_file)
	aus = UnifiedReader(aus, con_file, dep_file)
	fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus])

	#print >> sys.stderr, "2"
	pickle_in = open(classifier_pickle, 'rb')
	classifier = pickle.load(pickle_in)
	pickle_in.close()
	#labels = classifier.labels()

	#print >> sys.stderr, "3"
	fout = codecs.open(output_file, mode='w', encoding='utf-8')
	for fs, l in fss_n_lists[0]:
		prob_dist = classifier.prob_classify(fs)
		label = prob_dist.max()
		#print >> fout, '\t'.join(['%s\t%f' % (x, prob_dist.prob(x)) for x in labels])
		print >> fout, '%s\t%f' % (label, prob_dist.prob(label))
Exemple #3
0
def ARFFPrinter(aus, concept, outFile):
    #fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), [aus])
    featuresets = FeatureGenerator.get_featuresets(aus, concept)

    # calculation for header
    attDict = dict()
    for featureset in featuresets:
        for key, value in featureset[0].items():
            try: attDict[key].add(value)
            except KeyError: attDict[key] = set([value])

    attributes = attDict.keys()

    fout = open(outFile, 'w')
    
    # print header 
    print >> fout, "@relation %s"%concept
    for attribute in attributes: 
        if attribute.startswith('contain-'): dataType = '{True, False}'
        else: dataType = 'string'
        print >> fout, '@attribute "%s" %s'%(re.sub('"','\\"',attribute), dataType)
    if concept is 'CCS': classes = 'unidentifiable normalTOcancer cancerTOnormal'
    elif concpet is 'PT': classes = 'observation causality'
    else: raise ValueError
    print >> fout, '@attribute %s {%s}'%(concept, classes)

    #print data
    print >> fout, "@data"
    for featureset in featuresets:
        dataLine = ""
        for attribute in attributes:
            try: dataLine += '"'+re.sub('"','\\"',unicode(featureset[0][attribute]).encode('ascii','ignore'))+'"'+','
            except KeyError: dataLine += 'False'+','
        dataLine += featureset[1]
        print >> fout, dataLine
        

    fout.close()
Exemple #4
0
def n_fold_test(n_folds, xml_file, con_file, dep_file, alg, concept, classification_method, multiple_cancer_terms, unique_pmids, dup_pmids_in_one_fold, classifier_pickle):
	# instance filtering according to the options
	aus = get_annotation_units(xml_file)
	aus = UnifiedReader(aus, con_file, dep_file)
	aus = instance_filter(aus, classification_method, multiple_cancer_terms, concept)

	# divide into n sets
	n_lists = fold_divider(n_folds, aus, unique_pmids, dup_pmids_in_one_fold)

	# convert annotation units into feature sets
	fss_n_lists = map(lambda x: FeatureGenerator.get_featuresets(x, concept), n_lists)
	print fss_n_lists[0][0][1]

	# N-fold cross validation
	results = []
	classifiers = []
	#threads = []
	for i in range(n_folds):
		results.append(0)
		classifiers.append(0)

	start = time.time()
	for i in range(len(fss_n_lists)):
		one_fold_test(i, fss_n_lists, results, classifiers, alg)
		#threads.append(threading.Thread(target=one_fold_test, args=(i, fss_n_lists, results, classifiers, alg)))
		#threads[i].start()
	#for i in range(len(fss_n_lists)):
	#	threads[i].join()
	
	print '#fold\taccuracy\ttrain_time\ttest_time'
	for i in range(len(fss_n_lists)):
		print 'fold_%s\t%s\t%s\t%s'%(i, results[i][0], results[i][1], results[i][2]), results[i][3]

	acc_sum, t_train_sum, t_test_sum = reduce(lambda x, y: (x[0]+y[0],x[1]+y[1],x[2]+y[2]),results)
	print 'average\t%s\t%s\t%s\t' % (float(acc_sum/n_folds), float(t_train_sum/n_folds), float(t_test_sum/n_folds))
	print 'total elapsed time: %d' % (time.time()-start)

	# for excel
	print 'accuracy'
	for i in range(len(fss_n_lists)):
		print results[i][0]
	print float(acc_sum/n_folds)

	classes = list()
	for i in range(len(results[0][3])):
		classes.append(results[0][3][i][0])
	for clas in classes:
		print clas
		print 'precision'
		for i in range(len(fss_n_lists)):
			for numbers in results[i][3]:
				if numbers[0] == clas: print numbers[1]
		print 'recall'
		for i in range(len(fss_n_lists)):
			for numbers in results[i][3]:
				if numbers[0] == clas: print numbers[2]
		print 'f'
		for i in range(len(fss_n_lists)):
			for numbers in results[i][3]:
				if numbers[0] == clas: print numbers[3]
	
	
	pickle_out = open(classifier_pickle, 'wb')
	pickle.dump(classifiers, pickle_out)
	pickle_out.close()