def __init__(self, string, type_of_sentence=None): self.type_of_sentence = type_of_sentence self.sentence = string self.nominal1 = None self.nominal2 = None self.get_nominal_pair() self.remove_tags() self.feature_set = FeatureSet(self)
def __init__(self, samplerate, featureList, mp3dirs, k, times, run_before, euclidean): self.cluster = [] self.SAMPLERATE = samplerate self.featureList = featureList self.num_features = len(featureList) self.mp3dirs = mp3dirs self.k = k self.times = times self.euclidean = euclidean #do mp3_to_feature_vectors and brand new FeatureSet object if this is a new dataset/feature combo if run_before == False: self.mp3_to_feature_vectors() self.f = FeatureSet(self.featureList, False) #else skip conversion, load from file else: self.f = FeatureSet(self.featureList, True)
class Sentence: RelatorList = ["because", "after", "since", " as "] def __init__(self, string, type_of_sentence=None): self.type_of_sentence = type_of_sentence self.sentence = string self.nominal1 = None self.nominal2 = None self.get_nominal_pair() self.remove_tags() self.feature_set = FeatureSet(self) # have to remove tags before tokenizing def get_nominal_pair(self): # finds the tags in the s and returns four cases match = re.findall(r"(<.*?>(.*?)</.*?>)", self.sentence) if match: self.nominal1 = match[0][1] self.nominal2 = match[1][1] # returns the two cases we want which don't have the tags anymore def remove_tags(self): self.sentence = self.sentence.replace("<e>", "") self.sentence = self.sentence.replace("</e>", "") def is_causal_from_dataset(self): return "Cause-Effect" in self.type_of_sentence def causal_features(self): features = {"HasRelator": self.feature_set.has_relator(), "DiscreditRelator": self.feature_set.discredit_relator(), "HasCausalVerb": self.feature_set.check_for_causal_verb(), "HasCause": self.feature_set.check_for_cause_in_sentence()} return features
from accurancyTools import * from sklearn import ensemble import itertools import matplotlib.pyplot as plt trainingDataSetNames = ['Volume', 'CentroidNorm', 'Centroid', 'Perimeter', 'PseudoRadius', 'Complexity', 'BoundingBox2Volume', 'BoundingBoxAspectRatio', 'IntensityMax', 'IntensityMean', 'IntensityMin', 'IntensityStd', 'CloseMassRatio', 'IntensityHist', 'gaussianCoefficients', 'gaussianGOV', 'Gradient', 'GradientOfMag'] senaryo1File = '../../noduledetectordata/ilastikoutput3/s1/s1.h5' senaryo1LabelFile = '../../noduledetectordata/ilastikoutput3/s1/s1_labels.h5' senaryo1BatchFile = '../../noduledetectordata/ilastikoutput3/s1/example_05.h5' senaryo1BatchLabelFile = '../../noduledetectordata/ilastikoutput3/s1/labels_example_05.h5' senaryo1 = {'all_features': senaryo1File, 'all_features_labels': senaryo1LabelFile, 'batch_features': senaryo1BatchFile, 'batch_features_labels': senaryo1BatchLabelFile} senaryo1_train_sets = FeatureSet.readFromFile(senaryo1File, trainingDataSetNames, senaryo1LabelFile, 'labels') senaryo1_batch_sets = FeatureSet.readFromFile(senaryo1BatchFile, trainingDataSetNames, senaryo1BatchLabelFile, 'labels') senaryo1_sets = {'all_features_set': senaryo1_train_sets, 'batch_features_set': senaryo1_batch_sets, 'fileName': 'example05'} senaryo2File = '../../noduledetectordata/ilastikoutput3/s2/s2.h5' senaryo2LabelFile = '../../noduledetectordata/ilastikoutput3/s2/s2_labels.h5' senaryo2BatchFile = '../../noduledetectordata/ilastikoutput3/s2/example_01.h5' senaryo2BatchLabelFile = '../../noduledetectordata/ilastikoutput3/s2/labels_example_01.h5' senaryo2 = {'all_features': senaryo2File, 'all_features_labels': senaryo2LabelFile, 'batch_features': senaryo2BatchFile, 'batch_features_labels': senaryo2BatchLabelFile} senaryo2_train_sets = FeatureSet.readFromFile(senaryo2File, trainingDataSetNames, senaryo2LabelFile, 'labels') senaryo2_batch_sets = FeatureSet.readFromFile(senaryo2BatchFile, trainingDataSetNames, senaryo2BatchLabelFile, 'labels') senaryo2_sets = {'all_features_set': senaryo2_train_sets, 'batch_features_set': senaryo2_batch_sets, 'fileName': 'example01'} senaryo3File = '../../noduledetectordata/ilastikoutput3/s3/s3.h5' senaryo3LabelFile = '../../noduledetectordata/ilastikoutput3/s3/s3_labels.h5' senaryo3BatchFile = '../../noduledetectordata/ilastikoutput3/s3/example_03.h5'
#'skewness'] #this is vigra computeed #trainingDataSetNames = ['count', 'regionCenter', 'regionRadii', 'histogram'] # this is matlab trainingDataSetNames = ['Volume', 'CentroidNorm', 'Perimeter','Complexity', 'BoundingBox2Volume','BoundingBoxAspectRatio', #, 'BoundingBoxAspectRatio', 'IntensityHist']#'BoundingBoxAspectRatio']#, 'IntensityMax','IntensityMean', random_seed= 100 numpy.random.seed(random_seed) print numpy.random.rand(1,1) repeatN = 100 acc = numpy.zeros([repeatN, 3]) allFeatures = FeatureSet.readFromFile(featureFile, trainingDataSetNames,labelFile, 'labels') coordinateFeatures = FeatureSet.readFromFile(featureFile,['Centroid'] ,labelFile, 'labels') predThreshold = 0.5 thresholdRange = numpy.linspace(0.1, 0.999, 10) roc_mn_acc=numpy.zeros([len(thresholdRange),3]) roc_std_acc=numpy.zeros([len(thresholdRange),3]) partreeCount = 50 counter = 0 for predThreshold in thresholdRange: for ite in range(0,repeatN): trn, val = allFeatures.divideSetRandom(1,1,True) #trn, val = allFeatures.divideSetByZ(coordinateFeatures.data[:,2]) rf = vigra.learning.RandomForest(treeCount=partreeCount) rf.learnRF(trn.data, trn.labels,randomSeed = ite*10)
'CloseMassRatio', 'IntensityHist', 'gaussianCoefficients', 'gaussianGOV', 'Gradient', 'GradientOfMag' ] senaryo1File = '../../noduledetectordata/ilastikoutput3/s1/s1.h5' senaryo1LabelFile = '../../noduledetectordata/ilastikoutput3/s1/s1_labels.h5' senaryo1BatchFile = '../../noduledetectordata/ilastikoutput3/s1/example_05.h5' senaryo1BatchLabelFile = '../../noduledetectordata/ilastikoutput3/s1/labels_example_05.h5' senaryo1 = { 'all_features': senaryo1File, 'all_features_labels': senaryo1LabelFile, 'batch_features': senaryo1BatchFile, 'batch_features_labels': senaryo1BatchLabelFile } senaryo1_train_sets = FeatureSet.readFromFile(senaryo1File, trainingDataSetNames, senaryo1LabelFile, 'labels') senaryo1_batch_sets = FeatureSet.readFromFile(senaryo1BatchFile, trainingDataSetNames, senaryo1BatchLabelFile, 'labels') senaryo1_sets = { 'all_features_set': senaryo1_train_sets, 'batch_features_set': senaryo1_batch_sets, 'fileName': 'example05' } senaryo2File = '../../noduledetectordata/ilastikoutput3/s2/s2.h5' senaryo2LabelFile = '../../noduledetectordata/ilastikoutput3/s2/s2_labels.h5' senaryo2BatchFile = '../../noduledetectordata/ilastikoutput3/s2/example_01.h5' senaryo2BatchLabelFile = '../../noduledetectordata/ilastikoutput3/s2/labels_example_01.h5' senaryo2 = {
if __name__ == '__main__': # test code from Bio import GenBank from Bio.SeqFeature import SeqFeature from FeatureSet import FeatureSet from GraphSet import GraphSet from random import normalvariate parser = GenBank.FeatureParser() fhandle = open( '/data/genomes/Bacteria/Nanoarchaeum_equitans/NC_005213.gbk', 'r') genbank_entry = parser.parse(fhandle) fhandle.close() gdfs1 = FeatureSet(0, 'Nanoarchaeum equitans CDS - CDS') gdfs2 = FeatureSet(1, 'Nanoarchaeum equitans CDS - gene') for feature in genbank_entry.features: if feature.type == 'CDS': gdfs1.add_feature(feature) if feature.type == 'gene': gdfs2.add_feature(feature) gdt = Track() gdt.add_set(gdfs1) gdt.add_set(gdfs2) graphdata = [] for pos in xrange(1, len(genbank_entry.seq), 1000): graphdata.append((pos, normalvariate(0.5, 0.1))) gdgs = GraphSet(2, 'test data')
if __name__ == "__main__": # test code from Bio import GenBank from Bio.SeqFeature import SeqFeature from FeatureSet import FeatureSet from GraphSet import GraphSet from random import normalvariate parser = GenBank.FeatureParser() fhandle = open("/data/genomes/Bacteria/Nanoarchaeum_equitans/NC_005213.gbk", "r") genbank_entry = parser.parse(fhandle) fhandle.close() gdfs1 = FeatureSet(0, "Nanoarchaeum equitans CDS - CDS") gdfs2 = FeatureSet(1, "Nanoarchaeum equitans CDS - gene") for feature in genbank_entry.features: if feature.type == "CDS": gdfs1.add_feature(feature) if feature.type == "gene": gdfs2.add_feature(feature) gdt = Track() gdt.add_set(gdfs1) gdt.add_set(gdfs2) graphdata = [] for pos in xrange(1, len(genbank_entry.seq), 1000): graphdata.append((pos, normalvariate(0.5, 0.1))) gdgs = GraphSet(2, "test data")