def computeBaseLine(self, baselineType=0): iterWeek, endWeek = datetime(2013, 3, 23), datetime(2013, 4, 20) db = DBController() fg = FeatureGenerator() baselineScore = 0 while iterWeek <= endWeek: lastWeek = iterWeek - timedelta(weeks=1) featureList = db.getFeatureListByWeek(iterWeek) y_pred, y_test = [], [] for featureVector in featureList: songId = featureVector["id"] lastWeekRank = db.getTop50Rank(lastWeek, songId) if lastWeekRank is None: lastWeekScore = 0 else: lastWeekScore = fg.rankToPopScore(lastWeekRank) currentWeekRank = featureVector["rank"] currentWeekScore = fg.rankToPopScore(currentWeekRank) if currentWeekRank is not None else lastWeekScore y_pred.append(lastWeekScore) y_test.append(currentWeekScore) y_pred, y_test = self.getRankArray(numpy.asarray(y_pred)), self.getRankArray(numpy.asarray(y_test)) if baselineType == 0: baselineScore += self.getRankEvalationScore(y_pred, y_test) elif baselineType == 1: baselineScore += metrics.r2_score(y_pred, y_test) else: baselineScore += metrics.mean_squared_error(y_pred, y_test) iterWeek += timedelta(weeks=1) baselineScore = baselineScore / 5 print baselineScore
def main(argv): del argv # read in configuration config = get_model_config('config/basic.yml') # read in data MyData = DataWarehouse(config) MyData.read_data() # preprocessing if FLAGS.pre_process: MyPreprocess = DataPreProcess(MyData, config) MyPreprocess.process() # generate feature if FLAGS.gen_features: MyFeature = FeatureGenerator(config) MyFeature.process() # train & predict if FLAGS.train: model = Model(MyData, config) #---------Generating outputs --------------- model.process() # ypred = model.predict() # MyData.generate_submission(ypred) try: # logging.info('clear work_dir') shutil.rmtree(WORK_DIR) except Exception as e: pass
def genFeatures(filename): fgen = FeatureGenerator(PhonemeDataFile(filename)) features = list(fgen.features_vector()) shuffle(features) pivot = int(len(features) * 0.8) test,train = features[pivot:],features[:pivot] return test,train
def print_train_feature_file(options): feature_config = setPrintFeatureConfig(options.config_file) printer = FeatureGenerator(feature_config) feature_strs = printer.get_train_features(options.key) output_file = open(options.output_file, 'w') output_file.write('\n'.join(feature_strs)) output_file.close()
def print_test_feature_file(options): feature_config = setPrintFeatureConfig(options.config_file) printer = FeatureGenerator(feature_config) feature_strs = printer.get_test_features(options.id, only_gold=options.only_gold_align) output_file = open(options.output_file, 'w') output_file.write('\n'.join(feature_strs)) output_file.close()
def classifyData(model, sourceDocuments, systemSummaries, referenceSummaries): featureGenerator = FeatureGenerator() featureData = featureGenerator.generateFeatureScores( sourceDocuments, systemSummaries, referenceSummaries, None) vectorGenerator = VectorGenerator() featureVector = vectorGenerator.generateFeatureVector(featureData, 0, 0, 1) return model.predict([featureVector])
def testWord( nnfile, word ): pcas, phones, nn = loadNN( nnfile ) fgen = FeatureGenerator(None) wordc = [c for c in word.lower()] vectors = fgen.word_vectors( wordc ) print "For word: %s"%word # print "Phonemes: %d, %s"%(len(phones),str(phones)) for c, v in vectors: pp = zip( nn.run(v), phones ) # print sorted(pp,reverse=True) (_,best_pronunciation) = sorted(pp,reverse=True)[0] print "char: %s, pronunciation: %s"%(c,best_pronunciation)
def train(self, beginWeek, endWeek, featureMode=0, regressionModelType=0): if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today(): raise Exception("Invalid input date!") beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek) endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek iterWeek = beginWeek fg = FeatureGenerator() regression = self.getRegressionModel(regressionModelType) while iterWeek <= endWeek: matrix_train = fg.getFeatureMatrix(iterWeek, iterWeek, featureMode) X_train, y_train = matrix_train[:, 0:-1], matrix_train[:, -1] regression.fit(X_train, y_train) iterWeek += timedelta(weeks=1) return regression
def get_lexicon_features(self, inst_tokens, pos_lexicon, neg_lexicon): lemmatizer = WordNetLemmatizer() pos_words = [] neg_words = [] with open(pos_lexicon) as pf: for line in pf: if re.match("[^;]", line) and len(line) > 1: pos_words.append(line.strip()) with open(neg_lexicon) as nf: for line in nf: if re.match("[^;]", line) and len(line) > 1: neg_words.append(line.strip()) pos_neg_list = [] pos_len = 0 neg_len = 0 inst_len = 0 for token in inst_tokens: word = token.get_text() tag = token.get_tag() wordnet_tag = FeatureGenerator.get_wordnet_pos(tag) if wordnet_tag: word = lemmatizer.lemmatize(word, wordnet_tag).encode('utf-8') # word = str(lemmatizer.lemmatize(word, wordnet_tag)) else: word = lemmatizer.lemmatize(word).encode('utf-8') # word = str(lemmatizer.lemmatize(word)) if word in pos_words: pos_len += 1 elif word in neg_words: neg_len += 1 inst_len += 1 pos_neg_list.append(pos_len/inst_len) pos_neg_list.append(neg_len/inst_len) return pos_neg_list
class QueryClassifier: def __init__(self, cl, lm): self.cl = cl self.fg = FeatureGenerator(lm) def is_correct(self, query, words): x = [self.fg.generate_features(query, words)] return self.cl.predict(x)[0]
def makeNN(filename, outputfile, hidden, pca, layers): fgen = FeatureGenerator(PhonemeDataFile(filename)) features, pcas = list(fgen.features_vector(pca)) shuffle(features) split = int(len(features) * 0.8) train = features[:split] #The larger set for training test = features[split:] num_input = len(train[0][0]) num_output = len(train[0][1]) inputVars = tuple([num_input] + [hidden]*layers + [num_output]) print "Making NN with: %s"%str(inputVars) print "len(train)=%d, len(test)=%d"%(len(train),len(test)) network = NeuralNet( inputVars ) network.train(train, test, debug=True) if network.save(pcas,list(fgen.phones),outputfile): print "Saved nn successfully" else: print "Error while saving nn"
def test(self, model, week, featureMode=0, outputSong=False, x=10): fg = FeatureGenerator() matrix_test = fg.getFeatureMatrix(week, week, featureMode, True) X_test, y_test = matrix_test[:, 1:-1], matrix_test[:, -1] songIdList = matrix_test[:, 0].tolist() songIdList = list(itertools.chain(*songIdList)) y_pred = model.predict(X_test) y_pred = self.getRankArray(y_pred) y_test = self.getRankArray(y_test) r2Score = metrics.r2_score(y_test, y_pred) meanSquare = metrics.mean_squared_error(y_test, y_pred) rankEvalScore = self.getRankEvalationScore(y_pred, y_test) print "r2 score: ", r2Score, "mean square error: ", meanSquare, " rank score: ", rankEvalScore predTopX, realTopX = self.outputTopX(songIdList, y_pred.tolist()), self.outputTopX(songIdList, y_test.tolist()) inclusiveAccuracy, rankMatchAccuracy = self.computTopXAccuracy(predTopX, realTopX) print "Top", x, "inclusive accuracy: ", inclusiveAccuracy, ",Top", x, "rank match accuracy: ", rankMatchAccuracy if outputSong: print self.outputTopXSongNames(predTopX), "\n", self.outputTopXSongNames(realTopX) return r2Score, meanSquare, rankEvalScore
def __init__(self, config, load_path=None, w2v=None, db=None, stats=None, dmodel=None): ''' Creates a new NN model configured by a json. config is either a dict or a path to a json file json structure: { strip_stop_words=[boolean] context_window_size=[int] max_mention_words=[int] dropout=[0.0 .. 1.0] feature_generator={mention_features={feature names...}, entity_features={feature names...}} finetune_embd=[boolean] pairwise=[boolean] inputs=[list out of ['candidates', 'context', 'mention', 'extra_features']] } ''' if type(config) in {unicode, str}: with open(config) as data_file: self._config = json.load(data_file) else: self._config = config self._stopwords = stopwords.words('english') if self._config['strip_stop_words'] else None self._word_dict = None self._concept_dict = None self._db = db self._batch_left_X = [] self._batch_right_X = [] self._batch_candidate1_X = [] self._batch_candidate2_X = [] self._batch_mention_X = [] self._batch_extra_features_X = [] self._batchY = [] self.train_loss = [] self._batch_size = 128 self.inputs = {x for x in self._config['inputs']} if 'feature_generator' in self._config: self._feature_generator = FeatureGenerator(mention_features= self._config['feature_generator']['mention_features'], entity_features= self._config['feature_generator']['entity_features'], stats=stats, db=db, dmodel=dmodel) self.model = None self.get_attn_model = None if load_path is None: self.compileModel(w2v) else: self.loadModel(load_path)
def trainModel(fileName, sourceDocuments, systemSummaries, referenceSummaries, systemSummaryScores): featureGenerator = FeatureGenerator() featureData = featureGenerator.generateFeatureScores( sourceDocuments, systemSummaries, referenceSummaries, "Feature Data/" + fileName + ".json") vectorGenerator = VectorGenerator() featureVectors = vectorGenerator.generateFeatureVectors(featureData) targetVector = vectorGenerator.generateTargetVector(systemSummaryScores) [filteredFeatureVectors, filteredTargetVector ] = vectorGenerator.filterVectors(featureVectors, targetVector) classifier = RandomForestClassifier(n_jobs=-1, n_estimators=100, min_samples_split=12, min_samples_leaf=25) classifier.fit(filteredFeatureVectors, filteredTargetVector) return classifier
def calculateSetTrainingFeatures(self, myPath, level): dfr = DicomFolderReader(myPath) dfr.compress() setID = dfr.getSetID() print("Processing training set {}: '{}'".format(setID, myPath)) print datetime.datetime.now() cc = dfr.getCoordinateConverter() finder = PixelFinder(myPath, cc) data = dfr.getVolumeData() shape = dfr.getVolumeShape() vshape = dfr.getVoxelShape() fgen = FeatureGenerator(setID, data, vshape, level) nbNodules = len(finder.Reader.Nodules) print("\tFound {} nodule(s).".format(nbNodules)) assert nbNodules > 0 maskP, maskN, nbNodulePixels = finder.getMasks( shape, radiusFactor=RADIUS_FACTOR) featuresP = fgen.getAllFeaturesByMask(maskP) print("\tProcessed {} nodules pixels.".format(nbNodulePixels)) featuresN = fgen.getAllFeaturesByMask(maskN) print( "\tProcessed {} random non-nodules pixels.".format(nbNodulePixels)) setFeatures = np.vstack([featuresP, featuresN]) #Create classification vector setClasses = np.zeros(setFeatures.shape[0], dtype=np.bool) setClasses[0:nbNodulePixels] = True #Let's try not to use too much memory del finder del fgen del data del cc del dfr return setFeatures, setClasses
def generateFolds(self, outdir, lemmatizer = None, POS_tagging = False, weightScheme = FeatureWeight.PRESENCE, includeRating = False, includeDocLength = False): if self.reviews == None or len(self.reviews) == 0: print 'No data to work on' return trainingData = {} validationData = {} self.generateKFolds(outdir, trainingData, validationData) for i in range(1,self.k+1): print "generating features for fold " + str(i) trainCorpus = Corpus(trainingData[str(i)], lemmatizer, POS_tagging) '''this dictionary will be used for both training and validation data''' dictionary = Dictionary(trainCorpus) generator = FeatureGenerator(trainCorpus, dictionary, outdir + '/train' + str(i) + '.csv', weightScheme, includeRating, includeDocLength) generator.generateFeatures() validCorpus = Corpus(validationData[str(i)], lemmatizer, POS_tagging); generator = FeatureGenerator(validCorpus, dictionary, outdir + '/valid' + str(i) + '.csv', weightScheme, includeRating, includeDocLength) generator.generateFeatures()
def __init__(self, weights='imagenet'): FeatureGenerator.__init__(self, VGG16, (224, 224), preprocess_input, weights)
class DeepModel: def __init__(self, config, load_path=None, w2v=None, db=None, stats=None, dmodel=None): ''' Creates a new NN model configured by a json. config is either a dict or a path to a json file json structure: { strip_stop_words=[boolean] context_window_size=[int] max_mention_words=[int] dropout=[0.0 .. 1.0] feature_generator={mention_features={feature names...}, entity_features={feature names...}} finetune_embd=[boolean] pairwise=[boolean] inputs=[list out of ['candidates', 'context', 'mention', 'extra_features']] } ''' if type(config) in {unicode, str}: with open(config) as data_file: self._config = json.load(data_file) else: self._config = config self._stopwords = stopwords.words('english') if self._config['strip_stop_words'] else None self._word_dict = None self._concept_dict = None self._db = db self._batch_left_X = [] self._batch_right_X = [] self._batch_candidate1_X = [] self._batch_candidate2_X = [] self._batch_mention_X = [] self._batch_extra_features_X = [] self._batchY = [] self.train_loss = [] self._batch_size = 128 self.inputs = {x for x in self._config['inputs']} if 'feature_generator' in self._config: self._feature_generator = FeatureGenerator(mention_features= self._config['feature_generator']['mention_features'], entity_features= self._config['feature_generator']['entity_features'], stats=stats, db=db, dmodel=dmodel) self.model = None self.get_attn_model = None if load_path is None: self.compileModel(w2v) else: self.loadModel(load_path) def getPredictor(self): if self._config['pairwise']: return PairwisePredict(self) else: return PointwisePredict(self) def compileModel(self, w2v): self._word_dict = w2v.wordDict self._concept_dict = w2v.conceptDict model_builder = ModelBuilder(self._config, w2v) # use candidates input if they were specifically specified, or if we are using an attention network to process # the context. if 'candidates' in self.inputs or \ ('context' in self.inputs and self._config['context_network'] == 'attention'): candidate1 = model_builder.addCandidateInput('candidate1_input', to_join='candidates' in self.inputs) if self._config['pairwise']: candidate2 = model_builder.addCandidateInput('candidate2_input', to_join='candidates' in self.inputs) else: candidate2 = None if 'context' in self.inputs: model_builder.addContextInput(controller1=candidate1, controller2=candidate2) if 'mention' in self.inputs: model_builder.addMentionInput() inputs = model_builder.inputs to_join = model_builder.to_join attn = model_builder.attn if 'extra_features' in self.inputs: n_extra_features = self._feature_generator.numPairwiseFeatures() if self._config['pairwise'] \ else self._feature_generator.numPointwiseFeatures() extra_features_input = Input(shape=(n_extra_features,), name='extra_features_input') inputs.append(extra_features_input) to_join.append(extra_features_input) # join all inputs x = merge(to_join, mode='concat') if len(to_join) > 1 else to_join[0] # build classifier model for c in self._config['classifier_layers']: x = Dense(c, activation='relu')(x) if 'dropout' in self._config: x = Dropout(float(self._config['dropout']))(x) out = Dense(2, activation='softmax', name='main_output')(x) model = Model(input=inputs, output=[out]) model.compile(optimizer='adagrad', loss='binary_crossentropy') self.model = model self.get_attn_model = Model(input=inputs, output=attn) print "model compiled!" def _2vec(self, mention, candidate1, candidate2): """ Transforms input to w2v vectors returns a tuple: (wikilink vec, candidate1 vec, candidate2 vec) if cannot produce wikilink vec or vectors for both candidates then returns None if cannot produce vector to only one of the candidates then returns the id of the other """ if (candidate1 is None or candidate1 not in self._concept_dict) and \ (candidate2 is None or candidate2 not in self._concept_dict): return None if self._config['pairwise']: if candidate1 is None or candidate1 not in self._concept_dict: print "h2" return candidate2 if candidate2 is None or candidate2 not in self._concept_dict: print "h3" return candidate1 candidate1_X = None candidate2_X = None left_context_X = None right_context_X = None mention_X = None extra_features_X = None # get candidate inputs if 'candidates' in self.inputs: candidate1_X = np.array([self._concept_dict[candidate1]]) if candidate1 is not None else None candidate2_X = np.array([self._concept_dict[candidate2]]) if candidate2 is not None else None # get context input if 'context' in self.inputs: left_context_X = self.wordIteratorToIndices(mention.left_context_iter(), self._config['context_window_size']) right_context_X = self.wordIteratorToIndices(mention.right_context_iter(), self._config['context_window_size']) # get mention input if 'mention' in self.inputs: mention_X = self.wordIteratorToIndices(mention.mention_text_tokenized(), self._config['max_mention_words']) if 'extra_features' in self.inputs: if self._config['pairwise']: extra_features_X = \ np.array(self._feature_generator.getPairwiseFeatures(mention, candidate1, candidate2)) else: extra_features_X = \ np.array(self._feature_generator.getPointwiseFeatures(mention, candidate1)) return left_context_X, right_context_X, mention_X, candidate1_X, candidate2_X, extra_features_X def wordIteratorToIndices(self, it, output_len): o = [] for w in it: w = w.lower() if len(o) >= output_len: break if w in self._word_dict and (self._stopwords is None or w not in self._stopwords): o.append(self._word_dict[w]) if len(o) == 0: o.append(self._word_dict[DUMMY_KEY]) o = o[:: -1] arr = np.zeros((output_len,)) n = len(o) if len(o) <= output_len else output_len arr[:n] = np.array(o)[:n] return arr def get_context_indices(self, it, output_len): words = [] indices = [] for i, w in enumerate(it): w = w.lower() words.append(w) if len(indices) >= output_len: break if w in self._word_dict and (self._stopwords is None or w not in self._stopwords): indices.append(i) return words, indices def train(self, mention, candidate1, candidate2, correct): """ Takes a single example to train :param mention: The mention to train on :param candidate1: the first candidate :param candidate2: the second candidate :param correct: which of the two is correct (expected output) """ vecs = self._2vec(mention, candidate1, candidate2) if not isinstance(vecs, tuple): return # nothing to train on (left_X, right_X, mention_X, candidate1_X, candidate2_X, extra_features_X) = vecs Y = np.array([1,0] if candidate1 == correct else [0,1]) self._trainXY(left_X, right_X, mention_X, candidate1_X, candidate2_X, extra_features_X, Y) def _trainXY(self, left_X, right_X, mention_X, candidate1_X, candidate2_X, extra_features_X, Y): self._batch_left_X.append(left_X) self._batch_right_X.append(right_X) self._batch_mention_X.append(mention_X) self._batch_candidate1_X.append(candidate1_X) self._batch_candidate2_X.append(candidate2_X) self._batch_extra_features_X.append(extra_features_X) self._batchY.append(Y) if len(self._batchY) >= self._batch_size: batchX = {} if 'candidates' in self.inputs: batchX['candidate1_input'] = np.array(self._batch_candidate1_X) if self._config['pairwise']: batchX['candidate2_input'] = np.array(self._batch_candidate2_X) if 'context' in self.inputs: batchX['left_context_input'] = np.array(self._batch_left_X) batchX['right_context_input'] = np.array(self._batch_right_X) if 'mention' in self.inputs: batchX['mention_input'] = np.array(self._batch_mention_X) if 'extra_features' in self.inputs: batchX['extra_features_input'] = np.array(self._batch_extra_features_X) batchY = np.array(self._batchY) loss = self.model.train_on_batch(batchX, batchY) self.train_loss.append(loss) print 'Done batch. Size of batch - ', batchY.shape, '; loss: ', loss self._batch_left_X = [] self._batch_right_X = [] self._batch_mention_X = [] self._batch_candidate1_X = [] self._batch_candidate2_X = [] self._batch_extra_features_X = [] self._batchY = [] def plotTrainLoss(self,fname, st=0): plt.plot(self.train_loss[st:]) plt.ylabel('Loss') plt.xlabel('Batch') plt.savefig(fname) def finalize(self): pass def saveModel(self, fname): with open(fname+".model", 'w') as model_file: model_file.write(self.model.to_json()) self.model.save_weights(fname + ".weights", overwrite=True) with open(fname+".w2v.def", 'w') as f: f.write(json.dumps(self._word_dict)+'\n') f.write(json.dumps(self._concept_dict)+'\n') return def loadModel(self, fname): with open(fname+".model", 'r') as model_file: self.model = model_from_json(model_file.read()) self.model.load_weights(fname + ".weights") with open(fname+".w2v.def", 'r') as f: l = f.readlines() self._word_dict = {str(x): int(y) for x,y in json.loads(l[0]).iteritems()} self._concept_dict = {int(x) if str(x) != DUMMY_KEY else DUMMY_KEY: int(y) for x, y in json.loads(l[1]).iteritems()} self.model.compile(optimizer='adagrad', loss='binary_crossentropy') def predict(self, mention, candidate1, candidate2): vecs = self._2vec(mention, candidate1, candidate2) if not isinstance(vecs, tuple): return vecs (left_X, right_X, mention_X, candidate1_X, candidate2_X, extraFeatures_X) = vecs X = {} if 'candidates' in self.inputs: X['candidate1_input'] = candidate1_X.reshape((1, candidate1_X.shape[0],)) if self._config['pairwise']: X['candidate2_input'] = candidate2_X.reshape((1, candidate2_X.shape[0],)) if 'context' in self.inputs: X['left_context_input'] = left_X.reshape((1, left_X.shape[0],)) X['right_context_input'] = right_X.reshape((1, right_X.shape[0],)) if 'mention' in self.inputs: X['mention_input'] = mention_X.reshape((1, mention_X.shape[0],)) if 'extra_features' in self.inputs: X['extra_features_input'] = np.array(extraFeatures_X.reshape(1,extraFeatures_X.shape[0])) Y = self.model.predict(X, batch_size=1) return Y[0][0] def get_attn(self, mention, candidate1, candidate2): vecs = self._2vec(mention, candidate1, candidate2) if not isinstance(vecs, tuple): return None (left_X, right_X, mention_X, candidate1_X, candidate2_X, extraFeatures_X) = vecs X = {} if 'candidates' in self.inputs: X['candidate1_input'] = candidate1_X.reshape((1, candidate1_X.shape[0],)) if self._config['pairwise']: X['candidate2_input'] = candidate2_X.reshape((1, candidate2_X.shape[0],)) if 'context' in self.inputs: X['left_context_input'] = left_X.reshape((1, left_X.shape[0],)) X['right_context_input'] = right_X.reshape((1, right_X.shape[0],)) if 'mention' in self.inputs: X['mention_input'] = mention_X.reshape((1, mention_X.shape[0],)) if 'extra_features' in self.inputs: X['extra_features_input'] = np.array(extraFeatures_X.reshape(1,extraFeatures_X.shape[0])) attn_out = self.get_attn_model.predict(X, batch_size=1) left_context, left_indices = self.get_context_indices(mention.left_context_iter(), self._config['context_window_size']) right_context, right_indices = self.get_context_indices(mention.right_context_iter(), self._config['context_window_size']) left_attn = [0 for i in xrange(len(left_context))] right_attn = [0 for i in xrange(len(right_context))] for i in xrange(self._config['context_window_size']): if i < len(left_indices): left_attn[left_indices[i]] = attn_out[0][0, i] if i < len(right_indices): right_attn[right_indices[i]] = attn_out[1][0, i] return left_context, left_attn, right_context, right_attn
def __init__(self, weights='imagenet'): FeatureGenerator.__init__(self, ResNet50, (224, 224), preprocess_input, weights, pooling='avg')
import numpy as np import os import cv2 import csv import warnings from sklearn.preprocessing import MinMaxScaler from FeatureGenerator import FeatureGenerator warnings.filterwarnings('ignore') feature_generator = FeatureGenerator() train_path = "../train" test_path = "../test" train_images_path = [] test_images_path = [] labels = [] breed_dic = {} fixed_size = tuple((80, 80)) def generator_train_npy(): with open(os.path.join("./", "train.csv")) as f: reader = csv.reader(f) for row in reader: img_file_name, breed, label = row train_images_path.append( os.path.join(train_path, img_file_name + ".jpg")) label = int(label) labels.append(label) breed_dic[label] = breed
def __init__(self, weights='imagenet'): FeatureGenerator.__init__(self, Xception, (299, 299), preprocess_input, weights, pooling='avg')
import pylab as pl import numpy as np from DicomFolderReader import DicomFolderReader from FeatureGenerator import FeatureGenerator dfr = DicomFolderReader.create("../data/LIDC-IDRI", 50) data = dfr.getVolumeData() h, w, d = data.shape mySlice = 93 mask = np.zeros_like(data, dtype=np.bool) mask[:, :, mySlice] = 1 vshape = dfr.getVoxelShape() fgen = FeatureGenerator(50, data, vshape, 1) result = fgen.averaging3DByMask(mask, windowSize=3, vesselSize=7.5) result = result.reshape((512, 512)) pl.subplot(121) pl.imshow(data[:, :, mySlice], cmap=pl.cm.bone) # @UndefinedVariable pl.subplot(122) pl.imshow(result, cmap=pl.cm.bone) # @UndefinedVariable pl.show()
class Classifier: def __init__(self, setID, data, vshape): self.SetID = setID self.Data = data self.VoxelShape = vshape self.model = None self.fgen = None def __del__(self): del self.SetID del self.Data del self.VoxelShape del self.model del self.fgen def isLevelset(self): return self.fgen is not None and self.model is not None def setLevel(self, level, model): self.fgen = FeatureGenerator(self.SetID, self.Data, self.VoxelShape, level) self.model = model # @staticmethod # def generatePixelList2D((h,w)): # x, y = np.meshgrid(np.arange(h), np.arange(w)) # x, y = x.flatten(), y.flatten() # points = np.vstack((x,y)).T # assert points.shape == (h*w,2) # # del x,y # return points # # @staticmethod # def generatePixelList3D((h,w,d)): # x, y, z = np.meshgrid(np.arange(h), np.arange(w), np.arange(d)) # x, y, z = x.flatten(), y.flatten(), z.flatten() # points = np.vstack((x,y,z)).T # assert points.shape == (h*w*d,3) # # del x,y,z # return points @staticmethod def pruneFeatures(allFeatures, allClasses, oldMask, newMask): """Selects current level feature out of previous level features based on masks.""" oldIndices = np.where(oldMask.ravel())[0] newIndices = np.where(newMask.ravel())[0] indices = np.searchsorted(oldIndices, newIndices) return allFeatures[indices, :], allClasses[indices, :] def generateProbabilityVolume(self, mask3D, threshold=0.01): if not self.isLevelset(): raise ValueError("Level not set") testFeatures = self.fgen.getAllFeaturesByMask( mask3D) #reusing previous features might be possible m, n = testFeatures.shape maxChunk = 1000000 nbRows = maxChunk // n result = np.empty((m, 2)) for r in np.arange(0, m, nbRows): #print "[{}, {}[".format(r, r+nbRows) chunk = testFeatures[r:r + nbRows, :] result[r:r + nbRows, :] = self.model.predict_proba(chunk) #result = self.model.predict_proba(testFeatures) probImg = np.zeros(mask3D.shape, dtype=np.float32) probImg[mask3D] = result[:, 1] mask = probImg > threshold return probImg, mask # def generateProbabilityImage(self, mask2D, mySlice, threshold=0.01): # if not self.isLevelset(): # raise ValueError("Level not set") # # testFeatures = deque() # for px,py in zip(np.where(mask2D)): # pixelFeatures = self.fgen.calculatePixelFeatures(px, py, mySlice) # testFeatures.append(pixelFeatures) # # testFeatures = np.array(testFeatures) # result = self.model.predict_proba(testFeatures) # # probImg = np.zeros(mask2D.shape) # probImg[mask2D] = result[:, 1] # # mask = ma.masked_greater(probImg, threshold).mask # # return probImg, mask
def __init__(self, cl, lm): self.cl = cl self.fg = FeatureGenerator(lm)
def getConfusionMatrix(self, y_true, y_pred): return confusion_matrix(y_true, y_pred) def getPrecisionRecallF1Score(self, y_true, y_pred): return precision_recall_fscore_support(y_true, y_pred, average='micro') def getAccuracy(self, y_true, y_pred): return accuracy_score(y_true, y_pred) if __name__ == '__main__': # locate corpus data xmlcorpora = "../data/corpora/AZ_distribution/" # generate gold-standard feature vectors featureGen = FeatureGenerator(xmlcorpora) # initialise and train NB model classifer = NaiveBayes(features=featureGen.features, distribution="Bernoulli", train_split=0.8) classifer.train() # Analysis confusionMatrix, precisionRecallF1, accuracy = classifer.test() classifer.plotConfusionMatrix(confusionMatrix, range(8)) print("=== Accuracy: %f ===" % (accuracy)) # generate summary classifer.getSummary('9405001.az-scixml')
def setLevel(self, level, model): self.fgen = FeatureGenerator(self.SetID, self.Data, self.VoxelShape, level) self.model = model
''' Created on Apr 15, 2013 This is where we invoke modules to generate features for training and test data @author: naresh ''' from Review import Review import nltk from Corpus import Corpus from Dictionary import Dictionary from FeatureGenerator import FeatureGenerator from FeatureWeight import FeatureWeight if __name__ == '__main__': trainingreviews = Review.readReviewsFromXML("../old-training-shuffled.xml") lemmatizer = nltk.WordNetLemmatizer() testReviews = Review.readReviewsFromXML("../old-test-data.xml") trainCorpus = Corpus(trainingreviews, lemmatizer, POS_tagging = True) '''this dictionary will be used for both training and validation data''' dictionary = Dictionary(trainCorpus) generator = FeatureGenerator(trainCorpus, dictionary, '../train.csv', weightScheme= FeatureWeight.TFIDF) generator.generateFeatures() testCorpus = Corpus(testReviews, lemmatizer, POS_tagging = True); generator = FeatureGenerator(testCorpus, dictionary, '../test.csv',weightScheme= FeatureWeight.TFIDF) generator.generateFeatures()