def computeBaseLine(self, baselineType=0):
     iterWeek, endWeek = datetime(2013, 3, 23), datetime(2013, 4, 20)
     db = DBController()
     fg = FeatureGenerator()
     baselineScore = 0
     while iterWeek <= endWeek:
         lastWeek = iterWeek - timedelta(weeks=1)
         featureList = db.getFeatureListByWeek(iterWeek)
         y_pred, y_test = [], []
         for featureVector in featureList:
             songId = featureVector["id"]
             lastWeekRank = db.getTop50Rank(lastWeek, songId)
             if lastWeekRank is None:
                 lastWeekScore = 0
             else:
                 lastWeekScore = fg.rankToPopScore(lastWeekRank)
             currentWeekRank = featureVector["rank"]
             currentWeekScore = fg.rankToPopScore(currentWeekRank) if currentWeekRank is not None else lastWeekScore
             y_pred.append(lastWeekScore)
             y_test.append(currentWeekScore)
         y_pred, y_test = self.getRankArray(numpy.asarray(y_pred)), self.getRankArray(numpy.asarray(y_test))
         if baselineType == 0:
             baselineScore += self.getRankEvalationScore(y_pred, y_test)
         elif baselineType == 1:
             baselineScore += metrics.r2_score(y_pred, y_test)
         else:
             baselineScore += metrics.mean_squared_error(y_pred, y_test)
         iterWeek += timedelta(weeks=1)
     baselineScore = baselineScore / 5
     print baselineScore
Exemple #2
0
def main(argv):
    del argv

    # read in configuration
    config = get_model_config('config/basic.yml')

    # read in data
    MyData = DataWarehouse(config)
    MyData.read_data()

    # preprocessing
    if FLAGS.pre_process:
        MyPreprocess = DataPreProcess(MyData, config)
        MyPreprocess.process()

    # generate feature
    if FLAGS.gen_features:
        MyFeature = FeatureGenerator(config)
        MyFeature.process()

    # train & predict
    if FLAGS.train:
        model = Model(MyData, config)

        #---------Generating outputs ---------------
        model.process()
        # ypred = model.predict()
        # MyData.generate_submission(ypred)

    try:
        # logging.info('clear work_dir')
        shutil.rmtree(WORK_DIR)
    except Exception as e:
        pass
Exemple #3
0
def genFeatures(filename):
    fgen = FeatureGenerator(PhonemeDataFile(filename))
    features = list(fgen.features_vector())
    shuffle(features)
    pivot = int(len(features) * 0.8)
    test,train = features[pivot:],features[:pivot]
    return test,train
def print_train_feature_file(options):
    feature_config = setPrintFeatureConfig(options.config_file)
    printer = FeatureGenerator(feature_config)
    feature_strs = printer.get_train_features(options.key)

    output_file = open(options.output_file, 'w')
    output_file.write('\n'.join(feature_strs))
    output_file.close()
def print_test_feature_file(options):
    feature_config = setPrintFeatureConfig(options.config_file)
    printer = FeatureGenerator(feature_config)
    feature_strs = printer.get_test_features(options.id,
                                             only_gold=options.only_gold_align)

    output_file = open(options.output_file, 'w')
    output_file.write('\n'.join(feature_strs))
    output_file.close()
def classifyData(model, sourceDocuments, systemSummaries, referenceSummaries):
    featureGenerator = FeatureGenerator()
    featureData = featureGenerator.generateFeatureScores(
        sourceDocuments, systemSummaries, referenceSummaries, None)

    vectorGenerator = VectorGenerator()
    featureVector = vectorGenerator.generateFeatureVector(featureData, 0, 0, 1)

    return model.predict([featureVector])
Exemple #7
0
def testWord( nnfile, word ):
    pcas, phones, nn = loadNN( nnfile )
    fgen = FeatureGenerator(None)
    wordc = [c for c in word.lower()]
    vectors = fgen.word_vectors( wordc )
    print "For word: %s"%word
#    print "Phonemes: %d, %s"%(len(phones),str(phones))
    for c, v in vectors:
        pp = zip( nn.run(v), phones )
#        print sorted(pp,reverse=True)
        (_,best_pronunciation) = sorted(pp,reverse=True)[0] 
        print "char: %s, pronunciation: %s"%(c,best_pronunciation)
 def train(self, beginWeek, endWeek, featureMode=0, regressionModelType=0):
     if beginWeek < datetime(2007, 1, 1) or endWeek > datetime.today():
         raise Exception("Invalid input date!")
     beginWeek, endWeek = dateToSaturday(beginWeek), dateToSaturday(endWeek)
     endWeek = endWeek - timedelta(days=7) if endWeek > datetime.today() else endWeek
     iterWeek = beginWeek
     fg = FeatureGenerator()
     regression = self.getRegressionModel(regressionModelType)
     while iterWeek <= endWeek:
         matrix_train = fg.getFeatureMatrix(iterWeek, iterWeek, featureMode)
         X_train, y_train = matrix_train[:, 0:-1], matrix_train[:, -1]
         regression.fit(X_train, y_train)
         iterWeek += timedelta(weeks=1)
     return regression
    def get_lexicon_features(self, inst_tokens, pos_lexicon, neg_lexicon):
        lemmatizer = WordNetLemmatizer()
        pos_words = []
        neg_words = []
        with open(pos_lexicon) as pf:
            for line in pf:
                if re.match("[^;]", line) and len(line) > 1:
                    pos_words.append(line.strip())
        with open(neg_lexicon) as nf:
            for line in nf:
                if re.match("[^;]", line) and len(line) > 1:
                    neg_words.append(line.strip())
        pos_neg_list = []
        pos_len = 0
        neg_len = 0
        inst_len = 0
        for token in inst_tokens:
            word = token.get_text()
            tag = token.get_tag()
            wordnet_tag = FeatureGenerator.get_wordnet_pos(tag)
            if wordnet_tag:
                word = lemmatizer.lemmatize(word, wordnet_tag).encode('utf-8')
                # word = str(lemmatizer.lemmatize(word, wordnet_tag))
            else:
                word = lemmatizer.lemmatize(word).encode('utf-8')
                # word = str(lemmatizer.lemmatize(word))
            if word in pos_words:
                pos_len += 1
            elif word in neg_words:
                neg_len += 1
            inst_len += 1
        pos_neg_list.append(pos_len/inst_len)
        pos_neg_list.append(neg_len/inst_len)

        return pos_neg_list
class QueryClassifier:

    def __init__(self, cl, lm):
        self.cl = cl
        self.fg = FeatureGenerator(lm)

    def is_correct(self, query, words):
        x = [self.fg.generate_features(query, words)]
        return self.cl.predict(x)[0]
Exemple #11
0
def makeNN(filename, outputfile, hidden, pca, layers):
    fgen = FeatureGenerator(PhonemeDataFile(filename))
    features, pcas = list(fgen.features_vector(pca))
    shuffle(features)
    split = int(len(features) * 0.8)
    train = features[:split] #The larger set for training
    test = features[split:] 
    
    num_input = len(train[0][0])
    num_output = len(train[0][1])
    inputVars = tuple([num_input] + [hidden]*layers + [num_output])
    print "Making NN with: %s"%str(inputVars)
    print "len(train)=%d, len(test)=%d"%(len(train),len(test))

    network = NeuralNet( inputVars )
    network.train(train, test, debug=True)
    if network.save(pcas,list(fgen.phones),outputfile):
        print "Saved nn successfully"
    else: print "Error while saving nn"
 def test(self, model, week, featureMode=0, outputSong=False, x=10):
     fg = FeatureGenerator()
     matrix_test = fg.getFeatureMatrix(week, week, featureMode, True)
     X_test, y_test = matrix_test[:, 1:-1], matrix_test[:, -1]
     songIdList = matrix_test[:, 0].tolist()
     songIdList = list(itertools.chain(*songIdList))
     y_pred = model.predict(X_test)
     y_pred = self.getRankArray(y_pred)
     y_test = self.getRankArray(y_test)
     r2Score = metrics.r2_score(y_test, y_pred)
     meanSquare = metrics.mean_squared_error(y_test, y_pred)
     rankEvalScore = self.getRankEvalationScore(y_pred, y_test)
     print "r2 score: ", r2Score, "mean square error: ", meanSquare, " rank score: ", rankEvalScore
     predTopX, realTopX = self.outputTopX(songIdList, y_pred.tolist()), self.outputTopX(songIdList, y_test.tolist())
     inclusiveAccuracy, rankMatchAccuracy = self.computTopXAccuracy(predTopX, realTopX)
     print "Top", x, "inclusive accuracy: ", inclusiveAccuracy, ",Top", x, "rank match accuracy: ", rankMatchAccuracy
     if outputSong:
         print self.outputTopXSongNames(predTopX), "\n", self.outputTopXSongNames(realTopX)
     return r2Score, meanSquare, rankEvalScore
Exemple #13
0
    def __init__(self, config, load_path=None, w2v=None, db=None, stats=None, dmodel=None):
        '''
        Creates a new NN model configured by a json.

        config is either a dict or a path to a json file

        json structure:
        {
            strip_stop_words=[boolean]
            context_window_size=[int]
            max_mention_words=[int]
            dropout=[0.0 .. 1.0]
            feature_generator={mention_features={feature names...}, entity_features={feature names...}}

            finetune_embd=[boolean]
            pairwise=[boolean]
            inputs=[list out of ['candidates', 'context', 'mention', 'extra_features']]
        }
        '''

        if type(config) in {unicode, str}:
            with open(config) as data_file:
                self._config = json.load(data_file)
        else:
            self._config = config
        self._stopwords = stopwords.words('english') if self._config['strip_stop_words'] else None

        self._word_dict = None
        self._concept_dict = None

        self._db = db
        self._batch_left_X = []
        self._batch_right_X = []
        self._batch_candidate1_X = []
        self._batch_candidate2_X = []
        self._batch_mention_X = []
        self._batch_extra_features_X = []
        self._batchY = []
        self.train_loss = []
        self._batch_size = 128
        self.inputs = {x for x in self._config['inputs']}

        if 'feature_generator' in self._config:
            self._feature_generator = FeatureGenerator(mention_features=
                                                       self._config['feature_generator']['mention_features'],
                                                       entity_features=
                                                       self._config['feature_generator']['entity_features'],
                                                       stats=stats, db=db, dmodel=dmodel)
        self.model = None
        self.get_attn_model = None

        if load_path is None:
            self.compileModel(w2v)
        else:
            self.loadModel(load_path)
Exemple #14
0
def trainModel(fileName, sourceDocuments, systemSummaries, referenceSummaries,
               systemSummaryScores):
    featureGenerator = FeatureGenerator()
    featureData = featureGenerator.generateFeatureScores(
        sourceDocuments, systemSummaries, referenceSummaries,
        "Feature Data/" + fileName + ".json")

    vectorGenerator = VectorGenerator()
    featureVectors = vectorGenerator.generateFeatureVectors(featureData)
    targetVector = vectorGenerator.generateTargetVector(systemSummaryScores)
    [filteredFeatureVectors, filteredTargetVector
     ] = vectorGenerator.filterVectors(featureVectors, targetVector)

    classifier = RandomForestClassifier(n_jobs=-1,
                                        n_estimators=100,
                                        min_samples_split=12,
                                        min_samples_leaf=25)
    classifier.fit(filteredFeatureVectors, filteredTargetVector)

    return classifier
    def calculateSetTrainingFeatures(self, myPath, level):
        dfr = DicomFolderReader(myPath)
        dfr.compress()
        setID = dfr.getSetID()
        print("Processing training set {}: '{}'".format(setID, myPath))
        print datetime.datetime.now()
        cc = dfr.getCoordinateConverter()
        finder = PixelFinder(myPath, cc)
        data = dfr.getVolumeData()
        shape = dfr.getVolumeShape()
        vshape = dfr.getVoxelShape()
        fgen = FeatureGenerator(setID, data, vshape, level)
        nbNodules = len(finder.Reader.Nodules)
        print("\tFound {} nodule(s).".format(nbNodules))
        assert nbNodules > 0

        maskP, maskN, nbNodulePixels = finder.getMasks(
            shape, radiusFactor=RADIUS_FACTOR)
        featuresP = fgen.getAllFeaturesByMask(maskP)
        print("\tProcessed {} nodules pixels.".format(nbNodulePixels))
        featuresN = fgen.getAllFeaturesByMask(maskN)
        print(
            "\tProcessed {} random non-nodules pixels.".format(nbNodulePixels))
        setFeatures = np.vstack([featuresP, featuresN])

        #Create classification vector
        setClasses = np.zeros(setFeatures.shape[0], dtype=np.bool)
        setClasses[0:nbNodulePixels] = True

        #Let's try not to use too much memory
        del finder
        del fgen
        del data
        del cc
        del dfr

        return setFeatures, setClasses
Exemple #16
0
 def generateFolds(self, outdir, lemmatizer = None, POS_tagging = False, 
                   weightScheme = FeatureWeight.PRESENCE, includeRating = False, includeDocLength = False):
     if self.reviews == None or len(self.reviews) == 0:
         print 'No data to work on'
         return
     
     trainingData = {}
     validationData = {}
     self.generateKFolds(outdir, trainingData, validationData)        
     
     for i in range(1,self.k+1):
         print "generating features for fold " + str(i)          
         
         trainCorpus = Corpus(trainingData[str(i)], lemmatizer, POS_tagging)
         '''this dictionary will be used for both training and validation data'''
         dictionary = Dictionary(trainCorpus)
         generator = FeatureGenerator(trainCorpus, dictionary, outdir + '/train' + str(i) + '.csv', 
                                      weightScheme, includeRating, includeDocLength)
         generator.generateFeatures()
         
         validCorpus = Corpus(validationData[str(i)], lemmatizer, POS_tagging);
         generator = FeatureGenerator(validCorpus, dictionary, outdir + '/valid' + str(i) + '.csv', 
                                      weightScheme, includeRating, includeDocLength)
         generator.generateFeatures()
Exemple #17
0
 def __init__(self, weights='imagenet'):
     FeatureGenerator.__init__(self, VGG16, (224, 224), preprocess_input,
                               weights)
Exemple #18
0
class DeepModel:
    def __init__(self, config, load_path=None, w2v=None, db=None, stats=None, dmodel=None):
        '''
        Creates a new NN model configured by a json.

        config is either a dict or a path to a json file

        json structure:
        {
            strip_stop_words=[boolean]
            context_window_size=[int]
            max_mention_words=[int]
            dropout=[0.0 .. 1.0]
            feature_generator={mention_features={feature names...}, entity_features={feature names...}}

            finetune_embd=[boolean]
            pairwise=[boolean]
            inputs=[list out of ['candidates', 'context', 'mention', 'extra_features']]
        }
        '''

        if type(config) in {unicode, str}:
            with open(config) as data_file:
                self._config = json.load(data_file)
        else:
            self._config = config
        self._stopwords = stopwords.words('english') if self._config['strip_stop_words'] else None

        self._word_dict = None
        self._concept_dict = None

        self._db = db
        self._batch_left_X = []
        self._batch_right_X = []
        self._batch_candidate1_X = []
        self._batch_candidate2_X = []
        self._batch_mention_X = []
        self._batch_extra_features_X = []
        self._batchY = []
        self.train_loss = []
        self._batch_size = 128
        self.inputs = {x for x in self._config['inputs']}

        if 'feature_generator' in self._config:
            self._feature_generator = FeatureGenerator(mention_features=
                                                       self._config['feature_generator']['mention_features'],
                                                       entity_features=
                                                       self._config['feature_generator']['entity_features'],
                                                       stats=stats, db=db, dmodel=dmodel)
        self.model = None
        self.get_attn_model = None

        if load_path is None:
            self.compileModel(w2v)
        else:
            self.loadModel(load_path)

    def getPredictor(self):
        if self._config['pairwise']:
            return PairwisePredict(self)
        else:
            return PointwisePredict(self)

    def compileModel(self, w2v):
        self._word_dict = w2v.wordDict
        self._concept_dict = w2v.conceptDict

        model_builder = ModelBuilder(self._config, w2v)

        # use candidates input if they were specifically specified, or if we are using an attention network to process
        # the context.
        if 'candidates' in self.inputs or \
                ('context' in self.inputs and self._config['context_network'] == 'attention'):
            candidate1 = model_builder.addCandidateInput('candidate1_input', to_join='candidates' in self.inputs)
            if self._config['pairwise']:
                candidate2 = model_builder.addCandidateInput('candidate2_input', to_join='candidates' in self.inputs)
            else:
                candidate2 = None

        if 'context' in self.inputs:
            model_builder.addContextInput(controller1=candidate1, controller2=candidate2)

        if 'mention' in self.inputs:
            model_builder.addMentionInput()

        inputs = model_builder.inputs
        to_join = model_builder.to_join
        attn = model_builder.attn

        if 'extra_features' in self.inputs:
            n_extra_features = self._feature_generator.numPairwiseFeatures() if self._config['pairwise'] \
                else self._feature_generator.numPointwiseFeatures()
            extra_features_input = Input(shape=(n_extra_features,), name='extra_features_input')
            inputs.append(extra_features_input)
            to_join.append(extra_features_input)

        # join all inputs
        x = merge(to_join, mode='concat') if len(to_join) > 1 else to_join[0]

        # build classifier model
        for c in self._config['classifier_layers']:
            x = Dense(c, activation='relu')(x)
        if 'dropout' in self._config:
            x = Dropout(float(self._config['dropout']))(x)
        out = Dense(2, activation='softmax', name='main_output')(x)

        model = Model(input=inputs, output=[out])
        model.compile(optimizer='adagrad', loss='binary_crossentropy')
        self.model = model
        self.get_attn_model = Model(input=inputs, output=attn)
        print "model compiled!"

    def _2vec(self, mention, candidate1, candidate2):
        """
        Transforms input to w2v vectors
        returns a tuple: (wikilink vec, candidate1 vec, candidate2 vec)

        if cannot produce wikilink vec or vectors for both candidates then returns None
        if cannot produce vector to only one of the candidates then returns the id of the other
        """
        if (candidate1 is None or candidate1 not in self._concept_dict) and \
                (candidate2 is None or candidate2 not in self._concept_dict):

            return None
        if self._config['pairwise']:
            if candidate1 is None or candidate1 not in self._concept_dict:
                print "h2"
                return candidate2
            if candidate2 is None or candidate2 not in self._concept_dict:
                print "h3"
                return candidate1

        candidate1_X = None
        candidate2_X = None
        left_context_X = None
        right_context_X = None
        mention_X = None
        extra_features_X = None

        # get candidate inputs
        if 'candidates' in self.inputs:
            candidate1_X = np.array([self._concept_dict[candidate1]]) if candidate1 is not None else None
            candidate2_X = np.array([self._concept_dict[candidate2]]) if candidate2 is not None else None

        # get context input
        if 'context' in self.inputs:
            left_context_X = self.wordIteratorToIndices(mention.left_context_iter(),
                                                        self._config['context_window_size'])
            right_context_X = self.wordIteratorToIndices(mention.right_context_iter(),
                                                         self._config['context_window_size'])

        # get mention input
        if 'mention' in self.inputs:
            mention_X = self.wordIteratorToIndices(mention.mention_text_tokenized(),
                                                   self._config['max_mention_words'])

        if 'extra_features' in self.inputs:
            if self._config['pairwise']:
                extra_features_X = \
                    np.array(self._feature_generator.getPairwiseFeatures(mention, candidate1, candidate2))
            else:
                extra_features_X = \
                    np.array(self._feature_generator.getPointwiseFeatures(mention, candidate1))

        return left_context_X, right_context_X, mention_X, candidate1_X, candidate2_X, extra_features_X

    def wordIteratorToIndices(self, it, output_len):
        o = []
        for w in it:
            w = w.lower()
            if len(o) >= output_len:
                break
            if w in self._word_dict and (self._stopwords is None or w not in self._stopwords):
                o.append(self._word_dict[w])
        if len(o) == 0:
            o.append(self._word_dict[DUMMY_KEY])
        o = o[:: -1]
        arr = np.zeros((output_len,))
        n = len(o) if len(o) <= output_len else output_len
        arr[:n] = np.array(o)[:n]
        return arr

    def get_context_indices(self, it, output_len):
        words = []
        indices = []
        for i, w in enumerate(it):
            w = w.lower()
            words.append(w)
            if len(indices) >= output_len:
                break
            if w in self._word_dict and (self._stopwords is None or w not in self._stopwords):
                indices.append(i)
        return words, indices

    def train(self, mention, candidate1, candidate2, correct):
        """
        Takes a single example to train
        :param mention:    The mention to train on
        :param candidate1:  the first candidate
        :param candidate2:  the second candidate
        :param correct:     which of the two is correct (expected output)
        """
        vecs = self._2vec(mention, candidate1, candidate2)
        if not isinstance(vecs, tuple):
            return # nothing to train on

        (left_X, right_X, mention_X, candidate1_X, candidate2_X, extra_features_X) = vecs
        Y = np.array([1,0] if candidate1 == correct else [0,1])
        self._trainXY(left_X, right_X, mention_X, candidate1_X, candidate2_X, extra_features_X, Y)

    def _trainXY(self, left_X, right_X, mention_X, candidate1_X, candidate2_X, extra_features_X, Y):
        self._batch_left_X.append(left_X)
        self._batch_right_X.append(right_X)
        self._batch_mention_X.append(mention_X)
        self._batch_candidate1_X.append(candidate1_X)
        self._batch_candidate2_X.append(candidate2_X)
        self._batch_extra_features_X.append(extra_features_X)
        self._batchY.append(Y)

        if len(self._batchY) >= self._batch_size:
            batchX = {}
            if 'candidates' in self.inputs:
                batchX['candidate1_input'] = np.array(self._batch_candidate1_X)
                if self._config['pairwise']:
                    batchX['candidate2_input'] = np.array(self._batch_candidate2_X)
            if 'context' in self.inputs:
                batchX['left_context_input'] = np.array(self._batch_left_X)
                batchX['right_context_input'] = np.array(self._batch_right_X)
            if 'mention' in self.inputs:
                batchX['mention_input'] = np.array(self._batch_mention_X)
            if 'extra_features' in self.inputs:
                batchX['extra_features_input'] = np.array(self._batch_extra_features_X)
            batchY = np.array(self._batchY)

            loss = self.model.train_on_batch(batchX, batchY)
            self.train_loss.append(loss)
            print 'Done batch. Size of batch - ', batchY.shape, '; loss: ', loss

            self._batch_left_X = []
            self._batch_right_X = []
            self._batch_mention_X = []
            self._batch_candidate1_X = []
            self._batch_candidate2_X = []
            self._batch_extra_features_X = []
            self._batchY = []

    def plotTrainLoss(self,fname, st=0):
        plt.plot(self.train_loss[st:])
        plt.ylabel('Loss')
        plt.xlabel('Batch')
        plt.savefig(fname)

    def finalize(self):
        pass

    def saveModel(self, fname):
        with open(fname+".model", 'w') as model_file:
            model_file.write(self.model.to_json())
        self.model.save_weights(fname + ".weights", overwrite=True)

        with open(fname+".w2v.def", 'w') as f:
            f.write(json.dumps(self._word_dict)+'\n')
            f.write(json.dumps(self._concept_dict)+'\n')
        return

    def loadModel(self, fname):
        with open(fname+".model", 'r') as model_file:
            self.model = model_from_json(model_file.read())
        self.model.load_weights(fname + ".weights")

        with open(fname+".w2v.def", 'r') as f:
            l = f.readlines()
            self._word_dict = {str(x): int(y) for x,y in json.loads(l[0]).iteritems()}
            self._concept_dict = {int(x) if str(x) != DUMMY_KEY else DUMMY_KEY: int(y) for x, y in json.loads(l[1]).iteritems()}


        self.model.compile(optimizer='adagrad', loss='binary_crossentropy')

    def predict(self, mention, candidate1, candidate2):
        vecs = self._2vec(mention, candidate1, candidate2)
        if not isinstance(vecs, tuple):
            return vecs
        (left_X, right_X, mention_X, candidate1_X, candidate2_X, extraFeatures_X) = vecs

        X = {}
        if 'candidates' in self.inputs:
            X['candidate1_input'] = candidate1_X.reshape((1, candidate1_X.shape[0],))
            if self._config['pairwise']:
                X['candidate2_input'] = candidate2_X.reshape((1, candidate2_X.shape[0],))
        if 'context' in self.inputs:
            X['left_context_input'] = left_X.reshape((1, left_X.shape[0],))
            X['right_context_input'] = right_X.reshape((1, right_X.shape[0],))
        if 'mention' in self.inputs:
            X['mention_input'] = mention_X.reshape((1, mention_X.shape[0],))
        if 'extra_features' in self.inputs:
            X['extra_features_input'] = np.array(extraFeatures_X.reshape(1,extraFeatures_X.shape[0]))

        Y = self.model.predict(X, batch_size=1)
        return Y[0][0]

    def get_attn(self, mention, candidate1, candidate2):
        vecs = self._2vec(mention, candidate1, candidate2)
        if not isinstance(vecs, tuple):
            return None
        (left_X, right_X, mention_X, candidate1_X, candidate2_X, extraFeatures_X) = vecs

        X = {}
        if 'candidates' in self.inputs:
            X['candidate1_input'] = candidate1_X.reshape((1, candidate1_X.shape[0],))
            if self._config['pairwise']:
                X['candidate2_input'] = candidate2_X.reshape((1, candidate2_X.shape[0],))
        if 'context' in self.inputs:
            X['left_context_input'] = left_X.reshape((1, left_X.shape[0],))
            X['right_context_input'] = right_X.reshape((1, right_X.shape[0],))
        if 'mention' in self.inputs:
            X['mention_input'] = mention_X.reshape((1, mention_X.shape[0],))
        if 'extra_features' in self.inputs:
            X['extra_features_input'] = np.array(extraFeatures_X.reshape(1,extraFeatures_X.shape[0]))

        attn_out = self.get_attn_model.predict(X, batch_size=1)

        left_context, left_indices = self.get_context_indices(mention.left_context_iter(),
                                                              self._config['context_window_size'])
        right_context, right_indices = self.get_context_indices(mention.right_context_iter(),
                                                                self._config['context_window_size'])
        left_attn = [0 for i in xrange(len(left_context))]
        right_attn = [0 for i in xrange(len(right_context))]
        for i in xrange(self._config['context_window_size']):
            if i < len(left_indices):
                left_attn[left_indices[i]] = attn_out[0][0, i]
            if i < len(right_indices):
                right_attn[right_indices[i]] = attn_out[1][0, i]
        return left_context, left_attn, right_context, right_attn
Exemple #19
0
 def __init__(self, weights='imagenet'):
     FeatureGenerator.__init__(self,
                               ResNet50, (224, 224),
                               preprocess_input,
                               weights,
                               pooling='avg')
import numpy as np
import os
import cv2
import csv
import warnings
from sklearn.preprocessing import MinMaxScaler
from FeatureGenerator import FeatureGenerator

warnings.filterwarnings('ignore')

feature_generator = FeatureGenerator()
train_path = "../train"
test_path = "../test"
train_images_path = []
test_images_path = []
labels = []
breed_dic = {}

fixed_size = tuple((80, 80))


def generator_train_npy():
    with open(os.path.join("./", "train.csv")) as f:
        reader = csv.reader(f)
        for row in reader:
            img_file_name, breed, label = row
            train_images_path.append(
                os.path.join(train_path, img_file_name + ".jpg"))
            label = int(label)
            labels.append(label)
            breed_dic[label] = breed
 def __init__(self, weights='imagenet'):
     FeatureGenerator.__init__(self, Xception, (299, 299), preprocess_input, weights, pooling='avg')
Exemple #22
0
import pylab as pl
import numpy as np
from DicomFolderReader import DicomFolderReader
from FeatureGenerator import FeatureGenerator
dfr = DicomFolderReader.create("../data/LIDC-IDRI", 50)
data = dfr.getVolumeData()
h, w, d = data.shape
mySlice = 93
mask = np.zeros_like(data, dtype=np.bool)
mask[:, :, mySlice] = 1
vshape = dfr.getVoxelShape()

fgen = FeatureGenerator(50, data, vshape, 1)
result = fgen.averaging3DByMask(mask, windowSize=3, vesselSize=7.5)

result = result.reshape((512, 512))

pl.subplot(121)
pl.imshow(data[:, :, mySlice], cmap=pl.cm.bone)  # @UndefinedVariable
pl.subplot(122)
pl.imshow(result, cmap=pl.cm.bone)  # @UndefinedVariable
pl.show()
class Classifier:
    def __init__(self, setID, data, vshape):
        self.SetID = setID
        self.Data = data
        self.VoxelShape = vshape
        self.model = None
        self.fgen = None

    def __del__(self):
        del self.SetID
        del self.Data
        del self.VoxelShape
        del self.model
        del self.fgen

    def isLevelset(self):
        return self.fgen is not None and self.model is not None

    def setLevel(self, level, model):
        self.fgen = FeatureGenerator(self.SetID, self.Data, self.VoxelShape,
                                     level)
        self.model = model

#     @staticmethod
#     def generatePixelList2D((h,w)):
#         x, y = np.meshgrid(np.arange(h), np.arange(w))
#         x, y = x.flatten(), y.flatten()
#         points = np.vstack((x,y)).T
#         assert points.shape == (h*w,2)
#
#         del x,y
#         return points
#
#     @staticmethod
#     def generatePixelList3D((h,w,d)):
#         x, y, z = np.meshgrid(np.arange(h), np.arange(w), np.arange(d))
#         x, y, z = x.flatten(), y.flatten(), z.flatten()
#         points = np.vstack((x,y,z)).T
#         assert points.shape == (h*w*d,3)
#
#         del x,y,z
#         return points

    @staticmethod
    def pruneFeatures(allFeatures, allClasses, oldMask, newMask):
        """Selects current level feature out of previous level features based on masks."""
        oldIndices = np.where(oldMask.ravel())[0]
        newIndices = np.where(newMask.ravel())[0]
        indices = np.searchsorted(oldIndices, newIndices)
        return allFeatures[indices, :], allClasses[indices, :]

    def generateProbabilityVolume(self, mask3D, threshold=0.01):
        if not self.isLevelset():
            raise ValueError("Level not set")

        testFeatures = self.fgen.getAllFeaturesByMask(
            mask3D)  #reusing previous features might be possible

        m, n = testFeatures.shape
        maxChunk = 1000000
        nbRows = maxChunk // n

        result = np.empty((m, 2))
        for r in np.arange(0, m, nbRows):
            #print "[{}, {}[".format(r, r+nbRows)
            chunk = testFeatures[r:r + nbRows, :]
            result[r:r + nbRows, :] = self.model.predict_proba(chunk)

        #result = self.model.predict_proba(testFeatures)

        probImg = np.zeros(mask3D.shape, dtype=np.float32)
        probImg[mask3D] = result[:, 1]

        mask = probImg > threshold

        return probImg, mask


#     def generateProbabilityImage(self, mask2D, mySlice, threshold=0.01):
#         if not self.isLevelset():
#             raise ValueError("Level not set")
#
#         testFeatures = deque()
#         for px,py in zip(np.where(mask2D)):
#             pixelFeatures = self.fgen.calculatePixelFeatures(px, py, mySlice)
#             testFeatures.append(pixelFeatures)
#
#         testFeatures = np.array(testFeatures)
#         result = self.model.predict_proba(testFeatures)
#
#         probImg = np.zeros(mask2D.shape)
#         probImg[mask2D] = result[:, 1]
#
#         mask = ma.masked_greater(probImg, threshold).mask
#
#         return probImg, mask
 def __init__(self, cl, lm):
     self.cl = cl
     self.fg = FeatureGenerator(lm)
    def getConfusionMatrix(self, y_true, y_pred):
        return confusion_matrix(y_true, y_pred)

    def getPrecisionRecallF1Score(self, y_true, y_pred):
        return precision_recall_fscore_support(y_true, y_pred, average='micro')

    def getAccuracy(self, y_true, y_pred):
        return accuracy_score(y_true, y_pred)


if __name__ == '__main__':

    # locate corpus data
    xmlcorpora = "../data/corpora/AZ_distribution/"

    # generate gold-standard feature vectors
    featureGen = FeatureGenerator(xmlcorpora)

    # initialise and train NB model
    classifer = NaiveBayes(features=featureGen.features,
                           distribution="Bernoulli",
                           train_split=0.8)
    classifer.train()

    # Analysis
    confusionMatrix, precisionRecallF1, accuracy = classifer.test()
    classifer.plotConfusionMatrix(confusionMatrix, range(8))
    print("=== Accuracy: %f ===" % (accuracy))

    # generate summary
    classifer.getSummary('9405001.az-scixml')
 def setLevel(self, level, model):
     self.fgen = FeatureGenerator(self.SetID, self.Data, self.VoxelShape,
                                  level)
     self.model = model
Exemple #27
0
'''
Created on Apr 15, 2013

This is where we invoke modules to generate features for training and test data

@author: naresh
'''
from Review import Review
import nltk
from Corpus import Corpus
from Dictionary import Dictionary
from FeatureGenerator import FeatureGenerator
from FeatureWeight import FeatureWeight

if __name__ == '__main__':
    trainingreviews = Review.readReviewsFromXML("../old-training-shuffled.xml")
    lemmatizer = nltk.WordNetLemmatizer()
    testReviews = Review.readReviewsFromXML("../old-test-data.xml")
    
    trainCorpus = Corpus(trainingreviews, lemmatizer, POS_tagging = True)
    '''this dictionary will be used for both training and validation data'''
    dictionary = Dictionary(trainCorpus)
    generator = FeatureGenerator(trainCorpus, dictionary, '../train.csv', weightScheme= FeatureWeight.TFIDF)
    generator.generateFeatures()
    
    testCorpus = Corpus(testReviews, lemmatizer, POS_tagging = True);
    generator = FeatureGenerator(testCorpus, dictionary, '../test.csv',weightScheme= FeatureWeight.TFIDF)
    generator.generateFeatures()