Ejemplo n.º 1
0
    def create_descriptors_pca(self, dim=90):
        '''
        计算描述子pca
        :param dim:
        :return:
        '''
        print("start create_descriptors_pca ...")
        query = DB.DescriptorModel.select(
            DB.DescriptorModel.id,
            DB.DescriptorModel.descriptor).tuples().iterator()
        features = numpy.array(map(lambda x: [x[0]] + list(x[1]), query))
        print("create_descriptors_pca,count=%d,dim=%d" % (len(features), dim))
        start = time()
        print("build eigenvectors start time %s" % start)

        mean, eigenvectors = cv2.PCACompute(features[:, 1:],
                                            None,
                                            maxComponents=dim)
        fitted = cv2.PCAProject(features[:, 1:], mean, eigenvectors)
        #pca = PCA(n_components=dim)
        #fitted = pca.fit_transform(features[:,1:])
        print("build eigenvectors cost time %s" % (time() - start))
        print("saving data ...")

        #scaler = preprocessing.MinMaxScaler()
        #pca = scaler.fit_transform(pca)
        DB.db.connect()
        with DB.db.transaction():
            DB.PcaModel.drop_table(fail_silently=True)
            DB.PcaModel.create_table()

            #res = DB.TrainingResult()
            #res.name = "daisy_pca"
            #res.data = pca
            #res.save()

            for i in range(0, len(fitted)):
                model = DB.PcaModel()
                model.pca = fitted[i]
                model.feature = features[i][0]
                model.save()

            DB.TrainingResult.delete().where(
                DB.TrainingResult.name == "pca_mean").execute()
            DB.TrainingResult.delete().where(
                DB.TrainingResult.name == "pca_eigenvectors").execute()
            tr = DB.TrainingResult()
            tr.name = "pca_mean"
            tr.data = mean
            tr.save()

            tr = DB.TrainingResult()
            tr.name = "pca_eigenvectors"
            tr.data = eigenvectors
            tr.save()

        print("create_descriptors_pca done")
Ejemplo n.º 2
0
    def create_classifier(self):
        DB.db.connect()
        clf = SGDClassifier(loss="modified_huber")
        labs_map = NameToIndex()

        with DB.db.transaction():
            offset = 0
            words_count = self.get_words_count()
            classes = numpy.arange(0, words_count)
            x_all = []
            y_all = []
            while True:
                print ' %d partial_fit %d' % (time(), offset)
                query = DB.Vocabulary\
                    .select(DB.Vocabulary.lv1, DB.Vocabulary.lv2)\
                    .join(DB.PcaModel, on=(DB.Vocabulary.feature == DB.PcaModel.feature)).order_by( DB.Vocabulary.feature).offset(offset).limit(1000)\
                    .tuples().iterator()
                features = numpy.array(
                    map(lambda x: [x[0]] + list(x[1]), query))
                offset += len(features)
                if len(features) == 0:
                    break

                Y = features[:, 0]
                X = features[:, 1:]

                labs = []
                for lab in Y:
                    labs.append(labs_map.map(lab))

                if (len(x_all) < 10000):
                    x_all = x_all + X.tolist()
                    y_all = y_all + labs
                labs = numpy.array(labs)

                #clf = LinearSVC()
                #clf = OneVsRestClassifier(SVC(probability=True, kernel='linear'))
                #clf.fit(X,labs)
                clf.partial_fit(X, labs, classes)
                print clf.score(x_all, y_all)

            DB.TrainingResult.delete().where(
                DB.TrainingResult.name == self.__class__.__name__ +
                "_clf").execute()
            DB.TrainingResult.delete().where(
                DB.TrainingResult.name == self.__class__.__name__ +
                "_labs_map").execute()

            tr = DB.TrainingResult()
            tr.name = self.__class__.__name__ + "_clf"
            tr.data = clf
            tr.save()

            tr = DB.TrainingResult()
            tr.name = self.__class__.__name__ + "_labs_map"
            tr.data = labs_map
            tr.save()
Ejemplo n.º 3
0
    def cluster_words_all(self):
        '''
        对所有样本进行聚类
        '''

        print "start cluster_words_all ..."
        offset = 0
        limit = 300
        cluster = MiniBatchKMeans(n_clusters=100, verbose=1)
        while True:
            print ' %d partial_fit %d' % (time(), offset)

            query = DB.PcaModel.select(DB.PcaModel.feature,DB.PcaModel.pca)\
                .offset(offset).limit(limit).tuples().iterator()

            features = numpy.array(map(lambda x: [x[0]] + list(x[1]), query))
            if len(features) == 0:
                break
            offset += len(features)
            X = features[:, 1:]
            cluster.partial_fit(X)

        DB.db.connect()
        with DB.db.transaction():
            DB.Vocabulary.drop_table(fail_silently=True)
            DB.Vocabulary.create_table()
            DB.Words.drop_table(fail_silently=True)
            DB.Words.create_table()

            offset = 0
            while True:
                query = DB.PcaModel.select(
                    DB.PcaModel.feature, DB.PcaModel.pca).offset(offset).limit(
                        1000).tuples().iterator()
                features = numpy.array(
                    map(lambda x: [x[0]] + list(x[1]), query))
                if len(features) == 0:
                    break
                offset += len(features)
                X = features[:, 1:]
                Y = features[:, 0]
                res = cluster.predict(X)

                for i in range(0, len(res)):

                    DB.Words.insert(id=res[i]).upsert().execute()
                    DB.Vocabulary.insert(word=res[i], feature=Y[i]).execute()

                DB.TrainingResult.delete().where(
                    DB.TrainingResult.name == self.__class__.__name__ +
                    "_clf").execute()

                tr = DB.TrainingResult()
                tr.name = self.__class__.__name__ + "_clf"
                tr.data = cluster
                tr.save()

        #print "%d words, %d core samples, %d noise"%(len(types.keys()),len(res.core_sample_indices_), len(types[-1]) )

        print "done cluster_words_all"
        #self.display_words()
        return cluster