Beispiel #1
0
    def pretreatment(self):
        #read data
        [title, content, result] = self.DT.read_excel(self.origin_data_file)

        for i in range(len(result)):
            if result[i] < 0:
                result[i] = -1

        PT = PreTreater()
        keydata = PT.get_keywords(content)

        wd_dict = PT.getdict()
        traindata = PT.create_train_data_dict(wd_dict, keydata)

        #if self.model_dict['lrTmodel']:
        keydata_title = PT.get_keywords(title, all_tag=True)
        train_title_data = PT.create_train_data_dict(wd_dict, keydata_title)
        np.save(self.wd_dict_file, [wd_dict])
        np.save(self.data_title_file, [train_title_data])

        #if self.model_dict['scoreModel']:
        [wd_id_dict, id_score_dict] = PT.get_score_dict()
        traindata_score = PT.create_train_data_dict(wd_id_dict, keydata)
        np.save(self.wd_id_dict_file, [wd_id_dict])
        np.save(self.id_score_dict_file, [id_score_dict])
        np.save(self.data_score_file, [traindata_score])

        traindata_title_score = PT.create_train_data_dict(wd_id_dict, keydata_title)
        np.save(self.data_score_title_file, [traindata_title_score])
        #traindata = self.normalize_data(trainData)

        np.save(self.data_file, [traindata, np.array(result)])
        self.create_random_seed(len(result))
Beispiel #2
0
    def pre_data_treate(self, filename):
        test_title, test_content, empty = self.DT.read_excel(filename)
        [wd_dict] = np.load(self.wd_dict_file)
        [wd_score_dict] = np.load(self.wd_id_dict_file)

        PT = PreTreater()
        keydata = PT.get_keywords(test_content)
        testdata = PT.create_train_data_dict(wd_dict, keydata)
        test_score_data = PT.create_train_data_dict(wd_score_dict, keydata)      
        
        keydata_title = PT.get_keywords(test_title)
        testdata_title = PT.create_train_data_dict(wd_dict, keydata_title)
        test_score_data_title = PT.create_train_data_dict(wd_score_dict, keydata_title)

        return [testdata, testdata_title, test_score_data, test_score_data_title]        
Beispiel #3
0
    def cacul(self, x_te, id_score_dict):
        sum_score = np.zeros((x_te.shape[0]), dtype=float)
        #        sum_score_cp = np.zeros((x_te.shape[0]), dtype=float)
        id_score_vector = np.array(np.array(id_score_dict.values())[:, 1],
                                   dtype=float)

        #only work for the sparse data
        for row_idx in range(x_te.shape[0]):
            row, col = x_te[row_idx].nonzero()
            #            sum_score[row_idx] = np.dot(id_score_vector[col], x_te[row_idx, :].data)
            sum_score[row_idx] = np.sum(id_score_vector[col])


#        pos_mean = np.mean(sum_score[sum_score > 0])
#        neg_mean = np.mean(sum_score[sum_score < 0])
#        sum_score_cp[sum_score > pos_mean] = 1
#        sum_score_cp[sum_score < neg_mean] = -1
        sum_score = sum_score / (max(sum_score) - min(sum_score))
        return sum_score

if __name__ == '__main__':
    DSM = DictScoreModel()
    PT = PreTreater()
    wd_score_idx, id_score_idx = PT.get_score_dict('../data/score.txt')
    np.save('../data/id_score_dict.npy', [id_score_idx])
    from scipy.sparse import csr_matrix
    content = csr_matrix(([1, 1, 1], [5, 10, 22], [0, 2, 3]),
                         shape=((2, 100)),
                         dtype=float)
    print DSM.predict(content)
Beispiel #4
0
        else:
            print 'failed to load the file of wd_score_dict'
            sys.exit(2)
            
    def cacul(self, x_te, id_score_dict):
        sum_score = np.zeros((x_te.shape[0]), dtype=float)
#        sum_score_cp = np.zeros((x_te.shape[0]), dtype=float)
        id_score_vector = np.array(np.array(id_score_dict.values())[:, 1], dtype=float)
        
        #only work for the sparse data
        for row_idx in range(x_te.shape[0]):
            row, col = x_te[row_idx].nonzero()
#            sum_score[row_idx] = np.dot(id_score_vector[col], x_te[row_idx, :].data)
            sum_score[row_idx] = np.sum(id_score_vector[col])

#        pos_mean = np.mean(sum_score[sum_score > 0])
#        neg_mean = np.mean(sum_score[sum_score < 0])
#        sum_score_cp[sum_score > pos_mean] = 1
#        sum_score_cp[sum_score < neg_mean] = -1
        sum_score = sum_score/(max(sum_score) - min(sum_score))
        return sum_score
    
if __name__ == '__main__':
    DSM = DictScoreModel()
    PT = PreTreater()
    wd_score_idx, id_score_idx = PT.get_score_dict('../data/score.txt')
    np.save('../data/id_score_dict.npy', [id_score_idx])    
    from scipy.sparse import csr_matrix
    content = csr_matrix(([1,1,1], [5, 10 ,22], [0,2,3]), shape=((2,100)), dtype=float)
    print DSM.predict(content)