Exemple #1
0
def test_normalize():
    # Test normalize function
    # Only tests functionality not used by the tests for Normalizer.
    X = np.random.RandomState(37).randn(3, 2)
    assert_array_equal(normalize(X, copy=False),
                       normalize(X.T, axis=0, copy=False).T)
    assert_raises(ValueError, normalize, [[0]], axis=2)
    assert_raises(ValueError, normalize, [[0]], norm='l3')
def test_normalize():
    """Test normalize function"""
    # Only tests functionality not used by the tests for Normalizer.
    X = np.random.RandomState(37).randn(3, 2)
    assert_array_equal(normalize(X, copy=False),
                       normalize(X.T, axis=0, copy=False).T)
    assert_raises(ValueError, normalize, [[0]], axis=2)
    assert_raises(ValueError, normalize, [[0]], norm='l3')
def build_inputs(files_list, accel_labels, file_label_dict):
    X_seq = []
    y_seq = []
    labels = []
    if(os.path.isfile(rootFolder + "experim.file")):
        with open(rootFolder + "experim.file", "rb") as f:
            dump = pickle.load(f)
            return dump[0], dump[1], dump[2]
    else:
        for path in files_list:
            raw_data, target, target_label = get_row_data(path, accel_labels, file_label_dict)
            raw_data, indx = get_features(raw_data, path)
            tmp = pd.DataFrame(normalize(raw_data, axis=0, norm='max'))
            tmp.columns = raw_data.columns
            tmp = tmp[['mean', 'skew', 'standard deviation']]
            processedFeatures = np.array(tmp)
            
            for inputs in range(len(processedFeatures)):
                X_seq.append(processedFeatures[inputs])
                y_seq.append(list(target))
                labels.append(target_label)
        X_ = pd.DataFrame(X_seq)
        y_ = pd.DataFrame(y_seq)
        labels = pd.DataFrame(labels)
    with open(rootFolder + "experim.file", "wb") as f:
        pickle.dump([X_, y_, labels], f, pickle.HIGHEST_PROTOCOL)
    return X_, y_, labels
def build_inputs(files_list, accel_labels, file_label_dict):
    X_seq = []
    y_seq = []
    labels = []
    if (os.path.isfile(rootFolder + "experim.file")):
        with open(rootFolder + "experim.file", "rb") as f:
            dump = pickle.load(f)
            return dump[0], dump[1], dump[2]
    else:
        for path in files_list:
            raw_data, target, target_label = get_row_data(
                path, accel_labels, file_label_dict)
            raw_data, indx = get_features(raw_data, path)
            tmp = pd.DataFrame(normalize(raw_data, axis=0, norm='max'))
            tmp.columns = raw_data.columns
            #         tmp.to_csv(path_or_buf=path + "Normalized.csv", sep=',',
            #             na_rep='', float_format=None, columns=None, header=True,
            #             index=True, index_label=None, mode='w', encoding=None,
            #             compression=None, quoting=None, quotechar='"', line_terminator='\n',
            #             chunksize=None, tupleize_cols=None, date_format=None, doublequote=True,
            #             escapechar=None)
            #                 tmp = pd.DataFrame(columns=[])
            tmp = tmp[['mean', 'skew', 'standard deviation']]
            processedFeatures = vectorize(tmp)
            for inputs in range(len(processedFeatures)):
                X_seq.append(np.array(processedFeatures[inputs]))
                y_seq.append(list(target))
                labels.append(target_label)
        X_ = np.array(X_seq)
        y_ = np.array(y_seq)
        labels = np.array(labels)
        with open(rootFolder + "experim.file", "wb") as f:
            pickle.dump([X_, y_, labels], f, pickle.HIGHEST_PROTOCOL)
        return X_, y_, labels
    def query_tf_idf(self,
                     query,
                     do_idf=True,
                     force=False,
                     smooth=False,
                     proba=False,
                     base=10,
                     norm=None,
                     type_tf=1):
        ''' Converts Index to tf.idf values
            do_idf: if False, convert to tf only
            type_idf: IDF: Default(Inverted Frequency)
                      SIDF: Smooth Inverted Frequency
                      PIDF: Probabilistic Inverted Frequency
        '''
        N = len(self.documents)
        M = len(self.index)

        matrix = np.zeros((1, M))

        if len(query.query_vector) == 0:
            raise "Not query preprocessor"
        if (len(query.query_score) == 0):
            terms = nltk.FreqDist(query.query_vector)
            do_idf = True
            type_tf = 1
        else:
            #do_idf = True
            #type_tf = 1
            terms = dict(zip(query.query_vector, query.query_score))
            #norm = None

        print(dict(terms))
        for (term, tf) in terms.items():

            j = self.get_feature_id(term)
            if (j):
                if do_idf:
                    ni = len(self.index[term])
                    tf = self.tf(tf, base=base, type=type_tf)
                    idf = self.idf(N,
                                   ni,
                                   unary=not do_idf,
                                   smooth=smooth,
                                   proba=proba,
                                   base=base)
                    matrix[0][j] = tf * idf
                else:
                    matrix[0][j] = tf

            # else:
            #     print(term)
            #     matrix[0][j] = 0

        if norm:
            matrix = normalize(matrix, norm=norm, copy=False)

        matrix = csr_matrix(matrix)
        return matrix
    def tf_idf(self,
               do_idf=True,
               force=False,
               smooth=False,
               proba=False,
               base=10,
               norm='l2',
               type_tf=1):
        ''' Converts Index to tf.idf values
            do_idf: if False, convert to tf only
            type_idf: IDF: Default(Inverted Frequency)
                      SIDF: Smooth Inverted Frequency
                      PIDF: Probabilistic Inverted Frequency
        '''
        if self.matrix == None or force:
            N = len(self.documents)
            M = len(self.index)

            matrix = np.zeros((N, M))
            for j, term in self.feature_names.items():
                docs = self.index[term]
                ni = len(docs)
                for doc in docs:
                    i = doc.uid
                    tf = self.tf(doc.tf, base=base, type=type_tf)
                    idf = self.idf(N,
                                   ni,
                                   unary=not do_idf,
                                   smooth=smooth,
                                   proba=proba,
                                   base=base)
                    matrix[i][j] = tf * idf

            if norm and len(matrix) > 0:
                matrix = normalize(matrix, norm=norm, copy=False)

            self.matrix = csr_matrix(matrix)
            #self.comatrix()
        return self.matrix
trainNewsItems = SenetimentNewsItemCollection("data/train-json.txt")

trainingData = numpy.asarray([
    doc2vecModel.infer_vector(newsItem.getWords())
    for newsItem in trainNewsItems.get_news_items()
])
trainingLabels = [
    newsItem.getSentiment() for newsItem in trainNewsItems.get_news_items()
]

multilabelbinarizer = MultiLabelBinarizer()
multilabelbinarizer.fit([['positive', 'negative']])

testData = numpy.asarray([
    doc2vecModel.infer_vector(newsItem.getWords())
    for newsItem in testNewsItems.get_news_items()
])
testLabels = [
    newsItem.getSentiment() for newsItem in testNewsItems.get_news_items()
]

trainingData = normalize(trainingData)
testData = normalize(testData)

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(trainingData, trainingLabels)
lr.predict(testData)
with open('model/sentiment-classifier', 'wb') as fid:
    pickle.dump(lr, fid)

print('Test Accuracy: %.2f' % lr.score(testData, testLabels))
Exemple #8
0
    csrtrain = parsetrain2sparse(train, dictlist)
    print('parsetrain2sparse finish %s' % datetime.now())
    csrtest = parsetest2sparse(test, dictlist)
    print('parsetest2sparse finish %s' % datetime.now())
    dump('csrtrain', csrtrain)
    dump('csrtest', csrtest)
    ''' ------------------------------华丽的sparse分割线-----------------------------------'''

    # 使用这里的代码替代上面的代码,直接读取sparse矩阵
    # with open('csrtrain_pickle', 'rb') as f: # float16
    #     csrtrain = pickle.load(f)
    # with open('csrtest_pickle', 'rb') as f:
    #     csrtest = pickle.load(f)
    # csrtrain= csrtrain.astype(np.float64)
    ''' ------------------------------华丽的sparse分割线-----------------------------------'''
    csctrainnor = normalize(csrtrain, norm='l2', axis=0)
    csrtrain = csctrainnor.tocsr()
    # csrtrain = csrtrain.astype(np.float32)
    # csrtest = csrtest.astype(np.float32)

    csrcosine = csrtest.dot(csrtrain)  # 矩阵乘法
    print('dot finish %s' % datetime.now())
    # sim_mat = [list(row) for row in csrcosine.toarray()]  # 该代码占用内存太大,直接死机

    # sim_mat = similar_mat(test.userlist, train.userlist)  # 计算匹配度矩阵
    # with open(os.path.join(BasePath, 'temp/sim_mat'), 'wb') as file:  # 将匹配矩阵存起来,因为该数据很重要
    #     pickle.dump(sim_mat, file)
    print('start knn')
    knn_mat = similar2knn(csrcosine, 50)  # 通过匹配矩阵来计算匹配用户
    step4 = datetime.now()
    print(step4)
Exemple #9
0
    def fit(self,
            k=250,
            shrink=100,
            alpha=None,
            beta=None,
            gamma=None,
            omega=None):
        self.k = k
        self.shrink = shrink
        # Check the parameters for the tuning scenerio
        if alpha is not None:
            self.alpha = alpha
        if beta is not None:
            self.beta = beta
        if gamma is not None:
            self.gamma = gamma
        if omega is not None:
            self.omega = omega
        print(
            "Sequential Random Hybrid Recommender mark 2: Model fitting begins"
        )
        # Calculate all the Similarity Matrices One by one
        # URM tfidf --> 50446 x 50446
        self.sim_URM_tfidf = Similarity_old(self.URM_train_tfidf.T,
                                            shrink=0,
                                            verbose=self.verbose,
                                            neighbourhood=200,
                                            mode=self.similarity_mode,
                                            normalize=self.normalize)
        # ICM tfidf --> 20635 x 20635
        self.sim_ICM_tfidf = Similarity_old(self.ICM.T,
                                            shrink=0,
                                            verbose=self.verbose,
                                            neighbourhood=25,
                                            mode=self.similarity_mode,
                                            normalize=self.normalize)
        # URM.T tfidf --> 20635 x 20635
        self.sim_URM_T_tfidf = Similarity_old(self.URM_train_tfidf,
                                              shrink=10,
                                              verbose=self.verbose,
                                              neighbourhood=350,
                                              mode=self.similarity_mode,
                                              normalize=self.normalize)
        # Slim --> 20635 x 20635
        self.sim_Slim_item = Slim_BPR_Recommender_Cython(self.URM_train)
        self.sim_Slim_user = Slim_BPR_Recommender_Cython(self.URM_train.T)

        if self.sparse_weights:
            # URM
            self.W_sparse_URM = normalize(
                self.sim_URM_tfidf.compute_similarity(), axis=1, norm="l2")
            # ICM
            self.W_sparse_ICM = normalize(
                self.sim_ICM_tfidf.compute_similarity(), axis=1, norm="l2")
            # URM_T
            self.W_sparse_URM_T = normalize(
                self.sim_URM_T_tfidf.compute_similarity(), axis=1, norm="l2")
            # Slim
            self.W_sparse_Slim_item = normalize(self.sim_Slim_item.fit(
                lambda_i=0.37142857,
                lambda_j=0.97857143,
                learning_rate=0.001,
                epochs=30),
                                                axis=1,
                                                norm="l2")

            # Slim_T
            self.W_sparse_Slim_user = normalize(self.sim_Slim_user.fit(
                lambda_i=1, lambda_j=1, learning_rate=0.001, epochs=30),
                                                axis=1,
                                                norm="l2")

        # add the parameters for the logging
        self.parameters = "sparse_weights= {0}, verbose= {1}, similarity= {2},shrink= {3}, neighbourhood={4},normalize= {5}, alpha= {6}, beta={7}, gamma={8}, omega={9}".format(
            self.sparse_weights, self.verbose, self.similarity_mode,
            self.shrink, self.k, self.normalize, self.alpha, self.beta,
            self.gamma, self.omega)
Exemple #10
0
def Normalize(matrix: csr_matrix):
    return normalize(matrix, axis=1)
Exemple #11
0
        #gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        vals = img.mean(axis=1).flatten()
        hist = np.histogram(vals, range(40, 121))
        dataset_features[idx, :] = hist[0]

        outputs[idx] = class_label
        idx += 1
    class_label += 1

TRAIN_SIZE = 0.8  # Разделение данных на обучающую и контрольную части в пропорции 70/30%
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import normalize

y = outputs
X = dataset_features
X = normalize(X)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size=TRAIN_SIZE,
                                                    random_state=0,
                                                    shuffle=True)

HIDDEN_NEURONS_NUM = 100  # Количество нейронов, содержащееся в скрытом слое сети
MAX_EPOCHS = 100  # Максимальное число итераций алгоритма оптимизации параметров сети

np.random.seed(0)
# Конвертация данных в структуру ClassificationDataSet
# Обучающая часть
ds_train = ClassificationDataSet(np.shape(X)[1],
                                 nb_classes=len(np.unique(y_train)))
Exemple #12
0
topic_sim3 = base_net(sim3_in)
topic_nonsim1 = base_net(nonsim1_in)

dist1 = Lambda(euclidean_distance,
               output_shape=eucl_dist_output_shape)([topic_main, topic_sim1])
dist2 = Lambda(euclidean_distance,
               output_shape=eucl_dist_output_shape)([topic_main, topic_sim2])
dist3 = Lambda(euclidean_distance,
               output_shape=eucl_dist_output_shape)([topic_main, topic_sim3])
dist_non = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)(
    [topic_main, topic_nonsim1])

decoder = get_decoder_net(hidden_dim, doc_dim)
reconstruction = decoder(topic_main)

model = Model(input=[main_in, sim1_in, sim2_in, sim3_in, nonsim1_in],
              output=[reconstruction, dist1, dist2, dist3, dist_non])

model.compile('sgd', loss=['mse'] + [contrastive_loss] * 4)

tf_idf = tf_idf.toarray()
tf_idf = normalize(tf_idf, copy=False) + 1e-5
x_train = [
    tf_idf[main_keys], tf_idf[sim1_data], tf_idf[sim2_data], tf_idf[sim3_data],
    tf_idf[non_sim_data]
]
z = np.zeros_like(main_keys)

output = [tf_idf[main_keys], z, z, z, np.ones_like(main_keys)]
model.fit(x_train, output)
topic_sim2 = base_net(sim2_in)
topic_sim3 = base_net(sim3_in)
topic_nonsim1 = base_net(nonsim1_in)

dist1 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)(
    [topic_main, topic_sim1])
dist2 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)(
    [topic_main, topic_sim2])
dist3 = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)(
    [topic_main, topic_sim3])
dist_non = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)(
    [topic_main, topic_nonsim1])

decoder = get_decoder_net(hidden_dim, doc_dim)
reconstruction = decoder(topic_main)

model = Model(input=[main_in, sim1_in, sim2_in, sim3_in, nonsim1_in],
              output=[reconstruction, dist1, dist2, dist3, dist_non])

model.compile('sgd', loss=['mse'] + [contrastive_loss]*4)

tf_idf = tf_idf.toarray()
tf_idf = normalize(tf_idf, copy=False) + 1e-5
x_train = [tf_idf[main_keys], tf_idf[sim1_data],
           tf_idf[sim2_data], tf_idf[sim3_data], tf_idf[non_sim_data]]
z = np.zeros_like(main_keys)

output = [tf_idf[main_keys], z, z, z, np.ones_like(main_keys)]
model.fit(x_train, output)