Esempio n. 1
0
    def test_constructor_twoline(self):
        pm_inst = ParseAndModel(
            feature_list=["screen"],
            filename='../tests/data/parse_and_model/twoLineTest.txt',
            log_base=2)
        em = EmVectorByFeature(explicit_model=pm_inst)

        expected_section_word_counts_matrix = [[1, 1, 1, 0, 0],
                                               [1, 0, 0, 1, 1]]
        expected_model_background_matrix = np.array(
            [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6])
        expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282],
                                                  [0.109], [0.109]])

        self.assertEqual(True,
                         np.array_equiv(expected_section_word_counts_matrix,
                                        csr_matrix.toarray(em.reviews_matrix)),
                         msg="section counts do not match")
        self.assertEqual(True,
                         np.array_equiv(
                             expected_model_background_matrix,
                             csr_matrix.toarray(em.background_probability)),
                         msg="background model does not match")
        self.assertEqual(True,
                         np.array_equiv(expected_model_feature_matrix,
                                        np.round(em.topic_model, 3)),
                         msg="topic models do not match")

        print("testing")
Esempio n. 2
0
    def test_bem_two_section(self):
        pm = ParseAndModel()

        section_list = pd.DataFrame([[0, 0, "large clear screen", True]
                                        , [0, 1, "large broken bad", True]
                                     ], columns=["doc_id", "section_id", "section_text", "title"])

        pm.feature_list = ["screen"]
        pm.formatted_feature_list = pm.format_feature_list()

        pm.parsed_text = dict(section_list=section_list)
        pm.model_results = pm.build_explicit_models(lemmatize_words=False, log_base=2)

        expected_model_background = [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]
        expected_model_feature = [[0.218, 0.282, 0.282, 0.109, 0.109]]
        expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})
            , 1: Counter({"large": 1, "broken": 1, "bad": 1})}
        expected_section_word_counts_matrix = [[1, 1, 1, 0, 0]
            , [1, 0, 0, 1, 1]]
        expected_model_background_matrix = np.array([1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6])
        expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282], [0.109], [0.109]])
        expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen', 3: 'broken', 4: 'bad'}

        self.assertEqual(True, expected_model_background == pm.model_results["model_background"])
        self.assertEqual(True, expected_model_feature == [[round(val, 3) for val in feature_model] for feature_model in
                                                          pm.model_results["model_feature"]])
        # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"])
        self.assertEqual(True,
                         np.array_equiv(expected_section_word_counts_matrix,
                                        csr_matrix.toarray(pm.model_results["section_word_counts_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_background_matrix,
                                              csr_matrix.toarray(pm.model_results["model_background_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_feature_matrix,
                                              np.round(pm.model_results["model_feature_matrix"], 3)))
        self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
Esempio n. 3
0
    def test_bem_one_section(self):
        pm = ParseAndModel()

        section_list = pd.DataFrame([[0, 0, "large clear screen", True]
                                     ], columns=["doc_id", "section_id", "section_text", "title"])

        pm.feature_list = ["screen"]
        pm.formatted_feature_list = pm.format_feature_list()

        pm.parsed_text = dict(section_list=section_list)
        pm.model_results = pm.build_explicit_models(log_base=2)

        expected_model_background = [1 / 3, 1 / 3, 1 / 3]
        expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]]
        expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})}
        expected_section_word_counts_matrix = [[1, 1, 1]]
        expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3])
        expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]])
        expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'}

        self.assertEqual(True, expected_model_background == pm.model_results["model_background"])
        self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"])
        # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"])
        self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix,
                                              csr_matrix.toarray(pm.model_results["section_word_counts_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_background_matrix,
                                              csr_matrix.toarray(pm.model_results["model_background_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"]))
        self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
    def fit(self, X, y):
        """Fit KFDA model.

        Parameters
        ----------
        X: numpy array of shape [n_samples, n_features]
           Training set.
        y: numpy array of shape [n_samples]
           Target values. Only works for 2 classes.

        Returns
        -------
        self

        """
        n = len(X)
        self._X = X
        self._H = np.identity(n) - np.matmul(
            1. / n * np.ones(n),
            np.ones(n).T)  #np.ones(n) , np.ones(n).T
        self._E = OneHotEncoder().fit_transform(y.reshape(n, 1))
        _, counts = np.unique(y, return_counts=True)
        K = self._kernel(X)
        C = np.matmul(np.matmul(self._H, K), self._H)
        self._Delta = np.linalg.inv(C + self.lmb * np.identity(n))
        A = np.matmul(csc_matrix.toarray(self._E.T), C)
        B = np.matmul(self._Delta, csr_matrix.toarray(self._E))
        self._Pi_12 = np.diag(np.sqrt(1.0 / counts))
        P = np.matmul(self._Pi_12, A)
        Q = np.matmul(B, self._Pi_12)
        R = np.matmul(P, Q)
        V, self._Gamma, self._U = np.linalg.svd(R, full_matrices=False)
        return self
Esempio n. 5
0
    def normalize_transform(self, mode='clr'):
        """
        Some operations may require transformed data.
        This function performs normalization and
        a clr transform on all OTU tables in a Batch object.
        It returns a deep copy of the original Batch object,
        so the original file is not modified.

        :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio)
        :return: Transformed copy of Batch object.
        """
        batchcopy = copy.deepcopy(self)
        try:
            for x in list(self.otu):
                # normalizes the data by samples
                normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False)
                mat = csr_matrix.toarray(normbiom.matrix_data)
                # replaces all zeros with a small value
                # multiplicative replacement preserves ratios between values
                mat = multiplicative_replacement(mat)
                if mode is 'clr':
                    mat = clr(mat)
                elif mode is 'ilr':
                    mat = ilr(mat)
                else:
                    raise ValueError("Only CLR and ILR transformations are currently supported.")
                normbiom._data = csc_matrix(mat)
                batchcopy.otu[x] = normbiom
        except Exception:
            logger.error("Failed to normalize data", exc_info=True)
        return batchcopy
def load_train(filename, i, nb_timesteps, output_dim):
    x_train = pkl.load(open('db/serialized/' + filename + '_x_train' + str(i+1) + '.np', 'rb'))
    y_train = pkl.load(open('db/serialized/' + filename + '_y_train' + str(i+1) + '.np', 'rb'))
    x_train = csr_matrix.toarray(x_train)
    x_train = np.resize(x_train, (x_train.shape[0], nb_timesteps, x_train.shape[1]))
    y_train = np.resize(y_train, (y_train.shape[0], output_dim))
    return x_train, y_train
Esempio n. 7
0
 def predict(self, X):
     X = csr_matrix.toarray(self._fix_test_feats(X))
     W = np.transpose(self.W)
     yhat = np.matmul(X, W)
     predictions = np.zeros(len(yhat), dtype=int)
     for i in range(len(predictions)):
         predictions[i] = np.argmax(yhat[i])
     return predictions
Esempio n. 8
0
def cos_similarity(X, df, your_pick):
    # Compute similarity of movie: Melvin and Howard
    index = df[df['Title'] == your_pick].index[0]
    d1 = list(csr_matrix.toarray(X[index]))
    mag_d1 = np.linalg.norm(d1)
    dist = []
    for i in range(X.shape[0]):
        row = list(csr_matrix.toarray(X[i]))
        dot_product_xy = np.multiply(d1, row).sum(1)
        mag_row = np.linalg.norm(row)
        x_time_y = mag_d1 * mag_row
        dist.append(dot_product_xy/x_time_y) 
    dist_series = pd.Series(dist)
    dist_series = dist_series.sort_values(ascending=False)
    dist_series.iloc[1:6]
    dist_series = pd.DataFrame(dist_series)
    
    return dist_series
Esempio n. 9
0
 def fit(self, *, X, y, lr):
     W = self.W
     for obs in range(X.shape[0]):  # once for each observation
         x = csr_matrix.toarray(X[obs])
         check = np.dot(x, np.transpose(W))
         yhat = np.argmax(np.dot(x, np.transpose(W)))
         if yhat != y[obs]:
             W[yhat] = W[yhat] - lr * X[obs]
             W[y[obs]] = W[y[obs]] + lr * X[obs]
     self.W = W
Esempio n. 10
0
def train_model(data_cleaned,vocab,num_featuers):
#This code was adapted from session 2 posted by Dr Jose Camacho Collados Oct-2019
#accessed Nov-2019
#https://learningcentral.cf.ac.uk/webapps/blackboard/content/listContent.jsp?course_id=_393342_1&content_id=_5178506_1
    #Apply most frequent words technique
    #extracting  1st dimension of features(most frequent words , also splitting the features from target column ( both are stored as list)
    X_train=[]
    Y_train=[]  
    for i,review in data_cleaned.iterrows():
        vector_review=get_vector_text(vocab,data_cleaned.at[i,'token'])
        X_train.append(vector_review)
        Y_train.append(data_cleaned.at[i,'label'])
    #Convert them to arrays (NumPy libraries)   
    X_train_sentanalysis=np.asarray(X_train)
    Y_train_sentanalysis=np.asarray(Y_train)
    #End adapted code

    #extracting 2nd dimension of features(TF-IDF), then converting it to array using (Scipy library because the returned data type is csr.csr_matrix in Scipy lib )
    X_tfId=get_tf_idf(data_cleaned,num_featuers,stop_words)
    X_tfId=sc.toarray(X_tfId)
    #extracting 3rd dimension of features(HashingVectorizer), then converting it to array using (Scipy library) 
    X_hash= get_Hashing(data_cleaned,num_featuers)
    X_hash=sc.toarray(X_hash)
    
    #Concatenate all 3 dimensions to one matrix 
    X_tfId=np.concatenate((X_tfId,X_hash), axis=1)
    X_train_sentanalysis = np.concatenate((X_train_sentanalysis,X_tfId), axis=1)  
    
    #Define a pipeline contains Feature selection technique and the model and the model
    #Feature selection technique used is selectKbest with chi2 , and the value of k is set to get the half of concatenated features 
    #in first iteration of training/validate model :each feature generate 1000 column (3000 in total) so after feature selection will
    #will reduced to (1500 feature vector)  which are the most wighted feature.
    # the model is logisticRegression, it is a classifier its solver set to sag due to the size of data(large)
    #the motivation behind do them in one pipeline to minimise the steps of fitting and transforming selection the fit again with model,
    #also apply .predict with dev/test (in their stages) without needing to apply (fit_transform)sfeatuer_election separately then predict them.
    model_pipline = Pipeline(steps=[("dimension_reduction", SelectKBest(chi2, k=(int(num_featuers*.5)))),
    ("classifiers", LogisticRegression(solver='sag', max_iter=2000))])#edit the default value of max_iter(100) 
    model_pipline.fit(X_train_sentanalysis,Y_train_sentanalysis)    
    #return the trained model
    return model_pipline
Esempio n. 11
0
 def fit(self, *, X, y, lr):
     W = self.W
     for obs in range(len(y)):  # once for each observation
         x = csr_matrix.toarray(X[obs])
         g = np.dot(W, np.transpose(x))
         for k in range(len(W)):
             p = self.softmax(g)
             correction = p[k] * x
             if k == y[obs]:
                 W[k:k + 1] += lr * (x - correction)
             else:
                 W[k:k + 1] -= lr * correction
     self.W = W
Esempio n. 12
0
def write_csr_to_csv(csr_matrix, name):
    graph = csr_matrix.toarray()
    filename = name
    #print("length of graph array:", len(graph))
    #print("csr matrix array:", graph)
    #print("length of one row:", len(graph[0]))
    with open(filename, "w") as writefile:
        writer = csv.writer(writefile)
        writer.writerow(["From", "To", "Weight"])
        #print("WRITING CSR MATRIX TO CSV AT: ", filename)
        for (m, n), value in np.ndenumerate(graph):
            if value != 0:
                writer.writerow([m, n, value])
Esempio n. 13
0
def reshape_input_data(x_ro, x_md):
    """
    Concatenates the input data into shape (num_samples, sample_size, 2).


    Parameters
    ----------
    x_ro: sparse matrix
        TF-IDF encoding of Romanian input samples.
    x_md: sparse matrix
        TF-IDF encoding of Moldavian input samples.

    Returns
    -------
    result
        Numpy ndarray representing the concatenated data.
    """
    assert x_ro.shape == x_md.shape
    num_samples, sample_size = x_ro.shape
    result = np.stack([csr_matrix.toarray(x_ro),
                       csr_matrix.toarray(x_md)],
                      axis=-1)
    return result
Esempio n. 14
0
    def test_constructor_one_section(self):
        pm = ParseAndModel(feature_list=["screen"], filename='data/parse_and_model/twoLineTest.txt',
                           lemmatize_words=False, nlines=1)

        section_list = pd.DataFrame([[0, 0, "large clear screen", True]
                                     ], columns=["doc_id", "section_id", "section_text", "title"])

        expected_model_background = [1 / 3, 1 / 3, 1 / 3]
        expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]]
        expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})}
        expected_section_word_counts_matrix = [[1, 1, 1]]
        expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3])
        expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]])
        expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'}

        self.assertEqual(True, expected_model_background == pm.model_results["model_background"])
        self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"])
        # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"])
        self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix,
                                              csr_matrix.toarray(pm.model_results["section_word_counts_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_background_matrix,
                                              csr_matrix.toarray(pm.model_results["model_background_matrix"])))
        self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"]))
        self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
Esempio n. 15
0
def sampling(adata, axis = 0, nsamples=500, method = "sps", optm_parameters=True, pinit=0.195, pfin = 0.9, K=500):
    
    ob = adata.X
    ob = csr_matrix.toarray(ob)
    #sampling rows
    if(axis == 0):
        ob = ob.T
    # print(ob.shape)
    if(nsamples>=ob.shape[1]):
        print("Number of samples are greater than number of columns. Sampling cant be done")
        exit(0)
    
    no_samples = ob.shape[1]
    init = no_samples if no_samples < 20000 else min(20000,round(no_samples/3))

    # random sample of ids from sample = 0 to no_samples - 1 of size init
    sample_ids = np.random.choice(list(range(0, no_samples,1)), init) 

    data = normalize(ob)
    data = np.take(ob, sample_ids, axis = 1)
    
    partition = annPartition(data)

    if(optm_parameters==True):
        param = optimized_param(partition, nsamples)
        pinit = param[0]
        pfin = param[1]
        K = param[2]
        print("Optimized parameters: ", param,"\n")

    unique_elements, counts_elements = np.unique(partition[:,1], return_counts=True)
    cluster_freq = np.asarray((counts_elements), dtype = int)
    # print(cluster_freq.shape)
    prop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq)
    cluster_freq = np.vstack((cluster_freq,prop)).T
    subsamples = np.empty((0))
    
    for i in range(len(prop)):
        subsamples = np.concatenate((subsamples, np.random.choice(partition[partition[:,1]==i,0], size = int(prop[i]), replace = False)), axis = None)

    subsamples = np.asarray(subsamples, dtype = int)

    print(len(subsamples), "Samples extracted. Returning indices of samples")

    # Returning indices of selected samples
    return subsamples
    
Esempio n. 16
0
    def run(self, filter_xtrim, by_group):

        if len(self.id2doc) == 0:
            if by_group:
                os.makedirs(os.path.dirname(f"{self.dir}/{self.sentiment}_model/"), exist_ok=True)
                documents = self.prepare_documents_by_group(filter_xtrim)
                self.transform_into_featuresets(documents)
            else:
                os.makedirs(os.path.dirname(f"{self.dir}/{self.sentiment}_model/"), exist_ok=True)
                documents = self.prepare_documents(filter_xtrim)
                self.transform_into_featuresets(documents)

        os.makedirs(os.path.dirname(f"{self.dir}/{self.sentiment}_results/local_v_foreign/"), exist_ok=True)

        doc_pairs = {}
        doc_count = 0
        for doc_num in self.id2doc:
            doc_count += 1
            doc = self.id2doc[doc_num]

            # get word probability distribution - it is l2 normalized
            prob_dist = csr_matrix.toarray(self.tfidf_matrix[doc_num, :])[0]
            # print(np.sum(np.square(prob_dist)))

            # get term (features) and its probability in descending format
            sorted_indices = np.argsort(prob_dist)[::-1]
            sorted_features = np.array(self.tfidf_vectorizer.get_feature_names())[sorted_indices]

            temp = [i for i in prob_dist]
            temp.sort()
            sorted_prob = temp[::-1]

            word_prob = list(zip(sorted_features, sorted_prob))

            # keep words with probability more than 0 and sentiment prob is larger than 0.9
            rep_words = []
            for w in [w for w in word_prob if w[1] > 0]:
                sent = " ".join(w[0].split("_"))
                sent_prob = self.sentiment_classifier.sentiment(sent)

                if sent_prob[0] == self.sentiment and sent_prob[1] > 0.6:
                    rep_words.append(w)

            doc_pairs.setdefault(doc.name,[]).append((doc.location, rep_words))

            print("\r",end="")
            print("Getting relevant sentimental words", int(doc_count/len(self.id2doc) * 100), "percent", end="", flush=True)

        # local-foreign review difference
        doc_count = 0
        for doc_name in doc_pairs:
            doc_count += 1

            local_pdist = []
            foreign_pdist = []

            # find unique words
            for loc_prob_tuple in doc_pairs[doc_name]:
                if loc_prob_tuple[0] == "sgp":
                    local_pdist = loc_prob_tuple[1]
                else:
                    foreign_pdist = loc_prob_tuple[1]

            local_dict = {k: v for (k, v) in local_pdist}
            foreign_dict = {k: v for (k, v) in foreign_pdist}

            wdiff = self.rank_words(local_dict)
            filename = doc_name.replace(".csv", "") + "_sgp.csv"
            with open(f"{self.dir}/{self.sentiment}_results/local_v_foreign/{filename}","w",
                      encoding="utf8") as writer:
                writer.writelines([f"{w[0]},{w[1]}\n" for w in wdiff])


            wdiff = self.rank_words(foreign_dict)
            filename = doc_name.replace(".csv", "") + "_ovs.csv"
            with open(f"{self.dir}/{self.sentiment}_results/local_v_foreign/{filename}", "w",
                      encoding="utf8") as writer:
                writer.writelines([f"{w[0]},{w[1]}\n" for w in wdiff])
Esempio n. 17
0
Nz, Dz = X.shape
s = (np.ones((Nz, 1))) * 0.2

#10,60
Knum = 5
W = kneighbors_graph(X, Knum, mode='distance', include_self=True)
hidden_size = 30

maps = spectral_embedding(W, n_components=hidden_size)
######################################################################
W = W
W = sparse.csr_matrix(W)

W1 = W.toarray()

W = csr_matrix.toarray(W)
params, gammasC = lr_init(maps, K, K)
print(maps.dtype)
model = GAN(hidden_size, batchSize, 1e-1, maps)
numIter = 0
loss_value = 0
training_loss = 0
training_loss1 = 0
while numIter < 100:
    gammasC, params, P = bayesianLowrankModel(maps, params, gammasC, K, K, W)
    for i in range(batch):
        images = X[batchSize * i:batchSize * (i + 1), :] / 255
        maps1 = maps[batchSize * i:batchSize * (i + 1), :]

        R_loss, loss_value, loss_value1 = model.update_params1(
            images, images, maps1)
Esempio n. 18
0
import numpy as np
import os
import tim
import pandas as pd
import joblib
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.svm import LinearSVC
from scipy.sparse import csr_matrix
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt


original_set_train = csr_matrix.toarray(original_set_train)
original_set_test = csr_matrix.toarray(original_set_test)

def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate 3 plots: the test and training learning curve, the training
    samples vs fit times curve, the fit times vs score curve.
    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.
    title : string
        Title for the chart.
    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
Esempio n. 19
0
dropout_rate = 0.4

# TF-IDF / fitting and transforming train data (node embedding)
vect = TfidfVectorizer(
    decode_error="ignore",
    sublinear_tf=True,
    ngram_range=(1, 1),
    min_df=0.0149,
    max_df=0.9,
    binary=False,
    smooth_idf=True,
)
X_embed = vect.fit_transform(cleaned_train_data + cleaned_test_data)

# Setting the feature of all nodes
features_matrix = csr_matrix.toarray(X_embed)

# Creating indices to split data into training and test sets
idx = np.random.RandomState(seed=42).permutation(n_hosts)
index_train = idx[: int(0.8 * n_hosts)]
index_test = idx[int(0.8 * n_hosts) :]

# Transforming the numpy matrices/vectors to torch tensors
features = torch.FloatTensor(features_matrix)
y = torch.LongTensor(y)
adj = torch.FloatTensor(adj)
index_train = torch.LongTensor(index_train)
index_test = torch.LongTensor(index_test)

# Applying the GNN model on the subgraph H
model = GNN(features.shape[1], n_hidden_1, n_hidden_2, n_class, dropout_rate)
def VB_Decomp_Gen(M: Union[csr_matrix, np.ndarray],
                  rank: int,
                  maxiter: int = 100) -> Tuple[np.ndarray, np.ndarray]:
    # init
    I = M.shape[0]
    J = M.shape[1]
    n = rank
    sigma_sq = np.ones(n)
    rho_sq = np.ones(n) / n
    tau_sq = 1
    u_bar = []
    v_bar = []
    t = []
    S, Phi, Psi = [], [], []

    for i in range(0, I):
        Phi.append(np.eye(n))
        u_bar.append(np.random.normal(0, 1, n))
    for j in range(0, J):
        Psi.append(np.eye(n))
        S.append(np.diag(1 / rho_sq))
        t.append(np.zeros(n))
        v_bar.append(np.random.normal(0, 1, n))

    Phi = np.array(Phi)
    Psi = np.array(Psi)
    S = np.array(S)
    t = np.array(t)
    u_bar = np.array(u_bar)
    v_bar = np.array(v_bar)

    norm_u = 0
    norm_v = 0

    N = []
    for i in range(0, I):
        N.append(scipy.sparse.find(M[i])[1])
    ob = scipy.sparse.find(M)

    # EM iteration
    for iter in range(0, maxiter):
        # E step
        # update Q(u_i)
        for i in range(0, I):
            outer = np.zeros((n, n))
            N_i = N[i]
            for j in N_i:
                outer += np.outer(v_bar[j], v_bar[j])
            Phi[i] = np.linalg.inv(
                np.diag(1 / sigma_sq) + (Psi[N_i].sum(0) + outer) / tau_sq)
            mtplr = ((M[i, N_i] * (v_bar[N_i])) / tau_sq).sum(0)
            u_bar[i] = Phi[i].dot(mtplr)
            S[N_i] += (Phi[i] + np.outer(u_bar[i], u_bar[i])) / tau_sq
            t[N_i] += (np.outer(csr_matrix.toarray(M[i, N_i]),
                                (u_bar[i])) / tau_sq)

        #update Q(v_j)
        Psi = np.linalg.inv(S)
        for j in range(0, J):
            v_bar[j] = Psi[j].dot(t[j])

    # M step
        for l in range(0, n):
            sigma_sq[l] = ((Phi[:, l, l] + u_bar[:, l]**2).sum()) / (I - 1)

        K = len(ob[1])
        Tr = 0
        for i, j in np.array([ob[0], ob[1]]).T:
            A = Phi[i] + np.outer(u_bar[i], u_bar[i])
            B = Psi[j] + np.outer(v_bar[j], v_bar[j])
            Tr += np.trace(A.dot(B))
        tau_sq = (((ob[2]**2) - (2 * ob[2] * np.einsum(
            'ij,ij->i', u_bar[ob[0]], v_bar[ob[1]]))).sum() + Tr) / (K - 1)

        cur_norm_u = np.linalg.norm(u_bar)
        cur_norm_v = np.linalg.norm(v_bar)
        if (abs(cur_norm_u - norm_u) < 0.01
                or abs(cur_norm_v - norm_v) < 0.01):
            break
        else:
            norm_u, norm_v = cur_norm_u, cur_norm_v
        yield np.array(u_bar), np.array(v_bar)
def GDS_model(train):
    x_train, x_test, y_train, y_test = load(train)

    ## Trim data
    l = int(len(y_train)/16)*16
    x_train = x_train[0:l]
    y_train = y_train[0:l]

    x_train = x_train[0:16]
    y_train = y_train[0:16]

    l = int(len(y_test)/16)*16
    x_test = x_test[0:l]
    y_test = y_test[0:l]

    ## Network structure
    nb_timesteps = 1
    nb_features = x_train.shape[1]
    output_dim = 1

    ## cross-validated model parameters
    batch_size = 16
    dropout = 0.25
    activation = 'sigmoid'
    nb_hidden = 128
    initialization = 'glorot_normal'

    ## reshaping X to three dimensions
    x_train = csr_matrix.toarray(x_train)
    x_train = np.resize(x_train, (x_train.shape[0], nb_timesteps, x_train.shape[1]))

    x_test = csr_matrix.toarray(x_test)
    x_test = np.resize(x_test, (x_test.shape[0], nb_timesteps, x_test.shape[1]))

    ##  reshape Y to appropriate dimensions
    y_train = np.resize(y_train, (y_train.shape[0], output_dim))
    y_test = np.resize(y_test, (y_test.shape[0], output_dim))

    ## Initialize model
    model = Sequential()

    model.add(Masking(mask_value=0., batch_input_shape=(batch_size, nb_timesteps, nb_features), name='Mask')) # embedding for variable input lengths
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU01',
                   batch_input_shape=(batch_size, nb_timesteps, nb_features)))
    model.add(Dropout(dropout, name='DO_01'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU02'))
    model.add(Dropout(dropout, name='DO_02'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU03'))
    model.add(Dropout(dropout, name='DO_03'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU04'))
    model.add(Dropout(dropout, name='DO_04'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU05'))
    model.add(Dropout(dropout, name='DO_05'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU06'))
    model.add(Dropout(dropout, name='DO_06'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU07'))
    model.add(Dropout(dropout, name='DO_07'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU08'))
    model.add(Dropout(dropout, name='DO_08'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU09'))
    model.add(Dropout(dropout, name='DO_09'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU10'))
    model.add(Dropout(dropout, name='DO_10'))
    model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU11'))
    model.add(Dropout(dropout, name='DO_11'))
    model.add(GRU(nb_hidden, stateful=True, init=initialization, name='GRU12'))
    model.add(Dropout(dropout, name='DO_12'))
    model.add(Dense(output_dim, activation=activation, name='Output'))

    # Configure learning process

    model.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Prepare model checkpoints and callbacks
    filepath="db/results/"+ train +"_best_weights.h5"
    checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=False)
    csv_logger = CSVLogger('db/results/training_log.csv', separator=',', append=True)

    # Training
    print('Training')
    history = model.fit(x_train,
                        y_train,
                        batch_size=batch_size,
                        verbose=1,
                        epochs=1,
                        shuffle=False, # turn off shuffle to ensure training data patterns remain sequential
                        callbacks=[checkpointer,csv_logger],
                        validation_data=(x_test, y_test))

    ## Evaluating on best results
    model.load_weights(filepath=filepath)
    score = model.evaluate(x_test, y_test, batch_size=16, verbose=1)
    score = dict(zip(model.metrics_names, score))

    summary = model.summary()
    model.save('db/results/model_'+ train + '.h5')
    return history, score, summary
train = pd.read_csv("training.csv", low_memory=False, index_col="article_number")
test = pd.read_csv("test.csv", low_memory=False, index_col="article_number")

# Create Ordinal Encoding
le = LabelEncoder().fit(train.topic)
train["label"] = le.transform(train.topic)
test["label"] = le.transform(test.topic)

# Split into x and y
train_x = train.drop(["label", "topic"], axis=1)
test_x = test.drop(["label", "topic"], axis=1)
train_y = train["label"]
test_y = test["label"]

#create TFIDF features from article words
tfidf = TfidfVectorizer(max_features=500).fit(train_x.article_words)

# Transform words and convert from sparse matrix to array
train_tfidf = csr_matrix.toarray(tfidf.transform(train_x.article_words))
test_tfidf = csr_matrix.toarray(tfidf.transform(test_x.article_words))

words = train_tfidf
test_words = test_tfidf

#Final model -- for how parameters were found see logistic_regression_hpo.py
model=LogisticRegression(C= 16, class_weight='balanced',penalty='l2',max_iter=700)
model.fit(words, train_y)
print(
     classification_report(test_y, model.predict(test_words))
)
Esempio n. 23
0
 def score(self, X):
     X = csr_matrix.toarray(self._fix_test_feats(X))
     W = self.W
     yhat = np.matmul(W, np.transpose(
         X))  # yhat[k, i] gives prob that sample xi is in class k
     return yhat[1]
def IKPP_model(test):
    ## set time
    time1 = datetime.datetime.today()

    ## Trim data
    x_train, x_test, y_train, y_test = load(test)
    l = int(len(y_train)/16)*16
    x_train = x_train[0:l]
    y_train = y_train[0:l]
    l = int(len(y_test)/16)*16
    x_test = x_test[0:l]
    y_test = y_test[0:l]

    ## Network structure
    nb_timesteps = 1
    nb_features = x_train.shape[1]
    output_dim = 1

    ## cross-validated model parameters
    batch_size = 16
    dropout = 0.25
    activation = 'sigmoid'
    nb_hidden = 128
    initialization = 'glorot_normal'

    ## reshaping X to three dimensions
    x_train = csr_matrix.toarray(x_train)
    x_train = np.resize(x_train, (x_train.shape[0], nb_timesteps, x_train.shape[1]))

    x_test = csr_matrix.toarray(x_test)
    x_test = np.resize(x_test, (x_test.shape[0], nb_timesteps, x_test.shape[1]))

    ##  reshape Y to appropriate dimensions
    y_train = np.resize(y_train, (y_train.shape[0], output_dim))
    y_test = np.resize(y_test, (y_test.shape[0], output_dim))

    ## Load model
    IKPP = load_model('db/results/model_' + train + '.h5')
    IKPP.load_weights('db/results/' + train + '_best_weights.h5')

    ## Freeze layers
    for layer in IKPP.layers[:20]:
        layer.trainable = False

    ## Reset weights
    reset = 0
    if reset == 1:
        for layer in IKPP.layers[-6:]:
            layer.reset_states()

    ## Decoder
    decoder = Sequential()
    decoder.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name="Encoder",
                    batch_input_shape=(batch_size, nb_timesteps, nb_features)))
    decoder.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name="Decoder"))
    decoder.add(Dense(IKPP.layers[0].input_shape[2], activation="linear"))
    # plot_model(decoder, 'db/models/decoder.png')

    ## Combine models
    #merged = Sequential()
    #merged.add(decoder)
    #merged.add(IKPP)
    merged = Model(inputs=decoder.input, outputs=IKPP(decoder.output))
    merged.layers[-1].get_input_at(-2)
    merged.layers[-1].get_input_mask_at(-3)
    merged.compile(optimizer='rmsprop',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    # Prepare model checkpoints and callbacks
    filepath="db/results/" + test + "_best_weights.h5"
    checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=False)
    csv_logger = CSVLogger('db/results/model_' +  test + '.csv', separator=',', append=True)

    ## Predict un-tuned model
    score_UT = merged.evaluate(x_test, y_test, verbose=1, batch_size=16)
    score_UT = dict(zip(merged.metrics_names, score_UT))

    # Training
    print('Training')
    while (True):
        time2 = datetime.datetime.today()
        history = merged.fit(x_train,
                             y_train,
                             batch_size=batch_size,
                             verbose=1,
                             nb_epoch=1,
                             shuffle=False,  # turn off shuffle to ensure training data patterns remain sequential
                             callbacks=[checkpointer, csv_logger],
                             validation_data=(x_test, y_test))
        time3 = datetime.datetime.today()
        if ((time3 - time2).seconds*2 + (time2-time1).seconds >= 600):
            break

    ## Evaluating on best results
    merged.load_weights(filepath=filepath)
    score = merged.evaluate(x_test, y_test, batch_size=16, verbose=1)
    score = dict(zip(merged.metrics_names, score))

    summary = merged.summary()
    merged.save('db/results/model_'+ test + '.h5')

    return history,
Esempio n. 25
0
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out


def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    return xx, yy


if __name__ == '__main__':
    train_features, train_labels = load_svmlight_file('twofeature.txt')
    x0, x1 = csr_matrix.toarray(train_features).T
    colors = ['red' if label == 1 else 'green' for label in train_labels]
    xx, yy = make_meshgrid(x0, x1)
    fig, ax = plt.subplots(1, 2)
    indC = 0
    for C in [1, 100]:
        clf = svm.SVC(kernel='linear', C=C)
        clf.fit(np.array([x0, x1]).T, train_labels)
        plot_contours(ax[indC], clf, xx, yy)
        ax[indC].scatter(x0,
                         x1,
                         c=train_labels,
                         cmap=plt.cm.coolwarm,
                         edgecolors='k')
        ax[indC].set_title(f'C={C}')
        indC += 1
Esempio n. 26
0
def rmse(y, pred):
    sub = [(a_i - b_i)**2 for a_i, b_i in zip(csr_matrix.toarray(y), pred)]
    sum_sqr = sum(sum(sub))
    size = y.shape[0] * y.shape[1]
    return sqrt(sum_sqr / size)
def load_test(filename):
    x_test = pkl.load(open('db/serialized/'+filename+'_x_test.np', 'rb'))
    y_test = pkl.load(open('db/serialized/'+filename+'_y_test.np', 'rb'))
    x_test = csr_matrix.toarray(x_test)
    return (x_test, y_test)
Esempio n. 28
0
    modelo = naive_bayes.fit(X_, y_)
    print("Modelo creado: " + str(modelo))
    return modelo


def crearModelo_RForest(X_, y_):
    modelo_ranForest = RandomForest.fit(X_, y_)
    print("Modelo creado: " + str(modelo_ranForest))
    return modelo_ranForest


###########################################################

support_vectors = modelo_svm.support_vectors_

X_train_ = csr_matrix.toarray(X_train)
support_vectors_ = csr_matrix.toarray(support_vectors)
plt.scatter(X_train_[:, 0], X_train_[:, 1])
plt.scatter(support_vectors_[:, 0], support_vectors_[:, 1], color='red')
plt.title('Grafica de matriz de confusión')
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()

X_test_ = csr_matrix.toarray(X_test)
y_test_ = pd.Series.to_numpy(y_test)
value = 1.5
width = 0.75
plot_decision_regions(X_test_,
                      y_test_,
                      clf=modelo_svm,
nb_classes = 2
nb_features = X_train.shape[1]
output_dim = 1

# Define cross-validated model parameters

batch_size = 14
dropout = 0.25
activation = 'sigmoid'
nb_hidden = 128
initialization = 'glorot_normal'

# # Reshape X to three dimensions
# # Should have shape (batch_size, nb_timesteps, nb_features)

X_train = csr_matrix.toarray(X_train) # convert from sparse matrix to N dimensional array

X_train = np.resize(X_train, (X_train.shape[0], nb_timesteps, X_train.shape[1]))

print('X_train shape:', X_train.shape)

X_test = csr_matrix.toarray(X_test) # convert from sparse matrix to N dimensional array

X_test = np.resize(X_test, (X_test.shape[0], nb_timesteps, X_test.shape[1]))

print('X_test shape:', X_test.shape)

# Reshape y to two dimensions
# Should have shape (batch_size, output_dim)

y_train = np.resize(y_train, (X_train.shape[0], output_dim))
Esempio n. 30
0
  print('Validating model with '+str(num_features)+' features...')
  X_dev_sentanalysis=[]
  Y_dev= []
  for i,review in dev_cleaned.iterrows():
    #extracting  1st dimension of features(most frequent words , also spliting the fetuers from target column ( both are stored as list)
    vector_instance=get_vector_text(vocabulary[:num_features],dev_cleaned.at[i,'token'])
    X_dev_sentanalysis.append(vector_instance)
    Y_dev.append(dev_cleaned.at[i,'label'])
  
  #convert previous list to arrays(NumPy librray) for prediction on the model
  X_dev_sentanalysis=np.asarray(X_dev_sentanalysis)
  Y_dev_gold=np.asarray(Y_dev)  
  
  #extracting 2nd dimenstion of featuers(TF-IDF), then converting it to array using (Scipy library)
  X_dev_TF1=get_tf_idf(dev_cleaned, num_features,stop_words)
  X_dev_TF=sc.toarray(X_dev_TF1)
 
  #extracting 3rd dimenstion of featuers(HashingVectorizer), then converting it to array using (Scipy library )
  X_dev_hash= get_Hashing(dev_cleaned,num_features)
  X_dev_hash=sc.toarray(X_dev_hash)
  
  #Concatenate all 3 dimensions to one matrix 
  X_dev_TF=np.concatenate((X_dev_TF,X_dev_hash), axis=1)
  X_dev = np.concatenate((X_dev_sentanalysis,X_dev_TF), axis=1)

  #######

  #Predicting featuers of Dev set, then calculating the performance measuers
  Y_dev_predictions=model.predict(X_dev)
  print('Done')
#  print('\n')