Example #1
0
def build_vectors(articles, weights):
    """
    Build weighted vector representations for a list of articles.
    """
    pub_vecs, bow_vecs, con_vecs = [], [], []
    for a in articles:
        pub_vecs.append(np.array([a.published]))
        bow_vecs.append(vectorize(a.text))
        con_vecs.append(concept_vectorize([c.slug for c in a.concepts]))

    pub_vecs = normalize(csr_matrix(pub_vecs), copy=False)
    bow_vecs = normalize(csr_matrix(bow_vecs), copy=False)
    con_vecs = normalize(csr_matrix(con_vecs), copy=False)

    # Merge vectors.
    vecs = hstack([pub_vecs, bow_vecs, con_vecs])

    # Convert to a scipy.sparse.lil_matrix because it is subscriptable.
    vecs = vecs.tolil()

    # Apply weights to the proper columns:
    # col 0 = pub, cols 1-101 = bow, 102+ = concepts
    # weights = [pub, bow, concept]
    vecs[:,0]     *= weights[0]
    vecs[:,1:101] *= weights[1]
    vecs[:,101:]  *= weights[2]

    return vecs.toarray()
Example #2
0
def create_word_count(node_info):
    '''
    Create word_mat matrix (num_nodes x num_unique_words) that contains the
    number of occurences of each word in each abstract
    '''

    all_abstract = np.array(' '.join(node_info.abstract.as_matrix()).split())
    
    unique_words = np.unique(all_abstract)
    ind_to_words_dict = dict(zip(range(len(unique_words)), unique_words))
    words_to_ind_dict = dict(zip(unique_words, range(len(unique_words))))
    
    
    word_mat = lil_matrix((len(node_info), len(unique_words)), dtype=np.int32)
    assert all(node_info.index == range(len(node_info)))
    # Fill matrix iteratively by looping on abstracts
    for (ind, abstract) in node_info.abstract.iteritems():
        if ind%200 == 0:
            print '[Creating Word Matrix] ind={ind}'.format(ind=ind)
        for word in abstract.split():
            word_mat[ind, words_to_ind_dict[word]] += 1
    
    
    # Normalise word_mat # 0: word occurence in corpus; 1: number of words in abstract
    word_count = word_mat.sum(0)
    # (by total number of occurence of each word)
    word_mat_norm = normalize(normalize(word_mat, norm='l1', axis=0))
    # (both)
    #word_mat_norm = normalize(normalize(word_mat, norm='l1', axis=0), norm='l1', axis=1) # 0: word occurence; 1: number of words
    return word_mat, word_mat_norm
Example #3
0
def load():
    iris = load_iris()
    n_samples, n_features = iris.data.shape
    indices = np.arange(n_samples)
    np.random.shuffle(indices)
    X = iris.data[indices]
    Y = iris.target[indices]
    split = (n_samples * 4) / 5
    X_train, X_test = X[:split], X[split:]
    y_train, y_test = Y[:split], Y[split:]
    X_train = preprocessing.normalize(X_train)
    X_test = preprocessing.normalize(X_test)

    Y_train = []
    for Y in y_train:
        if Y == 0: Y_train.append([1, 0, 0])
        if Y == 1: Y_train.append([0, 1, 0])
        if Y == 2: Y_train.append([0, 0, 1])

    Y_test = []
    for Y in y_test:
        if Y == 0: Y_test.append([1, 0, 0])
        if Y == 1: Y_test.append([0, 1, 0])
        if Y == 2: Y_test.append([0, 0, 1])

    return X_train, Y_train, X_test, Y_test
Example #4
0
  def __init__(self, hps, example_list, dqn_batch_size, use_state_prime = False, max_art_oovs = 0):
    """
      Args:
       hps: seq2seq model parameters
       example_list: list of experiences
       dqn_batch_size: DDQN batch size
       use_state_prime: whether to use the next decoder state to make the batch or the current one
       max_art_oovs: number of OOV tokens in current batch

      Properties:
        _x: The input to DDQN model for training, this is basically the decoder output (dqn_batch_size, dqn_input_feature_len)
        _y: The Q-estimation (dqn_batch_size, vocab_size)
        _y_extended: The Q-estimation (dqn_batch_size, vocab_size + max_art_oovs)
    """
    self._x = np.zeros((dqn_batch_size, hps.dqn_input_feature_len))
    self._y = np.zeros((dqn_batch_size, hps.vocab_size))
    self._y_extended = np.zeros((dqn_batch_size, hps.vocab_size + max_art_oovs))
    for i,e in enumerate(example_list):
      if use_state_prime:
        self._x[i,:]=e.state_prime
      else:
        self._x[i,:]=e.state
        self._y[i,:]=normalize([e.q_value[0:hps.vocab_size]], axis=1, norm='l1')
      if max_art_oovs == 0:
        self._y_extended[i,:] = normalize([e.q_value[0:hps.vocab_size]], axis=1, norm='l1')
      else:
        self._y_extended[i,:] = e.q_value
Example #5
0
def func(A, B):  # comparematrices(A,B):
    colA = A.shape[1]
    colB = B.shape[1]

    # method 1 - n is small dim, m is larger, matnew is new comparison matrix
    if colA == colB and colA != 1:
        Aprime = normalize(A, axis=1, norm='l2')
        Bprime = normalize(B, axis=1, norm='l2')
        if colA == 1:
            dist = np.linalg.norm(Aprime - Bprime)  # L2 norm (vectors)
        else:
            dist = np.linalg.norm(Aprime - Bprime, 2)  # Frobenius norm (matrices)
    else:
        if colA < colB:
            n = colA
            m = colB
            big = B
            small = A
        else:
            n = colB
            m = colA
            big = A
            small = B
        matnew = np.identity(m)
        matnew[0:n, 0:n] = small
        bigprime = normalize(big, axis=1, norm='l2')
        matnewprime = normalize(matnew, axis=1, norm='l2')
    dist = np.linalg.norm(matnewprime - bigprime, 2)

    print dist
Example #6
0
def classify(dummy_train,dummy_test,feature_pkl,output_file):
    # Train classifier, iterating over subsets
    # Load Features
    print 'Loading features...'
    featureIndex, trainFeatures, trainTargets, trainItemIds, testFeatures, testItemIds = joblib.load(feature_pkl)
    trainTargets = np.array(trainTargets)
    testItemIds = np.array(testItemIds)
    predicted_ids = []
    predicted_scores = []
    # SGD Logistic Regression per sample 
    clf = SGDClassifier(alpha=3.16227766017e-08, class_weight='auto', epsilon=0.1,
          eta0=0.0, fit_intercept=True, l1_ratio=0.15,
          learning_rate='optimal', loss='log', n_iter=5, n_jobs=1,
          penalty='elasticnet', power_t=0.5, random_state=None, shuffle=False,
          verbose=0, warm_start=False)
    for col in range(np.shape(dummy_train)[1]):
        # Get nonzero dummy indices as array
        idx_train = dummy_train[:,col].astype('bool').T.toarray()[0]
        print 'Training subset {} of {}...'.format(col,np.shape(dummy_train)[1])
        sub_train = normalize(trainFeatures.tocsr()[idx_train,:], norm='l2', axis=0)
        clf.fit(sub_train,trainTargets[idx_train])
       # Use probabilities instead of binary class prediction in order to generate a ranking    
        idx_test = dummy_test[:,col].astype('bool').T.toarray()[0]
        sub_test = normalize(testFeatures.tocsr()[idx_test,:], norm='l2', axis=0)
        predicted_scores += clf.predict_proba(sub_test).T[1].tolist()
        predicted_ids += testItemIds[idx_test].tolist()
    
    with open(os.path.splitext(feature_pkl)[0]+'_'+output_file,'w') as out_fid:
        out_fid.write("id\n")
        for pred_score, item_id in sorted(zip(predicted_scores, predicted_ids), reverse = True):
           # only writes item_id per output spec, but may want to look at predicted_scores
            out_fid.write("%d\n" % (item_id))
def split_and_build_class(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    print X_train.shape
    print X_test.shape

    # Normalize the input data.
    imp = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
    fixed_X_train = X_train[:, 1:]
    imp.fit(fixed_X_train)
    fixed_X_train = imp.transform(fixed_X_train)
    preprocessing.normalize(fixed_X_train, copy=False)
    X_train[:, 1:] = fixed_X_train

    fixed_X_test = X_test[:, 1:]
    imp.fit(fixed_X_test)
    fixed_X_test = imp.transform(fixed_X_test)
    preprocessing.normalize(fixed_X_test, copy=False)
    X_test[:, 1:] = fixed_X_test

    train_data = read_dataset.microData()
    train_data.get_data(X_train)
    y_train = train_data.set_output(y_train)
    test_data = read_dataset.microData()
    test_data.get_data(X_test)
    y_test = test_data.set_output(y_test)

    return [X_train, X_test, y_train, y_test, train_data, test_data]
    def __init__(self, epsilon_init, userNum, itemNum,k, feature_dim, tau=0.1, lambda_=0.1, init='zero', learning_rate='decay'):
        self.reward = 0
        self.userNum = userNum
        self.itemNum = itemNum
        # self.R = np.zeros((userNum, itemNum))
        self.S = np.zeros((userNum, itemNum))
        self.time = 1
        self.tau = tau #SGD Learning rate
        self.tau_init = 1 # decay
        self.learning_rate = learning_rate
        self.epsilon_init = epsilon_init
        
        if (init == 'random'):
            self.U = np.random.rand(userNum,k)
            self.V = np.random.rand(itemNum,k)
        else:   
            self.U = np.zeros((userNum,k))         
            self.V = np.zeros((itemNum,k))
        
        self.lambda_ = lambda_ #
        self.feature_dim = feature_dim

        #add normalization
        self.U = normalize(self.U, axis=1, norm='l1')
        self.V = normalize(self.V, axis=1, norm='l1')
        self.k = k
        # print k

        self.CanEstimateUserPreference = False
        self.CanEstimateCoUserPreference = True
        self.CanEstimateW = False
Example #9
0
def read_dataset(train_size, scale=False, normalize=False):
    logging.info('fetching the dataset')
    #
    d = sklearn.datasets.load_diabetes() # 糖尿病
    #d = sklearn.datasets.load_boston() # ボストン住宅価格
    #
    data = d['data'].astype(np.float32)
    target = d['target'].astype(np.float32).reshape(len(d['target']), 1)
    #"Chainerのmnist.pyだと下記ののような書き方になっているが、ミニバッチの数が2以上だと動かない"らしい 
    #target = diabetes['target'].astype(np.float32) 
    # 本来訓練データで標準化・正規化して、そのパラメータをテストデータに適用すべき
    if normalize and scale:
        raise Exception('both normalize and scale can not be True')
    if normalize:
        data = preprocessing.normalize(data)
        target = preprocessing.normalize(target)
    if scale:
        data = preprocessing.scale(data)
        target = preprocessing.scale(target)
    # 分割
    x_train, x_test = np.split(data, [train_size])
    y_train, y_test = np.split(target, [train_size])
    assert len(x_train)==len(y_train)
    assert len(x_test)==len(y_test)
    return  ((x_train, y_train), (x_test, y_test), 
        {"SHAPE_TRAIN_X":x_train.shape,
          "SHAPE_TRAIN_Y":y_train.shape,
          "SHAPE_TEST_X":x_test.shape,
          "SHAPE_TEST_Y":y_test.shape,
          })
Example #10
0
 def find_most_similar(self, bids, K_sim=14):
     """Return the bid of the most similar book to parameter bid except the given bid."""
     termv = sparse.csc_matrix((self.M, 1), dtype=int)
     for bid in bids:
         col_num = self.bid_to_col.get(str(bid))
         if col_num is not None:
             termv = termv + self.term_bid_matrix.getcol(col_num)
     if termv.nnz == 0:
         return ()
     termva = termv.toarray()    # Generate a vector for terms
     stop_words_removed = np.logical_and(termva, self.stop_words)
     nonzero = stop_words_removed.nonzero()[0]    # Nonzero indices
     rest_term_rows = self.term_bid_matrix_csr[nonzero]
     docs = np.zeros(self.N, dtype=bool)
     for row in rest_term_rows:
         np.logical_or(docs, row.toarray()[0], docs)
     cols = docs.nonzero()[0]
     matched_matrix = self.term_bid_matrix[:,cols]
     termv.data = self.tf(termv.data) * np.array([self.idf(self.row_to_term[row])
                                                  for row in termv.indices])
     termv = normalize(termv.T, axis=1, copy=False)
     matched_matrix.data = self.tf(matched_matrix.data)
     matched_matrix = normalize(matched_matrix.T, axis=1, copy=False).T
     cos_sims = termv.dot(matched_matrix).toarray()[0]
     found_bids = (self.col_to_bid[col] for col in cols)
     return islice((int(r[1])
                    for r in heapq.nlargest(K_sim, zip(cos_sims, found_bids))
                    if int(r[1]) not in bids),
                    9)
Example #11
0
 def normalize(self):
     """
     impute
     """
     print('Normalization')
     self.tr = normalize(self.tr)
     self.te = normalize(self.te)
Example #12
0
def make_clouds(files, n_words=20):
    # set locations
    base_model_name = os.path.splitext(os.path.basename(files.model))[0]
    output_d = '../browser/clouds/' + base_model_name + '/'
    if not os.path.exists(output_d):
        os.makedirs(output_d)
    # create wordcloud generator
    wc = WordCloud(width=1000, height=500, background_color='white')

    print('Loading model')
    model = LdaModel.load(files.model)
    beta = model.expElogbeta

    print('Normalizing by topics, and by words')
    pTW = normalize(beta, axis=0)
    pWT = normalize(beta, axis=1)

    # load bug<->id map, then invert to id<-> bug
    bug_to_id = json.loads(open(files.replacements).read())
    id_to_bug = {v: k for k, v in bug_to_id.items() if "." not in k}

    for i in range(len(beta)):
        # compute RAR
        t_rar = np.sqrt(pTW[i] * pWT[i])
        top_word_ids = t_rar.argsort()[:-1 - n_words:-1]
        top_words = [model.id2word.id2token[wordid] for wordid in top_word_ids]
        top_words = [id_to_bug[word] if word in id_to_bug else word for word in top_words]
        wc.fit_words(zip(top_words, t_rar[top_word_ids]))
        wc.to_file(output_d + str(i) + '.png')
Example #13
0
def normaliser(x, option):
  # normalize by the norm
    if option == 'norm':
       # print 'normalize by the norm or the row'
        from sklearn.preprocessing import normalize
        x_norma = normalize(x, norm='l2')

#   normalize by the sum of the row, ( normalized matrix sum to 1 )
    elif option == 'sum': # normalize sum to 1:
        #print('normalize by the sum of the row')
        from sklearn.preprocessing import normalize
        x_norma = normalize(x, norm='l1')

#   normalize each row by z-score : (x-mean)/std
    elif option == 'zscore':
        from scipy import stats
        x_norma = stats.zscore(x, axis=1)
        # set the nan to 0
        x_norma[np.isnan(x_norma)] = 0

    elif option == 'none':
       # print ('no normalization')
        x_norma = x

    return x_norma
def relational_retrofit(word_vecs, sparse_relations, iterations=5, verbose=True, orig_weight=1):
    orig_vecs = normalize(word_vecs, norm='l2', copy=False)
    orig_vecs *= orig_weight

    arbitrary_value = next(iter(sparse_relations.values()))
    M, k = orig_vecs.shape
    N = arbitrary_value.shape[0]

    vecs = np.zeros(shape=(N, k), dtype='f')
    vecs[:M] = orig_vecs
    sparse_list = sorted(sparse_relations.items(), key=itemgetter(0))

    for iteration in range(iterations):
        rel_array = dense_relation_array(word_vecs, sparse_relations)
        next_vecs = np.zeros(shape=vecs.shape, dtype='f')
        for i in range(len(sparse_list)):
            name = sparse_list[i][0]
            if verbose:
                print('Iteration %d of %d: %s' % (iteration + 1, iterations, name))
            sparse = sparse_list[i][1]
            dense = rel_array[i]
            next_vecs += sparse.dot(vecs.dot(dense.T))

        normalize(next_vecs, norm='l2', copy=False)
        next_vecs[:M] += orig_vecs
        next_vecs[:M] /= 1+orig_weight
        vecs = next_vecs
        del next_vecs

    return vecs
def remove_outliers(image,mask):
#taking the mask part to image to check the presence of bee
	im = cv2.bitwise_and(image,image,mask=mask);
	ldp_image,_,_ = ldp.ldp(im);
	test_Y = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2]));
	test_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2]));
	test = np.concatenate((test_Y,test_rgb),axis=1);
	mask_not = cv2.bitwise_not(mask);
	ret1, mask_not = cv2.threshold (mask_not,np.mean(mask_not), 255, cv2.THRESH_BINARY);		
	im = cv2.bitwise_and(image,image,mask=mask_not);
	ldp_image,_,_ = ldp.ldp(im);	
	data_ldp = ldp_image.reshape((ldp_image.shape[0] * ldp_image.shape[1], ldp_image.shape[2]));
	data_rgb = im.reshape((im.shape[0] * im.shape[1], im.shape[2]));
	data = np.concatenate((data_rgb,data_ldp),axis=1);
	data = data[np.any(data!=0,axis=1)];	
	print data.shape;		
	data = data.astype('float64');		
	data = preprocessing.normalize(data,axis=0);
	ss = StandardScaler();	
	data = ss.fit_transform(data);
	clf = svm.OneClassSVM(nu=0.8, kernel="rbf", gamma=0.1)
	clf.fit(data);
	test = test.astype('float64');		
	test = preprocessing.normalize(test,axis=0);	
	print test.shape;	
	test = ss.fit_transform(test);
	test = clf.predict(test);
	test = test.reshape((image.shape[0] , image.shape[1]));
	test[test==-1] = 0;
	test[test==1] = 255;
	test = test.astype('uint8');
	im = cv2.bitwise_and(image,image,mask=test);	
	im = cv2.bitwise_and(im,im,mask=mask);	
	#print test[:,0],test[:,1];	
	return(im,test);  
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("bof_histogram")
    args = parser.parse_args(sys.argv[1:])

    print("loading bof histogram")
    with gzip.open(args.bof_histogram, "rb") as f:
        obj_hists = pickle.load(f)

    target_names = jsk_apc2015_common.get_object_list()

    # create train and test data
    X, y = [], []
    for i, obj_name in enumerate(target_names):
        X.append(obj_hists[obj_name])
        y += [i] * len(obj_hists[obj_name])
    X = np.vstack(X)
    normalize(X, copy=False)
    y = np.array(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=np.random.randint(1234))

    # train and test
    lgr = LogisticRegression()
    print("fitting LogisticRegression")
    lgr.fit(X_train, y_train)
    with gzip.open("lgr.pkl.gz", "wb") as f:
        pickle.dump(lgr, f)
    y_pred = lgr.predict(X_test)
    print("score lgr: {}".format(accuracy_score(y_test, y_pred)))
    print(classification_report(y_test, y_pred, target_names=target_names))
def rw_overlap_kernel(C1, C2):
    
    #l = 1.0/np.exp(1.0)
    l = 0.5    
    
    
    k = 0
    c = 0
    
    # reshape rows into kernel matrices
    M1 = np.reshape(C1, (90, 90))
    M2 = np.reshape(C2, (90, 90))
    
    # normalise so rows sum to 1
    M1_norm = normalize(M1, axis=1, norm='l1')
    M2_norm = normalize(M2, axis=1, norm='l1')
    
    for i in range(1, 101) :
        
        M1_exp = np.linalg.matrix_power(M1_norm, i)
        M2_exp = np.linalg.matrix_power(M2_norm, i)
        
        #overlap = np.sum(np.minimum(M1_exp, M2_exp))
        overlap = np.sum(np.sqrt(np.multiply(M1_exp, M2_exp)))
    
        #k = k + ((np.exp(-i) ) * overlap)
        #c = c + ((np.exp(-i)) * 90)
        k = k + ((l ** i) * overlap)
        c = c + ((l ** i) * 90)
    
    return  k/c
Example #18
0
def pagerank_undirected(graph,d):
    """
    Args:
        graph - Unnormalized(or normalized) transition matrix in csr format
        d - page rank probability of taking the edge vs jumping to a new node

    Returns:
        page rank of each node in the graph
    """
    normalize(graph,norm='l1',axis=1,copy=False) #normalize as transition probability per row(node)

    #create matrix for which to find principal eigenvector
    M = d*graph #weighted transition probability
    try:
        jmp_temp = (1-d)/graph.shape[0] #teleportation weighting
        J = np.ones(graph.shape) 
        J = jmp_temp*J #teleportation probability matrix
        M = M+J
    except ValueError:
        pass

    #intialize normalized rankings
    R = np.matrix(np.random.random(graph.shape[0])).transpose()
    normalize(R,norm='l1',axis=0,copy=False)

    Rp = R*np.Inf

    #iterate until convergence
    while (np.square(R-Rp).sum() > 0.001):
        Rp = R
        R = M.dot(R)

    #convert R to numpy array instead of matrix before returning.
    return np.array(R).reshape(len(R),)
def compute_vector_features(name, series1, series2, tfidf_transform, svd, top_n_diff, results):
    X1 = tfidf_transform(series1)
    X2 = tfidf_transform(series2)

    dot = X1.multiply(X2).sum(axis=1)
    results['text_%s_dot' % name] = np.asarray(dot).reshape(-1)

    X1_norm = normalize(X1)
    X2_norm = normalize(X2)

    cosine = X1_norm.multiply(X2_norm).sum(axis=1)
    results['text_%s_cosine' % name] = np.asarray(cosine).reshape(-1)

    X1_svd = svd.transform(X1)
    X2_svd = svd.transform(X2)

    results['text_%s_dot_svd' % name] = (X1_svd * X2_svd).sum(axis=1)

    X_diff = X1_svd - X2_svd
    results['text_%s_euclidean_svd' % name] = (X_diff ** 2).sum(axis=1)
    results['text_%s_manhattan_svd' % name] = np.abs(X_diff).sum(axis=1)

    X1_svd = normalize(X1_svd)
    X2_svd = normalize(X2_svd)
    results['text_%s_cosine_svd' % name] = (X1_svd * X2_svd).sum(axis=1)
    
    for i in range(top_n_diff):
        results['text_%s_svd_diff_%d' % (name, i)] = np.abs(X_diff[:, i])
Example #20
0
def _construct_edge_pairs(data, norm='l2', power_norm=True, dtype=np.float32):
    """
    A tree is a list of edges, with each edge as the concatenation of the repr. of parent and child nodes.
    :param data:
    :return root, edges:
    """

    r = data['tree'][1].astype(dtype=dtype)
    if power_norm:
        r = np.sign(r) * np.sqrt(np.abs(r))
    r = preprocessing.normalize(r[np.newaxis,:], norm=norm)
    root = [np.squeeze(r), ]

    edges = []
    for id in data['tree'].keys():
        if id > 1:
            x_left = data['tree'][id].astype('float32')
            x_right = data['tree'][int(id/2.)].astype('float32')

            e = np.concatenate([x_left,x_right])
            if power_norm:
                e = np.sign(e) * np.sqrt(np.abs(e))
            e = preprocessing.normalize(e[np.newaxis,:], norm=norm)

            # if power_norm:
            #     x_left = np.sign(x_left) * np.sqrt(np.abs(x_left))
            #     x_right = np.sign(x_right) * np.sqrt(np.abs(x_right))
            #
            # e = (preprocessing.normalize(x_left[np.newaxis,:], norm=norm), preprocessing.normalize(x_right[np.newaxis,:], norm=norm) )
            e = np.hstack(e)

            edges.append([np.squeeze(e),])

    return root, edges
Example #21
0
def nearest_binary_landmark(R, X):
    y = np.dot(R, X)
    y = np.transpose(y)
    normalize(y, copy=False)

    n = y.shape[0]
    c = y.shape[1]
   
    idx = np.argsort(y, axis=1)
    bs = []
    ms = []

    for i in range(n):
        b = np.zeros(c)
        s = 0
        max_psi = 0
        max_b = b
        m=1
        for k in range(c-1, -1, -1):
            if y[i][idx[i][k]] <= 0:
                break
            b[idx[i][k]] = 1
            s = s + y[i][idx[i][k]]
            psi = s / math.sqrt(c-k)

            if psi > max_psi:
                max_psi = psi
                max_b = np.copy(b)
                m = float(c-k)

        bs.append(max_b)
        ms.append(m)
    bs = np.array(bs)
    return bs, np.mean(ms), y
Example #22
0
def difference_vectors(X, cluster_predictions, clusters_centers):
	PC = X.shape[1]
	K = len(clusters_centers)
	u_k = {}
	num_frms = X.shape[0]
	for frm in range(num_frms):
		frm_NN_cluster = cluster_predictions[frm]
		c = clusters_centers[frm_NN_cluster]
		diff = X[frm] - c
		if frm_NN_cluster not in u_k:
			u_k[frm_NN_cluster] = diff
		else:
			sum_k = u_k[frm_NN_cluster]
			sum_k += diff
			u_k[frm_NN_cluster] = sum_k
	vlad = u_k[0]
	vlad = vlad.reshape(1, vlad.shape[0])
	for k in range(1, K):
		K_cluster = u_k[k]
		K_cluster = K_cluster.reshape(1, K_cluster.shape[0])

		# Intra Normalization
		K_cluster = preprocessing.normalize(K_cluster, norm = 'l2')
		vlad = np.concatenate((vlad, K_cluster), axis = 1)

	# L2 Normalization
	vlad = preprocessing.normalize(vlad, norm = 'l2')
	return vlad
def getData_NEC():
    reader = JobDBCorpus()
    data = reader.read_sequence_list(target='TODO')
    np.seterr(all='ignore')

    train, test = reader.train_test_data(test_size=0.2)

    print("Reading chunks...")
    chunks_train = ChunkSet(dataset=train)
    chunks_test = ChunkSet(dataset=test)

    print("Building features...")
    idf = featNEC.IDFeatures(dataset=train, chunkset=chunks_train)
    idf.build_features()

    ###############################################################################
    print("Standarizing dataset...")
    X_train, Y_train = getStandart(chunks_train, idf)
    X_test, Y_test = getStandart(chunks_test, idf)

    # sparse representation and normalize
    X_train = sparse.csr_matrix(X_train)
    X_train = normalize(X_train, copy=False)

    X_test = sparse.csr_matrix(X_test)
    X_test = normalize(X_test, copy=False)

    return X_train, Y_train, X_test, Y_test, chunks_train
Example #24
0
def SVM_vary_train(train_images, train_labels, test_images, test_labels, kernel_type, tune1, tune2, tune3, train_amm):
    # reshape the training/testing data into a classifiable form
    train_data_mat = np.reshape(train_images, (train_images.shape[0]*train_images.shape[1],train_images.shape[3]))
    test_data_mat = np.reshape(test_images, (test_images.shape[0]*test_images.shape[1],test_images.shape[3]))

    train_data_mat = 1.0*np.array(np.mat(train_data_mat).transpose())
    test_data_mat = 1.0*np.array(np.mat(test_data_mat).transpose())

    # normalize the data
    train_data_mat = preprocessing.normalize(train_data_mat, norm='l2')
    test_data_mat = preprocessing.normalize(test_data_mat, norm='l2')
     
    
    if kernel_type is "linear": 
        classif_1vr = svm.SVC(kernel=kernel_type, C=tune1)
    elif kernel_type is "rbf": 
        classif_1vr = svm.SVC(kernel=kernel_type, gamma=tune1)
    elif kernel_type is "sigmoid":
        classif_1vr = svm.SVC(kernel=kernel_type, gamma=tune1, coef0=tune2)
    elif kernel_type is "poly":
        classif_1vr = svm.SVC(kernel=kernel_type, gamma=tune1, coef0=tune2, degree=tune3)
    
    # fit the SVM to the training set
    classif_1vr.fit(train_data_mat[0:train_amm,:], train_labels[0][0:train_amm])
    
    targets = test_labels[0]
    
    # make prediction on the test data set
    predict = classif_1vr.predict(test_data_mat)
 
    # calculate the accuracy 
    acc = calc_acc(targets, predict) 

    return "kernel=" + str(kernel_type)  + ", tune1=" + str(tune1)  + ", tune2=" + str(tune2) + ", tune3=" + str(tune3) + ", train_amm=" + str(train_amm) + ", acc: " + str(acc) + "\n"
Example #25
0
def classify(data_trn,lbl_trn,data_vld,lbl_vld,data_tst,lbl_tst):

	data_trn = normalize(data_trn,copy=False)
	data_vld = normalize(data_vld,copy=False)
	data_tst = normalize(data_tst,copy=False)

	# accuracy metric
	metric_obj = mean_squared_error
	'''
	Train our model to predict labels for the dataset #1
	'''
	parameters = {'svr__gamma': 1.5, 'svr__probability': False, 'svr__epsilon': 0.4, 'svr__C': 1, 'svr__kernel': 'rbf'}
	cls = Pipeline([
			#('feature_selection',LinearSVC()),
			('svr', SVR())
			])
	cls.set_params(**parameters)

	cls.fit(data_trn, lbl_trn)


        pred_vld = cls.predict(data_vld)
        pred_tst = cls.predict(data_tst)

        print ("Score for vld: %.6f" % (metric_obj(lbl_vld, pred_vld),))
        print ("Score for tst: %.6f" % (metric_obj(lbl_tst, pred_tst),))

	return pred_vld,pred_tst
def get_vector_sp(model, x_data, x_test):
    tmp_x = x_data[:, x_data.shape[1]-16:x_data.shape[1]-1]
    tmp_x = tmp_x.reshape(tmp_x.shape[0], tmp_x.shape[1])
    tmp_x = tmp_x/tmp_x[:,0].reshape(tmp_x.shape[0],1)
    #tmp_x[:,1:] = tmp_x[:,1:] / tmp_x[:,0:tmp_x.shape[1]-1]
    tmp_x = preprocessing.normalize(tmp_x, norm='l2')
    preds = model.predict_proba(tmp_x.reshape(x_data.shape[0],15,1))


    test_x = x_test[:, x_test.shape[1]-16:x_test.shape[1]-1]
    test_x = test_x.reshape(test_x.shape[0], test_x.shape[1])
    test_x = test_x/test_x[0][0]
    test_x = preprocessing.normalize(test_x, norm='l2')
    pred_test = model.predict_proba(test_x.reshape(test_x.shape[0],15,1))
    x_result = numpy.hstack((x_data.reshape(x_data.shape[0], x_data.shape[1]), preds))
    x_result_test = numpy.hstack((x_test.reshape(x_test.shape[0], x_test.shape[1]), pred_test))
    test_bdate = x_test[0][1]
    tmp_list = []
    tmp_vec = x_result_test[0][x_result_test.shape[1]-20:]
    i  = 0
    for m in x_result:
        i = i + 1
        dist = numpy.sqrt(numpy.sum(numpy.square(m[m.shape[0]-20:]- pred_test[0])))
        
        tmp_list.append((m[1], dist))
    sort_list = sorted(tmp_list, key = lambda x:x[1], reverse =False)
    return sort_list, test_bdate
Example #27
0
def PCA_SVM(train_images, train_labels, test_images, test_labels, kernel_type, do_PCA, comps):
    # reshape the training/testing data into a classifiable form
    train_data_mat = np.reshape(train_images, (train_images.shape[0]*train_images.shape[1],train_images.shape[3]))
    test_data_mat = np.reshape(test_images, (test_images.shape[0]*test_images.shape[1],test_images.shape[3]))

    train_data_mat = np.array(np.mat(train_data_mat).transpose())
    test_data_mat = np.array(np.mat(test_data_mat).transpose())

    # normalize the data
    train_data_mat = preprocessing.normalize(train_data_mat, norm='l2')
    test_data_mat = preprocessing.normalize(test_data_mat, norm='l2')
    
    # do PCA if necessary
    if do_PCA:
        # learn the covariance 
        pca = PCA(n_components=comps, whiten=True)
        pca.fit(train_data_mat)
    
        # use pca to reduce dimensionality of training data
        train_data_mat = pca.transform(train_data_mat)
        test_data_mat = pca.transform(test_data_mat)
    
    # fit svm to pca-reduced
    classif_1vr = svm.SVC(kernel=kernel_type)
    classif_1vr.fit(train_data_mat, train_labels[0])

    targets = test_labels[0]
    
    # make prediction on the test data set
    predict = classif_1vr.predict(test_data_mat)
 
    # calculate the accuracy 
    acc = calc_acc(targets, predict) 

    return "PCA=" + str(do_PCA) + ", num_comps= " + str(comps) + ", kernel=" + str(kernel_type)  + ", acc: " + str(acc) + "\n"
def rwr(transition,PT,r=0.7):
    """

    :param transition: Get the spare Transition matrix
    :param PT: Intialization Vector
    :param r: restart probability
    :return: Numpy Matrix of predicted scores
    """
    #Stop criteria
    stop = 1e-07
    PO = PT
    Tr  =  transition

    while True:

        PX = (1-r)* Tr.T * PT + (r * PO)
        delta =  spnorm(PX) - spnorm(PT)

        if delta < stop :
            break

        PT = PX
    #fMat = normalize(PT, norm='l1', axis=0)
    OM = PT[0:5080]
    OM  = normalize(OM, norm='l1', axis=0)
    PP = PT[5080:15078]
    PP = normalize(PP, norm='l1', axis=0)
    CP = PT[15078:16904]
    CP  = normalize(CP, norm='l1', axis=0)
    PAT = PT[16904:19435]
    PAT  = normalize(PAT, norm='l1', axis=0)
    P = np.concatenate((OM,PP,CP,PAT),axis=0)

    return P
def getData_NEC(test=0.2, val=0.2, mode='by_sent',target='TODO'):
    print("Reading data...")
    train,test,val = getData(test=test, val=val, mode=mode,target=target)

    print("Reading chunks...")
    chunks_train = ChunkSet(dataset=train)
    chunks_test = ChunkSet(dataset=test)

    print("Building features...")
    idf = featNEC.IDFeatures(dataset = train, chunkset = chunks_train)
    idf.build_features()

    ###############################################################################
    print("Standarizing dataset...")
    X_train,Y_train = getStandart(chunks_train, idf)
    X_test,Y_test = getStandart(chunks_test, idf)

    # sparse representation and normalize
    X_train = sparse.csr_matrix(X_train)
    X_train = normalize(X_train, copy = False)

    X_test = sparse.csr_matrix(X_test)
    X_test = normalize(X_test, copy = False)

    return X_train,Y_train,X_test,Y_test, chunks_train
Example #30
0
def classify(df):
    print type(df)
    df = preprocessing.normalize(df.ix[:,[1,2,3]], norm="l2")
    print df
    print type(df)
    #load dataset
    dataset_train = pd.read_csv("training.csv",delimiter=",")
    X = dataset_train.ix[:,[0,1,2]]
    X = X.as_matrix()
    X = preprocessing.normalize(X,norm="l2")
    print X
    print type(X)
    #X_normalised = preprocessing.normalize(X,norm="l2")
    #ground truth
    y = dataset_train.ix[:,[3]]
    y = y.as_matrix()

    y = y.ravel()
    print y
    print type(y)
    #y = y.reshape((len(y),))
    n_neighbors = 3
    #create an instance of Neighbours Classifier and fit the data.
    knn = neighbors.KNeighborsClassifier(n_neighbors)
    knn.fit(X, y)
    #print df[:,0:4]
    return knn.predict(df[:,0:4])
    def frame_callback(vis, frame_idx, seq_info, viewer,angle):
        print("Processing frame %05d" % frame_idx)

        # Load image and generate detections.
        detections = create_detections(
            seq_info["detections"], frame_idx, min_detection_height)
        detections = [d for d in detections if d.confidence >= min_confidence]

        # Run non-maxima suppression.
        boxes = np.array([d.tlwh for d in detections])
        scores = np.array([d.confidence for d in detections])
        indices = preprocessing.non_max_suppression(
            boxes, nms_max_overlap, scores)
        detections = [detections[i] for i in indices]

        image = cv2.imread(
            seq_info["image_filenames"][frame_idx], cv2.IMREAD_COLOR)
        # Update tracker.
        tracker[angle-1].predict()
        tracker[angle-1].update(detections, image,  myexactor, dic_feature, frame_idx, global_next_id, angle)
        id_dic = {}
        for index, track in enumerate(tracker[angle-1].tracks):
            str_id = str(track.track_id)
            if str_id in id_dic.keys():
                track.track_id = global_next_id[0]
                global_next_id[0] += 1
            else:
                id_dic[str_id] = (index, track.state)

        # Update visualization.
        if display:
            vis.set_image(image.copy(),viewer,angle)
            #print("deep_sort angle: "+str(angle))
            vis.draw_detections(detections,viewer,angle)
            vis.draw_trackers(tracker[angle-1].tracks,viewer,angle)

        # Store results.
        for track in tracker[angle-1].tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlwh()
            print(angle)
            if angle == 1:
                print("angle1")
                results1.append([
                    frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3], track.sex, track.person_age])
            if angle == 2:
                print("angle2")
                results2.append([
                    frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3], track.sex, track.person_age])
            if angle == 3:
                print("angle3")
                results3.append([
                    frame_idx, track.track_id, bbox[0], bbox[1], bbox[2], bbox[3], track.sex, track.person_age])
            ### save gallery
            if (frame_idx) % 4 == 0:
                for i in range(4):
                    if bbox[i] < 0 :
                        bbox[i] = 0
                img = image[int(bbox[1]):int(bbox[1] + bbox[3]), int(bbox[0]):int(bbox[0] + bbox[2])]
                img = cv2.resize(img, (128, 256), interpolation=cv2.INTER_CUBIC)
                temp = img.copy()
                transform_test = T.Compose([
                    T.ToTensor(),
                    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                ])

                img = transform_test(img)
                img = torch.unsqueeze(img, 0)
##sex age forward
                track_id = track.track_id
                img = img.cuda()
                sex_output, age_output = model_sex_age(img)
                pred1 = sex_output.data.max(1)[1]
                pred2 = age_output.data.max(1)[1]
                age = age_dict[str(int(pred2))]
                sex = sex_dict[str(int(pred1))]
                track.person_age = age
                track.sex = sex
##end
                f1 = myexactor(img)
                a1 = normalize(pool2d(f1[0], type='max'))
                if str(track_id) not in dic_feature.keys():
                    dic_feature[str(track_id)] = []
                    dic_feature[str(track_id)].append((a1, angle, (bbox[0]+bbox[2]*0.5, bbox[1]+bbox[3])))
                else:
                    if len(dic_feature[str(track_id)]) > 100:
                        del(dic_feature[str(track_id)][0])
                    dic_feature[str(track_id)].append((a1, angle, (bbox[0]+bbox[2]*0.5, bbox[1]+bbox[3])))
Example #32
0
# In[32]:

from sklearn import preprocessing

sim_user = []

#add the given user vector to a frequency list of all users
fVectorOfGivenUser_listForm = fVectorOfGivenUser.tolist()
sim_user.append(fVectorOfGivenUser_listForm)

for first_user in most_sim_users:
    sim_user.append(freqList[userIds.index(first_user[1])])

#normalise the vectors using l2 norm
normalizedSimilarUsers = preprocessing.normalize(sim_user, norm='l2')

# # contributing term calculation

# In[33]:

top3Terms = []

print("The top 3 contributing terms for each match are: \n")
for each_k in range(1, int(k) + 1):
    print("For match " + str(each_k) + ":")
    diffVector = []
    for j in range(len(vocab)):
        if (normalizedSimilarUsers[0][j] != 0
                and normalizedSimilarUsers[each_k][j] != 0):
            diffVector.append([
Example #33
0
    feature_file = sys.argv[1]
    output_file = sys.argv[3]
    cluster_num = int(sys.argv[2])

    if feature_file.split('.')[1] == 'surf':
        feature_name = 'surf'
        print('SURF features')
    elif feature_file.split('.')[1] == 'cnn':
        feature_name = 'cnn'
        print('CNN features')
    else:
        raise (ValueError('Invalid data'))

    # Read data
    X = np.loadtxt(feature_file, delimiter=';')
    #X = np.genfromtxt(mfcc_csv_file, delimiter=";")
    print(X.shape)

    X = normalize(X, axis=1)
    # Model fit to data
    #kmeans = KMeans(n_clusters=cluster_num)
    kmeans = MiniBatchKMeans(
        n_clusters=cluster_num, batch_size=50, random_state=0,
        init_size=500)  #Convential KMeans is too slow #Potential Memory Error
    kmeans.fit(X)

    # Save model
    pickle.dump(kmeans, open(output_file, 'wb'))

    print("K-means trained successfully!")
    loader=np.load(filename)
    data=loader['data']
    indptr=loader['indptr']
    indices=loader['indices']
    shape=loader['shape']
    return csr_matrix((data,indices,indptr),shape)

tf_idf=csr_load(filename)
print tf_idf[0,:]
tf_idf.shape

fl=open('E:/Clustering and Retreival/Week 6/people_wiki_map_index_to_word.json')
map_index_to_word=json.load(fl)
fl.close()

tf_idf=normalize(tf_idf)
type(wiki.name)


def bipartition(cluster,maxiter=400,num_runs=4,seed=None):
    dataframe=cluster['dataframe']
    data_matrix=cluster['data_matrix']
    km=KMeans(n_clusters=2,n_init=num_runs,max_iter=maxiter,random_state=seed)
    km.fit(data_matrix)
    centroids,cluster_ass=km.cluster_centers_,km.labels_
    data_matrix_left_child,data_matrix_right_child=data_matrix[cluster_ass==0,:],data_matrix[cluster_ass==1,:]
    cluster_ass=pd.Series(cluster_ass)
    data_frame_left,data_frame_right=dataframe.loc[cluster_ass==0,:],dataframe.loc[cluster_ass==1,:]
    cluster_left={'dataframe':data_frame_left,
                  'data_matrix':data_matrix_left_child,
                  'centroids':centroids[0]}
Example #35
0
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300,
                                   init='k-means++', verbose=False,
                                   x_squared_norms=None,
                                   sample_weights=None,
                                   random_state=None, tol=1e-4,
                                   precompute_distances=True):
    '''
    Modified from sklearn.cluster.k_means_.k_means_single_lloyd.
    '''
    random_state = check_random_state(random_state)

    best_labels, best_inertia, best_centers = None, None, None

    # init
    centers = _init_centroids(X, n_clusters, init, random_state=random_state,
                              x_squared_norms=x_squared_norms)
    if verbose:
        print("Initialization complete")

    # Allocate memory to store the distances for each sample to its
    # closer center for reallocation in case of ties
    distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype)

    # iterations
    for i in range(max_iter):
        centers_old = centers.copy()

        # labels assignment
        # TODO: _labels_inertia should be done with cosine distance
        #       since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized
        #       this doesn't really matter.
        labels, inertia = \
            _labels_inertia(X, sample_weights, x_squared_norms, centers,
                            precompute_distances=precompute_distances,
                            distances=distances)
    
        # computation of the means
        if sp.issparse(X):
            centers = _k_means._centers_sparse(X, sample_weights, labels, n_clusters,
                                               distances)
        else:
            centers = _k_means._centers_dense(X, sample_weights, labels, n_clusters, distances)

        # l2-normalize centers (this is the main contibution here)
        centers = normalize(centers)

        if verbose:
            print("Iteration %2d, inertia %.3f" % (i, inertia))

        if best_inertia is None or inertia < best_inertia:
            best_labels = labels.copy()
            best_centers = centers.copy()
            best_inertia = inertia

        center_shift_total = squared_norm(centers_old - centers)
        if center_shift_total <= tol:
            if verbose:
                print("Converged at iteration %d: "
                      "center shift %e within tolerance %e"
                      % (i, center_shift_total, tol))
            break

    if center_shift_total > 0:
        # rerun E-step in case of non-convergence so that predicted labels
        # match cluster centers
        best_labels, best_inertia = \
            _labels_inertia(X, x_squared_norms, best_centers,
                            precompute_distances=precompute_distances,
                            distances=distances)

    return best_labels, best_inertia, best_centers, i + 1
Example #36
0
X[neg_ind] = 0.
spec_err[neg_ind] = 0.

#%% Set zero fluxes to NaN

X_nonan = X.copy()
zero_ind = X == 0.
X[zero_ind] = np.NaN

#%% Set all zero and negative flux errors to NaN

zero_err_ind = spec_err <= 0.
spec_err[zero_err_ind] = np.NaN

#%% Normalise spectrum
X_normal, norm = preprocessing.normalize(X_nonan, return_norm=True)
X_norm_zeros = np.copy(X_normal)

#%% Plot an example spectrum in the data
plt.figure()
plt.plot(wavelengths, X_normal[4])
plt.show()

#%% Set all zero normalised fluxes to nan
zero_norm_ind = X_normal == 0.
X_normal[zero_norm_ind] = np.NaN

#%% Transform errors due to corresponding normalisation
spec_err_T = np.transpose(spec_err)
spec_err_norm_T = np.divide(spec_err_T, norm)
spec_err_norm = np.transpose(spec_err_norm_T)
Example #37
0
    return rmse(y_pred, y_actual) / (max(y_actual) - min(y_actual))


with open("1_train.txt", "r") as file:
    for line in file:
        newline = line.rstrip('\n')
        X_train.append([int(i) for i in newline.split(' ')])
        Y_train += [X_train[len(X_train) - 1].pop()]

with open("1_test.txt", "r") as file:
    for line in file:
        newline = line.rstrip('\n')
        X_test.append([int(i) for i in newline.split(' ')])
        Y_test += [X_test[len(X_test) - 1].pop()]

X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)
Y_train, Y_test = np.array(Y_train), np.array(Y_test)
np.hstack((X_train, np.ones((X_train.shape[0], 1))))
np.hstack((X_test, np.ones((X_test.shape[0], 1))))
# modelLS = Ridge(alpha=0.5, solver='svd')
# modelLS.fit(X_train, Y_train)
# y = modelLS.predict(X_train)
# error1 = nrmse(y, Y_train)
# print(error1)
# y_pred = modelLS.predict(X_test)
# error2 = nrmse(y_pred, Y_test)
# print(error2)

# for i in range(100, 300, 50):
#     modelGD = SGDRegressor(shuffle=True, max_iter=i, penalty="elasticnet", alpha=0.01, learning_rate="invscaling", eta0=0.001, l1_ratio=0.6, power_t=0.3)
## TSNE
from sklearn.manifold import TSNE
ts = TSNE(n_components=2).fit_transform(data1)
plt.scatter(ts[:, 0], ts[:, 1], c=kmeans_model.fit_predict(data1))
plt.show()

## percent
labels = kmeans.labels_
percent_clas = []
for i in range(kmeans.n_clusters):
    percent_clas.append(sum(labels == i) / len(labels))
print(percent_clas)

## Hierachical Clustering
from sklearn import preprocessing
data1 = preprocessing.normalize(data1)

from sklearn.cluster import AgglomerativeClustering
import time
start = time.time()
hpre = AgglomerativeClustering(n_clusters=3).fit_predict(data1)
end = time.time()
print(end - start)
hfit = AgglomerativeClustering(n_clusters=3).fit(data1[0:1000, :])

from sklearn.manifold import TSNE
ts = TSNE(n_components=2).fit_transform(data1)
plt.scatter(ts[:, 0], ts[:, 1], c=hpre)
plt.show()

from sklearn.manifold import MDS
Example #39
0
  116117885893.23831,
  92005035565.86392,
  48313784253.98602,
  4578303196.450315,
  -28703495471.202255,
  -57784000679.98717,
  -78326916097.10924,
  -91205023418.17476
]



final = np.convolve(signal,a, 'same')
print(final.shape)

final = normalize([final],axis=1)
print(final.shape)
ax1.plot(signal)

ax2.plot(final)
print(np.max(final))


sp = np.fft.fft(final)
freq = np.fft.fftfreq(final.shape[-1])

#ax2.plot(freq, sp.real)



#wavfile.write('final.wav',44100,final)
Example #40
0
def extract_features():

    directories = os.listdir(path="dataset_splitted")

    dataset = []

    kernels = []
    for theta in range(4):
        theta = theta / 4. * np.pi
        for sigma in (1, 3):
            for frequency in (0.05, 0.25):
                kernel = np.real(
                    gabor_kernel(frequency,
                                 theta=theta,
                                 sigma_x=sigma,
                                 sigma_y=sigma))
                kernels.append(kernel)

    load_full_features = True

    if load_full_features == False:

        for type_name in ('/train/', '/validation/', '/test/'):
            i = 1
            for dir in directories:

                for file in os.listdir("dataset_splitted/" + dir + type_name):

                    descriptors = []

                    img = cv2.imread(
                        "dataset_splitted/" + dir + type_name + file,
                        cv2.IMREAD_GRAYSCALE)
                    img_color = cv2.imread("dataset_splitted/" + dir +
                                           type_name + file)

                    feature_extractor = LocalBinaryPatterns(128, 1)
                    descriptor_template = feature_extractor.describe(img)
                    descriptor_template_norm = preprocessing.normalize(
                        np.array(descriptor_template).reshape(1, -1))[0]
                    descriptors.append(descriptor_template_norm)

                    feature_extractor_2 = Gradient_histogram(128)
                    descriptor_template_2 = feature_extractor_2.describe(img)
                    descriptor_template_2_norm = preprocessing.normalize(
                        np.array(descriptor_template_2).reshape(1, -1))[0]
                    descriptors.append(descriptor_template_2_norm)

                    descriptor_template_3 = compute_feats(img, kernels)
                    descriptor_template_3_norm = preprocessing.normalize(
                        np.array(descriptor_template_3).reshape(1, -1))[0]
                    descriptors.append(descriptor_template_3_norm)

                    feature_extractor_4 = LocalBinaryPatterns(128, 2)
                    descriptor_template_4 = feature_extractor_4.describe(img)
                    descriptor_template_4_norm = preprocessing.normalize(
                        np.array(descriptor_template_4).reshape(1, -1))[0]
                    descriptors.append(descriptor_template_4_norm)

                    for method in ('RGB', 'HSV', 'LAB'):
                        for hist in hist_describe(img_color, method):
                            descriptor_template_5_norm = preprocessing.normalize(
                                np.array(hist).reshape(1, -1))[0]
                            descriptors.append(descriptor_template_5_norm)

                    d_filename = file

                    descriptors.append([d_filename])

                    descriptors.append([i])

                    flat_descriptors = []
                    for sublist in descriptors:
                        for item in sublist:
                            flat_descriptors.append(item)

                    dataset.append(flat_descriptors)

                    print(i)

                i += 1

        dataset_arr = np.array([np.array(xi) for xi in dataset], dtype=object)

        x = dataset_arr[:, :-2]
        y = dataset_arr[:, -1]
        # print(np.unique(y, return_counts=True))
        names = dataset_arr[:, -2]

        np.savetxt('extracted_x.csv', x, fmt='%s', delimiter=',')
        np.savetxt('extracted_y.csv', y, fmt='%s', delimiter=',')
        np.savetxt('extracted_names.csv', names, fmt='%s', delimiter=',')

    else:
        x = np.loadtxt('extracted_x.csv', dtype='float', delimiter=',')
        y = np.loadtxt('extracted_y.csv', dtype='str', delimiter=',')
        names = np.loadtxt('extracted_names.csv', dtype='str', delimiter=',')
    # dtype = 'float'

    # df_X = pd.DataFrame.from_records(X)  # Shape (1600, 546)

    # y = dataset_arr[:, -1]
    # x = dataset_arr[:, :-2]
    # names = dataset_arr[:, -2]

    print(x.shape)

    # pca = PCA(n_components=600).fit(x)
    # print(pca.explained_variance_ratio_)
    # x = pca.transform(x)
    # print(x)

    x_train = x[0:800, :]
    x_validation = x[800:1120, :]
    x_test = x[1120:1600, :]

    print(x_train.shape)

    y_train = y[0:800]
    y_validation = y[800:1120]
    y_test = y[1120:1600]

    classify = True

    if classify == True:

        predictions_val_arr = []
        predictions_test_arr = []

        # test и validation перепутаны

        from sklearn.neighbors import KNeighborsClassifier
        clf1 = KNeighborsClassifier(n_neighbors=13)

        from sklearn.tree import DecisionTreeClassifier
        clf2 = DecisionTreeClassifier(criterion='entropy',
                                      min_samples_leaf=1,
                                      min_samples_split=2,
                                      max_leaf_nodes=150,
                                      random_state=0)

        from sklearn.ensemble import RandomForestClassifier
        clf3 = RandomForestClassifier(criterion='entropy',
                                      max_depth=20,
                                      min_samples_leaf=2,
                                      min_samples_split=4,
                                      random_state=0)

        from sklearn.ensemble import GradientBoostingClassifier
        clf4 = GradientBoostingClassifier(random_state=0)

        from sklearn.neural_network import MLPClassifier
        clf5 = MLPClassifier(solver='adam',
                             alpha=1e-5,
                             hidden_layer_sizes=(128, ),
                             random_state=1)

        from sklearn.ensemble import AdaBoostClassifier
        clf6 = AdaBoostClassifier(n_estimators=2000, random_state=0)

        classifiers = (clf1, clf2, clf3, clf4, clf5, clf6)

        # for clf in classifiers:
        #
        #     clf.fit(x_train, y_train)
        #
        #     predictions_val = clf.predict(x_test)
        #     predictions_val_arr.append(predictions_val)
        #
        #     predictions_test = clf.predict(x_validation)
        #     predictions_test_arr.append(predictions_test)
        #
        #     predictions_train = clf.predict(x_train)
        #
        #     print(classification_report(y_train, predictions_train))
        #     print(classification_report(y_test, predictions_val))

        from sklearn.ensemble import VotingClassifier

        voting_clf = VotingClassifier(estimators=[('1', clf1), ('2', clf2),
                                                  ('3', clf3), ('4', clf4),
                                                  ('5', clf5), ('6', clf6)],
                                      voting='hard')
        print("Created voting classifier")
        voting_clf.fit(x_train, y_train)

        predictions_val = voting_clf.predict(x_test)
        predictions_val_arr.append(predictions_val)

        predictions_test = voting_clf.predict(x_validation)
        predictions_test_arr.append(predictions_test)

        predictions_train = voting_clf.predict(x_train)

        # print(classification_report(y_train, predictions_train))
        print(classification_report(y_test, predictions_val))
        print(confusion_matrix(y_test, predictions_val))

        print(predictions_val_arr)

        ml_preprocessed_val = []
        ml_preprocessed_test = []

        i = 0
        for j in predictions_val_arr:
            for a in range(16):
                ml_preprocessed_val.append([])
            for prediction in j:
                for a in range(16):
                    ml_preprocessed_val[i * 16 + a].append(
                        int(int(prediction) == a))
            i += 1

        i = 0
        for j in predictions_test_arr:
            for a in range(16):
                ml_preprocessed_test.append([])
            for prediction in j:
                for a in range(16):
                    ml_preprocessed_test[i * 16 + a].append(
                        int(int(prediction) == a))
            i += 1

        ml_preprocessed_val = np.array(ml_preprocessed_val)
        ml_preprocessed_test = np.array(ml_preprocessed_test)

        # print(ml_preprocessed_val)
        # print(ml_preprocessed_val.shape)
        #
        # print(ml_preprocessed_test)
        # print(ml_preprocessed_test.shape)

        np.savetxt('ml_preprocessed_val.csv',
                   ml_preprocessed_val,
                   fmt='%s',
                   delimiter=',')
        np.savetxt('ml_preprocessed_test.csv',
                   ml_preprocessed_test,
                   fmt='%s',
                   delimiter=',')

    else:

        ml_preprocessed_val = np.loadtxt('ml_preprocessed_val.csv',
                                         dtype='float',
                                         delimiter=',')
        ml_preprocessed_test = np.loadtxt('ml_preprocessed_test.csv',
                                          dtype='float',
                                          delimiter=',')
Example #41
0
import string

import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
print(os.getcwd())
os.chdir(".\\data")
df = pd.read_json("tweeps.json", "r", lines=True)

data = df["text"]

tf_idf_vectorizor = TfidfVectorizer(
    stop_words='english',  #tokenizer = tokenize_and_stem,
    max_features=20000)
tf_idf = tf_idf_vectorizor.fit_transform(data)
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()

pd.DataFrame(tf_idf_array,
             columns=tf_idf_vectorizor.get_feature_names()).head()


class Kmeans:
    """ K Means Clustering
    
    Parameters
    -----------
        k: int , number of clusters
        
        seed: int, will be randomly set if None
        
Example #42
0
def fisher_vector(xx, gmm, normalization=True):
    """
    Computes the Fisher vector on a set of descriptors.
    code from : https://gist.github.cnsom/danoneata/9927923
    Parameters
    ----------
    xx: array_like, shape (N, D) or (D, )
        The set of descriptors

    gmm: instance of sklearn mixture.GMM object
        Gauassian mixture model of the descriptors.

    Returns
    -------
    fv: array_like, shape (K + 128 * D * K, )
        Fisher vector (derivatives with respect to the mixing weights, means
        and variances) of the given descriptors.

    Reference
    ---------
    Sanchez, J., Perronnin, F., Mensink, T., & Verbeek, J. (2013).
    Image classification with the fisher vector: Theory and practice. International journal of computer vision, 105(64), 222-245.
    https://hal.inria.fr/hal-00830491/file/journal.pdf

    """
    xx = np.atleast_2d(xx)
    n_points = xx.shape[0]
    D = gmm.means_.shape[1]
    tiled_weights = np.tile(np.expand_dims(gmm.weights_, axis=-1), [1, D])

    #start = time.time()
    # Compute posterior probabilities.
    Q = gmm.predict_proba(xx)  # NxK
    #mid = time.time()
    #print("Computing the probabilities took ", str(mid-start))
    #Compute Derivatives

    # Compute the sufficient statistics of descriptors.
    s0 = np.sum(Q, 0)[:, np.newaxis] / n_points
    s1 = np.dot(Q.T, xx) / n_points
    s2 = np.dot(Q.T, xx**2) / n_points

    d_pi = (s0.squeeze() - n_points * gmm.weights_) / np.sqrt(gmm.weights_)
    d_mu = (s1 - gmm.means_ * s0) / np.sqrt(tiled_weights * gmm.covariances_)
    d_sigma = (+s2 - 2 * s1 * gmm.means_ + s0 * gmm.means_**2 -
               s0 * gmm.covariances_) / (np.sqrt(2 * tiled_weights) *
                                         gmm.covariances_)

    #Power normaliation
    alpha = 0.5
    d_pi = np.sign(d_pi) * np.power(np.absolute(d_pi), alpha)
    d_mu = np.sign(d_mu) * np.power(np.absolute(d_mu), alpha)
    d_sigma = np.sign(d_sigma) * np.power(np.absolute(d_sigma), alpha)

    if normalization == True:
        d_pi = normalize(d_pi[:, np.newaxis], axis=0).ravel()
        d_mu = normalize(d_mu, axis=0)
        d_sigma = normalize(d_sigma, axis=0)
    # Merge derivatives into a vector.

    #print("comnputing the derivatives took ", str(time.time()-mid))

    return np.hstack((d_pi, d_mu.flatten(), d_sigma.flatten()))
from sklearn.decomposition import PCA as PCA
import scipy.io
import numpy as np
import inspect

import sys
sys.path.append('/home/ov/python/py_utils')
import utils

patient_nr = 10
scid = 'nsc'

pwd = '/home/ov/preprocessed_waves/extracted/'
pwd_means = pwd + 'nfpat' + str(patient_nr) + '.icp.' + scid + '_means'
cluster_means = np.loadtxt(pwd_means)

print(cluster_means.shape)

# Normalization:
from sklearn.preprocessing import normalize
cluster_means = normalize(cluster_means, norm='max')

fig = utils.plot_means_a4(cluster_means,
                          title='PAT {} {} (pa)'.format(
                              patient_nr, scid))  # pa := postapocalyptic
Example #44
0
drop_col_names = []

vifs = list(vif_df.VIF)
predictors = list(vif_df.Ind_Var)

for i in range(len(predictors)):
    if vifs[i] >= 10:
        drop_col_names.append(predictors[i])

df = df.drop(drop_col_names,
             1)  # this is the data frame with high VIF variables removed
X = df.drop('G3', 1)  # this is the design matrix
y = list(df.G3)  # this is the discrete response vector
y_new = response_conv(y)  # this is the multinomial response vector
X_scale = preprocessing.scale(X)
X_norm = preprocessing.normalize(X)
########################################################################################################################
X1_train, X1_test, y1_train, y1_test = train_test_split(X,
                                                        y_new,
                                                        test_size=0.33,
                                                        random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X_scale,
                                                        y_new,
                                                        test_size=0.33,
                                                        random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(X_norm,
                                                        y_new,
                                                        test_size=0.33,
                                                        random_state=42)

log_reg1 = LogisticRegressionCV(cv=10, scoring='neg_log_loss',
Example #45
0
    def net(self,
            x,
            classify=False,
            miecorr=False,
            predict=False,
            train=False,
            show=False):
        import keras
        from keras.layers import Input, Dense
        from keras.optimizers import RMSprop, Adam, SGD
        from keras.models import model_from_json
        from keras.callbacks import ModelCheckpoint
        #import tensorflow as tf
        #import tensorflow.compat.v1 as tf
        #tf.disable_v2_behavior()
        """
		TODO: UPDATE TO TENSORFLOW VERSION 2 
		"""

        ####################################################################################################
        #	DETERMINE WICH MODEL PARAMETERS YOUN WNAT TO USE
        #		CLASSIFY == TRUE GIVES THE MODEL TRAINED TO CLASSIFY ALL CELLUAR COMPONENTS BASED ON SPECTRA
        #					BETWEEN 950-1800 WVN
        #
        #		MIECORR == TRUE GIVES THE CORRESPONDING NEURAL NETWORK FOR PERFORMING EFFICIENT RMIE-CORRECTION
        #				   ON FFPE-BASED TISSUE SPECTRA
        #
        ####################################################################################################

        if classify == True:

            if x.shape[1] != 450:

                raise ValueError(
                    'This is a classification problem: Your spectral data needs 450 datapoints in WVN range of 950-1800 1/cm'
                )

            json_file = open(
                os.path.join(
                    str(MODELPATH) + '/model_weights_classification.json'),
                'r')

            loaded_model_json = json_file.read()

            loaded_model = model_from_json(loaded_model_json)

            if show == True:
                print(loaded_model.summary())

            loaded_model.load_weights(
                os.path.join(
                    str(MODELPATH) +
                    "/model_weights_classification.best.hdf5"))

            print("Loaded model from disk")

            model = loaded_model.compile(loss='categorical_crossentropy',
                                         optimizer='rmsprop',
                                         metrics=['accuracy'])

            from sklearn.preprocessing import normalize

            trX = normalize(x, axis=1, norm='l2')

            return loaded_model.predict(trX), load_model

        if miecorr == True:

            if x.shape[1] != 909:

                raise ValueError(
                    'This is a regression problem: Your spectral data needs 909 datapoints in WVN range of 950-2300 1/cm'
                )
            ####################################################################################################
            #	THIS MODEL NEEDS THE FIRST 909 WVN. RANGE FROM 950-2300 WVN 1/cm
            #
            #
            #
            ####################################################################################################x

            json_file = open(
                os.path.join(
                    str(MODELPATH) + '/model_weights_regression.json'), 'r')

            loaded_model_json = json_file.read()

            loaded_model = model_from_json(loaded_model_json)

            if show == True:
                print(loaded_model.summary())

            loaded_model.load_weights(
                os.path.join(
                    str(MODELPATH) + "/model_weights_regression.best.hdf5"))

            print("Loaded model from disk")

            loaded_model.compile(loss='mean_squared_error', optimizer='adam')

            from sklearn.preprocessing import normalize

            trX = normalize(x, axis=1, norm='l2')
            return loaded_model.predict(trX), load_model
Example #46
0
counts = np.zeros((len(words), len(words)))

file2 = nltk.data.path[0] + '/corpora/brown/brown_100.txt'
with open(file2) as g:
#	starting_word = '<s>'
	for line in g:
		wdsEachLine = line[:-1].lower().split()
#		wdsEachLine = [starting_word] + wdsEachLine
		wdsEachLine.append('</s>')
		for idx, wd in enumerate(wdsEachLine):
			if idx != 0:
				counts[words[wdsEachLine[idx]]][words[wdsEachLine[idx - 1]]] += 1

counts += 0.1
from sklearn.preprocessing import normalize
probs = normalize(counts, norm='l1', axis=0)
						
# Write them into a file
target = open('smooth_probs.txt', 'w')
target.write('p(the|all) = ' + str(probs[words['the']][words['all']]) + '\n')
target.write('p(jury|the) = ' + str(probs[words['jury']][words['the']]) + '\n')
target.write('p(campaign|the) = ' + str(probs[words['campaign']][words['the']]) + '\n')
target.write('p(calls|anonymous) = ' + str(probs[words['calls']][words['anonymous']]) + '\n')
target.close()

file3 = nltk.data.path[0] + '/corpora/brown/toy_corpus.txt'
target1 = open('smoothed_eval.txt', 'w')
with open(file3) as h:
	for line in h:
		wdsInSentence = line[:-1].lower().split()
#		wdsInSentence = ['<s>'] + wdsInSentence
Example #47
0
#Visualising the distributions of the different variables
sns.pairplot(df_vpp_sd)

# From the pair plot above, we can clearly see that many of the variables have either positively or negatively skewed distributions. There also appears to be many outliers in each distribution and the different parameters don't share a common scale.
#
# To remedy this and also to ensure sound clustering analysis, we are going to **standardise** the dataset. We may also perform some principal component analysis on the data and to allow for this, the data will need to be standardised anyway.

# #### Standardising the features

# In[39]:

#Scaling the data
array_vpp_scaled = StandardScaler().fit_transform(df_vpp_sd)

#Normalizing the data
array_vpp_norm = normalize(array_vpp_scaled)

#Converting the standardised array back to a DataFrame
df_vpp_sd = pd.DataFrame(array_vpp_norm, columns=df_vpp_sd.columns)

# In[40]:

df_vpp_sd.describe()

# #### Add the group feature back in

# Since the data is now fully processed and ready for clustering, we can add the *'group'* column back in as it will be needed later on.

# In[41]:

df_vpp_sd = pd.concat([df_vpp.iloc[:, 0], df_vpp_sd], axis=1)
Example #48
0
    def transfer(self,
                 x,
                 y,
                 batch,
                 train_epochs,
                 add_l=[],
                 classify=False,
                 miecorr=False,
                 trainable=False):
        import keras
        from keras.models import Model
        from keras.optimizers import RMSprop, Adam, SGD
        from keras.models import model_from_json
        from keras.callbacks import ModelCheckpoint
        from keras.models import Sequential
        from datetime import datetime
        from sklearn.preprocessing import normalize
        """
		ALL PARTS OF THE TRANSFER-LEARNING NETWORKS ON FTIR SPECTROSCOPIC DATA

		"""
        trX = normalize(x, axis=1, norm='l2')

        def onehot(y):
            import keras
            from keras.utils import np_utils

            c = np.max(y) + 1

            y1hot = np_utils.to_categorical(y, num_classes=c)

            return (y1hot)

        def add_layer():
            from keras.utils import np_utils
            from keras.layers import Input, Dense
            from keras.models import Model
            from keras import models

            yoh = onehot(y)
            sm = int(yoh.shape[1])

            print("training on", sm, "classes")
            json_file = open(
                os.path.join(
                    str(MODELPATH) + '/model_weights_classification.json'),
                'r')

            loaded_model_json = json_file.read()

            loaded_model = model_from_json(loaded_model_json)

            loaded_model.load_weights(
                os.path.join(
                    str(MODELPATH) +
                    "/model_weights_classification.best.hdf5"))

            if trainable == False:
                for layer in loaded_model.layers:
                    layer.trainable = False
            else:
                for layer in loaded_model.layers:
                    layer.trainable = True

            if not add_l:
                preds = Dense(sm, name='newlast', activation='softmax')(
                    loaded_model.layers[-3].output)

                model = Model(inputs=loaded_model.input, outputs=preds)

                model.compile(loss='categorical_crossentropy',
                              optimizer='rmsprop',
                              metrics=['accuracy'])

                history = model.fit(trX,
                                    yoh,
                                    batch_size=batch,
                                    epochs=train_epochs)

                print(model.summary())

            if add_l:

                def add_2_model(add_l):

                    base = Model(inputs=loaded_model.input,
                                 outputs=loaded_model.layers[-3].output)

                    model = Sequential()
                    model.add(base)
                    model.add(Dense(add_l[0], input_dim=450,
                                    activation='relu'))

                    for layer_size in add_l[1:]:
                        model.add(Dense(layer_size, activation='relu'))

                    model.add(Dense(sm, activation='softmax'))

                    return model

                model = add_2_model(add_l)

                model.compile(loss='categorical_crossentropy',
                              optimizer='rmsprop',
                              metrics=['accuracy'])

                history = model.fit(trX,
                                    yoh,
                                    batch_size=batch,
                                    epochs=train_epochs)

                print(model.summary())

            dtstr = datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")

            model_json = model.to_json()

            with open("model_ptMLP_class_" + dtstr + ".json",
                      "w") as json_file:
                json_file.write(model_json)

            model.save_weights("model_model_ptMLP_class_" + dtstr + ".h5")

            print("Saved model to disk to",
                  "model_model_ptMLP_class_" + dtstr + ".json")
            print("and weights to")
            print("Saved model to disk to",
                  "model_model_ptMLP_class_" + dtstr + ".h5")

            ###########################PLOTTING##########################
            history_dict = history.history

            history_dict.keys()

            a = np.array(history_dict['acc'])

            print(a.shape)

            l = np.array(history_dict['loss'])

            e = range(1, len(a) + 1)

            plt.plot(e, a, 'bo', color='red', label='Acc Training')

            plt.plot(e, l, 'b', label='Loss Training')

            plt.xlabel('Epochs')

            plt.legend()

            plt.savefig('model.pdf')

            return (model, history_dict)

        def simple_val_of_data(x, y):
            from sklearn.model_selection import train_test_split
            from random import randrange
            from sklearn.preprocessing import normalize

            trX = normalize(x, axis=1, norm='l2')

            seed = randrange(999)

            print('used random seed was', seed)

            x_train, x_test, y_train, y_test = train_test_split(
                trX, y, test_size=0.4, random_state=seed)

            return x_train, x_test, y_train, y_test

        def train_layer():
            from keras.utils import np_utils
            from keras.layers import Input, Dense
            from keras.models import Model
            from keras import models

            sm = int(y.shape[1])

            json_filer = open(
                os.path.join(
                    str(MODELPATH) + '/model_weights_regression.json'), 'r')

            loaded_model_jsonr = json_filer.read()

            loaded_modelr = model_from_json(loaded_model_jsonr)

            loaded_modelr.load_weights(
                os.path.join(
                    str(MODELPATH) + "/model_weights_regression.best.hdf5"))

            if trainable == False:
                for layer in loaded_modelr.layers:
                    layer.trainable = False
            else:
                for layer in loaded_modelr.layers:
                    layer.trainable = True

            loaded_modelr.compile(loss='mean_squared_error', optimizer='adam')

            history = loaded_modelr.fit(x,
                                        y,
                                        batch_size=batch,
                                        epochs=train_epochs)

            dtstr = datetime.now().strftime("%d-%m-%Y_%I-%M-%S_%p")

            print(loaded_modelr.summary())

            model_json = loaded_modelr.to_json()
            with open("model_ptMLP_MieReg_" + dtstr + ".json",
                      "w") as json_file:
                json_file.write(model_json)

            loaded_modelr.save_weights("model_model_ptMLP_MieReg_" + dtstr +
                                       ".h5")

            print("Saved model to disk to",
                  "model_model_ptMLP_MieReg_" + dtstr + ".json")
            print("and weights to")
            print("Saved model to disk to",
                  "model_model_ptMLP_MieReg_" + dtstr + ".h5")

            return

        if classify == True:
            if x.shape[1] != 450:
                raise ValueError(
                    'This is a classification problem: x needs to be 450 datapoints in WVN range of 950-1800 1/cm'
                )

            mod, h = add_layer()

        if miecorr == True:
            if y.shape[1] != x.shape[1]:
                raise ValueError(
                    'This is a regression problem: x and y need 909 datapoints in WVN range of 950-2300 1/cm'
                )

            train_layer()
Example #49
0
        predicted = clf.predict(X_test)

        precision, recall, fscore, support = score(Y_test, predicted)

        p = p + precision
        r = r + recall
        f = f + fscore
        s = s + support
        ft = ft + np.array([fit_time])

    scores['window'] = windows
    scores['precision'] = p / cv_sets
    scores['recall'] = r / cv_sets
    scores['fscore'] = f / cv_sets
    scores['support'] = normalize(s / cv_sets).reshape(3, )
    scores['fit_time'] = normalize(ft / cv_sets).reshape(3, )

    print("Done window %s..." % (str(windows)))

    scores_table = pd.DataFrame.from_dict(scores, orient='columns')

    final_list.append(scores_table)

final_table = pd.concat(final_list, ignore_index=True)
final_table.to_csv(output + str(kernel) + '.csv')

for clas in set(final_table['labels']):
    temp = final_table.loc[final_table['labels'] == clas]
    temp.plot(x='window',
              y=['precision', 'recall', 'fscore', 'support', 'fit_time'],
np.random.seed(42)
nrange = [100000000]
nnsize = [6, 110, 2]
num_data = 1000
for l in range(100):
    torch.manual_seed(l)

    def create_rand_params(h):
        if type(h) == nn.Linear:
            h.weight.data.uniform_(0, 1)

    model = ANN(size=nnsize)
    model.apply(create_rand_params)
    x, Y = make_blobs(num_data, n_features=6, centers=2, random_state=42)
    x = normalize(x)
    x_train, x_test, y_train, y_test = train_test_split(x,
                                                        Y,
                                                        test_size=0.4,
                                                        random_state=0)
    np.save('x.npy', x_train)
    np.save('Y.npy', y_train)
    # randomise the labels
    for i in range(int(0.2 * len(y_train))):
        y_train[i] = np.random.randint(0, 2)
    x_train, y_train = shuffle(x_train, y_train)
    x_train = torch.Tensor(x_train).double()
    y_train = torch.Tensor(y_train).long()
    x_test = torch.Tensor(x_test).double()
    y_test = torch.Tensor(y_test).long()
    criterion = nn.CrossEntropyLoss()
Example #51
0
def normalize_l1(x):
    return preprocessing.normalize(x, norm='l1')
def get_spatial_pyramid_feats(image_paths, max_level, feature):
    """
    This function assumes that 'vocab_hog.npy' (for HoG) or 'vocab_sift.npy' (for SIFT)
    exists and contains an N x feature vector length matrix 'vocab' where each row
    is a kmeans centroid or visual word. This matrix is saved to disk rather than passed
    in a parameter to avoid recomputing the vocabulary every run.

    :param image_paths: a N array of string where each string is an image path,
    :param max_level: level of pyramid,
    :param feature: name of image feature representation.

    :return: an N x d matrix, where d is the dimensionality of the
        feature representation. In this case, d will equal the number
        of clusters or equivalently the number of entries in each
        image's histogram ('vocab_size'), multiplies with
        (1 / 3) * (4 ^ (max_level + 1) - 1).
    """
    def _get_histogram_for_feature(img, vocab, feature, bins):
        features = feature_extraction(img, feature)
        try:
            dist = pdist(vocab, features)
            min_dist_index = dist.argmin(axis=0)
            hist, _ = np.histogram(min_dist_index, bins=bins)
            return hist
        except:
            hist, _ = np.histogram([], bins=bins)
            return hist

    def _spatial_pyramid_recursion(img, max_level, current_level, vocab, feature):
        if current_level > max_level:
            return np.zeros(vocab.shape[0])
        else:
            img1 = img[0:int(img.shape[0]/2), 0:int(img.shape[1]/2),:]
            img2 = img[0:int(img.shape[0]/2), int(img.shape[1] / 2):int(img.shape[1]), :]
            img3 = img[int(img.shape[0]/2):int(img.shape[0]), 0:int(img.shape[1]/2),:]
            img4 = img[int(img.shape[0]/2):int(img.shape[0]), int(img.shape[1] / 2):int(img.shape[1]), :]

            #NOTE: visual check of division
            #cv2.imshow('img1',img1)
            #cv2.imshow('img2', img2)
            #cv2.imshow('img3', img3)
            #cv2.imshow('img4', img4)

            hist_img = _get_histogram_for_feature(img, vocab, feature, bins)

            current_weight = pow(2,current_level-max_level)

            all_histograms = np.array([current_weight*hist_img,
                          _spatial_pyramid_recursion(img1, max_level, current_level + 1, vocab, feature),
                          _spatial_pyramid_recursion(img2, max_level, current_level + 1, vocab, feature),
                          _spatial_pyramid_recursion(img3, max_level, current_level + 1, vocab, feature),
                          _spatial_pyramid_recursion(img4, max_level, current_level + 1, vocab, feature)])

            return all_histograms.sum(axis=0)

    if feature == 'HoG':
        vocab = np.load('vocab_hog.npy')
    elif feature == 'SIFT':
        vocab = np.load('vocab_sift.npy')

    vocab_size = vocab.shape[0]

    # Your code here. You should also change the return value.

    bins = range(-1,vocab_size)

    # Your code here. You should also change the return value.
    all_histograms = np.empty((0, vocab_size))

    i = 0
    for path in image_paths:
        img = cv2.imread(path)[:, :, ::-1]
        print('iter: ' + str(i))
        sp_histograms = _spatial_pyramid_recursion(img, max_level, 1, vocab, feature)
        sp_histograms = normalize(sp_histograms.reshape(1, -1), norm="l2")

        all_histograms = np.vstack((all_histograms, sp_histograms))
        i += 1

    return all_histograms
    print('Computing gmm with ' + str(k) + ' centroids')
    gmm = ynumpy.gmm_learn(np.float32(Desc), k)
    io.save_object(gmm, 'gmm_NN_agg_features_max')


    # Compute the fisher vectors of the training images
    print('Computing fisher vectors')
    fisher = np.zeros((len(Train_descriptors), k * 1 * 2), dtype=np.float32)
    for i in xrange(len(Train_descriptors)):
        descriptor = Train_descriptors[i]
       # descriptor = np.float32(pca.transform(descriptor))
        aux=ynumpy.fisher(gmm, descriptor, include=['mu', 'sigma'])
        fisher[i, :] = np.reshape(aux, [1, aux.shape[0]])
        # L2 normalization - reshape to avoid deprecation warning, checked that the result is the same
        fisher[i, :] = preprocessing.normalize(fisher[i, :].reshape(1,-1), norm='l2')


    # Train an SVM classifier
    stdSlr = StandardScaler().fit(fisher)
    D_scaled = stdSlr.transform(fisher)
    print 'Training the SVM classifier...'
    clf = svm.SVC(kernel=kernels.intersection_kernel, C=C, probability=True).fit(D_scaled, train_labels)
    io.save_object(clf, 'clf_NN_pca256')
    #clf = io.load_object('clf_NN',ignore=False)

    # get all the test data and predict their labels
    fisher_test = np.zeros((len(test_images_filenames), k * 1* 2), dtype=np.float32)
    for i in range(len(test_images_filenames)):
        img = image.load_img(test_images_filenames[i], target_size=(224, 224))
        x = image.img_to_array(img)
Example #54
0
            cell_types.append(id_to_type[cell_id])
            ages.append(14.5)

    tprint('Found {} valid cells among all datasets'.format(len(valid_idx)))

    return valid_idx, np.array(cell_types), np.array(ages)


datasets, genes_list, n_cells = load_names(data_names, norm=False)
qc_idx, cell_types, ages = keep_valid(datasets)
datasets, genes = merge_datasets(datasets, genes_list)

X = vstack(datasets)
X = X[qc_idx]

if not os.path.isfile('data/dimred/{}_{}.txt'.format(DR_METHOD, NAMESPACE)):
    mkdir_p('data/dimred')
    tprint('Dimension reduction with {}...'.format(DR_METHOD))
    X_dimred = reduce_dimensionality(normalize(X), dim_red_k=DIMRED)
    tprint('Dimensionality = {}'.format(X_dimred.shape[1]))
    np.savetxt('data/dimred/{}_{}.txt'.format(DR_METHOD, NAMESPACE), X_dimred)
else:
    X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(DR_METHOD, NAMESPACE))

dataset = AnnData(X)
dataset.var['gene_symbols'] = genes
dataset.obs['cell_types'] = ['mca_han_etal_fetal_' + l for l in cell_types]
dataset.obs['ages'] = ages
datasets = [dataset]
namespaces = [NAMESPACE]
Example #55
0
# problem 2.6
x1 = [22, 1, 42, 10]
x2 = [20, 0, 36, 8]
euclidean = distance.euclidean(x1, x2)
manhattan = distance.cityblock(x1, x2)
minkowski = distance.minkowski(x1, x2, p=3)
supremum = distance.chebyshev(x1, x2)

# problem 2.8
x_exist = [[1.5, 1.7], [2, 1.9], [1.6, 1.8], [1.2, 1.5], [1.5, 1.0]]
x = [1.4, 1.6]

sim = np.zeros((5, 4))
for i in range(0, 5):
    sim[i, 0] = distance.euclidean(x_exist[i], x)
    sim[i, 1] = distance.cityblock(x_exist[i], x)
    sim[i, 2] = distance.chebyshev(x_exist[i], x)
    sim[i, 3] = 1 - distance.cosine(x_exist[i], x)  # cosine similarity

print(sim)

x = [[1.4, 1.6]]
x_norm = preprocessing.normalize(x, norm='l2', axis=1)
x_exist_norm = preprocessing.normalize(x_exist, norm='l2', axis=1)
print(x_norm)
print(x_exist_norm)

sim_norm = np.zeros((5, 1))
for i in range(0, 5):
    sim_norm[i, 0] = distance.euclidean(x_exist_norm[i], x_norm)
print(sim_norm)
Example #56
0
def normalize_l2(x):
    return preprocessing.normalize(x)
    def naive_bayes_algo(self):
        X = []
        Y = []


        with open('../Data/full_table.csv', 'r') as file:
            for line in csv.reader(file, delimiter = ','):
                if len(line) == 13:
                    try:
                        zhvi = float(line[5])
                        property_type = line[6]
                        room_type = line[7]
                        accommodates = int(line[8])
                        bathrooms = float(line[9])
                        beds = int(line[10])
                        bed_type = line[11]
                        price = float(line[12])

                        x = {
                            'zhvi': zhvi,
                            'property_type': property_type,
                            'room_type': room_type,
                            'accommodates': accommodates,
                            'bathrooms': bathrooms,
                            'beds': beds,
                            'bed_type': bed_type
                        }

                        y = price

                        X.append(x)
                        Y.append(y)


                    except:
                        pass


        # The DictVectorizer converts data from a dictionary to an array
        vec = DictVectorizer()

        # Convert X to Array
        X = vec.fit_transform(X).toarray()

        # Normalize Data
        X = preprocessing.normalize(X)

        # Split X and Y into training and testing sets
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

        # Naive Bayes Regression
        model = GaussianNB()
        model.fit(X_train, Y_train)
        Y_pred = model.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Naive Bayes')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Boosting
        model_boost = AdaBoostRegressor(GaussianNB())
        model_boost.fit(X_train, Y_train)
        Y_pred = model_boost.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Naive Bayes (with AdaBoost)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))

        # With Bagging
        model_bag = BaggingRegressor(GaussianNB())
        model_bag.fit(X_train, Y_train)
        Y_pred = model_bag.predict(X_test)
        mse = mean_squared_error(Y_test, Y_pred)
        mae = mean_absolute_error(Y_test, Y_pred)
        r2 = r2_score(Y_test, Y_pred)

        print('Naive Bayes (with Bagging)')
        print('Mean Squared Error: {0}'.format(mse))
        print('Mean Average Error: {0}'.format(mae))
        print('R2 Score: {0}'.format(r2))
Example #58
0
def gen_vectors(nb, dim):
    vectors = [[random.random() for _ in range(dim)] for _ in range(nb)]
    vectors = preprocessing.normalize(vectors, axis=1, norm='l2')
    return vectors.tolist()
Example #59
0
def main(args):

    np.seterr(divide="ignore")  # POT has issues with divide by zero errors
    source_lang = args.source_lang
    target_lang = args.target_lang

    source_vectors_filename = args.source_vector
    target_vectors_filename = args.target_vector
    vectors_source = load_embeddings(source_vectors_filename)
    vectors_target = load_embeddings(target_vectors_filename)

    source_defs_filename = args.source_defs
    target_defs_filename = args.target_defs

    batch = args.batch
    input_mode = args.mode
    input_paradigm = args.paradigm

    run_method = list()
    run_paradigm = list()

    if input_paradigm == "all":
        run_paradigm.extend(("matching", "retrieval"))
    else:
        run_paradigm.append(input_paradigm)

    if input_mode == "all":
        run_method.extend(["wmd", "snk"])
    else:
        run_method.append(input_mode)

    defs_source = [
        line.rstrip("\n")
        for line in open(source_defs_filename, encoding="utf8")
    ]
    defs_target = [
        line.rstrip("\n")
        for line in open(target_defs_filename, encoding="utf8")
    ]

    clean_src_corpus, clean_src_vectors, src_keys = process_corpus(
        set(vectors_source.keys()), defs_source, vectors_source, source_lang)

    clean_target_corpus, clean_target_vectors, target_keys = process_corpus(
        set(vectors_target.keys()), defs_target, vectors_target, target_lang)

    take = args.instances

    common_keys = set(src_keys).intersection(set(target_keys))
    take = min(len(common_keys), take)  # you can't sample more than length
    experiment_keys = random.sample(common_keys, take)

    instances = len(experiment_keys)

    clean_src_corpus = list(clean_src_corpus[experiment_keys])
    clean_target_corpus = list(clean_target_corpus[experiment_keys])

    del vectors_source, vectors_target, defs_source, defs_target

    vec = CountVectorizer().fit(clean_src_corpus + clean_target_corpus)
    common = [
        word for word in vec.get_feature_names()
        if word in clean_src_vectors or word in clean_target_vectors
    ]
    W_common = []
    for w in common:
        if w in clean_src_vectors:
            W_common.append(np.array(clean_src_vectors[w]))
        else:
            W_common.append(np.array(clean_target_vectors[w]))

    if not batch:
        print(
            f"{source_lang} - {target_lang}\n" +
            f" document sizes: {len(clean_src_corpus)}, {len(clean_target_corpus)}\n"
            + f" vocabulary size: {len(W_common)}")

    W_common = np.array(W_common)
    W_common = normalize(W_common)
    vect = TfidfVectorizer(vocabulary=common, dtype=np.double, norm=None)
    vect.fit(clean_src_corpus + clean_target_corpus)
    X_train_idf = vect.transform(clean_src_corpus)
    X_test_idf = vect.transform(clean_target_corpus)

    for paradigm in run_paradigm:
        WassersteinDriver = None
        if paradigm == "matching":
            WassersteinDriver = WassersteinMatcher
        else:
            WassersteinDriver = WassersteinRetriever

        for metric in run_method:
            if not batch:
                print(
                    f"{paradigm} - {metric} on {source_lang} - {target_lang}")

            clf = WassersteinDriver(W_embed=W_common,
                                    n_neighbors=5,
                                    n_jobs=14,
                                    sinkhorn=(metric == "snk"))
            clf.fit(X_train_idf[:instances], np.ones(instances))
            p_at_one, percentage = clf.align(X_test_idf[:instances],
                                             n_neighbors=instances)

            if not batch:
                print(
                    f"P @ 1: {p_at_one}\n{percentage}% {instances} definitions\n"
                )
            else:
                fields = [
                    f"{source_lang}",
                    f"{target_lang}",
                    f"{instances}",
                    f"{p_at_one}",
                    f"{percentage}",
                ]
                with open(f"{metric}_{paradigm}_results.csv", "a") as f:
                    writer = csv.writer(f)
                    writer.writerow(fields)
Example #60
0
y = y.drop(y.columns[0], axis=1)

print('Done.')

##########################################
# split data into training and testing set
##########################################

print('Reducing and splitting..')

#PCA on x
pca = decomposition.PCA(n_components=700)
x = pca.fit_transform(x)

# normalization
x = preprocessing.normalize(x)

# label encoding
le = preprocessing.LabelEncoder()
Y1 = y.apply(le.fit_transform)
y = le.fit_transform(Y1)  # complete label encoded array

#splitting
x_train, x_val, y_train, y_val \
    = train_test_split(x, y, test_size=0.15, random_state=42 , shuffle=True)
#
#For example, if variable y is a binary categorical variable with values 0 and 1 and there are 25% of zeros
#and 75% of ones, stratify=y will make sure that your random split has 25% of 0's and 75% of 1's
# feature selection through PCA

print('ready.')