def _bootstrap_pool(X, Y, X_saliences, Y_saliences, n_components,procrustes, algorithm, boot_i): """ basic version for parallel implementation of bootstrapping using pool """ #call random seed so not the same random number is used in each process np.random.seed( int( time() ) + boot_i) #choose indices to resample randomly with replacement for a sample of same size sample_indices = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True) X_boot = X[sample_indices,:] Y_boot = Y[sample_indices,:] X_boot_scaled = scale(X_boot) Y_boot_scaled = scale(Y_boot) covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot) X_saliences_boot = X_saliences_boot.T #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_boot) > len(Y_saliences_boot): #use procrustes_rotation on smaller dataset Y_bootstraps, rotation_matrix = _procrustes_rotation(Y_saliences, Y_saliences_boot) X_bootstraps = np.dot(X_saliences_boot, rotation_matrix) else: X_bootstraps, rotation_matrix = _procrustes_rotation(X_saliences, X_saliences_boot) Y_bootstraps = np.dot(Y_saliences_boot, rotation_matrix) #print np.shape(X_bootstraps) #print np.shape(Y_bootstraps) return X_bootstraps, Y_bootstraps
class TruncatedSVDImpl(): def __init__(self, n_components=2, algorithm='randomized', n_iter=5, random_state=None, tol=0.0): self._hyperparams = { 'n_components': n_components, 'algorithm': algorithm, 'n_iter': n_iter, 'random_state': random_state, 'tol': tol } self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def _permute_and_calc_singular_values_pool(X, Y, X_saliences, Y_saliences, n_components, procrustes, algorithm, perm_i): """ basic version for parallel implementation using pool """ #call random seed so not the same random number is used in each process np.random.seed(int(time()) + perm_i) if len(X) < len(Y): #apply permutation to shorter list #print "randomization X<Y" X_perm = np.random.permutation(X) covariance_perm = np.dot(Y.T, X_perm) else: #print "other permutation" Y_perm = np.random.permutation(Y) covariance_perm = np.dot(Y_perm.T, X) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit( covariance_perm) if procrustes: #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_perm) > len(Y_saliences_perm): _, _, singular_values_perm = _procrustes_rotation( Y_saliences, Y_saliences_perm, singular_values_perm) else: X_saliences_perm = X_saliences_perm.T _, _, singular_values_perm = _procrustes_rotation( X_saliences, X_saliences_perm, singular_values_perm) return singular_values_perm
class RawModel: def __init__(self): # 2015-05-15 GEL Found that n_components=20 gives a nice balance of # speed (substantial improvement), accuracy, and reduced memory usage # (25% decrease). self.decomposer = TruncatedSVD(n_components=20) # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than # algorithm='kd_tree' # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples) # euclidean 0.950025 # manhattan 0.933533 # chebyshev 0.675662 # hamming 0.708646 # canberra 0.934033 # braycurtis 0.940530 self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean') def fit(self, trainExamples): X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) ) Y = [x.Y for x in trainExamples] self.model.fit(X, Y) return self def predict(self, examples): X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) ) return self.model.predict( X )
def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = TruncatedSVD( n_components=self.hyperparams['n_components'], algorithm=self.hyperparams['algorithm']['choice'], n_iter=self.hyperparams['algorithm'].get('n_iter', 5), tol=self.hyperparams['algorithm'].get('tol', 0), random_state=self.random_seed, ) self.primitiveNo = PrimitiveCount.primitive_no PrimitiveCount.primitive_no += 1 self._inputs = None self._outputs = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._input_column_names = None self._fitted = False
def _permute_and_calc_singular_values(X, Y, X_saliences, Y_saliences, singular_values_samples, perm_i, n_components, procrustes=False, algorithm="randomized"): if len(X) < len(Y): X_perm = np.random.permutation(X) covariance_perm = np.dot(Y.T, X_perm) else: Y_perm = np.random.permutation(Y) covariance_perm = np.dot(Y_perm.T, X) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit( covariance_perm) if procrustes: #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_perm) > len(Y_saliences_perm): _, _, singular_values_samples[:, perm_i] = _procrustes_rotation( Y_saliences, Y_saliences_perm, singular_values_perm) else: X_saliences_perm = X_saliences_perm.T _, _, singular_values_samples[:, perm_i] = _procrustes_rotation( X_saliences, X_saliences_perm, singular_values_perm) else: singular_values_samples[:, perm_i] = singular_values_perm
def _permute_and_calc_singular_values_process(X, Y, a, b, n_components, algorithm, output, x): #perm_i """ basic version for parallel implementation using processes and output queue """ #call random seed so not the same random number is used each time #pid = current_process()._identity[0] #randst = np.random.mtrand.RandomState(pid) np.random.seed(int(time()) + x + 50) #test how permutation works c = np.random.permutation(a) print a print c if len(X) < len(Y): #apply permutation to shorter list #print "randomization X<Y" X_perm = np.random.permutation(X) covariance_perm = np.dot(Y.T, X_perm) else: #print "other permutation" Y_perm = np.random.permutation(Y) covariance_perm = np.dot(Y_perm.T, X) svd = TruncatedSVD(n_components, algorithm=algorithm) #print covariance_perm Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit( covariance_perm) output.put(singular_values_perm)
def outliersSvdReduction(self): svd = TruncatedSVD(n_components=1) ordersSvd = svd.fit_transform( self.training_order_start_end_districts_and_time, self.training_number_of_orders) priceSvd = svd.fit_transform( self.training_order_start_end_districts_and_time, self.training_order_median_price) self.outliersPriceOrders(ordersSvd, priceSvd)
def write_spacy_vocab(output_dirpath, vocab_size, embedding_dim): if not os.path.exists(output_dirpath): os.makedirs(output_dirpath) allowed_chars = set(string.ascii_letters + string.punctuation) ascii = set(string.ascii_letters) ascii_plus_period = set(string.ascii_letters + '.') word_set = set() spacy_vocab = spacy.load('en').vocab top_words = [] for w in spacy_vocab: if w.rank > 2 * vocab_size: continue try: word_string = str(w.lower_).strip() if not word_string: continue if word_string in word_set: continue if any(bad_char in word_string for bad_char in ('[', ']', '<', '>', '{', '}')): # these are used to mark word types and person ids. continue if any(c not in allowed_chars for c in word_string): continue if sum(1 for c in word_string if c not in ascii_plus_period) > 2: continue if word_string[-1] == '.' and sum( 1 for c in word_string if c in ascii) > 2: continue top_words.append(w) word_set.add(word_string) except: pass top_words.sort(key=lambda w: w.rank) top_words = top_words[:vocab_size] with open(os.path.join(output_dirpath, 'vocab'), 'w') as f: for word in top_words: f.write('%s\n' % word.lower_.strip()) vectors = np.array([w.vector for w in top_words]) svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack') embeddings = svd.fit_transform(vectors) print embeddings.shape print[ sum(svd.explained_variance_ratio_[:i]) for i in range(1, embedding_dim + 1) ] np.save(os.path.join(output_dirpath, 'pretrained_embeddings.npy'), embeddings)
def fit_transform(self, X, Y): if self.standardize: X = _standardize(X) Y = _standardize(Y) K = X.dot(Y.T) model = TruncatedSVD(n_components=self.n_components, random_state=self.random_state) U = model.fit_transform(K) U = U / np.linalg.norm(U, axis=0) V = model.components_.T self.correlations = np.array( [pearsonr(u.dot(X), v.dot(Y))[0] for u, v in zip(U.T, V.T)]) return U, V
class RegressionRecommender(object): def __init__(self, feature_size=10, regressor=None): self.feature_size = feature_size self.user_svd = TruncatedSVD(n_components=feature_size) self.item_svd = TruncatedSVD(n_components=feature_size) if regressor is None: self.regressor = LinearRegression() def fit(self, rating): # rating (item x user) item_features = self.item_svd.fit_transform(rating) user_features = self.user_svd.fit_transform(rating.T) self.item_features = item_features self.user_features = user_features n_item, n_user = rating.shape n_examples = rating.count_nonzero() X = zeros((n_examples, self.feature_size + self.feature_size)) y = zeros((n_examples, 1)) for i, (item, user) in enumerate(zip(*rating.nonzero())): X[i] = concatenate([item_features[item], user_features[user]], axis=0) y[i] = rating[item, user] self.regressor.fit(X, y) return self def predict(self, item, user): user_features = self.user_features[user] item_features = self.item_features[item] input_features = concatenate(user_features, item_features) return self.regressor.predict(input_features) def save(self, filepath): to_save = { 'regressor': self.regressor, 'user_svd': self.user_svd, 'item_svd': self.item_svd } with open(filepath, 'wb') as handle: saver = Pickler(handle, protocol=HIGHEST_PROTOCOL) saver.save(to_save) def load(self, filepath): with open(filepath, 'rb') as handle: loader = Unpickler(handle) state = loader.load() self.regressor = state['regressor'] self.user_svd = state['user_svd'] self.item_svd = state['item_svd']
def fit(self, X, y=None): self._sklearn_model = SKLModel(**self._hyperparams) if (y is not None): self._sklearn_model.fit(X, y) else: self._sklearn_model.fit(X) return self
def create_union_transf(_): pca2c_transformer = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=2), ) os_transformer = make_pipeline( FunctionTransformer(lambda x: x.os, validate=False), CountVectorizer(), TruncatedSVD(n_components=10), ) arch_transformer = FunctionTransformer(lambda x: pd.get_dummies(x.cpuArch), validate=False) gmm_transformer = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=2), FunctionTransformer(lambda x: GaussianMixture(n_components=3). fit_predict(x)[np.newaxis].T)) transf = make_union( drop_transform, gmm_transformer, os_transformer, arch_transformer, pca2c_transformer, ) return transf
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"): #scaling if scale: X_scaled = zscore(X, axis=0, ddof=1) Y_scaled = zscore(Y, axis=0, ddof=1) covariance = np.dot(Y_scaled.T, X_scaled) else: covariance = np.dot(Y.T, X) svd = TruncatedSVD(n_components, algorithm) Y_saliences, singular_values, X_saliences = svd._fit(covariance) X_saliences = X_saliences.T inertia = singular_values.sum() if scale: return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled else: return X_saliences, Y_saliences, singular_values, inertia
def reduce_dimensionality(dataframe, maxvariance, columns_to_drop): ''' Performs PCA on feature pandas dataframe and reduces number of principal components to those which explain a defined variance ''' dataframe_without_columns = dataframe.drop(columns_to_drop, axis=1) LOGGER.info('Columns to be used by pca:') print dataframe_without_columns.columns LOGGER.info('Adding noise to dataframe') dataframe_without_columns = dataframe_without_columns + numpy.random.normal( size=dataframe_without_columns.shape) * 1.e-19 LOGGER.info('Starting PCA') try: pca = PCA(n_components='mle') pca.fit(dataframe_without_columns) # transform samples = pca.transform(dataframe_without_columns) # aggregated sum of variances sum_variance = sum(pca.explained_variance_) list_variance = pca.explained_variance_ #print sum_variance, pca.explained_variance_ # get those having aggregated variance below threshold except ValueError: LOGGER.info('PCA failed, using truncated SVD') svd = TruncatedSVD(n_components=3) svd.fit(dataframe_without_columns) samples = svd.transform(dataframe_without_columns) sum_variance = sum(svd.explained_variance_) list_variance = svd.explained_variance_ scomp = 0 ncomp = 0 while scomp < maxvariance: #c = pca.explained_variance_[ncomp] c = list_variance[ncomp] scomp = scomp + c / sum_variance ncomp = ncomp + 1 # reduce dimensionality samples = samples[:, :ncomp] LOGGER.info("Number of features after PCA transformation %s" % samples.shape[1]) return samples
def fit_transform(self, Xs): """ Optimize each CC components and return per-data set projections. :param Xs: List of matrices with the same number of columns. :return: CCA subspace. """ p = len(Xs) Ws = [np.zeros((X.shape[0], self.n_components)) for X in Xs] if self.standardize: Xs = list(map(_standardize, Xs)) Ws_init = [ TruncatedSVD(n_components=self.n_components, random_state=self.random_state).fit_transform(X) for X in Xs ] correlations = np.zeros((self.n_components, )) # Optimize each CC component individually for cc in range(self.n_components): w_cur = [Wi[:, cc] / np.linalg.norm(Wi[:, cc]) for Wi in Ws_init] for itr in range(self.max_iter): o1 = self._objective(Xs, w_cur) for i in range(p): wi = 0 for j in range(p): if i == j: continue wj = w_cur[j] Dj = np.diag( np.diagonal(Ws[i].T.dot(Xs[i]).dot(Xs[j].T.dot( Ws[j])))) wi += Xs[i].dot((Xs[j].T.dot(wj))) - Ws[i].dot(Dj).dot( Ws[j].T).dot(wj) w_cur[i] = wi / np.linalg.norm(wi) o2 = self._objective(Xs, w_cur) if abs(o2 - o1) / abs(o1) < self.tol: break for i in range(p): Ws[i][:, cc] = w_cur[i] # Compute average correlations n_pairs = p * (p - 1) / 2 for i, j in it.combinations(range(p), 2): wi = Ws[i][:, cc].T.dot(Xs[i]) wj = Ws[j][:, cc].T.dot(Xs[j]) correlations[cc] += pearsonr(wi, wj)[0] / n_pairs # Orientate vectors s = np.sign(Ws[0][0, :]) for i in range(p): Ws[i] = Ws[i] * s self.correlations = correlations return Ws
def compute_reduced_embeddings_original_vocab(output_vocab_filepath, output_embeddings_filepath, input_vocab_filepath, vocab_size, embedding_dim): print N_FREE_TOKENS vocab = Vocab(input_vocab_filepath, 1.5 * vocab_size) spacy_vocab = spacy.load('en').vocab matrix = np.zeros((vocab_size, spacy_vocab.vectors_length), dtype=np.float32) new_i = 0 final_vocab = [] for i, word in vocab._id_to_word.iteritems(): if new_i == vocab_size: break if i >= N_FREE_TOKENS and unicode(word) not in spacy_vocab: continue if i >= N_FREE_TOKENS: final_vocab.append(word) matrix[new_i] = spacy_vocab[unicode(word)].vector new_i += 1 print 'Last word added:', final_vocab[-1] if embedding_dim < spacy_vocab.vectors_length: svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack') embeddings = svd.fit_transform(matrix) print embeddings.shape print[ sum(svd.explained_variance_ratio_[:i]) for i in range(1, embedding_dim + 1) ] else: embeddings = matrix with open(output_vocab_filepath, 'w') as output: for word in final_vocab: output.write('%s\n' % word) np.save(output_embeddings_filepath, embeddings)
def __init__(self, path, corpusName, query=None): self.query = query documents = (line.lower().split() for line in codecs.open( corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore')) self.corpus = [' '.join(i) for i in documents] if self.query is not None: self.corpus.append(' '.join(query.getTokens())) # Make models t0 = time() print "Creating SciKit TF-IDF Model" self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LSA Model" t0 = time() lsa = TruncatedSVD(n_components=300) self.lsaModel = lsa.fit_transform(self.tfidfModel) self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LDA Model" # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA") tf_vectorizer = CountVectorizer(max_features=2000) t0 = time() tf = tf_vectorizer.fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print("Fitting LDA model") lda = LatentDirichletAllocation(n_topics=300, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() self.ldaModel = lda.fit_transform(tf) self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel) print("Done in %0.3fs." % (time() - t0))
def _boostrap(X, Y, X_saliences, Y_saliences, X_saliences_bootstraps, Y_saliences_bootstraps, bootstrap_i, n_components, algorithm="randomized"): sample_indices = np.random.choice(list(range(X.shape[0])), size=X.shape[0], replace=True) X_boot = X[sample_indices, :] Y_boot = Y[sample_indices, :] X_boot_scaled = scale(X_boot) Y_boot_scaled = scale(Y_boot) covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot) X_saliences_boot = X_saliences_boot.T #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_boot) > len(Y_saliences_boot): Y_saliences_bootstraps[:, :, bootstrap_i], rotation_matrix = _procrustes_rotation( Y_saliences, Y_saliences_boot) X_saliences_bootstraps[:, :, bootstrap_i] = np.dot(X_saliences_boot, rotation_matrix) else: X_saliences_bootstraps[:, :, bootstrap_i], rotation_matrix = _procrustes_rotation( X_saliences, X_saliences_boot) Y_saliences_bootstraps[:, :, bootstrap_i] = np.dot(Y_saliences_boot, rotation_matrix)
def __init__(self, n_components=2, algorithm='randomized', n_iter=5, random_state=None, tol=0.0): self._hyperparams = { 'n_components': n_components, 'algorithm': algorithm, 'n_iter': n_iter, 'random_state': random_state, 'tol': tol } self._wrapped_model = Op(**self._hyperparams)
def __init__(self, path, corpusName, query=None): self.query = query documents = (line.lower().split() for line in codecs.open( corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore')) self.corpus = [' '.join(i) for i in documents] if self.query is not None: self.corpus.append(' '.join(query.getTokens())) # Make models t0 = time() print "Creating SciKit TF-IDF Model" self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LSA Model" t0 = time() lsa = TruncatedSVD(n_components=300) self.lsaModel = lsa.fit_transform(self.tfidfModel) self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LDA Model" # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA") tf_vectorizer = CountVectorizer(max_features=2000) t0 = time() tf = tf_vectorizer.fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print("Fitting LDA model") lda = LatentDirichletAllocation(n_topics=300, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() self.ldaModel = lda.fit_transform(tf) self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel) print("Done in %0.3fs." % (time() - t0))
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"): #scaling print "calculating SVD" if scale: X_scaled = zscore(X, axis=0, ddof=1) Y_scaled = zscore(Y, axis=0, ddof=1) covariance = np.dot(Y_scaled.T, X_scaled) else: covariance = np.dot(Y.T, X) print np.shape(covariance) sum_var = covariance svd = TruncatedSVD(n_components, algorithm) #computes only the first n_components largest singular values #produces a low-rank approximation of covariance matrix Y_saliences, singular_values, X_saliences = svd._fit(covariance) X_saliences = X_saliences.T inertia = singular_values.sum() if scale: return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled, sum_var else: return X_saliences, Y_saliences, singular_values, inertia
def __init__(self): # 2015-05-15 GEL Found that n_components=20 gives a nice balance of # speed (substantial improvement), accuracy, and reduced memory usage # (25% decrease). self.decomposer = TruncatedSVD(n_components=20) # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than # algorithm='kd_tree' # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples) # euclidean 0.950025 # manhattan 0.933533 # chebyshev 0.675662 # hamming 0.708646 # canberra 0.934033 # braycurtis 0.940530 self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')
def build_accesson(options): ngroups, ncell_cut = int(options.ngroup), int(options.ncell) reads = scipy.io.mmread(options.s + '/matrix/filtered_reads.mtx') reads = scipy.sparse.csr_matrix(reads) * 1.0 cells = pandas.read_csv(options.s + '/matrix/filtered_cells.csv', sep='\t', index_col=0, engine='c', na_filter=False, low_memory=False) cells = cells.index.values peaks = ['peak' + str(x) for x in range(0, reads.shape[0])] scale = numpy.array(10000.0 / reads.sum(axis=0))[0] sklearn.utils.sparsefuncs.inplace_column_scale(reads, scale) reads.data = numpy.log2(reads.data + 1) npc = min(int(options.npc), reads.shape[0], reads.shape[1]) if len(cells) > ncell_cut: pca_result = TruncatedSVD(n_components=npc, algorithm='arpack', random_state=0).fit_transform(reads) else: pca_result = PCA(n_components=npc, svd_solver='full').fit_transform(reads.A) connectivity = kneighbors_graph(pca_result, n_neighbors=10, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward', connectivity=connectivity) # ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward') y_predict = ward_linkage.fit_predict(pca_result) peak_labels_df = pandas.DataFrame(y_predict, index=peaks, columns=['group']) peak_labels_df.to_csv(options.s + '/matrix/Accesson_peaks.csv', sep='\t') groups = list(set(y_predict)) coAccess_matrix = numpy.array( [reads[numpy.where(y_predict == x)[0], :].sum(axis=0) for x in groups]) coAccess_matrix = coAccess_matrix[:, 0, :].T coAccess_df = pandas.DataFrame(coAccess_matrix, index=cells, columns=groups) coAccess_df.to_csv(options.s + '/matrix/Accesson_reads.csv', sep=',') return
class SimilarityRecommender(object): def __init__(self, feature_size=10): self.feature_size = feature_size self.svd = TruncatedSVD(n_components=feature_size) self.rating = None def fit(self, rating): # rating (item x user) self.rating = rating item = self.svd.fit_transform(rating) similarity = defaultdict(lambda: dict()) n_item, n_user = rating.shape for first in tqdm(range(n_item)): for second in range(first): first_item = item[first].reshape(1, -1) second_item = item[second].reshape(1, -1) similarity[first][second] = float( cosine_similarity(first_item, second_item)[0, 0]) self.similarity = dict(similarity) return self def predict(self, user, item): history = self.rating[:, user].nonzero() absolute_score = sum( self.get_similarity(item, user_item) * rating for user_item, rating in history) score = float(absolute_score) / sum(rating for item, rating in history) return score def similar_to(self, item, n=5): return def get_similarity(self, item, target): return self.similarity[item][ target] if item > target else self.similarity[target][item] def save(self, filepath): with open(filepath, 'w') as handle: json.dump(self.similarity, handle)
names=header, engine='python') # Number of users in current set print('Number of unique users in current data-set', active_time_data.user_id.unique().shape[0]) print('Number of unique articles in current data-set', active_time_data.item_id.unique().shape[0]) # SVD allows us to look at our input matrix as a product of three smaller matrices; U, Z and V. # In short this will help us discover concepts from the original input matrix, # (subsets of users that like subsets of items) # Note that use of SVD is not strictly restricted to user-item matrices # https://www.youtube.com/watch?v=P5mlg91as1c algorithm = TruncatedSVD() # Finally we run our cross validation in n folds, where n is denoted by the cv parameter. # Verbose can be adjusted by an integer to determine level of verbosity. # We pass in our SVD algorithm as the estimator used to fit the data. # X is our data set that we want to fit. # Since our estimator (The SVD algorithm), We must either define our own estimator, or we can simply define how it # score the fitting. # Since we currently evaluate the enjoyment of our users per article highly binary, (Please see the rate_article fn in # the filter script), we can easily decide our precision and recall based on whether or not our prediction exactly # matches the binary rating field in the test set. # This, the F1 scoring metric seems an intuitive choice for measuring our success, as it provides a balanced score # based on the two. cv(estimator=algorithm, X=active_time_data, scoring='f1', cv=5, verbose=True)
def __init__(self): self.components = 2 self.svd = TruncatedSVD(n_components=self.components) self.reductCount = 0 for file_name, data_set in [ (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME, FileIo.TRAINING_DATA_SET), (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME, FileIo.TEST_DATA_SET) ]: # Check and see if the data has already been saved try: logging.info("RunRegression: Trying to load " + data_set + " data") saved_data = numpy.load(file_name, mmap_mode='r') # If the data is not found, load it except IOError: logging.info( "RunRegression: Saved data not found. Generating " + data_set + " data") # Generate inputs poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup() order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup( poi_district_lookup) regression_input = RegressionInput.RegressionInput( data_set, order_categorical_lookup, poi_district_lookup) if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. training_order_start_end_districts_and_time, order_value_price=self.training_order_median_price, order_value_number=self.training_number_of_orders) else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. testing_order_start_end_districts_and_time, order_value_price=self.testing_order_median_price, order_value_number=self.testing_number_of_orders) # If the saved data is found, load it else: logging.info("RunRegression: Loading " + data_set + " data") if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] self.initial = self.training_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.training_number_of_orders)) + " train data rows") else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.initialTesting = self.testing_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.testing_number_of_orders)) + " test data rows")
def buildModel(self): tfidfModel = TfidfVectorizer().fit_transform(self.corpus) lsa = TruncatedSVD(n_components=200) self.Model = lsa.fit_transform(tfidfModel) self.Model = Normalizer(copy=False).fit_transform(self.Model)
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False) from sklearn.cross_validation import train_test_split, cross_val_score df = pd.read_csv('/path/file.csv', header=0, sep=',', names=['SentenceId', 'Sentence', 'Sentiment']) reduced_data = tfidf_vect.fit_transform(df['Sentence'].values) y = df['Sentiment'].values from sklearn.decomposition.truncated_svd import TruncatedSVD svd = TruncatedSVD(n_components=5) reduced_data = svd.fit_transform(reduced_data) X_train, X_test, y_train, y_test = train_test_split(reduced_data, y, test_size=0.33, random_state=42) from sklearn.ensemble import RandomForestClassifier #se pasmo con 1000000 #probar con mas parametros classifier = RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test)
class RunRegression(object): REGRESSION_TRAINING_INPUT_FILE_NAME = "RegressionTrainingInput.npz" REGRESSION_TESTING_INPUT_FILE_NAME = "RegressionTestingInput.npz" MAXIMUM_NUMBER_OF_JOBS = -1 NUMBER_OF_CROSS_VALIDATION_FOLDS = 5 ROWS_TO_USE_FOR_GAUSSIAN_KERNEL_REGRESSION = 15 DISTRICT_SIZE = 132 TIME_SIZE = 152 POI_SIZE = 352 WEATHER_SIZE = 9 TRAFFIC_SIZE = 8 def __init__(self): self.components = 2 self.svd = TruncatedSVD(n_components=self.components) self.reductCount = 0 for file_name, data_set in [ (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME, FileIo.TRAINING_DATA_SET), (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME, FileIo.TEST_DATA_SET) ]: # Check and see if the data has already been saved try: logging.info("RunRegression: Trying to load " + data_set + " data") saved_data = numpy.load(file_name, mmap_mode='r') # If the data is not found, load it except IOError: logging.info( "RunRegression: Saved data not found. Generating " + data_set + " data") # Generate inputs poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup() order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup( poi_district_lookup) regression_input = RegressionInput.RegressionInput( data_set, order_categorical_lookup, poi_district_lookup) if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. training_order_start_end_districts_and_time, order_value_price=self.training_order_median_price, order_value_number=self.training_number_of_orders) else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. testing_order_start_end_districts_and_time, order_value_price=self.testing_order_median_price, order_value_number=self.testing_number_of_orders) # If the saved data is found, load it else: logging.info("RunRegression: Loading " + data_set + " data") if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] self.initial = self.training_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.training_number_of_orders)) + " train data rows") else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.initialTesting = self.testing_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.testing_number_of_orders)) + " test data rows") """ Run sgd regression """ def run_sgd_regression(self): losses = ["squared_loss"] penalties = ["none", "l2", "l1", "elasticnet"] initial_learning_rates = [0.1, 0.01, 0.001] learning_rates = ["constant", "optimal", "invscaling"] lowest_ride_prediction_error = float('inf') best_loss = "" best_penalty = "" best_initial_learning_rate = 0.0 best_learning_rate = "" # Find the best hyper-parameters for loss in losses: for penalty in penalties: for initial_learning_rate in initial_learning_rates: for learning_rate in learning_rates: mean_ride_prediction_error = 0.0 # Do k-fold cross-validation using mini-batch training. for testing_fold_number in range( RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS ): # Create the sgd regressor using the input parameters sgd_regressor = linear_model.SGDRegressor( loss=loss, penalty=penalty, eta0=initial_learning_rate, learning_rate=learning_rate) # Run mini batch training for the fold if its not the training fold for fold_number in range( RunRegression. NUMBER_OF_CROSS_VALIDATION_FOLDS): if fold_number == testing_fold_number: continue training_start_row = fold_number * \ len(self.training_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS training_end_row = (fold_number + 1) * \ len(self.training_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS logging.info( "RunRegression: " + str(RunRegression. NUMBER_OF_CROSS_VALIDATION_FOLDS) + " fold cross validation training SGD Regressor for fold " + str(fold_number) + ", starting row " + str(training_start_row) + ", ending row " + str(training_end_row) + ", loss " + loss + ", penalty " + penalty + ", initial learning rate " + str(initial_learning_rate) + " and learning rate " + learning_rate) # Train regression model sgd_regressor\ .partial_fit(X=self.training_order_start_end_districts_and_time[training_start_row : training_end_row], y=self.training_number_of_orders[training_start_row:training_end_row]) testing_start_row = testing_fold_number * \ len(self.testing_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS testing_end_row = (testing_fold_number + 1 )* \ len(self.testing_order_start_end_districts_and_time) // \ RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS predicted_number_of_orders = sgd_regressor\ .predict(self.testing_order_start_end_districts_and_time[testing_start_row : testing_end_row]) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders[ testing_start_row:testing_end_row])**2) logging.info( "RunRegression: Prediction error for fold " + str(testing_fold_number) + " is " + str(current_ride_prediction_error)) mean_ride_prediction_error += current_ride_prediction_error if RunRegression.__is_mean_prediction_error_too_high( mean_ride_prediction_error, lowest_ride_prediction_error): logging.info( "RunRegression: Mean prediction error of " + str(mean_ride_prediction_error) + "is too high compared to best so far " + str(lowest_ride_prediction_error) + ". Ending current cross validation.") break else: mean_ride_prediction_error /= RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS logging.info( "RunRegression: Mean prediction error is " + str(mean_ride_prediction_error)) # Save values if better than previous best if mean_ride_prediction_error < lowest_ride_prediction_error: logging.info( "RunRegression: mean error of " + str(mean_ride_prediction_error) + " is the best so far. Saving loss " + loss + ", penalty " + penalty + ", initial learning rate " + str(initial_learning_rate) + " and learning rate " + learning_rate) lowest_ride_prediction_error = mean_ride_prediction_error best_loss = loss best_penalty = penalty best_initial_learning_rate = initial_learning_rate best_learning_rate = learning_rate logging.info( "RunRegression: Running regression with best values so far: loss " + best_loss + ", penalty " + best_penalty + ", initial learning rate " + str(best_initial_learning_rate) + " and learning rate " + best_learning_rate) sgd_regressor = linear_model.SGDRegressor( loss=best_loss, penalty=best_penalty, eta0=best_initial_learning_rate, learning_rate=best_learning_rate) sgd_regressor.fit(X=self.training_order_start_end_districts_and_time, y=self.training_number_of_orders) best_predicted_number_of_orders = sgd_regressor.predict( self.testing_order_start_end_districts_and_time) coef = sgd_regressor.coef_ print(coef) logging.info( "RunRegression: Mean squared prediction error after cross validation is " + str( numpy.mean((best_predicted_number_of_orders - self.testing_number_of_orders)**2))) """ Check if mean prediction error is to high to qualify as the best so far """ @staticmethod def __is_mean_prediction_error_too_high(cumulative_mean_prediction_error, best_prediction_error_so_far): return cumulative_mean_prediction_error / RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS > \ best_prediction_error_so_far """ Run regression based on multidimensional scaling """ def run_mds_regression(self): # Create a square matrix with number of test data rows preserved training_data_square_matrix = numpy.dot( self.training_order_start_end_districts_and_time.T, self.training_order_start_end_districts_and_time) logging.info("RunRegression: Square matrix shape " + str(training_data_square_matrix.shape)) # Get Eigen values and eigen vectors training_data_eigen_values, training_data_eigen_vectors = linalg.eig( training_data_square_matrix) #print(training_data_eigen_values) #print(training_data_eigen_vectors) print(self.training_order_start_end_districts_and_time) sorted_index = training_data_eigen_values.argsort()[::-1] sorted_training_data_eigen_values = training_data_eigen_values[ sorted_index] sorted_training_data_eigen_vectors = training_data_eigen_vectors[:, sorted_index] logging.info("RunRegression: Found " + str(len(sorted_training_data_eigen_values)) + " eigen values.") logging.info("RunRegression: Eigen vectors have length " + str(len(sorted_training_data_eigen_vectors[0]))) if logging.getLogger().getEffectiveLevel() == logging.DEBUG: RunRegression.__show_eigen_values_trend( eigen_values=sorted_training_data_eigen_values) """ Show Eigen values trend """ @staticmethod def __show_eigen_values_trend(self, eigen_values): # Plot eigen values plt.plot(eigen_values) plt.ylabel('Eigen Values') plt.title('Sorted Eigen Values') plt.show() def leastAngleRegression(self): lar = linear_model.Lars() lar.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = lar.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(lar.coef_) def orthogonalMatchingPursuit(self): omp = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=10) omp.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = omp.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(omp.coef_) def theilSenRegressor(self): tsr = linear_model.TheilSenRegressor() tsr.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = tsr.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(tsr.coef_) def polynomial(self): poly = PolynomialFeatures(degree=3) self.training_order_start_end_districts_and_time = poly.fit_transform( self.training_order_start_end_districts_and_time, self.training_number_of_orders) predict = poly.transform( self.testing_order_start_end_districts_and_time) clf = linear_model.LinearRegression() clf.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) predicted_number_of_orders = clf.predict(predict) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(clf.coef_) def svm(self): oneClass = svm.OneClassSVM() logging.info("svm fit") oneClass.fit(self.training_order_start_end_districts_and_time, self.training_number_of_orders) logging.info("svm predict") predicted_number_of_orders = oneClass.predict( self.testing_order_start_end_districts_and_time) current_ride_prediction_error = numpy.mean( (predicted_number_of_orders - self.testing_number_of_orders)**2) print(current_ride_prediction_error) print(oneClass.coef_) def districtReduction(self, keyType, key): y = key districts = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.DISTRICT_SIZE) if keyType == "training": districtRed = self.svd.fit_transform( districts, self.training_number_of_orders) else: districtRed = self.svd.transform(districts) nonDistrict = numpy.apply_along_axis(sliceTransform, 1, y, self.DISTRICT_SIZE, self.dimensions) keyWithDist = numpy.append(districtRed, nonDistrict, axis=1) return keyWithDist def timeReduction(self, keyType, key): y = key time = numpy.apply_along_axis(sliceTransform, 1, y, self.components, self.TIME_SIZE + self.components) if keyType == "training": timeRed = self.svd.fit_transform(time, self.training_number_of_orders) else: timeRed = self.svd.transform(time) befTime = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components) aftTime = numpy.apply_along_axis(sliceTransform, 1, y, self.TIME_SIZE + self.components, self.dimensions) keyWithTime = numpy.append(befTime, timeRed, axis=1) keyWithTime = numpy.append(keyWithTime, aftTime, axis=1) return keyWithTime def POIReduction(self, keyType, key): y = key poi = numpy.apply_along_axis(sliceTransform, 1, y, self.components * 2, self.POI_SIZE + self.components * 2) if keyType == "training": poiRed = self.svd.fit_transform(poi, self.training_number_of_orders) else: poiRed = self.svd.transform(poi) befPoi = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components * 2) aftPoi = numpy.apply_along_axis(sliceTransform, 1, y, self.POI_SIZE + self.components * 2, self.dimensions) keyWithPoi = numpy.append(befPoi, poiRed, axis=1) keyWithPoi = numpy.append(keyWithPoi, aftPoi, axis=1) return keyWithPoi def WeatherReduction(self, keyType, key): y = key weather = numpy.apply_along_axis( sliceTransform, 1, y, self.components * 3, self.WEATHER_SIZE + self.components * 3) if keyType == "training": weatherRed = self.svd.fit_transform(weather, self.training_number_of_orders) else: weatherRed = self.svd.transform(weather) befWeather = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components * 3) aftWeather = numpy.apply_along_axis( sliceTransform, 1, y, self.WEATHER_SIZE + self.components * 3, self.dimensions) keyWithWeather = numpy.append(befWeather, weatherRed, axis=1) keyWithWeather = numpy.append(keyWithWeather, aftWeather, axis=1) return keyWithWeather def TrafficReduction(self, keyType, key): y = key traffic = numpy.apply_along_axis( sliceTransform, 1, y, self.components * 4, self.TRAFFIC_SIZE + self.components * 4) if keyType == "training": trafficRed = self.svd.fit_transform(traffic, self.training_number_of_orders) if self.reductCount == 0: self.boxPlot(trafficRed) self.reductCount = 1 else: trafficRed = self.svd.transform(traffic) befTraffic = numpy.apply_along_axis(sliceTransform, 1, y, 0, self.components * 4) keyWithTraffic = numpy.append(befTraffic, trafficRed, axis=1) return keyWithTraffic def wholeReductionTraining(self): y = self.training_order_start_end_districts_and_time b = self.svd.fit_transform(y, self.training_number_of_orders) if self.reductCount < 2: self.boxPlot(b) self.reductCount += 1 self.training_order_start_end_districts_and_time = b def wholeReductionTesting(self): y = self.testing_order_start_end_districts_and_time b = self.svd.transform(y) self.testing_order_start_end_districts_and_time = b def reduction(self): self.training_order_start_end_districts_and_time = self.initial self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] self.testing_order_start_end_districts_and_time = self.initialTesting logging.info("RunRegression: Reducing Districts") self.training_order_start_end_districts_and_time = run_regression.districtReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.districtReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.testing_order_start_end_districts_and_time[:, 0:1] y = self.testing_order_start_end_districts_and_time[:, 1:2] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] print(self.dimensions) logging.info("RunRegression: Reducing Time") self.training_order_start_end_districts_and_time = run_regression.timeReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.timeReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 2:3] y = self.training_order_start_end_districts_and_time[:, 3:4] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] #plt.scatter(x,y) #plt.show() logging.info("RunRegression: Reducing POI") self.training_order_start_end_districts_and_time = run_regression.POIReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.POIReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 4:5] y = self.training_order_start_end_districts_and_time[:, 5:6] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] #plt.scatter(x,y) #plt.show() logging.info("RunRegression: Reducing Weather") self.training_order_start_end_districts_and_time = run_regression.WeatherReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.WeatherReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 6:7] y = self.training_order_start_end_districts_and_time[:, 7:8] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] #plt.scatter(x,y) #plt.show() logging.info("RunRegression: Reducing Traffic") self.training_order_start_end_districts_and_time = run_regression.TrafficReduction( 'training', self.training_order_start_end_districts_and_time) self.testing_order_start_end_districts_and_time = run_regression.TrafficReduction( 'testing', self.testing_order_start_end_districts_and_time) x = self.training_order_start_end_districts_and_time[:, 8:9] y = self.training_order_start_end_districts_and_time[:, 9:10] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] print(self.initial.shape) def boxPlot(self, arrayBox): a = plt.boxplot(arrayBox) plt.show() idx = set() idxSet = set( numpy.arange(len( self.training_order_start_end_districts_and_time))) for d in a['fliers']: print(len(d.get_ydata())) for point in d.get_ydata(): pIdx = numpy.where(arrayBox == point) for rIdx in pIdx[0]: idx.add(rIdx) logging.info("done with loop") idxKeep = list(idxSet.difference(idx)) self.initial = self.initial[[idxKeep], :] self.training_number_of_orders = self.training_number_of_orders[[ idxKeep ]] self.initial = self.initial.reshape(self.initial.shape[1:])
("ravel", Ravel()), ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\ strip_accents='ascii', analyzer= "word", stop_words='english', norm = "l1", use_idf = True)) ]) # des_rescu_pipe = Pipeline([ # ('sel_num', DataFrameSelector(["Description", "RescuerID"], ravel = True)), # add rescuer to description # ('rm_nan', FnanToStr()), # ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\ # strip_accents='ascii', analyzer= "word", stop_words='english', use_idf = True, norm = None)) # ]) des_pipe_svd = Pipeline([ ('des_pipe', des_pipe), ('SVD', TruncatedSVD(n_components=20) ) #ValueError: n_components must be < n_features; got 140 >= 124 ]) des_pipe_for_svd = replace_step( des_pipe, "tfid_vect", ("tfid_vect", TfidfVectorizer(max_df= 0.95, min_df=0.005, ngram_range=(1,4),\ strip_accents='ascii', analyzer= "word", stop_words='english', norm = "l1", use_idf = True)) ) des_pipe_svd_v2 = Pipeline([('des_pipe_for_svd', des_pipe_for_svd), ('SVD', TruncatedSVD(n_components=20))]) des_pipe_svd_v3 = replace_step(des_pipe_svd_v2, "SVD", ('SVD', TruncatedSVD(n_components=100)))
X_train = vectorizer.fit_transform(X_train) duration = time() - t0 print("n_samples: %d, n_features: %d" % X_train.shape) print() print("Extracting features from the test data using the same vectorizer") t0 = time() #X_test,y_test = get_test_data() X_test = vectorizer.transform(X_test) duration = time() - t0 print(X_train.shape) #x,z, X_train = fastica(X_train.toarray()) svd = TruncatedSVD(n_components=1000) print(X_train) #print("n_samples: %d, n_features: %d" % X_test.shape) print() # mapping from integer feature name to original token string if opts.use_hashing: feature_names = None else: feature_names = vectorizer.get_feature_names() if opts.select_chi2: print("Extracting %d best features by a chi-squared test" % opts.select_chi2) t0 = time()
def train(self, model_name, corpus, log, opts, chain_features=None): from whim.entity_narrative import DistributionalVectorsNarrativeChainModel log.info("Training context vectors model") training_metadata = { "data": corpus.directory, "pmi": opts.pmi or opts.ppmi, "ppmi": opts.ppmi, } log.info("Extracting event counts") pbar = get_progress_bar(len(corpus), title="Event feature extraction") # Loop over all the chains again to collect events event_counts = Counter() for doc_num, document in enumerate(corpus): chains = document.get_chains() if len(chains): event_chains = list( DistributionalVectorsNarrativeChainModel. extract_chain_feature_lists(chains, only_verb=opts.only_verb, adjectives=opts.adj)) # Count all the events for chain in event_chains: event_counts.update(chain) pbar.update(doc_num) pbar.finish() if opts.event_threshold is not None and opts.event_threshold > 0: log.info("Applying event threshold") # Apply a threshold event count to_remove = [ event for (event, count) in event_counts.items() if count < opts.event_threshold ] pbar = get_progress_bar(len(to_remove), title="Filtering counts") for i, event in enumerate(to_remove): del event_counts[event] pbar.update(i) pbar.finish() log.info("Extracting pair counts") pbar = get_progress_bar(len(corpus), title="Pair feature extraction") # Loop over all the chains again to collect pairs of events pair_counts = Counter() for doc_num, document in enumerate(corpus): chains = document.get_chains() if len(chains): event_chains = list( DistributionalVectorsNarrativeChainModel. extract_chain_feature_lists(chains, only_verb=opts.only_verb, adjectives=opts.adj)) # Count all the events for chain in event_chains: # Count all pairs pairs = [] for i in range(len(chain) - 1): for j in range(i + 1, len(chain)): if chain[i] in event_counts and chain[ j] in event_counts: pairs.append( tuple(sorted([chain[i], chain[j]]))) pair_counts.update(pairs) pbar.update(doc_num) pbar.finish() if opts.pair_threshold is not None and opts.pair_threshold > 0: log.info("Applying pair threshold") # Apply a threshold pair count to_remove = [ pair for (pair, count) in pair_counts.items() if count < opts.pair_threshold ] if to_remove: pbar = get_progress_bar(len(to_remove), title="Filtering pair counts") for i, pair in enumerate(to_remove): del pair_counts[pair] pbar.update(i) pbar.finish() else: log.info("No counts removed") # Create a dictionary of the remaining vocabulary log.info("Building dictionary") dictionary = Dictionary([[event] for event in event_counts.keys()]) # Put all the co-occurrence counts into a big matrix log.info("Building counts matrix: vocab size %d" % len(dictionary)) vectors = numpy.zeros((len(dictionary), len(dictionary)), dtype=numpy.float64) # Fill the matrix with raw counts for (event0, event1), count in pair_counts.items(): if event0 in dictionary.token2id and event1 in dictionary.token2id: e0, e1 = dictionary.token2id[event0], dictionary.token2id[ event1] vectors[e0, e1] = count # Add the count both ways (it's only stored once above) vectors[e1, e0] = count # Now there are many things we could do to these counts if opts.pmi or opts.ppmi: log.info("Applying %sPMI" % "P" if opts.ppmi else "") # Apply PMI to the matrix # Compute the total counts for each event (note row and col totals are the same) log_totals = numpy.ma.log(vectors.sum(axis=0)) vectors = numpy.ma.log(vectors * vectors.sum()) - log_totals vectors = (vectors.T - log_totals).T vectors = vectors.filled(0.) if opts.ppmi: # Threshold the PMIs at zero vectors[vectors < 0.] = 0. # Convert to sparse for SVD and storage vectors = csr_matrix(vectors) if opts.svd: log.info("Fitting SVD with %d dimensions" % opts.svd) training_metadata["svd from"] = vectors.shape[1] training_metadata["svd"] = opts.svd vector_svd = TruncatedSVD(opts.svd) vectors = vector_svd.fit_transform(vectors) log.info("Saving model: %s" % model_name) model = DistributionalVectorsNarrativeChainModel( dictionary, vectors, only_verb=opts.only_verb, training_metadata=training_metadata, adjectives=opts.adj) model.save(model_name) return model
X_train_counts = count_vect.fit_transform(train_data.data) print(X_train_counts.shape) print(count_vect.vocabulary_.get(u'algorithm')) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) df= pd.DataFrame({'text':test_doc, 'class': test_data.target}) X = tfidf_vect.fit_transform(df['text'].values) y = df['class'].values from sklearn.decomposition.truncated_svd import TruncatedSVD pca = TruncatedSVD(n_components=2) X_reduced_train = pca.fit_transform(X) a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.ensemble import RandomForestClassifier classifier=RandomForestClassifier(n_estimators=10) classifier.fit(a_train.toarray(), b_train) clf = svm.SVC(kernel=my_kernel) # Support Vector Machine model #text_clf = Pipeline([('vect', CountVectorizer()), #(#'tfidf', TfidfTransformer()), #('clf', clf),])
words.append(word) vecs.append(model[word]) print len(vecs) CUTAWAY = 6000 vecs = np.matrix(np.random.permutation(vecs))[:CUTAWAY, :] words = words[:CUTAWAY] print vecs.shape print 'Dimensionality reductio' # dr = TruncatedSVD(n_iter=15) # X = dr.fit_transform(vecs) print 'Dimensionality reduction done, manifold learning' tsne = RandomTreesEmbedding(n_estimators=15, random_state=0, max_depth=5, verbose=2, n_jobs=3) X = tsne.fit_transform(vecs) print X.shape print 'Dim reduction' dr = TruncatedSVD(n_components=2) X = dr.fit_transform(X) print 'Manifold learning done' # X = vecs PLOT_CUTAWAY = 250 plt.figure() plt.scatter(X[:PLOT_CUTAWAY, 0], X[:PLOT_CUTAWAY, 1], c='green') for i in xrange(min(X.shape[0], PLOT_CUTAWAY)): plt.annotate(words[i], xy=(X[i, 0], X[i, 1])) plt.show()
tfidf_vect= TfidfVectorizer( use_idf=True, smooth_idf=True, sublinear_tf=False) from sklearn.cross_validation import train_test_split, cross_val_score df = pd.read_csv('/path/file.csv', header=0, sep=',', names=['SentenceId', 'Sentence', 'Sentiment']) reduced_data = tfidf_vect.fit_transform(df['Sentence'].values) y = df['Sentiment'].values from sklearn.decomposition.truncated_svd import TruncatedSVD svd = TruncatedSVD(n_components=5) reduced_data = svd.fit_transform(reduced_data) X_train, X_test, y_train, y_test = train_test_split(reduced_data, y, test_size=0.33, random_state=42) from sklearn.ensemble import RandomForestClassifier #se pasmo con 1000000 #probar con mas parametros classifier=RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test)