def __init__(self, hash_len, train_img, train_txt, query_image, query_text, retrieval_image, retrieval_text): if not os.path.exists('temp_data'): os.mkdir('temp_data') # normalize data norm2 = Normalizer(norm='l2') train_img = norm2.fit_transform(train_img) train_txt = norm2.fit_transform(train_txt) query_image = norm2.fit_transform(query_image) query_text = norm2.fit_transform(query_text) retrieval_image = norm2.fit_transform(retrieval_image) retrieval_text = norm2.fit_transform(retrieval_text) sio.savemat( 'temp_data/flickr_data.mat', { 'train_image': np.transpose(train_img), 'train_text': np.transpose(train_txt), 'query_image': query_image, 'query_text': query_text, 'retrieval_image': retrieval_image, 'retrieval_text': retrieval_text }) self.flickr_data = sio.loadmat('temp_data/flickr_data.mat') self.hash_len = hash_len
def load_dat(filepath, minmax=None, normalize=False, bias_term=True): """ load a dat file args: minmax: tuple(min, max), dersired range of transformed data normalize: boolean, normalize samples individually to unit norm if True bias_term: boolean, add a dummy column of 1s """ lines = np.loadtxt(filepath) labels = lines[:, -1] features = lines[:, :-1] N, dim = features.shape if minmax is not None: minmax = MinMaxScaler(feature_range=minmax, copy=False) minmax.fit_transform(features) if normalize: # make sure each entry's L2 norm is 1 normalizer = Normalizer(copy=False) normalizer.fit_transform(features) if bias_term: X = np.hstack([np.ones(shape=(N, 1)), features]) else: X = features return X, labels
def salary_provider(preprocessing="None"): X_train, X_test, Y_train, Y_test = provider(is_regression=True) # Normalize the label [No sense!!] # salary_max, salary_min = np.max(Y_train), np.min(Y_train) # Y_train = (Y_train - salary_min) / float(salary_max - salary_min) # Y_test = (Y_test - salary_min) / float(salary_max - salary_min) if preprocessing == "normalize": normalizer = Normalizer() X_train = normalizer.fit_transform(X_train) X_test = normalizer.fit_transform(X_test) elif preprocessing == "minmax": minmaxscaler = MinMaxScaler() X_train = minmaxscaler.fit_transform(X_train) X_test = minmaxscaler.fit_transform(X_test) elif preprocessing == "standard": standardscale = StandardScaler() X_train = standardscale.fit_transform(X_train) X_test = standardscale.fit_transform(X_test) else: pass print Y_test print Y_train return X_train, X_test, Y_train, Y_test
def set_tfidf_process(self): print('1') tfidf = TfidfVectorizer(token_pattern='\S+') # 띄어쓰기로 구분시킴 print('11') print(self.X_train) tfidf.fit(self.X_train.astype('U')) print('2') train_vector = tfidf.transform(self.X_train.astype('U')) print('22') validation_vector = tfidf.transform(self.X_validation.astype('U')) print('33') test_vector = tfidf.transform(self.X_test.astype('U')) # print('3') print('44') nmf = NMF(n_components=50) # 차원축소 nmf.fit(train_vector.toarray()) train_features = nmf.transform(train_vector.toarray()) validation_features = nmf.transform(validation_vector.toarray()) test_features = nmf.transform(test_vector.toarray()) print('4') norm = Normalizer() # 0 ~ 1 사이로 변경 self.train_nf = norm.fit_transform(train_features) self.validation_nf = norm.fit_transform(validation_features) self.test_nf = norm.fit_transform(test_features) print('5')
def visualize_attention(x_test, y_true, sent_model, doc_model, date, word2idx, label, rand): print('Label:', str(label)) #x_samples = np.array([x_test[k] for k,v in enumerate(y_true) if v == label]) if rand: random_index = nprnd.randint(x_samples.shape[0], size = SHOW_SAMPLES_CNT) select_samples = x_samples[random_index] else: # select_samples = x_samples[0:SHOW_SAMPLES_CNT] select_samples = x_test sent_all_att, doc_all_att = get_attention(sent_model, doc_model, select_samples, MODEL_NAME) text_sent = [[word2idx[idx] for sub in select_samples[i] for idx in sub] for i in range(5)] normalizer_sent = Normalizer() normalizer_doc = Normalizer() att_sent = normalizer_sent.fit_transform(sent_all_att) att_doc = normalizer_doc.fit_transform(doc_all_att) customed_heatmap(att_sent, text_sent, N_LIMIT, date, label, 'sent') customed_heatmap(att_doc[:,::-1].T, text_sent, N_LIMIT, date, label, 'doc') #important_words = [[word2idx[idx] for idx in word_idx[w_idx]] # for w_idx in range(SHOW_SAMPLES_CNT)] #print('some important keywords:') #pprint(important_words) return sent_all_att, doc_all_att
class GetXYData: def __init__( self, normalize=True, subsample=None, variables=["gross_primary_productivity", "soil_moisture"], random_state=123, ): self.normalize = normalize self.subsample = subsample self.variables = variables self.random_state = random_state def set_XY(self, xr_data, xr_data2=None): """Excepts a dataframe with the time components. Converts it into an array.""" # Convert xarray into dataframe for variables if xr_data2 is None: xr_data2 = xr_data X = xarray2df(xr_data[self.variables[0]]) Y = xarray2df(xr_data2[self.variables[1]]) # Merge the Two DataFrames var_df = X.merge(Y) # Drop the NA Values var_df = var_df.dropna() # Extract variables X = var_df[self.variables[0]].values Y = var_df[self.variables[1]].values lat = var_df["lat"] lon = var_df["lon"] # =============== # Normalize # =============== if self.normalize: self.x_normalizer = Normalizer() X = self.x_normalizer.fit_transform(X) self.y_normalizer = Normalizer() Y = self.y_normalizer.fit_transform(Y) # Subsample if necessary if self.subsample: X, _, Y, _, lat, _, lon, _ = train_test_split( X, Y, lat, lon, train_size=self.subsample, random_state=self.random_state, ) return X, Y, lat, lon
def normalization_data(norm_type, data_set): if norm_type == "l1": normlizer = Normalizer(norm='l1') norm_data = normlizer.fit_transform(data_set) if norm_type == "l2": normlizer = Normalizer(norm="l2") norm_data = normlizer.fit_transform(data_set) if norm_type == "min_max": normlizer = MinMaxScaler(feature_range=(0, 1)) norm_data = normlizer.fit_transform(data_set) return norm_data
def getNormalized(self, state, size): ft_train, ft_test, tg_train, tg_test = train_test_split( self.features, self.target, train_size=size, stratify=self.target, random_state=state) norm = Normalizer() ft_train_n = norm.fit_transform(ft_train) ft_test_n = norm.fit_transform(ft_test) return ft_train_n, ft_test_n
def generate_latent_variables(centered_co_occurence, num_components): normalizer = Normalizer() normalizer.fit_transform(centered_co_occurence) pca = decomposition.PCA(svd_solver='randomized', random_state=17) pca.fit(centered_co_occurence) components = pca.components_ k_components = components[:num_components] latent_vars = k_components * centered_co_occurence latent_vars_matrix = latent_vars.T return k_components, latent_vars_matrix, normalizer
def stds_norms_mms(df, scaler): if scaler == 'mms': mms = MinMaxScaler() mms.fit_transform(df) elif scaler == 'stds': stds = StandardScaler() stds.fit_transform(df) elif scaler == 'norms': norms = Normalizer() norms.fit_transform(df) return df
def getPcaFeatures(self, images, components, image_size): imageDataset = self.getImagesAsDataset(images, image_size) norm = Normalizer() imageDataset = norm.fit_transform(imageDataset) pca = PCA(n_components=components) imageDataset = pca.fit_transform(imageDataset) return pca, norm, imageDataset
def topics(tweets, n_topics): """ generate word2vec model from the tweets and then generate a matrix where each column is a word2vec vector of a word in the tweet vocabulary. Then use PCA to identify topics in the tweets and printthe top words that are associated with that topic Parameters ---------- tweets: list a list of unicode strings representing tweets n_topics: Integer an integer greater than 0 representing the number of topics """ print("transforming tweets into vectors...") stop = frozenset(stopwords.words('english')) vectorizer = TweetVectorizer(stop_words=stop).fit(tweets) tweet_vectors = vectorizer.words_matrix() word2vec = vectorizer.get_model() print("Fitting the PCA model..") normalizer = Normalizer() pca = PCA(n_components=n_topics) pca.fit_transform(normalizer.fit_transform(tweet_vectors)) for topic_idx, topic in enumerate(pca.components_): print("*" * 200) print("Topic #%d:" % topic_idx) print(word2vec.wv.similar_by_vector(topic)) print(" ")
def preprocess(data, n_components, use_tf_idf=True): """ Preproecess the data for clustering by running SVD and normalizing the results. This process is also known as LSA. arguments: data -- Dataset, if tf_idf is Truethe object must contain a tf_idf table alongside a raw frequencies dataframe. n_components -- int, the number of components to use for the SVD a minimum of 100 is recommended. use_tf_idf -- bool, whether to use the tf-idf frequencies for the preprocessing. returns: e -- float, a measure of variance explained by the SVD. X -- np.array, an array with the data reduced to n_components. """ if use_tf_idf: d = data.tf_idf.as_matrix() else: d = data.df.as_matrix() svd = TruncatedSVD(n_components=n_components) X = svd.fit_transform(d) norm = Normalizer() # Record a measure of explained variance e = svd.explained_variance_ratio_.sum()*100 return e, norm.fit_transform(d)
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3): out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) X = agetext["text"] X = X.tolist() label = agetext["agegroup"].tolist() vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2)) docs = [] for doc in X: docs.append(" ".join(doc)) docs2 = [doc.replace("\t","").replace("\n","") for doc in docs] traindocs = docs2[:7999] X = vec.fit_transform(traindocs) testdocs = docs2[8000:9500] X_test = vec.transform(testdocs) tlabel = label[:7999] testl = label[8000:9500] if(check): lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(X) X = normalizer.fit_transform(X) X_test = lsa.transform(X_test) X_test = normalizer.transform(X_test) model.fit(X,tlabel) pred = model.predict(X_test) out.append(round(accuracy_score(testl, pred),2)) print str(out) print np.mean(out)
def RNN(): order = cl.get_train_test_set_06() sc = Normalizer() # scaling using normalisation order = sc.fit_transform(order) Z = order[:, 1] Y = [] for i, z in enumerate(Z): if i % 5 == 0: Y.append(z) X = np.delete(order, 1, 1) X = np.reshape(X, (73, 5, 21)) model = Sequential() model.add(LSTM( 1, return_sequences=False, input_shape=(5, 21))) # returns a sequence of vectors of dimension 32 # model.add(LSTM(7, return_sequences=True)) # returns a sequence of vectors of dimension 32 # model.add(Dropout(0.5)) # model.add(LSTM(1)) # return a single vector of dimension 32 model.add(Dense(1, activation='linear')) model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) history = model.fit(X, Y, batch_size=7, epochs=50, validation_split=0.3, shuffle=False) pyplot.plot(history.history['loss'], label='train') pyplot.plot(history.history['val_loss'], label='test') pyplot.legend() pyplot.show()
def df_normalize(df): from sklearn.preprocessing import Normalizer normalizer = Normalizer(norm='l2') df = pd.DataFrame(normalizer.fit_transform(df), columns=df.columns) print("DataSet Normalized...") df.head() return df
def transformer(self,data,name_to_save='yahoo_scaler'): scaler = Normalizer() scaled_out=scaler.fit_transform(data) # print(scaler.data_min_,scaler.data_max_) pickle.dump(scaler,open(f'{name_to_save}.pkl','wb')) return scaled_out
def normalize(train_inputs, non_train_inputs): normalizer = Normalizer() train_inputs[train_inputs.columns] = normalizer.fit_transform( train_inputs.values) non_train_inputs[train_inputs.columns] = normalizer.transform( non_train_inputs.values) return train_inputs, non_train_inputs
def quantify(self, exclude, encoder_path, normalizer_path, columns_to_normalize=list()): for column in list(set(self.data.columns) - set(exclude)): if (not self.data[column].dtype in [np.float, np.int]) and (not "Embedding" in column): encoder = LabelEncoder() self.data[column] = encoder.fit_transform( self.data[column].astype(str)) dump( encoder, open( os.path.join(encoder_path, 'LabelEncoder_{}.pkl'.format(column)), 'wb')) if column in columns_to_normalize: normalizer = Normalizer() self.data[column] = normalizer.fit_transform( self.data[column].values.reshape(-1, 1)) dump( normalizer, open( os.path.join(normalizer_path, 'LabelNormalizer_{}.pkl'.format(column)), 'wb'))
def predict(self, layer=None): """ Performs sentiment classification prediction on preprocessed audio files @param layer: If None, performs normal sentiment classification. If not None, returns the values from the intermediate layers. return: - The model prediction result - The video file names for each of the rows returned in model.predict (without the .mp4 suffix) """ folder = unzip_folder(self.audio_folder, "audio_tmp") X = np.load(os.path.join(folder, 'audio-pickle-all-X-openl3.pkl'), allow_pickle=True) if layer is not None: print(f"Customizing model by returning layer {layer}") model = tf.keras.models.Model(self.model.input, self.model.get_layer(layer).output) else: model = self.model normalizer = Normalizer() for i in range(0, X.shape[0]): X[i] = normalizer.fit_transform(X[i]) # The original pre-processing created the X array using the sorted order of the video files audio_pickles = sorted( next(os.walk(os.path.join(self.audio_folder, "audio-pickle")))[2]) samples = map(lambda x: x.split(".mp4")[0], audio_pickles) return model.predict(X, batch_size=self.batch_size), list(samples)
def explore_k(svd_trans, k_range): ''' Explores various values of k in KMeans Args: svd_trans: dense array with lsi transformed data k_range: the range of k-values to explore Returns: scores: list of intertia scores for each k value ''' scores = [] # spherical kmeans, so normalize normalizer = Normalizer() norm_data = normalizer.fit_transform(svd_trans) for k in np.arange: km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1, verbose=2) km.fit(norm_data) scores.append(-1*km.score(norm_data)) plt.plot(k_range, scores) plt.xlabel('# of clusters') plt.ylabel('Inertia') sns.despine(offset=5, trim=True) return scores
def kfold(agetext,k,model,k2): import collections out = [] for i in range(k): print "iteration: "+str(i) agetext = shuffle(agetext) datatb = agetext.iloc[:,1:] label = agetext["agegroup"].tolist() X_train, X_test, y_train, y_test = cross_validation.train_test_split( datatb, label, test_size=0.15, random_state=i*6) data = X_train.values counter = collections.Counter(y_train) print counter testdata = X_test.values lsa = TruncatedSVD(k2, algorithm = 'arpack') normalizer = Normalizer(copy=False) X = lsa.fit_transform(data) X = normalizer.fit_transform(X) X_test = lsa.transform(testdata) X_test = normalizer.transform(X_test) model.fit(X,y_train) pred = model.predict(X_test) counter = collections.Counter(y_test) print counter counter = collections.Counter(pred) print counter out.append(round(accuracy_score(y_test, pred),5)) print str(out) print np.mean(out)
def main(): """ Main Function for data preprocesing, normalization and upsampling """ data = pd.read_csv('RawData/Full_Information_Cleaned.csv', index_col=0) data = data_pre_processing(data) X = data[[ 'Accept_Credit_Card', 'Outdoor_Seating', 'Take_out', 'Takes_Reservations', 'WIFI', 'Noise_Level', 'atm', 'bank', 'bar', 'beauty_salon', 'bus_station', 'cafe', 'gym', 'school', 'White population', 'Black population', 'American Indian population', 'Asian population', 'Hispanic or Latino population', 'High school or higher', 'Graduate or professional degree', 'Unemployed', 'average_price' ]] Y = pd.factorize(data['class'])[0] # To Normalize the feature data into same scale norm = Normalizer() X = norm.fit_transform(X) # To Standarize the data to mean 0 and std 1 scaler = StandardScaler() X = scaler.fit_transform(X) # Upsampling to deal with Imbalced Class sm = SMOTE(random_state=42) X, Y = sm.fit_resample(X, Y) print('Resampled dataset shape %s' % Counter(Y)) # Binarize the output y_bin = label_binarize(Y, classes=[0, 1, 2]) n_classes = y_bin.shape[1] model = addModels() classifier_run(X, Y, model)
def get_test(self): print "Getting physics..." data = pd.read_csv(self.data_path + "/test.csv") data = data.content.values.tolist() data = self.clean_html(data) temp = [] data = [re.sub(r'\n', ' ', x) for x in data] for d in data: if self.get_token(d): temp += [self.get_token(d)] mul = 10 x_test = [x for sublist in temp for x in sublist] c = Counter(x_test) x_test = list(set(x_test)) x_test = [x for x in x_test if c[x] > 25] ll = lambda x: float(len(x)) lt = map(ll, temp) x_test = [[doc.count(w) * mul / lt[i] for i, doc in enumerate(temp)] for w in x_test] x_test = np.array(x_test) #x_train = np.concatenate((x_train, np.zeros((x_train.shape[0], self.doclen - x_train.shape[1]))), axis=1) gc.collect() print "Doing LSA" print "SVD...." u, s, v = sparse.linalg.svds(x_test, embed_SIZE) n = Normalizer(copy=False) x_test = n.fit_transform(u * s.transpose()) return x_test
def draw_svc(dataset): normalizer = Normalizer() data_x, data_y = dataset.data, dataset.target data_n = normalizer.fit_transform(data_x) info = list() for i in range(100): info.append((i, *pipeline( mySVC(kernel='linear', epsilon=0, decision_function_shape='ovo'), data_n, data_y, label='my'), *pipeline(SVC( kernel='linear', gamma='auto', decision_function_shape='ovo'), data_x, data_y, label='sk'))) info = np.array(info) plt.figure() plt.plot(info[:, 0], info[:, 1], label='my') plt.plot(info[:, 0], info[:, 3], label='sklearn') plt.xlabel('times'), plt.ylabel('accuracy') plt.legend(loc='best') plt.show() plt.figure() plt.plot(info[:, 0], info[:, 2], label='my') plt.plot(info[:, 0], info[:, 4], label='sklearn') plt.xlabel('times'), plt.ylabel('time (sec)') plt.legend(loc='best') plt.show() mean = info.mean(axis=0) print(f'avg acc my: {mean[1]}, sk: {mean[3]}') print(f'avg time my: {mean[2]}, sk: {mean[4]}') return
def lr_eval(train_embs, eval_embs, train_labels, eval_labels): normalizer = Normalizer() train_embs = normalizer.fit_transform(train_embs) eval_embs = normalizer.transform(eval_embs) lr_model = LogisticRegression(random_state=0, penalty='l2', solver='liblinear') ##drop all negative labels non_neg = [i for i in range(len(train_labels)) if train_labels[i] >= 0] if len(non_neg) == 0: return 0, 0 else: train_embs = [train_embs[i] for i in non_neg] train_labels = [train_labels[i] for i in non_neg] num_classes = len(list(set(train_labels))) if num_classes == 1: return 0, 0 elif num_classes > 2: logger.warning('3 classes, something is wrong') lr_model.fit(X=train_embs, y=train_labels) y_pred = lr_model.predict(eval_embs) acc = sum(y_pred == eval_labels) / len(y_pred) weights = lr_model.coef_[0] dim = int(len(weights) / 2) weght_ratio = np.linalg.norm(weights[:dim]) / np.linalg.norm(weights[dim:]) return acc, weght_ratio
def save_cluster_of_sentence_embedding(sentence_embedding_file_path, cluster_sentence_output_path, cluster_num, cluster_centroids_output_path, cluster_labels_output_path): sentence_embedding_list = np.load(sentence_embedding_file_path) #归一化 normalizer = Normalizer(copy=False) sentence_embedding_list_norm = normalizer.fit_transform( sentence_embedding_list) end_time1 = datetime.datetime.now() #print('TIME: np.load sentence_embedding_list ', end_time1-start_time) #print('shape of sentence_embedding_list', np.shape(sentence_embedding_list)) cluster_number = int(cluster_num) Kmeans = KMeans(n_clusters=cluster_number, n_init=5, max_iter=100, n_jobs=-1) cluster_sentence = Kmeans.fit_predict(sentence_embedding_list_norm) cluster_sentence2 = Kmeans.fit(sentence_embedding_list_norm) end_time2 = datetime.datetime.now() print('TIME: Kmeans cluster ', end_time2 - end_time1) centroids = cluster_sentence2.cluster_centers_ labels = cluster_sentence2.labels_ #cluster_distance = Kmeans.transform(sentence_embedding_list) np.save(cluster_sentence_output_path, cluster_sentence) np.save(cluster_centroids_output_path, centroids) np.save(cluster_labels_output_path, labels)
def normalize_test(): X=[1,2,3,4,5,2,6,8] from sklearn.preprocessing import Normalizer normalizer = Normalizer() X2 = normalizer.fit_transform(X) print X2
def data_transformation(X, final_columns, norm=False, z_score=True): ''' Data transformation techniques 1. Range transformation (Normalization) 2. Z-Score transformation (Standardization) - Default ''' X_transformed = X # necessary transformations if norm: norm = Normalizer() X_transformed = norm.fit_transform(X) X_transformed = pd.DataFrame(X_transformed, columns=final_columns) print('Normalized') if z_score: scaler = StandardScaler() X_transformed = scaler.fit_transform(X) X_transformed = pd.DataFrame(X_transformed, columns=final_columns) print('Z-Score Applied') print(X_transformed) X_transformed_inversed = pd.DataFrame( scaler.inverse_transform(X_transformed), columns=final_columns) print(X_transformed_inversed) fi = 'data_transformation.pkl' with open(fi, 'wb') as mod: pickle.dump(scaler, mod) return X_transformed
def get_tf_idf_M(M, tf=["bin", "raw", "log", "dnorm"], idf=["c", "smooth", "max", "prob"], norm_samps=False): N = len(M) if tf == "raw": tf_M = np.copy(M) #just the frequency of the word in a text # #TODO: check if dnorm is implemented OK # elif tf == "dnorm": # tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1)))) if idf == "c": idf_v = [] for i in range( M.shape[1] ): #get the number of texts that contain a word words[i] idf_v.append(np.count_nonzero( M[:, i])) #count the non zero values in columns of matrix M idf_v = np.array(idf_v) idf_v = np.log(N / idf_v) tf_idf_M = tf_M * idf_v if norm_samps: normalizer = Normalizer() tf_idf_M = normalizer.fit_transform(tf_idf_M) # np.save("tf_idf_M", tf_idf_M) return tf_idf_M
def Normalized(self, df): meta = [] nparray = df.to_numpy() normalizer = Normalizer() meta.append(f'Normalized with scikitlearn {normalizer}') nparray = normalizer.fit_transform(nparray) return pd.DataFrame(nparray, columns=df.columns), meta
def normalise(data, method='robust'): """Normalise `data` with `method`. Parameters ---------- data: dict * train: tuple - X: features - y: labels * test: tuple - X: features - y: labels method: str Rescale (and center) data (per feature) by: * l2: unit L2 norm * l1: unit L1 norm * max: unit L{inf} norm * standard: standardise N(0, 1) each feature * maxabs: maximum absolute value * minmax: minimum and maximum values * robust: robust to outliers (IQR and median) * none: identity block Returns ------- rescaled_data: dict * train: tuple - X: features - y: labels * test: tuple - X: features - y: labels """ if method == 'none': return data X_train, y_train = data['train'] X_test, y_test = data['test'] if method == 'l2': trans = Normalizer('l2') elif method == 'l1': trans = Normalizer('l1') elif method == 'max': trans = Normalizer('max') elif method == 'standard': trans = StandardScaler() elif method == 'maxabs': trans = MaxAbsScaler() elif method == 'minmax': trans = MinMaxScaler() elif method == 'robust': trans = RobustScaler() else: raise ValueError('Unrecognised method=%s' % method) X_train = trans.fit_transform(X_train) X_test = trans.transform(X_test) return {'train': (X_train, y_train), 'test': (X_test, y_test)}
def outlier_dbscan(data): columns = [ 'wet_mean', 'green_mean', 'bright_mean', 'ARVI_mean', 'SAVI_mean', 'NDBI_mean', 'mNDWI_mean', 'NDWI_mean', 'mNDVI_mean', 'NDVI_mean', 'wet_p50', 'green_p50', 'bright_p50', 'ARVI_p50', 'SAVI_p50', 'NDBI_p50', 'mNDWI_p50', 'NDWI_p50', 'mNDVI_p50', 'NDVI_p50', 'S2_B12mean', 'S2_B11mean', 'S2_B8mean', 'S2_B4mean', 'S2_B3mean', 'S2_B2mean', 'S2_B12med', 'S2_B11med', 'S2_B8med', 'S2_B4med', 'S2_B3med', 'S2_B2med' ] t_c = data.TRAIN_CLASS.unique() for i in tqdm_notebook(range(len(t_c)), desc='Processing Clustering Outlier data'): cl_data = data.loc[data.TRAIN_CLASS == t_c[i], columns].dropna() st_sc = Normalizer() model_ = DBSCAN(eps=.05, min_samples=10).fit(st_sc.fit_transform(cl_data)) cl_data['label'] = model_.labels_ data.loc[cl_data.index, 'OUTLIER'] = cl_data.label data['OUTLIER'] = data.OUTLIER.apply(lambda y: 0 if y >= 0 else -1) data_outlier = data.loc[data.OUTLIER < 0, ['x', 'TRAIN_CLASS']].groupby( 'TRAIN_CLASS').agg('count').rename(columns={ 'x': 'COUNT_OUTLIER' }).reset_index() fig = px.bar(data_outlier, x="TRAIN_CLASS", y="COUNT_OUTLIER", title="OUTLIER") fig.show() return data
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname): gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname) logging.info('Loaded gensim model of subgraph vectors') subgraph_vocab = sorted(gensim_model.vocab.keys()) logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab))) wlk_files = get_files(corpus_dir, extn) logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files))) c_vectorizer = CountVectorizer(input='filename', tokenizer=subgraph2vec_tokenizer, lowercase=False, vocabulary=subgraph_vocab) normalizer = Normalizer() X = c_vectorizer.fit_transform(wlk_files) X = normalizer.fit_transform(X) logging.info('X (sample) matrix shape: {}'.format(X.shape)) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test) subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab) deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
def preprocess(df,service_list,flag_list,labeled=False): print(df.shape) df_data,label=to_numeric(df,service_list,flag_list,labeled=labeled) print(len(service_list)) print(len(flag_list)) print('Selected',df_data.shape) scaler=Normalizer() #Continuous Data data_cont=df_data[CONT_FEATURES].values data_cont=scaler.fit_transform(data_cont) #Binary Category Data data_bin=df_data[CAT_FEATURES[3:]].values #Categorical Data enc = OneHotEncoder(categories=[range(3), range(len(service_list)), range(len(flag_list))]) enc.fit(df_data[["protocol_type","service","flag"]].values) oneHotEncoding = enc.transform(df_data[["protocol_type","service","flag"]].values).toarray() print(oneHotEncoding.shape) df_final = np.concatenate((data_cont, oneHotEncoding,data_bin), axis=1) df_final = pd.DataFrame(df_final) print(df_final.shape) return df_final,label
def runPCA(input_data, test, d): input_data = removeCorrelation(input_data) test = removeCorrelation(test) normZ = Normalizer() scaledX = normZ.fit_transform(input_data.iloc[:,:-1]) scaledTestX = normZ.transform(test) pca = PCA() pcaX = pca.fit_transform(scaledX) pcaX = pd.DataFrame(pcaX) print 'Approx 98% variance explained by '+str(d)+' features: ' + str(pca.explained_variance_ratio_[:d].sum()) trainY = input_data.iloc[:,-1] trainY = trainY.reshape(len(trainY), 1) trainY = pd.DataFrame(trainY) trainY.columns = ['Class'] trainDataAfterPCA = pd.concat([pcaX.iloc[:,:d], trainY], axis=1) testDataAfterPCA = pca.transform(scaledTestX) testDataAfterPCA = pd.DataFrame(testDataAfterPCA) testDataAfterPCA = testDataAfterPCA.iloc[:,:d] return trainDataAfterPCA, testDataAfterPCA
def _normalize(self, X, y, X_t): from sklearn.preprocessing import Normalizer NORM = Normalizer() X = NORM.fit_transform(X, y) X_t = NORM.transform(X_t) return X, X_t
def kmeans(tfidf, svd, svd_trans, k=200, n_words=10): ''' Performs k-means clustering on svd transformed data and plots it Args: tfidf: sklearn fitted TfidfVectorizer svd: sklearn fitted TruncatedSVD svd_trans: dense array with lsi transformed data k: the k in k-means Returns: km: the fitted KMean object ''' # spherical kmeans, so normalize normalizer = Normalizer() norm_data = normalizer.fit_transform(svd_trans) km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5, verbose=2) km.fit(norm_data) original_space_centroids = svd.inverse_transform(km.cluster_centers_) order_centroids = original_space_centroids.argsort()[:, ::-1] terms = tfidf.get_feature_names() terms = prettify(terms) terms = np.array(terms) fig = plt.figure(figsize=(10, 8)) for i in range(10): print("Cluster {:d}:".format(i)) for ind in order_centroids[i, :n_words]: print(' {:s}'.format(terms[ind])) print('\n') # Make a figure and axes with dimensions as desired. ax = fig.add_subplot(2, 5, i+1) ax.set_title('Cluster {:d}'.format(i+1)) component = order_centroids[i] cmap = plt.cm.Purples mn = np.min(component[:n_words]) mx = np.max(component[:n_words]) norm = mpl.colors.Normalize(mn, mx) cb = mpl.colorbar.ColorbarBase(ax, cmap=cmap, norm=norm, orientation='vertical') # sorted_component = np.sort(component) colors = sns.color_palette('Purples', 9).as_hex() colors = np.repeat(colors[-1], n_words) cb.set_ticks(np.linspace(mn, mx, n_words+2)[1:-1]) cb.ax.yaxis.set_tick_params(size=0) cb.ax.tick_params(labelsize=10) for color, tick in zip(colors, cb.ax.get_yticklabels()): tick.set_color(color) tick.set_fontsize(14) cb.set_ticklabels(np.array(terms)[order_centroids[i, :n_words][::-1]]) plt.tight_layout() return km
def reduce_dimension(self, n_components=2): """ Return PCA transform of self.data, with n_components. """ reducer = PCA(n_components=n_components) X = self.data.values norm = Normalizer() Xnorm = norm.fit_transform(X) return reducer.fit_transform(Xnorm)
def make_nn_regression(n_samples=100, n_features=100, n_informative=10, dense=False, noise=0.0, test_size=0, normalize_x=True, normalize_y=True, shuffle=True, random_state=None): X, y, w = _make_nn_regression(n_samples=n_samples, n_features=n_features, n_informative=n_informative, shuffle=shuffle, random_state=random_state) if dense: X = X.toarray() if test_size > 0: cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state, test_size=test_size, train_size=1-test_size) train, test = list(cv)[0] X_train, y_train = X[train], y[train] X_test, y_test = X[test], y[test] if not dense: X_train.sort_indices() X_test.sort_indices() else: X_train, y_train = X, y if not dense: X_train.sort_indices() X_test, y_test = None, None # Add noise if noise > 0.0: generator = check_random_state(random_state) y_train += generator.normal(scale=noise * np.std(y_train), size=y_train.shape) y_train = np.maximum(y_train, 0) if normalize_x: normalizer = Normalizer() X_train = normalizer.fit_transform(X_train) if X_test is not None: X_test = normalizer.transform(X_test) if normalize_y: scaler = MinMaxScaler() y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel() if y_test is not None: y_test = scaler.transform(y_test.reshape(-1, 1)).ravel() if X_test is not None: return X_train, y_train, X_test, y_test, w else: return X_train, y_train, w
def get_tf_idf_M(M, tf = ["bin", "raw", "log", "dnorm"], idf = ["c", "smooth", "max", "prob"], norm_samps=False): N = len(M) if tf == "raw": tf_M = np.copy(M) #just the frequency of the word in a text # #TODO: check if dnorm is implemented OK # elif tf == "dnorm": # tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1)))) if idf == "c": idf_v = [] for i in range(M.shape[1]): #get the number of texts that contain a word words[i] idf_v.append(np.count_nonzero(M[:,i])) #count the non zero values in columns of matrix M idf_v = np.array(idf_v) idf_v = np.log(N/idf_v) tf_idf_M = tf_M*idf_v if norm_samps: normalizer = Normalizer() tf_idf_M = normalizer.fit_transform(tf_idf_M) # np.savetxt("tf_idf_M_" + str(N) + ".txt", tf_idf_M , fmt="%s") return tf_idf_M
def lstm_validate(lstm_model, evaluation_dataset, create_confusion_matrix=False, number_of_subframes=0, sample_strategy="random", batch_size=32): print("evaluate neural network...") validation_data = [] validation_labels = [] accuracy = 0 n = 0 idx = 0 for _obj in evaluation_dataset: if number_of_subframes > 0: validation_data.append(get_buckets(_obj.get_hoj_set(), number_of_subframes, sample_strategy)) else: validation_data.append(_obj.get_hoj_set()) validation_labels.append(_obj.get_hoj_label()[0]) # evaluate neural network score, acc = lstm_model.evaluate(np.array(validation_data), np.array(validation_labels), batch_size=batch_size, verbose=0) print("Accuracy:",acc) if create_confusion_matrix is True: predictions = lstm_model.predict(np.array(validation_data),batch_size = batch_size) predicted_labels = [] real_labels = [] for k in range(len(predictions)): predicted_idx = np.argmax(predictions[k]) label_idx = np.argmax(validation_labels[k]) real_labels.append(label_idx) predicted_labels.append(predicted_idx) cnf_matrix = confusion_matrix(real_labels, predicted_labels) norm = Normalizer() cnf_matrix = norm.fit_transform(cnf_matrix) return score, acc, cnf_matrix return score, acc, None
class FFTTransformer(TransformerMixin, BaseEstimator): def __init__(self): self.model = LinearRegression() self.y_mean = None self.normalize = Normalizer() def fit(self, X, y=None): X_train = self.make_waves(X) y_train = numpy.array(y) self.y_mean = y_train.mean() self.model.fit(X_train, y_train - self.y_mean) return self def make_waves(self, X): X = X['times'] time_scale = numpy.array([(time - X[0]).total_seconds() for time in X]).reshape(-1, 1) X_train = [ numpy.concatenate(( numpy.pi * 2.0 / (24 * 60 * 60) * delta, numpy.pi * 2.0 / (12 * 60 * 60) * delta, numpy.pi * 2.0 / (6 * 60 * 60) * delta, numpy.pi * 2.0 / (7 * 24 * 60 * 60) * delta, numpy.pi * 2.0 / (7.0 / 2 * 24 * 60 * 60) * delta, numpy.pi * 2.0 / (7.0 / 3 * 24 * 60 * 60) * delta, numpy.pi * 2.0 / (1380500.0) * delta, numpy.pi * 2.0 / (1380500.0 / 2) * delta, numpy.pi * 2.0 / (1380500.0 / 3) * delta), axis=0) for delta in time_scale] X_train = numpy.concatenate((numpy.sin(X_train), numpy.cos(X_train)), axis=1) return X_train def predict(self, X): X_test = self.make_waves(X) X_test = self.model.predict(X_test) + self.y_mean return X_test.reshape(-1, 1) def transform(self, X, y=None): X_test = self.predict(X) X_test = self.normalize.fit_transform(X_test) return X_test.reshape(-1, 1)
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname): ''' Perform classification from :param corpus_dir: folder containing subgraph2vec sentence files :param extn: extension of subgraph2vec sentence files :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code) :param class_labels_fname: files containing labels of each graph :return: None ''' gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname) logging.info('Loaded gensim model of subgraph vectors') subgraph_vocab = sorted(gensim_model.vocab.keys()) logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab))) wlk_files = get_files(corpus_dir, extn) logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files))) c_vectorizer = CountVectorizer(input='filename', tokenizer=subgraph2vec_tokenizer, lowercase=False, vocabulary=subgraph_vocab) normalizer = Normalizer() X = c_vectorizer.fit_transform(wlk_files) X = normalizer.fit_transform(X) logging.info('X (sample) matrix shape: {}'.format(X.shape)) Y = np.array(get_class_labels(wlk_files, class_labels_fname)) logging.info('Y (label) matrix shape: {}'.format(Y.shape)) seed = randint(0, 1000) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed) logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)) linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test) subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab) deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
def vectorize(n, comp=0): tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english', sublinear_tf=True, use_idf=True, smooth_idf=True) # Fit and transform X = tfv.fit_transform(boiler_stream(trainfnm, n)) lsa = None scaler = None if comp > 0: lsa = TruncatedSVD(comp) scaler = Normalizer(copy=False) X = lsa.fit_transform(X) X = scaler.fit_transform(X) # Transform only Z = tfv.transform(boiler_stream(testfnm, n)) if lsa: Z = lsa.transform(Z) Z = scaler.transform(Z) np.save(trainvecfnm, X) np.save(testvecfnm, Z)
def createGraph(similarity, sim_keep_above, graph_rule): G = nx.Graph() # Default (old version): may cause problems if graph_rule == "default": similarity = 1.0 - similarity elif graph_rule == "minmax": mm = MinMaxScaler() similarity = mm.fit_transform(1.0 - similarity) elif graph_rule == "normalization": norm = Normalizer() similarity = norm.fit_transform(1.0 - similarity) elif graph_rule == "inversed": similarity = similarity # Remove similarity small than S similarity[similarity < (sim_keep_above)] = 0.0 similarity = (similarity * 10000).astype(int) if similarity.shape[0] == 1: G.add_node(0) return G max_size = similarity.shape[0] vertice = 0 for i in xrange(0, max_size): for j in xrange(i, max_size): if not ALLOW_SELF_LOOP: if i == j: continue v = similarity[i][j] # print i,j,v if v > 1: vertice += 1 G.add_edge(i, j, weight=v) return G
import pandas as pd from sklearn.preprocessing import Normalizer from sklearn.linear_model import SGDRegressor from sklearn.cross_validation import KFold from sklearn.metrics import mean_squared_error df = pd.read_csv("forestfires.txt", index_col=False, sep=" ") X = df.iloc[:,0:-1].values Y = df.iloc[:,-1].values normalizer = Normalizer() X = normalizer.fit_transform(X) k_fold_cv = KFold(n=Y.shape[0], n_folds=10, shuffle=True) sgdr = SGDRegressor() for train_index, test_index in k_fold_cv: X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index] sgdr.fit(X_train, Y_train) pred = sgdr.predict(X_test) error = mean_squared_error(Y_test, pred) print(error)
print len(newdata) print len(newdata[0]) print np.shape(newdata) print "data done" print "logistic initialized" # clf.fit(data[:,:-1], data[:,-1]) print "fitted data" skf = StratifiedKFold(data[:,-1], n_folds=10, shuffle=True) output =[] finalscore = 0 counter = 0 for train, test in skf: counter = counter + 1 n = Normalizer() netdata = n.fit_transform([ newdata[i][:-1] for i in train ], [ data[i][-1] for i in train ]) print np.shape(netdata) clf = GradientBoostingClassifier(warm_start = True, n_estimators = 1500) clf = clf.fit( netdata, [ data[i][-1] for i in train ]) n = Normalizer() nowdata = n.fit_transform([ newdata[i][:-1] for i in test ]) print np.shape(nowdata) prediction = clf.predict(nowdata) # pred = [] # for i in prediction: # if(i > 1.5): # pred.append(2) # else: # pred.append(1) xscore = score.get_score( prediction , [ data[i][-1] for i in test ]) finalscore = finalscore + xscore
def run(self, working_directory, clustering_algorithm, word2Vec_conf, vector_type): vertices_path = working_directory + "vertex.txt" seedsMap_path = working_directory + "seedsMap.txt" groundTruth = working_directory + "groundTruth.csv" random_walks_path = working_directory + "sequenceIDs.txt" urlsmap = self.get_urlmap(seedsMap_path) documents = self.get_content_map(vertices_path) groundTruthMap = self.get_content_map(groundTruth) random_walks1, random_walks2 = tee(self.get_sequences(random_walks_path)) #true_labels = np.array([int(groundTruthMap[v]) for v in urlsmap.values()]) true_labels = [int(groundTruthMap[v]) for v in urlsmap.values()] dim_link, dim_content = self.get_dimension_vectors(vector_type) embedding_matrix = [] document_matrix = [] codes = list(urlsmap.keys()) if(dim_link>0): word2vec = self.runWord2Vec(word2Vec_conf, dim_link) word2vec.build_vocab(random_walks1) word2vec.train(random_walks2) for url in codes: embedding = word2vec[url] embedding_matrix.append(embedding) #Normalize embedding_matrix using L2 normalizer_embedding = Normalizer(copy=False) embedding_matrix = normalizer_embedding.fit_transform(embedding_matrix) print("Normalize embedding_matrix, shape: ",embedding_matrix.shape) if(dim_content>0): for url in codes: document_matrix.append(documents[url]) content_matrix = self.get_content_matrix(document_matrix, dim_content) combined_matrix = [] if(dim_link>0 and dim_content>0): combined_matrix = np.array ([np.concatenate((content_matrix[i], embedding_matrix[i])) for i in range(0, len(content_matrix))]) print("Combined link and content matrices, shape: ", combined_matrix.shape) elif (dim_link>0): combined_matrix = embedding_matrix else: combined_matrix = content_matrix #clustering if(clustering_algorithm == "KMEANS"): num_clusters = len(set(true_labels)) print("Clustering using KMEANS with num_clusters = ", num_clusters) algorithm = KMeans(n_clusters=num_clusters) elif (clustering_algorithm == "HDBSCAN"): print("Clustering using HDBSCAN with min 5 elements per cluster") algorithm = HDBSCAN(min_cluster_size=5) else: print("ERROR clustering, wrong parameter ", clustering_algorithm) sys.exit(2) #learned_labels = np.array(map(lambda x: int(x), algorithm.fit_predict(combined_matrix))) learned_labels = np.array([int(x) for x in algorithm.fit_predict(combined_matrix.astype(np.float))]) #metrics analysis filtered_true_labels = [] filtered_learned_labels = [] filtered_combined_matrix = [] for i in range(0, len(true_labels)): if (true_labels[i] != -1): filtered_true_labels.append(true_labels[i]) filtered_learned_labels.append(learned_labels[i]) filtered_combined_matrix.append(combined_matrix[i]) filtered_true_labels = np.array(filtered_true_labels) filtered_learned_labels = np.array(filtered_learned_labels) filtered_combined_matrix = np.array(filtered_combined_matrix) print("Web pages to analyze: ", len(filtered_learned_labels)) self.homogeneity = metrics.homogeneity_score(filtered_true_labels, filtered_learned_labels) self.completeness = metrics.completeness_score(filtered_true_labels,filtered_learned_labels) self.v_measure = metrics.v_measure_score(filtered_true_labels, filtered_learned_labels) self.ari = metrics.adjusted_rand_score(filtered_true_labels, filtered_learned_labels) self.ami = metrics.adjusted_mutual_info_score(filtered_true_labels, filtered_learned_labels) self.silhouette = metrics.silhouette_score(filtered_combined_matrix, filtered_learned_labels, metric='cosine') print('\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette') print(self.homogeneity, self.completeness, self.v_measure, self.ari, self.ami, self.silhouette) return(filtered_true_labels, filtered_learned_labels)
def load_blood_data(train=True, SEED=97, scale = False, minmax = False, norm = False, nointercept = False, engineering = False): """ Load training and test datasets for DrivenData's Predict Blood Donations warmup contest The training data is shuffled before it's returned; test data is not Note: patsy returns float64 data; Theano requires float32 so conversion will be required; the y values are converted to int32, so they're OK Arguments --------- train (bool) if True y_train, X_train = load_blood_data(train=True, ... if False X_test, IDs = load_blood_data(train=False, ... SEED (int) random seed scale (bool) if True, scale the data to mean zero, var 1; standard normal minmax (2-tuple) to scale the data to a specified range, provide a 2-tuple (min, max) norm (bool) if True, L2 normalize for distance and similarity measures nointercept (bool) if True, patsy will not create an intercept Usage ----- from load_blood_data import load_blood_data """ from sklearn.utils import shuffle from patsy import dmatrices, dmatrix from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler from sklearn.preprocessing import Normalizer import numpy as np import pandas as pd import re global scaler global minmaxer global normalizer if (scale and minmax): raise ValueError("cannot specify both scale and minmax") if (scale and norm): raise ValueError("cannot specify both scale and norm") if (norm and minmax): raise ValueError("cannot specify both norm and minmax") if type(train) is not bool: raise ValueError("train must be boolean") if type(SEED) is not int: raise ValueError("SEED must be int") if type(scale) is not bool: raise ValueError("scale must be boolean") if type(norm) is not bool: raise ValueError("norm must be boolean") if type(nointercept) is not bool: raise ValueError("nointercept must be boolean") if type(engineering) is not bool: raise ValueError("engineering must be boolean") # ------------- read the file ------------- file_name = '../data/train.csv' if train else '../data/test.csv' data = pd.read_csv(file_name) # ------------- shorten the column names ------------- column_names = ['ID','moSinceLast','numDonations','volume','moSinceFirst','donated'] data.columns = column_names if train else column_names[:-1] # ------------- create new variables ------------- if engineering: # Ratio of moSinceLast / moSinceFirst = moRatio data['moRatio'] = pd.Series(data.moSinceLast / data.moSinceFirst, index=data.index) # Ratio of (volume/numDonations) / moSinceFirst = avgDonation data['avgDonation'] = pd.Series((data.volume/data.numDonations) / data.moSinceFirst, index=data.index) # Ratio of moSinceFirst / numDonations = avgWait data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index) # ------------- scale the data ------------- # transform data to mean zero, unit variance # ========================================== if scale: if train: scaler = StandardScaler(copy=True, with_mean=True, with_std=True) exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = scaler.fit_transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) else: exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = scaler.transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) # transform data to fit in a range # ================================ if minmax: if len(minmax) != 2: raise ValueError("minmax must be a 2-tuple") if train: minmaxer = MinMaxScaler(feature_range = minmax) exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = minmaxer.fit_transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) else: exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = minmaxer.transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) # transform data to unit vector (L2 norm for distance and similarity) # =================================================================== if norm: if train: normalizer = Normalizer(norm='l2', copy=True) exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = normalizer.fit_transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) else: exclude = ['ID','donated'] data.ix[:, data.columns.difference(exclude)] = normalizer.transform( data.ix[:, data.columns.difference(exclude)].values.astype(np.float32)) # ------------- create the design matrix ------------- # create the datasets with a patsy formula formula = 'donated ~ moSinceLast * moSinceFirst + numDonations + volume' if engineering: formula = formula + ' + moRatio + avgDonation + avgWait' if nointercept: formula = formula + ' -1' if not train: match = re.search(r"~\s??(.*)", formula) if match: formula = match.group(1) else: raise ValueError("Patsy formula {} does not match the expected format".format(formula)) # ------------- return the values ------------- if train: y_train, X_train = dmatrices(formula, data=data, return_type="dataframe") y_train = np.ravel(y_train).astype(np.int32) X_train, y_train = shuffle(X_train, y_train, random_state=SEED) return y_train, X_train else: X_test = dmatrix(formula, data=data, return_type="dataframe") IDs = data.ID.values return X_test, IDs
#descriptors if exist vectors=None norm=None if desc: def str_column_to_array(df_column): lst=[] df_column.apply(lambda row: lst.append(np.array([float(elem) for elem in row.strip('[').strip(']').split(",")]))) return lst vectors = str_column_to_array(train_df["desc"]) norm="l1" if norm is not None: normalizer = Normalizer(norm) vectors = normalizer.fit_transform(vectors) logger.debug("Training...") if desc: #taining and ml algo on vectors model = ml().fit(vectors, labels) else: if args.model!="dnn": raise NotImplementedError("non dnn model is not proposed for direct images") #training raw data only with dnn if args.stats: model = ml(args.model_dir).fitdata(train_df["path"].tolist(), labels) else: model = ml().fitdata(train_df["path"].tolist(), labels)
def normalizer(X): s = Normalizer(norm='l1') return s.fit_transform(X)
def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): """ Plot data transformed into two dimensions by PCA. PCA transforms into a new embedding dimension such that the first dimension contains the maximal variance and following dimensions maximal remaining variance. This shoudl spread the observed n-dimensional data maximal. This is unsupervised and will not consider target values. """ if (scale): scaler = StandardScaler() X = scaler.fit_transform(X) if (normalize): normalizer = Normalizer(norm='l2') X = normalizer.fit_transform(X) if (embedding is 'pca'): pca = PCA(n_components=2) X_transformed = pca.fit_transform(X) elif (embedding is 'isomap'): isomap = Isomap(n_components=2, n_neighbors=20) X_transformed = isomap.fit_transform(X) elif (embedding is 'lle' ): lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5) X_transformed = lle.fit_transform(X) elif (embedding is 'tsne'): t_sne = TSNE(n_components=2) X_transformed = t_sne.fit_transform(X) elif (embedding is 'spectral'): se = SpectralEmbedding(n_components=2) X_transformed = se.fit_transform(X) elif (embedding is 'mds'): mds = MDS(n_components=2) X_transformed = mds.fit_transform(X) elif (embedding is 'gallery'): plt.figure(1) plt.subplot(231) plt.title('pca') X_t = PCA(n_components=2).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(232) plt.title('isomap') X_t = Isomap(n_neighbors=20).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(233) plt.title('lle') X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(234) plt.title('tsne') X_t = TSNE().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(235) plt.title('spectral') X_t = SpectralEmbedding().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(236) plt.title('mds') X_t = MDS().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.suptitle('Gallery transforms ' + title) return plt else: raise ValueError("Choose between pca, isomap and tsne") plt.title(title + ' ' + embedding + ' plot') sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y) plt.colorbar(sc) return plt
mnb = MultinomialNB() bnb = BernoulliNB() knn = KNeighborsClassifier() rf = RandomForestClassifier(n_estimators=51) ada = AdaBoostClassifier() classifiers = [lda,qda,svm,perceptron,gnb,mnb,bnb,knn,rf,ada] classifier_names = ["LDA","QDA","SVM (RBF)","Perceptron","Gaussian NB","Multinomial NB",\ "Bernoulli NB","KNN (K=5)","Random Forests","Ada Boost"] index = np.arange(len(classifier_names)) #Extracting the data values in a numpy array and Preprocessing it data = dataFrame.values data_normalized = normalizer.fit_transform(data) data_standard = standardscale.fit_transform(data) data_minmax = minmaxscaler.fit_transform(data) preprocess_names = ["Unscaled","Normalized","Standardized","MinMax"] preprocessors = [data,data_normalized,data_standard,data_minmax] train_labels = class_labels[:128] test_labels = class_labels[128:] performance_all_preprocess = list([]) count = 0 #Defines the Recursive Feature Selector for best feature selection def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFE(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels)
def main(path): #tweetdata = loadfiles(path) tweetdata = pd.read_csv(path,header=0,dtype=str, names = ['text','lat','lng','class'])[:50000] #traindata, testdata = train_test_split(tweetdata,test_size=0.3, random_state=50) size = len(tweetdata) start = 7*size/10 trainclass = tweetdata['class'][:start] testclass = tweetdata['class'][start:] vectorizer = TfidfVectorizer(max_df=0.5,min_df=2, stop_words='english',use_idf=True,encoding='utf-8', decode_error='ignore',lowercase=True) norm = Normalizer(copy=False) tfids = vectorizer.fit_transform(tweetdata['text']) normalized_tfids = norm.fit_transform(tfids) ch2 = SelectKBest(chi2, k=1000) #normalized_tfids = ch2.fit_transform(normalized_tfids,tweetdata['class']) data = pd.DataFrame(normalized_tfids.toarray()) traindata = data[:start] testdata = data[start:] traindata = ch2.fit_transform(traindata,trainclass) testdata = ch2.fit_transform(testdata,testclass) #traindata= pd.DataFrame(traindata,columns=['text','lat','lng','class']) #testdata = pd.DataFrame(testdata,columns=['text','lat','lng','class']) #tweetdata['location'] = map(reverseGeocode, tweetdata['lat'],tweetdata['lng']) # map(wordsForChiFeatures,tweetdata['text'], tweetdata['location']) # totalCount = sum(j for j in wordLocDict.values() if j>1) # for i,j in wordLocDict.items(): # # change 1 to any value as per requirement # if j>5 : # tweetdata[str(i)] = map(lambda x,y:assignFeature(x,y,i[0],totalCount),tweetdata['text'],tweetdata['location']) # tweetdata.to_csv('liw.csv',header=True, index=False,encoding='utf-8') #testdata= loadfiles('C:\Users\AravindKumarReddy\Downloads\SMMTest') #traindata['location'] = map(mapLocation, traindata['lat'],traindata['lng']) #testdata['location'] = map(mapLocation, testdata['lat'],testdata['lng']) #train_tfids = vectorizer.fit_transform(traindata['text']) #test_tfids = vectorizer.fit_transform(testdata['text']) #train_tfids = norm.fit_transform(train_tfids) #test_tfids = norm.fit_transform(test_tfids) #km = KMeans(n_clusters=2000, init='k-means++', max_iter=100, n_init=1) #km.fit(traindata[[1,2]]) #y = traindata['class'] nb = MultinomialNB(alpha=.1) nb.fit(traindata,trainclass) predictions = nb.predict(testdata) print predictions print '=================================' print testclass print accuracy_score(testclass,predictions)
def normalize(features): nm = Normalizer() min_max_scaler = MinMaxScaler(feature_range=(0, 10)) features = nm.fit_transform(features) return min_max_scaler.fit_transform(features)
def normalizer_scale(self, X): scaler = Normalizer() return scaler.fit_transform(X)
import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import Normalizer clf = GradientBoostingClassifier(warm_start = True, n_estimators=1000) print 'clf created' trainData = np.array([ [ float(x.strip()) for x in line.split(',') ] for line in open('completedData10NN.csv') ]) n = Normalizer(norm = 'l1') train = n.fit_transform(trainData[:,:-1], trainData[:,-1]) # train = normalize(trainData[:,:-1]) print 'train data read' clf.fit(train, trainData[:,-1]) print 'clf trained' testData = [ [ float(x.strip()) for x in line.split(',') ] for line in open('completedTestData10NN.csv') ] n = Normalizer(norm = 'l1') newdata = n.fit_transform(testData) # newdata = normalize(testData) print np.shape(newdata) print "test data read" prediction = clf.predict(newdata) print "predicted" f = open('team04_l1_n1000_10NN.txt', 'w') def num(x): if x == 1: return 'A' elif x == 2:
trainingAccuracy = numpy.zeros(folds) trainingBaseline = numpy.zeros(folds) testingAccuracy = numpy.zeros(folds) testingBaseline = numpy.zeros(folds) testingDensity = [] testingF1 = numpy.zeros(folds) # sys.stdout.write("Query\tFold\tGround-Truth\tPredicted\n") for i, (train, test) in enumerate(skf): vectorizer = CountVectorizer(min_df=1,dtype='double') normalizer = Normalizer() classifier = LinearSVC(loss='l1') strawMan = DummyClassifier(strategy='most_frequent') X = normalizer.fit_transform(vectorizer.fit_transform(examples[train])) y = labels[train] classifier.fit(X, y) strawMan.fit(X, y) testingDensity.extend(computeDensity(vectorizer, examples[test])) trainingAccuracy[i] = predict(classifier,normalizer,vectorizer,examples[train], labels[train]) trainingBaseline[i] = predict(strawMan,normalizer,vectorizer,examples[train], labels[train]) testingAccuracy[i] = predict(classifier,normalizer,vectorizer,examples[test], labels[test]) testingBaseline[i] = predict(strawMan,normalizer,vectorizer,examples[test], labels[test]) testingF1[i] = predictF1(classifier,normalizer,vectorizer,examples[test], labels[test]) print "Training Accuracy:" + prettyPrint(trainingAccuracy) print "Test Accuracy:" + prettyPrint(testingAccuracy) print "Training Baseline:" + prettyPrint(trainingBaseline) print "Test Baseline:" + prettyPrint(testingBaseline)
def main(): # if sys.argv[2] == 'svm': # Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100) # elif sys.argv[2] == 'lr': # Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8) # elif sys.argv[2] == 'pa': # Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced') # else: # Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced') Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100) Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced') Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5, param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3, # param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8) File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt' Ngram = 2 print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram) PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100] NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100] print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples)) X = PosSamples + NegSamples y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=random.randint(0,100)) print '# TrainLabels', len(y_train) print '# TestLabels', len(y_test) print 'performing CVectorizer' CVectorizer = CountVectorizer(lowercase = True, stop_words='english', # token_pattern='(?u)\b\w\w+\b', # tokenizer = SGTokenizer, tokenizer = Tokenizer, ngram_range=(1,2), dtype=np.float64, decode_error = 'ignore', max_df=0.8) print 'performing TfidfTransformer and Normalizer' # TFIDFTransformer = TfidfTransformer() normalizer = Normalizer() print 'creating Train and Test FVs' T0 = time() TrainFVs = CVectorizer.fit_transform(X_train) TestFVs = CVectorizer.transform(X_test) print 'feat ext time', time() - T0 # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs) # TestFVs = TFIDFTransformer.transform(TestFVs) TrainFVs = normalizer.fit_transform(TrainFVs) TestFVs = normalizer.transform(TestFVs) print 'Trai/test split' print TrainFVs.shape print TestFVs.shape # raw_input('hit any key...') print 'training classifier with train samples shape:', TrainFVs.shape T0 = time() # memory_dump('before_train_mem.txt') Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily) print 'batch fitted' print 'training time', time() - T0 # memory_dump('after_train_mem.txt') print 'testing classifier with test samples shape:', TestFVs.shape T0 = time() # memory_dump('before_test_mem.txt') PredictedLabels = Clf.predict(TestFVs) print 'testing time', time() - T0 # memory_dump('after_test_mem.txt') print '*'*100 print 'classification report' print '-'*20 Accuracy = np.mean(PredictedLabels == y_test) print "Test Set Accuracy = ", Accuracy print(metrics.classification_report(y_test, PredictedLabels, target_names=['Neg', 'Pos'])) print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels) print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels) print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels) print "Log loss:", metrics.log_loss(y_test, PredictedLabels) print "F1 Score:", metrics.f1_score(y_test, PredictedLabels) print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels) print '*'*100 Vocab = CVectorizer.get_feature_names() # print Vocab[:100] # raw_input() try: FeatureImportances = Clf.coef_[0] except: FeatureImportances = Clf.best_estimator_.coef_[0] print FeatureImportances.shape raw_input() PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1] NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1] for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices): print Vocab[PosFIndex], '+-', Vocab[NegFIndex] FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1])) FeatureImportancesSparseArray.setdiag(FeatureImportances) AllFVsTimesW = TestFVs*FeatureImportancesSparseArray print AllFVsTimesW.shape Ind = 0 for TestFV in TestFVs: if PredictedLabels[Ind] != y_test[Ind]: Ind += 1 continue if len(X_test[Ind].split()) < 5: Ind += 1 continue print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind]) # print TestFV # print TestFV.shape CurTestFV = np.array(AllFVsTimesW[Ind].toarray()) CurTestFV = CurTestFV.transpose() CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],) # print CurTestFV.shape # raw_input() PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1] NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1] PosFeatImps= CurTestFV.argsort()[-2:] NegFeatImps = CurTestFV.argsort()[:2] Tmp = AllFVsTimesW[Ind].todense() Tmp = np.sort(Tmp) # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp # print NegTopFeatureIndices, NegFeatImps if y_test[Ind] == 1: print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green') else: print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red') Ind += 1 raw_input()