Python Normalizer.fit_transform Examples, sklearn.preprocessing.Normalizer.fit_transform Python Examples

Example #1

0

Show file

    def __init__(self, hash_len, train_img, train_txt, query_image, query_text,
                 retrieval_image, retrieval_text):
        if not os.path.exists('temp_data'):
            os.mkdir('temp_data')

        # normalize data
        norm2 = Normalizer(norm='l2')
        train_img = norm2.fit_transform(train_img)
        train_txt = norm2.fit_transform(train_txt)
        query_image = norm2.fit_transform(query_image)
        query_text = norm2.fit_transform(query_text)
        retrieval_image = norm2.fit_transform(retrieval_image)
        retrieval_text = norm2.fit_transform(retrieval_text)

        sio.savemat(
            'temp_data/flickr_data.mat', {
                'train_image': np.transpose(train_img),
                'train_text': np.transpose(train_txt),
                'query_image': query_image,
                'query_text': query_text,
                'retrieval_image': retrieval_image,
                'retrieval_text': retrieval_text
            })

        self.flickr_data = sio.loadmat('temp_data/flickr_data.mat')
        self.hash_len = hash_len

Example #2

0

Show file

def load_dat(filepath, minmax=None, normalize=False, bias_term=True):
    """ load a dat file

    args:
    minmax: tuple(min, max), dersired range of transformed data
    normalize: boolean, normalize samples individually to unit norm if True
    bias_term: boolean, add a dummy column of 1s
    """
    lines = np.loadtxt(filepath)
    labels = lines[:, -1]
    features = lines[:, :-1]

    N, dim = features.shape

    if minmax is not None:
        minmax = MinMaxScaler(feature_range=minmax, copy=False)
        minmax.fit_transform(features)

    if normalize:
        # make sure each entry's L2 norm is 1
        normalizer = Normalizer(copy=False)
        normalizer.fit_transform(features)

    if bias_term:
        X = np.hstack([np.ones(shape=(N, 1)), features])
    else:
        X = features

    return X, labels

Example #3

0

Show file

File: provider.py Project: Dulou/Data-Mining-51Job

def salary_provider(preprocessing="None"):
    X_train, X_test, Y_train, Y_test = provider(is_regression=True)

    # Normalize the label [No sense!!]
    # salary_max, salary_min = np.max(Y_train), np.min(Y_train)
    # Y_train = (Y_train - salary_min) / float(salary_max - salary_min)
    # Y_test = (Y_test - salary_min) / float(salary_max - salary_min)
    if preprocessing == "normalize":
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        X_test = normalizer.fit_transform(X_test)
    elif preprocessing == "minmax":
        minmaxscaler = MinMaxScaler()
        X_train = minmaxscaler.fit_transform(X_train)
        X_test = minmaxscaler.fit_transform(X_test)
    elif preprocessing == "standard":
        standardscale = StandardScaler()
        X_train = standardscale.fit_transform(X_train)
        X_test = standardscale.fit_transform(X_test)
    else:
        pass

    print Y_test
    print Y_train
    return X_train, X_test, Y_train, Y_test

Example #4

0

Show file

File: analyzer.py Project: WoohyunNoh/Edu_AI

    def set_tfidf_process(self):
        print('1')
        tfidf = TfidfVectorizer(token_pattern='\S+')  # 띄어쓰기로 구분시킴
        print('11')
        print(self.X_train)
        tfidf.fit(self.X_train.astype('U'))
        print('2')
        train_vector = tfidf.transform(self.X_train.astype('U'))
        print('22')
        validation_vector = tfidf.transform(self.X_validation.astype('U'))
        print('33')
        test_vector = tfidf.transform(self.X_test.astype('U'))
        # print('3')
        print('44')

        nmf = NMF(n_components=50)  # 차원축소
        nmf.fit(train_vector.toarray())
        train_features = nmf.transform(train_vector.toarray())
        validation_features = nmf.transform(validation_vector.toarray())
        test_features = nmf.transform(test_vector.toarray())
        print('4')
        norm = Normalizer()  # 0 ~ 1 사이로 변경
        self.train_nf = norm.fit_transform(train_features)
        self.validation_nf = norm.fit_transform(validation_features)
        self.test_nf = norm.fit_transform(test_features)
        print('5')

Example #5

0

Show file

File: keras_model.py Project: stevewyl/comparative-reviews-classification

def visualize_attention(x_test, y_true, sent_model, doc_model, date, word2idx, label, rand):
    print('Label:', str(label))
    #x_samples = np.array([x_test[k] for k,v in enumerate(y_true) if v == label])
    if rand:
        random_index = nprnd.randint(x_samples.shape[0], size = SHOW_SAMPLES_CNT)
        select_samples = x_samples[random_index]
    else:
        # select_samples = x_samples[0:SHOW_SAMPLES_CNT]
        select_samples = x_test
    sent_all_att, doc_all_att = get_attention(sent_model, 
                                              doc_model, 
                                              select_samples,
                                              MODEL_NAME)
    text_sent = [[word2idx[idx] for sub in select_samples[i] for idx in sub] for i in range(5)]
    normalizer_sent = Normalizer()
    normalizer_doc = Normalizer()
    att_sent = normalizer_sent.fit_transform(sent_all_att)
    att_doc = normalizer_doc.fit_transform(doc_all_att)

    customed_heatmap(att_sent, text_sent, N_LIMIT, date, label, 'sent')
    customed_heatmap(att_doc[:,::-1].T, text_sent, N_LIMIT, date, label, 'doc')
    #important_words = [[word2idx[idx] for idx in word_idx[w_idx]] 
    #                    for w_idx in range(SHOW_SAMPLES_CNT)]
    #print('some important keywords:')
    #pprint(important_words)
    return sent_all_att, doc_all_att

Example #6

0

Show file

class GetXYData:
    def __init__(
        self,
        normalize=True,
        subsample=None,
        variables=["gross_primary_productivity", "soil_moisture"],
        random_state=123,
    ):
        self.normalize = normalize
        self.subsample = subsample
        self.variables = variables
        self.random_state = random_state

    def set_XY(self, xr_data, xr_data2=None):
        """Excepts a dataframe with the time components.
    Converts it into an array."""
        # Convert xarray into dataframe for variables

        if xr_data2 is None:
            xr_data2 = xr_data

        X = xarray2df(xr_data[self.variables[0]])
        Y = xarray2df(xr_data2[self.variables[1]])

        # Merge the Two DataFrames
        var_df = X.merge(Y)

        # Drop the NA Values
        var_df = var_df.dropna()

        # Extract variables
        X = var_df[self.variables[0]].values
        Y = var_df[self.variables[1]].values
        lat = var_df["lat"]
        lon = var_df["lon"]

        # ===============
        # Normalize
        # ===============
        if self.normalize:
            self.x_normalizer = Normalizer()
            X = self.x_normalizer.fit_transform(X)

            self.y_normalizer = Normalizer()
            Y = self.y_normalizer.fit_transform(Y)

        # Subsample if necessary
        if self.subsample:
            X, _, Y, _, lat, _, lon, _ = train_test_split(
                X,
                Y,
                lat,
                lon,
                train_size=self.subsample,
                random_state=self.random_state,
            )

        return X, Y, lat, lon

Example #7

0

Show file

File: min_batch_grad_practice.py Project: kevinlzb/MLP

def normalization_data(norm_type, data_set):
    if norm_type == "l1":
        normlizer = Normalizer(norm='l1')
        norm_data = normlizer.fit_transform(data_set)
    if norm_type == "l2":
        normlizer = Normalizer(norm="l2")
        norm_data = normlizer.fit_transform(data_set)
    if norm_type == "min_max":
        normlizer = MinMaxScaler(feature_range=(0, 1))
        norm_data = normlizer.fit_transform(data_set)
    return norm_data

Example #8

0

Show file

File: preprocesser.py Project: LLealL/DatasetTrainer

 def getNormalized(self, state, size):
     ft_train, ft_test, tg_train, tg_test = train_test_split(
         self.features,
         self.target,
         train_size=size,
         stratify=self.target,
         random_state=state)
     norm = Normalizer()
     ft_train_n = norm.fit_transform(ft_train)
     ft_test_n = norm.fit_transform(ft_test)
     return ft_train_n, ft_test_n

Example #9

0

Show file

File: disaster_prediction_word_embedding.py Project: Amitabh-G/kaggle

def generate_latent_variables(centered_co_occurence, num_components):

    normalizer = Normalizer()
    normalizer.fit_transform(centered_co_occurence)
    pca = decomposition.PCA(svd_solver='randomized', random_state=17)
    pca.fit(centered_co_occurence)
    components = pca.components_
    k_components = components[:num_components]
    latent_vars = k_components * centered_co_occurence
    latent_vars_matrix = latent_vars.T
    return k_components, latent_vars_matrix, normalizer

Example #10

0

Show file

def stds_norms_mms(df, scaler):
    if scaler == 'mms':
        mms = MinMaxScaler()
        mms.fit_transform(df)
    elif scaler == 'stds':
        stds = StandardScaler()
        stds.fit_transform(df)
    elif scaler == 'norms':
        norms = Normalizer()
        norms.fit_transform(df)
    return df

Example #11

0

Show file

File: ImagesProcessor.py Project: tincho4t/aaTP

 def getPcaFeatures(self, images, components, image_size):
     imageDataset = self.getImagesAsDataset(images, image_size)
     norm = Normalizer()
     imageDataset = norm.fit_transform(imageDataset)
     pca = PCA(n_components=components)
     imageDataset = pca.fit_transform(imageDataset)
     return pca, norm, imageDataset

Example #12

0

Show file

File: topic_analysis.py Project: kenzeng24/TwitterBotResearch

def topics(tweets, n_topics):
    """
    generate word2vec model from the tweets and then generate
    a matrix where each column is a word2vec vector of a word
    in the tweet vocabulary. Then use PCA to identify topics in
    the tweets and printthe top words that are associated with that topic

    Parameters
    ----------
    tweets: list
        a list of unicode strings representing tweets
    n_topics: Integer
        an integer greater than 0 representing the number of topics
    """
    print("transforming tweets into vectors...")
    stop = frozenset(stopwords.words('english'))
    vectorizer = TweetVectorizer(stop_words=stop).fit(tweets)
    tweet_vectors = vectorizer.words_matrix()
    word2vec = vectorizer.get_model()

    print("Fitting the PCA model..")
    normalizer = Normalizer()
    pca = PCA(n_components=n_topics)
    pca.fit_transform(normalizer.fit_transform(tweet_vectors))
    for topic_idx, topic in enumerate(pca.components_):
        print("*" * 200)
        print("Topic #%d:" % topic_idx)
        print(word2vec.wv.similar_by_vector(topic))
        print(" ")

Example #13

0

Show file

File: analyze.py Project: marcomorucci/Clustering-Constitutions

def preprocess(data, n_components, use_tf_idf=True):
    """
    Preproecess the data for clustering by running SVD and
    normalizing the results. This process is also known as
    LSA.

    arguments:
    data -- Dataset, if tf_idf is Truethe object must contain a
            tf_idf table alongside a raw frequencies dataframe.
    n_components -- int, the number of components to use for the SVD
                    a minimum of 100 is recommended.
    use_tf_idf -- bool, whether to use the tf-idf frequencies for the
                  preprocessing.

    returns:
    e -- float, a measure of variance explained by the SVD.
    X -- np.array, an array with the data reduced to n_components.
    """
    if use_tf_idf:
        d = data.tf_idf.as_matrix()
    else:
        d = data.df.as_matrix()
    svd = TruncatedSVD(n_components=n_components)
    X = svd.fit_transform(d)
    norm = Normalizer()

    # Record a measure of explained variance
    e = svd.explained_variance_ratio_.sum()*100
    return e, norm.fit_transform(d)

Example #14

0

Show file

File: TMClassCopy.py Project: hurelyyu/CS_Master_UW

def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3):
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        X = agetext["text"]
        X = X.tolist()
        label = agetext["agegroup"].tolist()
        vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2))
        docs = []
        for doc in X:
            docs.append(" ".join(doc))
        docs2 = [doc.replace("\t","").replace("\n","") for doc in docs]
        traindocs = docs2[:7999]
        X = vec.fit_transform(traindocs)
        testdocs = docs2[8000:9500]
        X_test = vec.transform(testdocs)
        tlabel = label[:7999]
        testl = label[8000:9500]
        if(check):
            lsa = TruncatedSVD(k2, algorithm = 'arpack')
            normalizer = Normalizer(copy=False)
            X = lsa.fit_transform(X)
            X = normalizer.fit_transform(X)
            X_test = lsa.transform(X_test)
            X_test = normalizer.transform(X_test)
        model.fit(X,tlabel)
        pred = model.predict(X_test)
        out.append(round(accuracy_score(testl, pred),2))
    print str(out)
    print np.mean(out)

Example #15

0

Show file

def RNN():
    order = cl.get_train_test_set_06()
    sc = Normalizer()  # scaling using normalisation
    order = sc.fit_transform(order)
    Z = order[:, 1]
    Y = []
    for i, z in enumerate(Z):
        if i % 5 == 0:
            Y.append(z)
    X = np.delete(order, 1, 1)
    X = np.reshape(X, (73, 5, 21))

    model = Sequential()
    model.add(LSTM(
        1, return_sequences=False,
        input_shape=(5, 21)))  # returns a sequence of vectors of dimension 32
    # model.add(LSTM(7, return_sequences=True))  # returns a sequence of vectors of dimension 32
    # model.add(Dropout(0.5))
    # model.add(LSTM(1))  # return a single vector of dimension 32
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(X,
                        Y,
                        batch_size=7,
                        epochs=50,
                        validation_split=0.3,
                        shuffle=False)

    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.show()

Example #16

0

Show file

File: Scaling_and_transforming.py Project: yctasoglu/Scaling

def df_normalize(df):
    from sklearn.preprocessing import Normalizer
    normalizer = Normalizer(norm='l2')
    df = pd.DataFrame(normalizer.fit_transform(df), columns=df.columns)
    print("DataSet Normalized...")
    df.head()
    return df

Example #17

0

Show file

File: RL_forecast.py Project: rajagurunath/RL-for-Anomaly

    def transformer(self,data,name_to_save='yahoo_scaler'):
        scaler = Normalizer()
        scaled_out=scaler.fit_transform(data)
#        print(scaler.data_min_,scaler.data_max_)
        pickle.dump(scaler,open(f'{name_to_save}.pkl','wb'))
    
        return scaled_out

Example #18

0

Show file

File: test_feature_optimization.py Project: zwcdp/hyperparameter_hunter

def normalize(train_inputs, non_train_inputs):
    normalizer = Normalizer()
    train_inputs[train_inputs.columns] = normalizer.fit_transform(
        train_inputs.values)
    non_train_inputs[train_inputs.columns] = normalizer.transform(
        non_train_inputs.values)
    return train_inputs, non_train_inputs

Example #19

0

Show file

 def quantify(self,
              exclude,
              encoder_path,
              normalizer_path,
              columns_to_normalize=list()):
     for column in list(set(self.data.columns) - set(exclude)):
         if (not self.data[column].dtype
                 in [np.float, np.int]) and (not "Embedding" in column):
             encoder = LabelEncoder()
             self.data[column] = encoder.fit_transform(
                 self.data[column].astype(str))
             dump(
                 encoder,
                 open(
                     os.path.join(encoder_path,
                                  'LabelEncoder_{}.pkl'.format(column)),
                     'wb'))
         if column in columns_to_normalize:
             normalizer = Normalizer()
             self.data[column] = normalizer.fit_transform(
                 self.data[column].values.reshape(-1, 1))
             dump(
                 normalizer,
                 open(
                     os.path.join(normalizer_path,
                                  'LabelNormalizer_{}.pkl'.format(column)),
                     'wb'))

Example #20

0

Show file

    def predict(self, layer=None):
        """
        Performs sentiment classification prediction on preprocessed audio files
        @param layer: If None, performs normal sentiment classification.
                      If not None, returns the values from the intermediate layers.
        return:
            - The model prediction result
            - The video file names for each of the rows returned in model.predict
              (without the .mp4 suffix)
        """
        folder = unzip_folder(self.audio_folder, "audio_tmp")
        X = np.load(os.path.join(folder, 'audio-pickle-all-X-openl3.pkl'),
                    allow_pickle=True)

        if layer is not None:
            print(f"Customizing model by returning layer {layer}")
            model = tf.keras.models.Model(self.model.input,
                                          self.model.get_layer(layer).output)
        else:
            model = self.model

        normalizer = Normalizer()
        for i in range(0, X.shape[0]):
            X[i] = normalizer.fit_transform(X[i])

        # The original pre-processing created the X array using the sorted order of the video files
        audio_pickles = sorted(
            next(os.walk(os.path.join(self.audio_folder, "audio-pickle")))[2])
        samples = map(lambda x: x.split(".mp4")[0], audio_pickles)

        return model.predict(X, batch_size=self.batch_size), list(samples)

Example #21

0

Show file

File: genres.py Project: lwoloszy/albumpitch

def explore_k(svd_trans, k_range):
    '''
    Explores various values of k in KMeans

    Args:
        svd_trans: dense array with lsi transformed data
        k_range: the range of k-values to explore
    Returns:
        scores: list of intertia scores for each k value
    '''

    scores = []
    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    for k in np.arange:
        km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1,
                    verbose=2)
        km.fit(norm_data)
        scores.append(-1*km.score(norm_data))
    plt.plot(k_range, scores)
    plt.xlabel('# of clusters')
    plt.ylabel('Inertia')
    sns.despine(offset=5, trim=True)
    return scores

Example #22

0

Show file

File: AgeGroup.py Project: hurelyyu/CS_Master_UW

def kfold(agetext,k,model,k2):
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        lsa = TruncatedSVD(k2, algorithm = 'arpack')
        normalizer = Normalizer(copy=False)
        X = lsa.fit_transform(data)
        X = normalizer.fit_transform(X)
        X_test = lsa.transform(testdata)
        X_test = normalizer.transform(X_test)
        model.fit(X,y_train)
        pred = model.predict(X_test)
        counter = collections.Counter(y_test)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(y_test, pred),5))
    print str(out)
    print np.mean(out)

Example #23

0

Show file

File: PredictiveAnalytcis.py Project: KoalaChelsea/Yelp-Restaurant-Data-Analysis

def main():
    """
    Main Function for data preprocesing, normalization and upsampling
    """
    data = pd.read_csv('RawData/Full_Information_Cleaned.csv', index_col=0)
    data = data_pre_processing(data)

    X = data[[
        'Accept_Credit_Card', 'Outdoor_Seating', 'Take_out',
        'Takes_Reservations', 'WIFI', 'Noise_Level', 'atm', 'bank', 'bar',
        'beauty_salon', 'bus_station', 'cafe', 'gym', 'school',
        'White population', 'Black population', 'American Indian population',
        'Asian population', 'Hispanic or Latino population',
        'High school or higher', 'Graduate or professional degree',
        'Unemployed', 'average_price'
    ]]
    Y = pd.factorize(data['class'])[0]
    # To Normalize the feature data into same scale
    norm = Normalizer()
    X = norm.fit_transform(X)
    # To Standarize the data to mean 0 and std 1
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Upsampling to deal with Imbalced Class
    sm = SMOTE(random_state=42)
    X, Y = sm.fit_resample(X, Y)
    print('Resampled dataset shape %s' % Counter(Y))
    # Binarize the output
    y_bin = label_binarize(Y, classes=[0, 1, 2])
    n_classes = y_bin.shape[1]
    model = addModels()
    classifier_run(X, Y, model)

Example #24

0

Show file

 def get_test(self):
     print "Getting physics..."
     data = pd.read_csv(self.data_path + "/test.csv")
     data = data.content.values.tolist()
     data = self.clean_html(data)
     temp = []
     data = [re.sub(r'\n', ' ', x) for x in data]
     for d in data:
         if self.get_token(d):
             temp += [self.get_token(d)]
     mul = 10
     x_test = [x for sublist in temp for x in sublist]
     c = Counter(x_test)
     x_test = list(set(x_test))
     x_test = [x for x in x_test if c[x] > 25]
     ll = lambda x: float(len(x))
     lt = map(ll, temp)
     x_test = [[doc.count(w) * mul / lt[i] for i, doc in enumerate(temp)]
               for w in x_test]
     x_test = np.array(x_test)
     #x_train = np.concatenate((x_train, np.zeros((x_train.shape[0], self.doclen - x_train.shape[1]))), axis=1)
     gc.collect()
     print "Doing LSA"
     print "SVD...."
     u, s, v = sparse.linalg.svds(x_test, embed_SIZE)
     n = Normalizer(copy=False)
     x_test = n.fit_transform(u * s.transpose())
     return x_test

Example #25

0

Show file

def draw_svc(dataset):
    normalizer = Normalizer()
    data_x, data_y = dataset.data, dataset.target
    data_n = normalizer.fit_transform(data_x)
    info = list()
    for i in range(100):
        info.append((i, *pipeline(
            mySVC(kernel='linear', epsilon=0, decision_function_shape='ovo'),
            data_n,
            data_y,
            label='my'), *pipeline(SVC(
                kernel='linear', gamma='auto', decision_function_shape='ovo'),
                                   data_x,
                                   data_y,
                                   label='sk')))
    info = np.array(info)

    plt.figure()
    plt.plot(info[:, 0], info[:, 1], label='my')
    plt.plot(info[:, 0], info[:, 3], label='sklearn')
    plt.xlabel('times'), plt.ylabel('accuracy')
    plt.legend(loc='best')
    plt.show()

    plt.figure()
    plt.plot(info[:, 0], info[:, 2], label='my')
    plt.plot(info[:, 0], info[:, 4], label='sklearn')
    plt.xlabel('times'), plt.ylabel('time (sec)')
    plt.legend(loc='best')
    plt.show()

    mean = info.mean(axis=0)
    print(f'avg acc  my: {mean[1]}, sk: {mean[3]}')
    print(f'avg time my: {mean[2]}, sk: {mean[4]}')
    return

Example #26

0

Show file

def lr_eval(train_embs, eval_embs, train_labels, eval_labels):

    normalizer = Normalizer()
    train_embs = normalizer.fit_transform(train_embs)
    eval_embs = normalizer.transform(eval_embs)
    lr_model = LogisticRegression(random_state=0,
                                  penalty='l2',
                                  solver='liblinear')

    ##drop all negative labels
    non_neg = [i for i in range(len(train_labels)) if train_labels[i] >= 0]
    if len(non_neg) == 0:
        return 0, 0
    else:
        train_embs = [train_embs[i] for i in non_neg]
        train_labels = [train_labels[i] for i in non_neg]
    num_classes = len(list(set(train_labels)))
    if num_classes == 1:
        return 0, 0
    elif num_classes > 2:
        logger.warning('3 classes, something is wrong')
    lr_model.fit(X=train_embs, y=train_labels)
    y_pred = lr_model.predict(eval_embs)
    acc = sum(y_pred == eval_labels) / len(y_pred)
    weights = lr_model.coef_[0]
    dim = int(len(weights) / 2)
    weght_ratio = np.linalg.norm(weights[:dim]) / np.linalg.norm(weights[dim:])

    return acc, weght_ratio

Example #27

0

Show file

def save_cluster_of_sentence_embedding(sentence_embedding_file_path,
                                       cluster_sentence_output_path,
                                       cluster_num,
                                       cluster_centroids_output_path,
                                       cluster_labels_output_path):
    sentence_embedding_list = np.load(sentence_embedding_file_path)
    #归一化
    normalizer = Normalizer(copy=False)
    sentence_embedding_list_norm = normalizer.fit_transform(
        sentence_embedding_list)
    end_time1 = datetime.datetime.now()
    #print('TIME: np.load sentence_embedding_list ', end_time1-start_time)

    #print('shape of sentence_embedding_list', np.shape(sentence_embedding_list))
    cluster_number = int(cluster_num)
    Kmeans = KMeans(n_clusters=cluster_number,
                    n_init=5,
                    max_iter=100,
                    n_jobs=-1)
    cluster_sentence = Kmeans.fit_predict(sentence_embedding_list_norm)
    cluster_sentence2 = Kmeans.fit(sentence_embedding_list_norm)
    end_time2 = datetime.datetime.now()
    print('TIME: Kmeans cluster ', end_time2 - end_time1)

    centroids = cluster_sentence2.cluster_centers_
    labels = cluster_sentence2.labels_
    #cluster_distance = Kmeans.transform(sentence_embedding_list)
    np.save(cluster_sentence_output_path, cluster_sentence)
    np.save(cluster_centroids_output_path, centroids)
    np.save(cluster_labels_output_path, labels)

Example #28

0

Show file

File: scikit_lab.py Project: swenker/bigdata

def normalize_test():
    X=[1,2,3,4,5,2,6,8]
    from sklearn.preprocessing import Normalizer
    normalizer = Normalizer()
    X2 = normalizer.fit_transform(X)

    print X2

Example #29

0

Show file

def data_transformation(X, final_columns, norm=False, z_score=True):
    '''
        Data transformation techniques
            1. Range transformation (Normalization)
            2. Z-Score transformation (Standardization) - Default
        
    '''
    X_transformed = X
    #   necessary transformations
    if norm:
        norm = Normalizer()
        X_transformed = norm.fit_transform(X)
        X_transformed = pd.DataFrame(X_transformed, columns=final_columns)
        print('Normalized')
    if z_score:
        scaler = StandardScaler()
        X_transformed = scaler.fit_transform(X)
        X_transformed = pd.DataFrame(X_transformed, columns=final_columns)
        print('Z-Score Applied')

        print(X_transformed)
        X_transformed_inversed = pd.DataFrame(
            scaler.inverse_transform(X_transformed), columns=final_columns)
        print(X_transformed_inversed)

        fi = 'data_transformation.pkl'
        with open(fi, 'wb') as mod:
            pickle.dump(scaler, mod)

    return X_transformed

Example #30

0

Show file

File: process.py Project: pang43/Spectral-clustering-HW

def get_tf_idf_M(M,
                 tf=["bin", "raw", "log", "dnorm"],
                 idf=["c", "smooth", "max", "prob"],
                 norm_samps=False):
    N = len(M)
    if tf == "raw":
        tf_M = np.copy(M)  #just the frequency of the word in a text
#    #TODO: check if dnorm is implemented OK
#    elif tf == "dnorm":
#        tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1))))
    if idf == "c":
        idf_v = []
        for i in range(
                M.shape[1]
        ):  #get the number of texts that contain a word words[i]
            idf_v.append(np.count_nonzero(
                M[:, i]))  #count the non zero values in columns of matrix M
        idf_v = np.array(idf_v)
        idf_v = np.log(N / idf_v)
    tf_idf_M = tf_M * idf_v
    if norm_samps:
        normalizer = Normalizer()
        tf_idf_M = normalizer.fit_transform(tf_idf_M)
#    np.save("tf_idf_M", tf_idf_M)
    return tf_idf_M

Example #31

0

Show file

File: cluster_workflow.py Project: fielddaylab/chi_play_clustering

 def Normalized(self, df):
     meta = []
     nparray = df.to_numpy()
     normalizer = Normalizer()
     meta.append(f'Normalized with scikitlearn {normalizer}')
     nparray = normalizer.fit_transform(nparray)
     return pd.DataFrame(nparray, columns=df.columns), meta

Example #32

0

Show file

File: normaliser.py Project: vishalbelsare/distance-metrics

def normalise(data, method='robust'):
    """Normalise `data` with `method`.

    Parameters
    ----------
    data: dict
        * train: tuple
            - X: features
            - y: labels
        * test: tuple
            - X: features
            - y: labels
    method: str
        Rescale (and center) data (per feature) by:
        * l2: unit L2 norm
        * l1: unit L1 norm
        * max: unit L{inf} norm
        * standard: standardise N(0, 1) each feature
        * maxabs: maximum absolute value
        * minmax: minimum and maximum values
        * robust: robust to outliers (IQR and median)
        * none: identity block

    Returns
    -------
    rescaled_data: dict
        * train: tuple
            - X: features
            - y: labels
        * test: tuple
            - X: features
            - y: labels
    """
    if method == 'none':
        return data

    X_train, y_train = data['train']
    X_test, y_test = data['test']

    if method == 'l2':
        trans = Normalizer('l2')
    elif method == 'l1':
        trans = Normalizer('l1')
    elif method == 'max':
        trans = Normalizer('max')
    elif method == 'standard':
        trans = StandardScaler()
    elif method == 'maxabs':
        trans = MaxAbsScaler()
    elif method == 'minmax':
        trans = MinMaxScaler()
    elif method == 'robust':
        trans = RobustScaler()
    else:
        raise ValueError('Unrecognised method=%s' % method)

    X_train = trans.fit_transform(X_train)
    X_test = trans.transform(X_test)

    return {'train': (X_train, y_train), 'test': (X_test, y_test)}

Example #33

0

Show file

File: bps_peta_processing.py Project: achmfirmansyah/sweet_project

def outlier_dbscan(data):
    columns = [
        'wet_mean', 'green_mean', 'bright_mean', 'ARVI_mean', 'SAVI_mean',
        'NDBI_mean', 'mNDWI_mean', 'NDWI_mean', 'mNDVI_mean', 'NDVI_mean',
        'wet_p50', 'green_p50', 'bright_p50', 'ARVI_p50', 'SAVI_p50',
        'NDBI_p50', 'mNDWI_p50', 'NDWI_p50', 'mNDVI_p50', 'NDVI_p50',
        'S2_B12mean', 'S2_B11mean', 'S2_B8mean', 'S2_B4mean', 'S2_B3mean',
        'S2_B2mean', 'S2_B12med', 'S2_B11med', 'S2_B8med', 'S2_B4med',
        'S2_B3med', 'S2_B2med'
    ]
    t_c = data.TRAIN_CLASS.unique()
    for i in tqdm_notebook(range(len(t_c)),
                           desc='Processing Clustering Outlier data'):
        cl_data = data.loc[data.TRAIN_CLASS == t_c[i], columns].dropna()
        st_sc = Normalizer()
        model_ = DBSCAN(eps=.05,
                        min_samples=10).fit(st_sc.fit_transform(cl_data))
        cl_data['label'] = model_.labels_
        data.loc[cl_data.index, 'OUTLIER'] = cl_data.label
    data['OUTLIER'] = data.OUTLIER.apply(lambda y: 0 if y >= 0 else -1)
    data_outlier = data.loc[data.OUTLIER < 0, ['x', 'TRAIN_CLASS']].groupby(
        'TRAIN_CLASS').agg('count').rename(columns={
            'x': 'COUNT_OUTLIER'
        }).reset_index()
    fig = px.bar(data_outlier,
                 x="TRAIN_CLASS",
                 y="COUNT_OUTLIER",
                 title="OUTLIER")
    fig.show()
    return data

Example #34

0

Show file

def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)

Example #35

0

Show file

def preprocess(df,service_list,flag_list,labeled=False):
    print(df.shape)
    df_data,label=to_numeric(df,service_list,flag_list,labeled=labeled)
    print(len(service_list))
    print(len(flag_list))
    print('Selected',df_data.shape)
    scaler=Normalizer()

    #Continuous Data
    data_cont=df_data[CONT_FEATURES].values
    data_cont=scaler.fit_transform(data_cont)

    #Binary Category Data
    data_bin=df_data[CAT_FEATURES[3:]].values

    #Categorical Data
    enc = OneHotEncoder(categories=[range(3), range(len(service_list)), range(len(flag_list))])
    enc.fit(df_data[["protocol_type","service","flag"]].values)
    oneHotEncoding = enc.transform(df_data[["protocol_type","service","flag"]].values).toarray()

    print(oneHotEncoding.shape)
    df_final = np.concatenate((data_cont, oneHotEncoding,data_bin), axis=1)
    df_final = pd.DataFrame(df_final)
    print(df_final.shape)
    return df_final,label

Example #36

0

Show file

File: featureSelection.py Project: shichenhao95/Brain-Wave-Classification

def runPCA(input_data, test, d):
	input_data = removeCorrelation(input_data)
	test = removeCorrelation(test)

	normZ = Normalizer()
	scaledX = normZ.fit_transform(input_data.iloc[:,:-1])
	scaledTestX = normZ.transform(test)

	pca = PCA()
	pcaX = pca.fit_transform(scaledX)
	pcaX = pd.DataFrame(pcaX)
	print 'Approx 98% variance explained by '+str(d)+' features: ' + str(pca.explained_variance_ratio_[:d].sum())

	trainY = input_data.iloc[:,-1]
	trainY = trainY.reshape(len(trainY), 1)
	trainY = pd.DataFrame(trainY)
	trainY.columns = ['Class']

	trainDataAfterPCA = pd.concat([pcaX.iloc[:,:d], trainY], axis=1)
	
	testDataAfterPCA = pca.transform(scaledTestX)
	testDataAfterPCA = pd.DataFrame(testDataAfterPCA)
	testDataAfterPCA = testDataAfterPCA.iloc[:,:d]
	
	return trainDataAfterPCA, testDataAfterPCA

Example #37

0

Show file

File: datasets.py Project: mikbuch/pymri

    def _normalize(self, X, y, X_t):
        from sklearn.preprocessing import Normalizer
        NORM = Normalizer()

        X = NORM.fit_transform(X, y)
        X_t = NORM.transform(X_t)

        return X, X_t

Example #38

0

Show file

File: genres.py Project: lwoloszy/albumpitch

def kmeans(tfidf, svd, svd_trans, k=200, n_words=10):
    '''
    Performs k-means clustering on svd transformed data and plots it

    Args:
        tfidf: sklearn fitted TfidfVectorizer
        svd: sklearn fitted TruncatedSVD
        svd_trans: dense array with lsi transformed data
        k: the k in k-means
    Returns:
        km: the fitted KMean object
    '''

    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5,
                verbose=2)
    km.fit(norm_data)

    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]

    terms = tfidf.get_feature_names()
    terms = prettify(terms)
    terms = np.array(terms)
    fig = plt.figure(figsize=(10, 8))
    for i in range(10):
        print("Cluster {:d}:".format(i))
        for ind in order_centroids[i, :n_words]:
            print(' {:s}'.format(terms[ind]))
        print('\n')

        # Make a figure and axes with dimensions as desired.
        ax = fig.add_subplot(2, 5, i+1)
        ax.set_title('Cluster {:d}'.format(i+1))

        component = order_centroids[i]
        cmap = plt.cm.Purples
        mn = np.min(component[:n_words])
        mx = np.max(component[:n_words])
        norm = mpl.colors.Normalize(mn, mx)

        cb = mpl.colorbar.ColorbarBase(ax, cmap=cmap, norm=norm,
                                       orientation='vertical')
        # sorted_component = np.sort(component)
        colors = sns.color_palette('Purples', 9).as_hex()
        colors = np.repeat(colors[-1], n_words)

        cb.set_ticks(np.linspace(mn, mx, n_words+2)[1:-1])
        cb.ax.yaxis.set_tick_params(size=0)
        cb.ax.tick_params(labelsize=10)
        for color, tick in zip(colors, cb.ax.get_yticklabels()):
            tick.set_color(color)
            tick.set_fontsize(14)
        cb.set_ticklabels(np.array(terms)[order_centroids[i, :n_words][::-1]])
    plt.tight_layout()
    return km

Example #39

0

Show file

File: pima.py Project: abshinn/practice

    def reduce_dimension(self, n_components=2):
        """ Return PCA transform of self.data, with n_components. """

        reducer = PCA(n_components=n_components)

        X = self.data.values

        norm = Normalizer()
        Xnorm = norm.fit_transform(X)

        return reducer.fit_transform(Xnorm)

Example #40

0

Show file

File: samples_generator.py Project: RPGOne/sebabulba

def make_nn_regression(n_samples=100, n_features=100, n_informative=10,
                       dense=False, noise=0.0, test_size=0,
                       normalize_x=True, normalize_y=True,
                       shuffle=True, random_state=None):

    X, y, w = _make_nn_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_informative,
                                  shuffle=shuffle,
                                  random_state=random_state)

    if dense:
        X = X.toarray()

    if test_size > 0:
        cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state,
                          test_size=test_size, train_size=1-test_size)

        train, test = list(cv)[0]
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        if not dense:
            X_train.sort_indices()
            X_test.sort_indices()
    else:
        X_train, y_train = X, y
        if not dense:
            X_train.sort_indices()
        X_test, y_test = None, None

    # Add noise
    if noise > 0.0:
        generator = check_random_state(random_state)
        y_train += generator.normal(scale=noise * np.std(y_train),
                                    size=y_train.shape)
        y_train = np.maximum(y_train, 0)

    if normalize_x:
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        if X_test is not None:
            X_test = normalizer.transform(X_test)

    if normalize_y:
        scaler = MinMaxScaler()
        y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
        if y_test is not None:
            y_test = scaler.transform(y_test.reshape(-1, 1)).ravel()

    if X_test is not None:
        return X_train, y_train, X_test, y_test, w
    else:
        return X_train, y_train, w

Example #41

0

Show file

File: newsgroups20.py Project: RokIvansek/Spectral-clustering-HW

def get_tf_idf_M(M, tf = ["bin", "raw", "log", "dnorm"], idf = ["c", "smooth", "max", "prob"], norm_samps=False):
    N = len(M)
    if tf == "raw":
        tf_M = np.copy(M) #just the frequency of the word in a text
#    #TODO: check if dnorm is implemented OK
#    elif tf == "dnorm":
#        tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1))))
    if idf == "c":
        idf_v = []
        for i in range(M.shape[1]): #get the number of texts that contain a word words[i]
            idf_v.append(np.count_nonzero(M[:,i])) #count the non zero values in columns of matrix M
        idf_v = np.array(idf_v)
        idf_v = np.log(N/idf_v)
    tf_idf_M = tf_M*idf_v
    if norm_samps:
        normalizer = Normalizer()
        tf_idf_M = normalizer.fit_transform(tf_idf_M)
#    np.savetxt("tf_idf_M_" + str(N) + ".txt", tf_idf_M , fmt="%s")
    return tf_idf_M

Example #42

0

Show file

File: lstm.py Project: Nudelreaktor/pyNTU_HoJ_LSTM

def lstm_validate(lstm_model, evaluation_dataset, create_confusion_matrix=False, number_of_subframes=0, sample_strategy="random", batch_size=32):
	
	print("evaluate neural network...")
	validation_data = []
	validation_labels = []
	
	accuracy = 0
	n = 0
	idx = 0

	
	for _obj in evaluation_dataset:
		if number_of_subframes > 0:
			validation_data.append(get_buckets(_obj.get_hoj_set(), number_of_subframes, sample_strategy))
		else:
			validation_data.append(_obj.get_hoj_set())
		validation_labels.append(_obj.get_hoj_label()[0])


	# evaluate neural network
	score, acc = lstm_model.evaluate(np.array(validation_data), np.array(validation_labels), batch_size=batch_size, verbose=0)
			
	print("Accuracy:",acc)

	if create_confusion_matrix is True:
		predictions = lstm_model.predict(np.array(validation_data),batch_size = batch_size)
		
		predicted_labels = []
		real_labels = []

		for k in range(len(predictions)):
			predicted_idx = np.argmax(predictions[k])

			label_idx = np.argmax(validation_labels[k])
			
			real_labels.append(label_idx)
			predicted_labels.append(predicted_idx)


		cnf_matrix = confusion_matrix(real_labels, predicted_labels)

		norm = Normalizer()
		cnf_matrix = norm.fit_transform(cnf_matrix)

		return score, acc, cnf_matrix


	return score, acc, None

Example #43

0

Show file

File: transform.py Project: wing00/TwitchExMachina

class FFTTransformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.model = LinearRegression()
        self.y_mean = None
        self.normalize = Normalizer()

    def fit(self, X, y=None):
        X_train = self.make_waves(X)
        y_train = numpy.array(y)
        self.y_mean = y_train.mean()
        self.model.fit(X_train, y_train - self.y_mean)
        return self

    def make_waves(self, X):
        X = X['times']
        time_scale = numpy.array([(time - X[0]).total_seconds() for time in X]).reshape(-1, 1)
        X_train = [
            numpy.concatenate((
                numpy.pi * 2.0 / (24 * 60 * 60) * delta,
                numpy.pi * 2.0 / (12 * 60 * 60) * delta,
                numpy.pi * 2.0 / (6 * 60 * 60) * delta,

                numpy.pi * 2.0 / (7 * 24 * 60 * 60) * delta,
                numpy.pi * 2.0 / (7.0 / 2 * 24 * 60 * 60) * delta,
                numpy.pi * 2.0 / (7.0 / 3 * 24 * 60 * 60) * delta,

                numpy.pi * 2.0 / (1380500.0) * delta,
                numpy.pi * 2.0 / (1380500.0 / 2) * delta,
                numpy.pi * 2.0 / (1380500.0 / 3) * delta), axis=0)
            for delta in time_scale]

        X_train = numpy.concatenate((numpy.sin(X_train), numpy.cos(X_train)), axis=1)
        return X_train

    def predict(self, X):
        X_test = self.make_waves(X)
        X_test = self.model.predict(X_test) + self.y_mean
        return X_test.reshape(-1, 1)

    def transform(self, X, y=None):
        X_test = self.predict(X)
        X_test = self.normalize.fit_transform(X_test)
        return X_test.reshape(-1, 1)

Example #44

0

Show file

File: classify.py Project: SongFGH/subgraph2vec_tf

def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    '''
    Perform classification from
    :param corpus_dir: folder containing subgraph2vec sentence files
    :param extn: extension of subgraph2vec sentence files
    :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
    :param class_labels_fname: files containing labels of each graph
    :return: None
    '''
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)

Example #45

0

Show file

File: vectorize.py Project: damonzon/kaggle

def vectorize(n, comp=0):
    tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english',
        sublinear_tf=True, use_idf=True, smooth_idf=True)

    # Fit and transform
    X = tfv.fit_transform(boiler_stream(trainfnm, n))
    lsa = None
    scaler = None
    if comp > 0:
        lsa = TruncatedSVD(comp)
        scaler = Normalizer(copy=False)
        X = lsa.fit_transform(X)
        X = scaler.fit_transform(X)

    # Transform only
    Z = tfv.transform(boiler_stream(testfnm, n))
    if lsa:
        Z = lsa.transform(Z)
        Z = scaler.transform(Z)
    
    np.save(trainvecfnm, X)
    np.save(testvecfnm, Z)

Example #46

0

Show file

File: mediaeval.py Project: joaopalotti/mediaeval

def createGraph(similarity, sim_keep_above, graph_rule):
    G = nx.Graph()

    # Default (old version): may cause problems
    if graph_rule == "default":
        similarity = 1.0 - similarity
    elif graph_rule == "minmax":
        mm = MinMaxScaler()
        similarity = mm.fit_transform(1.0 - similarity)
    elif graph_rule == "normalization":
        norm = Normalizer()
        similarity = norm.fit_transform(1.0 - similarity)
    elif graph_rule == "inversed":
        similarity = similarity

    # Remove similarity small than S
    similarity[similarity < (sim_keep_above)] = 0.0
    similarity = (similarity * 10000).astype(int)

    if similarity.shape[0] == 1:
        G.add_node(0)
        return G

    max_size = similarity.shape[0]
    vertice = 0
    for i in xrange(0, max_size):
        for j in xrange(i, max_size):
            if not ALLOW_SELF_LOOP:
                if i == j:
                    continue

            v = similarity[i][j]
            # print i,j,v
            if v > 1:
                vertice += 1
                G.add_edge(i, j, weight=v)
    return G

Example #47

0

Show file

File: prediction.py Project: GBJim/cvut_hw1

import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import SGDRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error


df = pd.read_csv("forestfires.txt", index_col=False, sep=" ")

X = df.iloc[:,0:-1].values
Y = df.iloc[:,-1].values
normalizer = Normalizer()

X = normalizer.fit_transform(X)
k_fold_cv = KFold(n=Y.shape[0], n_folds=10, shuffle=True)


sgdr = SGDRegressor()

for train_index, test_index in k_fold_cv:
	X_train, X_test = X[train_index], X[test_index]
	Y_train, Y_test = Y[train_index], Y[test_index]
	sgdr.fit(X_train, Y_train)
	pred = sgdr.predict(X_test)
	error = mean_squared_error(Y_test, pred)
	print(error)

Example #48

0

Show file

File: normlizer_l2_n1500_knn.py Project: jpsagarm95/ML_Contest

print len(newdata)
print len(newdata[0])
print np.shape(newdata)
print "data done"
print "logistic initialized"
# clf.fit(data[:,:-1], data[:,-1])
print "fitted data"
skf = StratifiedKFold(data[:,-1], n_folds=10, shuffle=True)
output =[]
finalscore = 0
counter = 0

for train, test in skf:
	counter = counter + 1
	n = Normalizer()	
	netdata = n.fit_transform([ newdata[i][:-1] for i in train ], [ data[i][-1] for i in train ])
	print np.shape(netdata)
	clf = GradientBoostingClassifier(warm_start = True, n_estimators = 1500)
	clf = clf.fit( netdata, [ data[i][-1] for i in train ])
	n = Normalizer()
	nowdata = n.fit_transform([ newdata[i][:-1] for i in test ])
	print np.shape(nowdata)
	prediction = clf.predict(nowdata)
	# pred = []
	# for i in prediction:
	# 	if(i > 1.5):
	# 		pred.append(2)
	# 	else:
	# 		pred.append(1)
	xscore = score.get_score( prediction , [ data[i][-1] for i in test ])
	finalscore = finalscore + xscore

Example #49

0

Show file

File: runExperiments.py Project: fabiana001/UrlOrganizer

    def run(self, working_directory, clustering_algorithm, word2Vec_conf, vector_type):
        vertices_path = working_directory + "vertex.txt"
        seedsMap_path = working_directory + "seedsMap.txt"
        groundTruth =  working_directory + "groundTruth.csv"
        random_walks_path  = working_directory + "sequenceIDs.txt"

        urlsmap = self.get_urlmap(seedsMap_path)
        documents = self.get_content_map(vertices_path)
        groundTruthMap = self.get_content_map(groundTruth)
        random_walks1, random_walks2 = tee(self.get_sequences(random_walks_path))
        #true_labels = np.array([int(groundTruthMap[v]) for v in urlsmap.values()])
        true_labels = [int(groundTruthMap[v]) for v in urlsmap.values()]

        dim_link, dim_content = self.get_dimension_vectors(vector_type)

        embedding_matrix = []
        document_matrix = []
        codes = list(urlsmap.keys())

        if(dim_link>0):
            word2vec = self.runWord2Vec(word2Vec_conf, dim_link)
            word2vec.build_vocab(random_walks1)
            word2vec.train(random_walks2)
            for url in codes:
                embedding = word2vec[url]
                embedding_matrix.append(embedding)

            #Normalize embedding_matrix using L2
            normalizer_embedding = Normalizer(copy=False)
            embedding_matrix = normalizer_embedding.fit_transform(embedding_matrix)
            print("Normalize embedding_matrix, shape: ",embedding_matrix.shape)

        if(dim_content>0):
             for url in codes:
                 document_matrix.append(documents[url])
             content_matrix = self.get_content_matrix(document_matrix, dim_content)

        combined_matrix = []

        if(dim_link>0 and dim_content>0):
            combined_matrix = np.array ([np.concatenate((content_matrix[i], embedding_matrix[i])) for i in range(0, len(content_matrix))])
            print("Combined link and content matrices, shape: ", combined_matrix.shape)
        elif (dim_link>0):
            combined_matrix = embedding_matrix
        else:
            combined_matrix = content_matrix

        #clustering
        if(clustering_algorithm == "KMEANS"):
            num_clusters = len(set(true_labels))
            print("Clustering using KMEANS with num_clusters = ", num_clusters)
            algorithm = KMeans(n_clusters=num_clusters)
        elif (clustering_algorithm == "HDBSCAN"):
            print("Clustering using HDBSCAN with min 5 elements per cluster")
            algorithm = HDBSCAN(min_cluster_size=5)
        else:
            print("ERROR clustering, wrong parameter ", clustering_algorithm)
            sys.exit(2)

        #learned_labels = np.array(map(lambda x: int(x), algorithm.fit_predict(combined_matrix)))
        learned_labels = np.array([int(x) for x in algorithm.fit_predict(combined_matrix.astype(np.float))])

        #metrics analysis
        filtered_true_labels = []
        filtered_learned_labels = []
        filtered_combined_matrix = []
        for i in range(0, len(true_labels)):
            if (true_labels[i] != -1):
                filtered_true_labels.append(true_labels[i])
                filtered_learned_labels.append(learned_labels[i])
                filtered_combined_matrix.append(combined_matrix[i])
        filtered_true_labels = np.array(filtered_true_labels)
        filtered_learned_labels = np.array(filtered_learned_labels)
        filtered_combined_matrix = np.array(filtered_combined_matrix)

        print("Web pages to analyze: ", len(filtered_learned_labels))
        self.homogeneity = metrics.homogeneity_score(filtered_true_labels, filtered_learned_labels)
        self.completeness = metrics.completeness_score(filtered_true_labels,filtered_learned_labels)
        self.v_measure = metrics.v_measure_score(filtered_true_labels, filtered_learned_labels)
        self.ari = metrics.adjusted_rand_score(filtered_true_labels, filtered_learned_labels)
        self.ami = metrics.adjusted_mutual_info_score(filtered_true_labels, filtered_learned_labels)
        self.silhouette = metrics.silhouette_score(filtered_combined_matrix, filtered_learned_labels, metric='cosine')
        print('\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
        print(self.homogeneity, self.completeness, self.v_measure, self.ari, self.ami, self.silhouette)
        return(filtered_true_labels, filtered_learned_labels)

Example #50

0

Show file

File: load_blood_data.py Project: bkawan/DM-Competition-Getting-Started

def load_blood_data(train=True, SEED=97, scale  = False, 
                                         minmax = False,
                                         norm   = False,
                                         nointercept = False,
                                         engineering = False):
    """
    Load training and test datasets
    for DrivenData's Predict Blood Donations warmup contest
    
    The training data is shuffled before it's returned; test data is not
    
    Note: patsy returns float64 data; Theano requires float32 so conversion
          will be required; the y values are converted to int32, so they're OK
    
    Arguments
    ---------
        train (bool) if True
                         y_train, X_train = load_blood_data(train=True, ...
                     if False
                         X_test, IDs = load_blood_data(train=False, ...
                         
        SEED (int)   random seed
        
        scale (bool) if True, scale the data to mean zero, var 1; standard normal
        
        minmax (2-tuple) to scale the data to a specified range, provide a
                         2-tuple (min, max)
                         
        norm (bool)  if True, L2 normalize for distance and similarity measures
        
        nointercept (bool) if True, patsy will not create an intercept
                         
                         
    Usage
    -----
    from load_blood_data import load_blood_data
    """
    from sklearn.utils         import shuffle
    from patsy                 import dmatrices, dmatrix
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
    import numpy  as np
    import pandas as pd
    import re
    
    global scaler
    global minmaxer
    global normalizer
    
    if (scale and minmax): raise ValueError("cannot specify both scale and minmax")
    if (scale and norm):   raise ValueError("cannot specify both scale and norm")
    if (norm  and minmax): raise ValueError("cannot specify both norm and minmax")
        
    if type(train) is not bool: raise ValueError("train must be boolean")
    if type(SEED)  is not int:  raise ValueError("SEED must be int")
    if type(scale) is not bool: raise ValueError("scale must be boolean")
    if type(norm)  is not bool: raise ValueError("norm must be boolean")
    if type(nointercept) is not bool: raise ValueError("nointercept must be boolean")
    if type(engineering) is not bool: raise ValueError("engineering must be boolean")
    
    # ------------- read the file -------------
    
    file_name = '../data/train.csv' if train else '../data/test.csv'
    data = pd.read_csv(file_name)
    
    
    # ------------- shorten the column names -------------
    
    column_names = ['ID','moSinceLast','numDonations','volume','moSinceFirst','donated']
    data.columns = column_names if train else column_names[:-1]
    
    
    # ------------- create new variables -------------
    
    if engineering:
        # Ratio of moSinceLast / moSinceFirst = moRatio
        data['moRatio'] = pd.Series(data.moSinceLast / data.moSinceFirst, index=data.index)
    
        # Ratio of (volume/numDonations) / moSinceFirst = avgDonation
        data['avgDonation'] = pd.Series((data.volume/data.numDonations) / data.moSinceFirst, index=data.index)
    
        # Ratio of moSinceFirst / numDonations = avgWait
        data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)

        
    # ------------- scale the data -------------

    # transform data to mean zero, unit variance
    # ==========================================
    if scale:
        if train:
            scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to fit in a range
    # ================================
    if minmax:
        if len(minmax) != 2: raise ValueError("minmax must be a 2-tuple")
        if train:
            minmaxer = MinMaxScaler(feature_range = minmax)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to unit vector (L2 norm for distance and similarity)
    # ===================================================================
    if norm:
        if train:
            normalizer = Normalizer(norm='l2', copy=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        
        
    # ------------- create the design matrix -------------
        
    # create the datasets with a patsy formula
    formula = 'donated ~ moSinceLast * moSinceFirst +  numDonations + volume'
    
    if engineering:
        formula = formula + ' + moRatio + avgDonation + avgWait'
        
    if nointercept: 
        formula = formula + ' -1'
        
    if not train:
        match = re.search(r"~\s??(.*)", formula)
        if match:
            formula = match.group(1)
        else:
            raise ValueError("Patsy formula {} does not match the expected format".format(formula))
            
            
    # ------------- return the values -------------
            
    if train:
        y_train, X_train = dmatrices(formula, data=data, return_type="dataframe")
        y_train = np.ravel(y_train).astype(np.int32)
        
        X_train, y_train = shuffle(X_train, y_train, random_state=SEED)
        return y_train, X_train
    else:
        X_test = dmatrix(formula, data=data, return_type="dataframe")
        IDs    = data.ID.values
        return X_test, IDs

Example #51

0

Show file

File: train.py Project: aymen82/gvml

    #descriptors if exist
    vectors=None
    norm=None
    if desc:
        def str_column_to_array(df_column):
            lst=[]
            df_column.apply(lambda row: lst.append(np.array([float(elem) for elem in row.strip('[').strip(']').split(",")])))
            return lst

        vectors = str_column_to_array(train_df["desc"])

        norm="l1"
        if norm is not None:
            normalizer = Normalizer(norm)
            vectors = normalizer.fit_transform(vectors)

    logger.debug("Training...")
    if desc:
        #taining and ml algo on vectors
        model = ml().fit(vectors, labels)
    else:
        if args.model!="dnn":
            raise NotImplementedError("non dnn model is not proposed for direct images")
        #training raw data only with dnn
        if args.stats:
            model = ml(args.model_dir).fitdata(train_df["path"].tolist(), labels)
        else:
            model = ml().fitdata(train_df["path"].tolist(), labels)

Example #52

0

Show file

File: spamchecker.py Project: BahulkarAmey/sklearnlab

def normalizer(X):
    s = Normalizer(norm='l1')
    return s.fit_transform(X)

Example #53

0

Show file

File: wine_explore.py Project: Borisdatzar/machine_learning_techniques

def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): 
	"""
	Plot data transformed into two dimensions by PCA. 
	PCA transforms into a new embedding dimension such that 
	the first dimension contains the maximal variance and following 
	dimensions maximal remaining variance. 
	This shoudl spread the observed n-dimensional data maximal. This 
	is unsupervised and will not consider target values. 
	"""
	if (scale): 
		scaler = StandardScaler()
		X = scaler.fit_transform(X)

	if (normalize): 
		normalizer = Normalizer(norm='l2')
		X = normalizer.fit_transform(X)
		
	if (embedding is 'pca'): 
		pca = PCA(n_components=2)
		X_transformed = pca.fit_transform(X)
	elif (embedding is 'isomap'):
		isomap = Isomap(n_components=2, n_neighbors=20)
		X_transformed = isomap.fit_transform(X)
	elif (embedding is 'lle' ): 
		lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5)
		X_transformed = lle.fit_transform(X)
	elif (embedding is 'tsne'): 
		t_sne = TSNE(n_components=2)
		X_transformed = t_sne.fit_transform(X)
	elif (embedding is 'spectral'): 
		se = SpectralEmbedding(n_components=2)
		X_transformed = se.fit_transform(X)
	elif (embedding is 'mds'):
		mds = MDS(n_components=2)
		X_transformed = mds.fit_transform(X)
	elif (embedding is 'gallery'): 
		plt.figure(1)
		
		plt.subplot(231)
		plt.title('pca')
		X_t = PCA(n_components=2).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(232)
		plt.title('isomap')
		X_t = Isomap(n_neighbors=20).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(233)
		plt.title('lle')
		X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(234)
		plt.title('tsne')
		X_t = TSNE().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(235)
		plt.title('spectral')
		X_t = SpectralEmbedding().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(236)
		plt.title('mds')
		X_t = MDS().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.suptitle('Gallery transforms ' + title)

		return plt
	else:
		raise ValueError("Choose between pca, isomap and tsne")

	plt.title(title + ' ' + embedding + ' plot')
	sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y)
	plt.colorbar(sc)
	return plt

Example #54

0

Show file

File: wine.py Project: rupakc/UCI-Data-Analysis

mnb = MultinomialNB()
bnb = BernoulliNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=51)
ada = AdaBoostClassifier()

classifiers = [lda,qda,svm,perceptron,gnb,mnb,bnb,knn,rf,ada]
classifier_names = ["LDA","QDA","SVM (RBF)","Perceptron","Gaussian NB","Multinomial NB",\
"Bernoulli NB","KNN (K=5)","Random Forests","Ada Boost"]

index = np.arange(len(classifier_names))  

#Extracting the data values in a numpy array and Preprocessing it

data = dataFrame.values
data_normalized = normalizer.fit_transform(data) 
data_standard = standardscale.fit_transform(data)
data_minmax = minmaxscaler.fit_transform(data) 
preprocess_names = ["Unscaled","Normalized","Standardized","MinMax"] 
preprocessors = [data,data_normalized,data_standard,data_minmax] 
train_labels = class_labels[:128]
test_labels = class_labels[128:]
performance_all_preprocess = list([]) 
count = 0

#Defines the Recursive Feature Selector for best feature selection 

def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)

Example #55

0

Show file

File: loadFiles.py Project: akry1/geolocating_tweets

def main(path):
    #tweetdata = loadfiles(path)
    tweetdata = pd.read_csv(path,header=0,dtype=str, names = ['text','lat','lng','class'])[:50000]
    
    #traindata, testdata = train_test_split(tweetdata,test_size=0.3, random_state=50)

    size = len(tweetdata)
    start = 7*size/10

    trainclass = tweetdata['class'][:start]
    testclass = tweetdata['class'][start:]

    vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,
            stop_words='english',use_idf=True,encoding='utf-8',
            decode_error='ignore',lowercase=True)

    norm = Normalizer(copy=False)
    tfids = vectorizer.fit_transform(tweetdata['text'])
    normalized_tfids = norm.fit_transform(tfids)

    ch2 = SelectKBest(chi2, k=1000)
    #normalized_tfids = ch2.fit_transform(normalized_tfids,tweetdata['class'])

    data = pd.DataFrame(normalized_tfids.toarray())
    traindata = data[:start]
    testdata = data[start:]

    traindata = ch2.fit_transform(traindata,trainclass)
    testdata = ch2.fit_transform(testdata,testclass)

    #traindata= pd.DataFrame(traindata,columns=['text','lat','lng','class'])
    #testdata = pd.DataFrame(testdata,columns=['text','lat','lng','class'])
        
    
    #tweetdata['location'] = map(reverseGeocode, tweetdata['lat'],tweetdata['lng'])  
#    map(wordsForChiFeatures,tweetdata['text'], tweetdata['location'])    
#    totalCount = sum(j for j in wordLocDict.values() if j>1)
#    for i,j in wordLocDict.items():
#        # change 1 to any value as per requirement
#        if j>5 :
#            tweetdata[str(i)] = map(lambda x,y:assignFeature(x,y,i[0],totalCount),tweetdata['text'],tweetdata['location'])
#    tweetdata.to_csv('liw.csv',header=True, index=False,encoding='utf-8')
        
    #testdata= loadfiles('C:\Users\AravindKumarReddy\Downloads\SMMTest')
    #traindata['location'] = map(mapLocation, traindata['lat'],traindata['lng'])
    #testdata['location'] = map(mapLocation, testdata['lat'],testdata['lng'])
    #train_tfids =  vectorizer.fit_transform(traindata['text'])
    #test_tfids =  vectorizer.fit_transform(testdata['text'])

    #train_tfids = norm.fit_transform(train_tfids)
    #test_tfids = norm.fit_transform(test_tfids)
    #km = KMeans(n_clusters=2000, init='k-means++', max_iter=100, n_init=1)
    #km.fit(traindata[[1,2]])
    #y = traindata['class']

    nb = MultinomialNB(alpha=.1)
    nb.fit(traindata,trainclass)

    predictions = nb.predict(testdata)
    print predictions
    print '================================='
    print testclass
    print accuracy_score(testclass,predictions)

Example #56

0

Show file

File: run.py Project: dongshu2013/music-instruments-identification

def normalize(features):
    nm = Normalizer()
    min_max_scaler = MinMaxScaler(feature_range=(0, 10))
    features = nm.fit_transform(features)
    return min_max_scaler.fit_transform(features)

Example #57

0

Show file

File: preprocessing.py Project: Chunzhen/mboost

 def normalizer_scale(self, X):
     scaler = Normalizer()
     return scaler.fit_transform(X)

Example #58

0

Show file

File: predictor6.py Project: jpsagarm95/ML_Contest

import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import Normalizer

clf = GradientBoostingClassifier(warm_start = True, n_estimators=1000)
print 'clf created'

trainData = np.array([ [ float(x.strip()) for x in line.split(',') ] for line in open('completedData10NN.csv') ])
n = Normalizer(norm = 'l1')
train = n.fit_transform(trainData[:,:-1], trainData[:,-1])
# train = normalize(trainData[:,:-1])
print 'train data read'

clf.fit(train, trainData[:,-1])
print 'clf trained'

testData = [ [ float(x.strip()) for x in line.split(',') ] for line in open('completedTestData10NN.csv') ]
n = Normalizer(norm = 'l1')
newdata = n.fit_transform(testData)
# newdata = normalize(testData)
print np.shape(newdata)
print "test data read"

prediction = clf.predict(newdata)
print "predicted"

f = open('team04_l1_n1000_10NN.txt', 'w')
def num(x):
	if x == 1:
		return 'A'
	elif x == 2:

Example #59

0

Show file

File: model.py Project: nikhilketkar/jugaad

 trainingAccuracy = numpy.zeros(folds)
 trainingBaseline = numpy.zeros(folds)
 testingAccuracy = numpy.zeros(folds)
 testingBaseline = numpy.zeros(folds)
 testingDensity = []
 testingF1 = numpy.zeros(folds)
 
 # sys.stdout.write("Query\tFold\tGround-Truth\tPredicted\n")
 
 for i, (train, test) in enumerate(skf):
     vectorizer = CountVectorizer(min_df=1,dtype='double')
     normalizer = Normalizer()
     classifier = LinearSVC(loss='l1')
     strawMan = DummyClassifier(strategy='most_frequent')
 
     X = normalizer.fit_transform(vectorizer.fit_transform(examples[train]))
     y = labels[train]
     classifier.fit(X, y)
     strawMan.fit(X, y)
 
     testingDensity.extend(computeDensity(vectorizer, examples[test]))
     trainingAccuracy[i] = predict(classifier,normalizer,vectorizer,examples[train], labels[train])
     trainingBaseline[i] = predict(strawMan,normalizer,vectorizer,examples[train], labels[train])
     testingAccuracy[i] = predict(classifier,normalizer,vectorizer,examples[test], labels[test])
     testingBaseline[i] = predict(strawMan,normalizer,vectorizer,examples[test], labels[test])
     testingF1[i] = predictF1(classifier,normalizer,vectorizer,examples[test], labels[test])
 
 print "Training Accuracy:" + prettyPrint(trainingAccuracy)
 print "Test Accuracy:" + prettyPrint(testingAccuracy)
 print "Training Baseline:" + prettyPrint(trainingBaseline)
 print "Test Baseline:" + prettyPrint(testingBaseline)

Example #60

0

Show file

File: Classify.py Project: MLDroid/SentimentAnalysisInt

def main():
    # if sys.argv[2] == 'svm':
    #     Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    # elif sys.argv[2] == 'lr':
    #     Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8)
    # elif sys.argv[2] == 'pa':
    #     Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced')
    # else:
    #     Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced')

    Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced')
    Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5,
                   param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)
    # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3,
    #                param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)

    File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt'
    Ngram = 2

    print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram)


    PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100]
    NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100]
    print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples))
    X = PosSamples + NegSamples
    y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=random.randint(0,100))
    print '# TrainLabels', len(y_train)
    print '# TestLabels', len(y_test)

    print 'performing CVectorizer'
    CVectorizer = CountVectorizer(lowercase = True,
                                  stop_words='english',
                                  # token_pattern='(?u)\b\w\w+\b',
                                  # tokenizer = SGTokenizer,
                                  tokenizer = Tokenizer,
                                  ngram_range=(1,2),
                                  dtype=np.float64,
                                  decode_error = 'ignore',
                                  max_df=0.8)
    print 'performing TfidfTransformer and Normalizer'
    # TFIDFTransformer = TfidfTransformer()
    normalizer = Normalizer()
    print 'creating Train and Test FVs'
    T0 = time()
    TrainFVs = CVectorizer.fit_transform(X_train)
    TestFVs = CVectorizer.transform(X_test)
    print 'feat ext time', time() - T0

    # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs)
    # TestFVs = TFIDFTransformer.transform(TestFVs)

    TrainFVs = normalizer.fit_transform(TrainFVs)
    TestFVs = normalizer.transform(TestFVs)

    print 'Trai/test split'
    print TrainFVs.shape
    print TestFVs.shape
    # raw_input('hit any key...')

    print 'training classifier with train samples shape:', TrainFVs.shape
    T0 = time()
    # memory_dump('before_train_mem.txt')
    Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily)
    print 'batch fitted'
    print 'training time', time() - T0
    # memory_dump('after_train_mem.txt')

    print 'testing classifier with test samples shape:', TestFVs.shape
    T0 = time()
    # memory_dump('before_test_mem.txt')
    PredictedLabels = Clf.predict(TestFVs)
    print 'testing time', time() - T0
    # memory_dump('after_test_mem.txt')

    print '*'*100
    print 'classification report'
    print '-'*20
    Accuracy = np.mean(PredictedLabels == y_test)
    print "Test Set Accuracy = ", Accuracy

    print(metrics.classification_report(y_test,
                PredictedLabels, target_names=['Neg', 'Pos']))

    print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels)
    print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels)
    print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels)
    print "Log loss:", metrics.log_loss(y_test, PredictedLabels)
    print "F1 Score:", metrics.f1_score(y_test, PredictedLabels)
    print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels)
    print '*'*100

    Vocab = CVectorizer.get_feature_names()
    # print Vocab[:100]
    # raw_input()
    try:
        FeatureImportances = Clf.coef_[0]
    except:
        FeatureImportances = Clf.best_estimator_.coef_[0]

    print FeatureImportances.shape
    raw_input()
    PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1]
    NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1]
    for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices):
                print Vocab[PosFIndex], '+-', Vocab[NegFIndex]


    FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1]))
    FeatureImportancesSparseArray.setdiag(FeatureImportances)

    AllFVsTimesW = TestFVs*FeatureImportancesSparseArray
    print AllFVsTimesW.shape

    Ind = 0
    for TestFV in TestFVs:
        if PredictedLabels[Ind] != y_test[Ind]:
            Ind += 1
            continue
        if len(X_test[Ind].split()) < 5:
            Ind += 1
            continue
        print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind])
        # print TestFV
        # print TestFV.shape
        CurTestFV = np.array(AllFVsTimesW[Ind].toarray())
        CurTestFV = CurTestFV.transpose()
        CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],)
        # print CurTestFV.shape
        # raw_input()
        PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1]
        NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1]
        PosFeatImps= CurTestFV.argsort()[-2:]
        NegFeatImps = CurTestFV.argsort()[:2]
        Tmp = AllFVsTimesW[Ind].todense()
        Tmp = np.sort(Tmp)
        # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp
        # print NegTopFeatureIndices, NegFeatImps
        if y_test[Ind] == 1:
            print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green')

        else:
            print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red')
        Ind += 1
        raw_input()