Example #1
0
    def __init__(self, hash_len, train_img, train_txt, query_image, query_text,
                 retrieval_image, retrieval_text):
        if not os.path.exists('temp_data'):
            os.mkdir('temp_data')

        # normalize data
        norm2 = Normalizer(norm='l2')
        train_img = norm2.fit_transform(train_img)
        train_txt = norm2.fit_transform(train_txt)
        query_image = norm2.fit_transform(query_image)
        query_text = norm2.fit_transform(query_text)
        retrieval_image = norm2.fit_transform(retrieval_image)
        retrieval_text = norm2.fit_transform(retrieval_text)

        sio.savemat(
            'temp_data/flickr_data.mat', {
                'train_image': np.transpose(train_img),
                'train_text': np.transpose(train_txt),
                'query_image': query_image,
                'query_text': query_text,
                'retrieval_image': retrieval_image,
                'retrieval_text': retrieval_text
            })

        self.flickr_data = sio.loadmat('temp_data/flickr_data.mat')
        self.hash_len = hash_len
Example #2
0
def load_dat(filepath, minmax=None, normalize=False, bias_term=True):
    """ load a dat file

    args:
    minmax: tuple(min, max), dersired range of transformed data
    normalize: boolean, normalize samples individually to unit norm if True
    bias_term: boolean, add a dummy column of 1s
    """
    lines = np.loadtxt(filepath)
    labels = lines[:, -1]
    features = lines[:, :-1]

    N, dim = features.shape

    if minmax is not None:
        minmax = MinMaxScaler(feature_range=minmax, copy=False)
        minmax.fit_transform(features)

    if normalize:
        # make sure each entry's L2 norm is 1
        normalizer = Normalizer(copy=False)
        normalizer.fit_transform(features)

    if bias_term:
        X = np.hstack([np.ones(shape=(N, 1)), features])
    else:
        X = features

    return X, labels
Example #3
0
def salary_provider(preprocessing="None"):
    X_train, X_test, Y_train, Y_test = provider(is_regression=True)

    # Normalize the label [No sense!!]
    # salary_max, salary_min = np.max(Y_train), np.min(Y_train)
    # Y_train = (Y_train - salary_min) / float(salary_max - salary_min)
    # Y_test = (Y_test - salary_min) / float(salary_max - salary_min)
    if preprocessing == "normalize":
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        X_test = normalizer.fit_transform(X_test)
    elif preprocessing == "minmax":
        minmaxscaler = MinMaxScaler()
        X_train = minmaxscaler.fit_transform(X_train)
        X_test = minmaxscaler.fit_transform(X_test)
    elif preprocessing == "standard":
        standardscale = StandardScaler()
        X_train = standardscale.fit_transform(X_train)
        X_test = standardscale.fit_transform(X_test)
    else:
        pass

    print Y_test
    print Y_train
    return X_train, X_test, Y_train, Y_test
Example #4
0
    def set_tfidf_process(self):
        print('1')
        tfidf = TfidfVectorizer(token_pattern='\S+')  # 띄어쓰기로 구분시킴
        print('11')
        print(self.X_train)
        tfidf.fit(self.X_train.astype('U'))
        print('2')
        train_vector = tfidf.transform(self.X_train.astype('U'))
        print('22')
        validation_vector = tfidf.transform(self.X_validation.astype('U'))
        print('33')
        test_vector = tfidf.transform(self.X_test.astype('U'))
        # print('3')
        print('44')

        nmf = NMF(n_components=50)  # 차원축소
        nmf.fit(train_vector.toarray())
        train_features = nmf.transform(train_vector.toarray())
        validation_features = nmf.transform(validation_vector.toarray())
        test_features = nmf.transform(test_vector.toarray())
        print('4')
        norm = Normalizer()  # 0 ~ 1 사이로 변경
        self.train_nf = norm.fit_transform(train_features)
        self.validation_nf = norm.fit_transform(validation_features)
        self.test_nf = norm.fit_transform(test_features)
        print('5')
def visualize_attention(x_test, y_true, sent_model, doc_model, date, word2idx, label, rand):
    print('Label:', str(label))
    #x_samples = np.array([x_test[k] for k,v in enumerate(y_true) if v == label])
    if rand:
        random_index = nprnd.randint(x_samples.shape[0], size = SHOW_SAMPLES_CNT)
        select_samples = x_samples[random_index]
    else:
        # select_samples = x_samples[0:SHOW_SAMPLES_CNT]
        select_samples = x_test
    sent_all_att, doc_all_att = get_attention(sent_model, 
                                              doc_model, 
                                              select_samples,
                                              MODEL_NAME)
    text_sent = [[word2idx[idx] for sub in select_samples[i] for idx in sub] for i in range(5)]
    normalizer_sent = Normalizer()
    normalizer_doc = Normalizer()
    att_sent = normalizer_sent.fit_transform(sent_all_att)
    att_doc = normalizer_doc.fit_transform(doc_all_att)

    customed_heatmap(att_sent, text_sent, N_LIMIT, date, label, 'sent')
    customed_heatmap(att_doc[:,::-1].T, text_sent, N_LIMIT, date, label, 'doc')
    #important_words = [[word2idx[idx] for idx in word_idx[w_idx]] 
    #                    for w_idx in range(SHOW_SAMPLES_CNT)]
    #print('some important keywords:')
    #pprint(important_words)
    return sent_all_att, doc_all_att
Example #6
0
class GetXYData:
    def __init__(
        self,
        normalize=True,
        subsample=None,
        variables=["gross_primary_productivity", "soil_moisture"],
        random_state=123,
    ):
        self.normalize = normalize
        self.subsample = subsample
        self.variables = variables
        self.random_state = random_state

    def set_XY(self, xr_data, xr_data2=None):
        """Excepts a dataframe with the time components.
    Converts it into an array."""
        # Convert xarray into dataframe for variables

        if xr_data2 is None:
            xr_data2 = xr_data

        X = xarray2df(xr_data[self.variables[0]])
        Y = xarray2df(xr_data2[self.variables[1]])

        # Merge the Two DataFrames
        var_df = X.merge(Y)

        # Drop the NA Values
        var_df = var_df.dropna()

        # Extract variables
        X = var_df[self.variables[0]].values
        Y = var_df[self.variables[1]].values
        lat = var_df["lat"]
        lon = var_df["lon"]

        # ===============
        # Normalize
        # ===============
        if self.normalize:
            self.x_normalizer = Normalizer()
            X = self.x_normalizer.fit_transform(X)

            self.y_normalizer = Normalizer()
            Y = self.y_normalizer.fit_transform(Y)

        # Subsample if necessary
        if self.subsample:
            X, _, Y, _, lat, _, lon, _ = train_test_split(
                X,
                Y,
                lat,
                lon,
                train_size=self.subsample,
                random_state=self.random_state,
            )

        return X, Y, lat, lon
Example #7
0
def normalization_data(norm_type, data_set):
    if norm_type == "l1":
        normlizer = Normalizer(norm='l1')
        norm_data = normlizer.fit_transform(data_set)
    if norm_type == "l2":
        normlizer = Normalizer(norm="l2")
        norm_data = normlizer.fit_transform(data_set)
    if norm_type == "min_max":
        normlizer = MinMaxScaler(feature_range=(0, 1))
        norm_data = normlizer.fit_transform(data_set)
    return norm_data
Example #8
0
 def getNormalized(self, state, size):
     ft_train, ft_test, tg_train, tg_test = train_test_split(
         self.features,
         self.target,
         train_size=size,
         stratify=self.target,
         random_state=state)
     norm = Normalizer()
     ft_train_n = norm.fit_transform(ft_train)
     ft_test_n = norm.fit_transform(ft_test)
     return ft_train_n, ft_test_n
def generate_latent_variables(centered_co_occurence, num_components):

    normalizer = Normalizer()
    normalizer.fit_transform(centered_co_occurence)
    pca = decomposition.PCA(svd_solver='randomized', random_state=17)
    pca.fit(centered_co_occurence)
    components = pca.components_
    k_components = components[:num_components]
    latent_vars = k_components * centered_co_occurence
    latent_vars_matrix = latent_vars.T
    return k_components, latent_vars_matrix, normalizer
Example #10
0
def stds_norms_mms(df, scaler):
    if scaler == 'mms':
        mms = MinMaxScaler()
        mms.fit_transform(df)
    elif scaler == 'stds':
        stds = StandardScaler()
        stds.fit_transform(df)
    elif scaler == 'norms':
        norms = Normalizer()
        norms.fit_transform(df)
    return df
Example #11
0
 def getPcaFeatures(self, images, components, image_size):
     imageDataset = self.getImagesAsDataset(images, image_size)
     norm = Normalizer()
     imageDataset = norm.fit_transform(imageDataset)
     pca = PCA(n_components=components)
     imageDataset = pca.fit_transform(imageDataset)
     return pca, norm, imageDataset
def topics(tweets, n_topics):
    """
    generate word2vec model from the tweets and then generate
    a matrix where each column is a word2vec vector of a word
    in the tweet vocabulary. Then use PCA to identify topics in
    the tweets and printthe top words that are associated with that topic

    Parameters
    ----------
    tweets: list
        a list of unicode strings representing tweets
    n_topics: Integer
        an integer greater than 0 representing the number of topics
    """
    print("transforming tweets into vectors...")
    stop = frozenset(stopwords.words('english'))
    vectorizer = TweetVectorizer(stop_words=stop).fit(tweets)
    tweet_vectors = vectorizer.words_matrix()
    word2vec = vectorizer.get_model()

    print("Fitting the PCA model..")
    normalizer = Normalizer()
    pca = PCA(n_components=n_topics)
    pca.fit_transform(normalizer.fit_transform(tweet_vectors))
    for topic_idx, topic in enumerate(pca.components_):
        print("*" * 200)
        print("Topic #%d:" % topic_idx)
        print(word2vec.wv.similar_by_vector(topic))
        print(" ")
def preprocess(data, n_components, use_tf_idf=True):
    """
    Preproecess the data for clustering by running SVD and
    normalizing the results. This process is also known as
    LSA.

    arguments:
    data -- Dataset, if tf_idf is Truethe object must contain a
            tf_idf table alongside a raw frequencies dataframe.
    n_components -- int, the number of components to use for the SVD
                    a minimum of 100 is recommended.
    use_tf_idf -- bool, whether to use the tf-idf frequencies for the
                  preprocessing.

    returns:
    e -- float, a measure of variance explained by the SVD.
    X -- np.array, an array with the data reduced to n_components.
    """
    if use_tf_idf:
        d = data.tf_idf.as_matrix()
    else:
        d = data.df.as_matrix()
    svd = TruncatedSVD(n_components=n_components)
    X = svd.fit_transform(d)
    norm = Normalizer()

    # Record a measure of explained variance
    e = svd.explained_variance_ratio_.sum()*100
    return e, norm.fit_transform(d)
Example #14
0
def kfold(agetext,k,model,nfeatures,check=False,k2 = None,max_df=0.9,min_df=3):
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        X = agetext["text"]
        X = X.tolist()
        label = agetext["agegroup"].tolist()
        vec = TfidfVectorizer(tokenizer = tokenize,token_pattern=r'(?u)\b\w\w+\b|^[_\W]+$',lowercase=False,max_features=nfeatures,max_df = max_df,min_df = min_df,use_idf=True,ngram_range=(1,2))
        docs = []
        for doc in X:
            docs.append(" ".join(doc))
        docs2 = [doc.replace("\t","").replace("\n","") for doc in docs]
        traindocs = docs2[:7999]
        X = vec.fit_transform(traindocs)
        testdocs = docs2[8000:9500]
        X_test = vec.transform(testdocs)
        tlabel = label[:7999]
        testl = label[8000:9500]
        if(check):
            lsa = TruncatedSVD(k2, algorithm = 'arpack')
            normalizer = Normalizer(copy=False)
            X = lsa.fit_transform(X)
            X = normalizer.fit_transform(X)
            X_test = lsa.transform(X_test)
            X_test = normalizer.transform(X_test)
        model.fit(X,tlabel)
        pred = model.predict(X_test)
        out.append(round(accuracy_score(testl, pred),2))
    print str(out)
    print np.mean(out)
Example #15
0
def RNN():
    order = cl.get_train_test_set_06()
    sc = Normalizer()  # scaling using normalisation
    order = sc.fit_transform(order)
    Z = order[:, 1]
    Y = []
    for i, z in enumerate(Z):
        if i % 5 == 0:
            Y.append(z)
    X = np.delete(order, 1, 1)
    X = np.reshape(X, (73, 5, 21))

    model = Sequential()
    model.add(LSTM(
        1, return_sequences=False,
        input_shape=(5, 21)))  # returns a sequence of vectors of dimension 32
    # model.add(LSTM(7, return_sequences=True))  # returns a sequence of vectors of dimension 32
    # model.add(Dropout(0.5))
    # model.add(LSTM(1))  # return a single vector of dimension 32
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mean_squared_error',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(X,
                        Y,
                        batch_size=7,
                        epochs=50,
                        validation_split=0.3,
                        shuffle=False)

    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.show()
def df_normalize(df):
    from sklearn.preprocessing import Normalizer
    normalizer = Normalizer(norm='l2')
    df = pd.DataFrame(normalizer.fit_transform(df), columns=df.columns)
    print("DataSet Normalized...")
    df.head()
    return df
    def transformer(self,data,name_to_save='yahoo_scaler'):
        scaler = Normalizer()
        scaled_out=scaler.fit_transform(data)
#        print(scaler.data_min_,scaler.data_max_)
        pickle.dump(scaler,open(f'{name_to_save}.pkl','wb'))
    
        return scaled_out
def normalize(train_inputs, non_train_inputs):
    normalizer = Normalizer()
    train_inputs[train_inputs.columns] = normalizer.fit_transform(
        train_inputs.values)
    non_train_inputs[train_inputs.columns] = normalizer.transform(
        non_train_inputs.values)
    return train_inputs, non_train_inputs
Example #19
0
 def quantify(self,
              exclude,
              encoder_path,
              normalizer_path,
              columns_to_normalize=list()):
     for column in list(set(self.data.columns) - set(exclude)):
         if (not self.data[column].dtype
                 in [np.float, np.int]) and (not "Embedding" in column):
             encoder = LabelEncoder()
             self.data[column] = encoder.fit_transform(
                 self.data[column].astype(str))
             dump(
                 encoder,
                 open(
                     os.path.join(encoder_path,
                                  'LabelEncoder_{}.pkl'.format(column)),
                     'wb'))
         if column in columns_to_normalize:
             normalizer = Normalizer()
             self.data[column] = normalizer.fit_transform(
                 self.data[column].values.reshape(-1, 1))
             dump(
                 normalizer,
                 open(
                     os.path.join(normalizer_path,
                                  'LabelNormalizer_{}.pkl'.format(column)),
                     'wb'))
Example #20
0
    def predict(self, layer=None):
        """
        Performs sentiment classification prediction on preprocessed audio files
        @param layer: If None, performs normal sentiment classification.
                      If not None, returns the values from the intermediate layers.
        return:
            - The model prediction result
            - The video file names for each of the rows returned in model.predict
              (without the .mp4 suffix)
        """
        folder = unzip_folder(self.audio_folder, "audio_tmp")
        X = np.load(os.path.join(folder, 'audio-pickle-all-X-openl3.pkl'),
                    allow_pickle=True)

        if layer is not None:
            print(f"Customizing model by returning layer {layer}")
            model = tf.keras.models.Model(self.model.input,
                                          self.model.get_layer(layer).output)
        else:
            model = self.model

        normalizer = Normalizer()
        for i in range(0, X.shape[0]):
            X[i] = normalizer.fit_transform(X[i])

        # The original pre-processing created the X array using the sorted order of the video files
        audio_pickles = sorted(
            next(os.walk(os.path.join(self.audio_folder, "audio-pickle")))[2])
        samples = map(lambda x: x.split(".mp4")[0], audio_pickles)

        return model.predict(X, batch_size=self.batch_size), list(samples)
Example #21
0
def explore_k(svd_trans, k_range):
    '''
    Explores various values of k in KMeans

    Args:
        svd_trans: dense array with lsi transformed data
        k_range: the range of k-values to explore
    Returns:
        scores: list of intertia scores for each k value
    '''

    scores = []
    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    for k in np.arange:
        km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1,
                    verbose=2)
        km.fit(norm_data)
        scores.append(-1*km.score(norm_data))
    plt.plot(k_range, scores)
    plt.xlabel('# of clusters')
    plt.ylabel('Inertia')
    sns.despine(offset=5, trim=True)
    return scores
Example #22
0
def kfold(agetext,k,model,k2):
    import collections
    out = []
    for i in range(k):
        print "iteration: "+str(i)
        agetext = shuffle(agetext)
        datatb = agetext.iloc[:,1:]
        label = agetext["agegroup"].tolist()
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            datatb, label, test_size=0.15, random_state=i*6)
        data = X_train.values
        counter = collections.Counter(y_train)
        print counter
        testdata = X_test.values
        lsa = TruncatedSVD(k2, algorithm = 'arpack')
        normalizer = Normalizer(copy=False)
        X = lsa.fit_transform(data)
        X = normalizer.fit_transform(X)
        X_test = lsa.transform(testdata)
        X_test = normalizer.transform(X_test)
        model.fit(X,y_train)
        pred = model.predict(X_test)
        counter = collections.Counter(y_test)
        print counter
        counter = collections.Counter(pred)
        print counter
        out.append(round(accuracy_score(y_test, pred),5))
    print str(out)
    print np.mean(out)
def main():
    """
    Main Function for data preprocesing, normalization and upsampling
    """
    data = pd.read_csv('RawData/Full_Information_Cleaned.csv', index_col=0)
    data = data_pre_processing(data)

    X = data[[
        'Accept_Credit_Card', 'Outdoor_Seating', 'Take_out',
        'Takes_Reservations', 'WIFI', 'Noise_Level', 'atm', 'bank', 'bar',
        'beauty_salon', 'bus_station', 'cafe', 'gym', 'school',
        'White population', 'Black population', 'American Indian population',
        'Asian population', 'Hispanic or Latino population',
        'High school or higher', 'Graduate or professional degree',
        'Unemployed', 'average_price'
    ]]
    Y = pd.factorize(data['class'])[0]
    # To Normalize the feature data into same scale
    norm = Normalizer()
    X = norm.fit_transform(X)
    # To Standarize the data to mean 0 and std 1
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Upsampling to deal with Imbalced Class
    sm = SMOTE(random_state=42)
    X, Y = sm.fit_resample(X, Y)
    print('Resampled dataset shape %s' % Counter(Y))
    # Binarize the output
    y_bin = label_binarize(Y, classes=[0, 1, 2])
    n_classes = y_bin.shape[1]
    model = addModels()
    classifier_run(X, Y, model)
Example #24
0
 def get_test(self):
     print "Getting physics..."
     data = pd.read_csv(self.data_path + "/test.csv")
     data = data.content.values.tolist()
     data = self.clean_html(data)
     temp = []
     data = [re.sub(r'\n', ' ', x) for x in data]
     for d in data:
         if self.get_token(d):
             temp += [self.get_token(d)]
     mul = 10
     x_test = [x for sublist in temp for x in sublist]
     c = Counter(x_test)
     x_test = list(set(x_test))
     x_test = [x for x in x_test if c[x] > 25]
     ll = lambda x: float(len(x))
     lt = map(ll, temp)
     x_test = [[doc.count(w) * mul / lt[i] for i, doc in enumerate(temp)]
               for w in x_test]
     x_test = np.array(x_test)
     #x_train = np.concatenate((x_train, np.zeros((x_train.shape[0], self.doclen - x_train.shape[1]))), axis=1)
     gc.collect()
     print "Doing LSA"
     print "SVD...."
     u, s, v = sparse.linalg.svds(x_test, embed_SIZE)
     n = Normalizer(copy=False)
     x_test = n.fit_transform(u * s.transpose())
     return x_test
Example #25
0
def draw_svc(dataset):
    normalizer = Normalizer()
    data_x, data_y = dataset.data, dataset.target
    data_n = normalizer.fit_transform(data_x)
    info = list()
    for i in range(100):
        info.append((i, *pipeline(
            mySVC(kernel='linear', epsilon=0, decision_function_shape='ovo'),
            data_n,
            data_y,
            label='my'), *pipeline(SVC(
                kernel='linear', gamma='auto', decision_function_shape='ovo'),
                                   data_x,
                                   data_y,
                                   label='sk')))
    info = np.array(info)

    plt.figure()
    plt.plot(info[:, 0], info[:, 1], label='my')
    plt.plot(info[:, 0], info[:, 3], label='sklearn')
    plt.xlabel('times'), plt.ylabel('accuracy')
    plt.legend(loc='best')
    plt.show()

    plt.figure()
    plt.plot(info[:, 0], info[:, 2], label='my')
    plt.plot(info[:, 0], info[:, 4], label='sklearn')
    plt.xlabel('times'), plt.ylabel('time (sec)')
    plt.legend(loc='best')
    plt.show()

    mean = info.mean(axis=0)
    print(f'avg acc  my: {mean[1]}, sk: {mean[3]}')
    print(f'avg time my: {mean[2]}, sk: {mean[4]}')
    return
Example #26
0
def lr_eval(train_embs, eval_embs, train_labels, eval_labels):

    normalizer = Normalizer()
    train_embs = normalizer.fit_transform(train_embs)
    eval_embs = normalizer.transform(eval_embs)
    lr_model = LogisticRegression(random_state=0,
                                  penalty='l2',
                                  solver='liblinear')

    ##drop all negative labels
    non_neg = [i for i in range(len(train_labels)) if train_labels[i] >= 0]
    if len(non_neg) == 0:
        return 0, 0
    else:
        train_embs = [train_embs[i] for i in non_neg]
        train_labels = [train_labels[i] for i in non_neg]
    num_classes = len(list(set(train_labels)))
    if num_classes == 1:
        return 0, 0
    elif num_classes > 2:
        logger.warning('3 classes, something is wrong')
    lr_model.fit(X=train_embs, y=train_labels)
    y_pred = lr_model.predict(eval_embs)
    acc = sum(y_pred == eval_labels) / len(y_pred)
    weights = lr_model.coef_[0]
    dim = int(len(weights) / 2)
    weght_ratio = np.linalg.norm(weights[:dim]) / np.linalg.norm(weights[dim:])

    return acc, weght_ratio
Example #27
0
def save_cluster_of_sentence_embedding(sentence_embedding_file_path,
                                       cluster_sentence_output_path,
                                       cluster_num,
                                       cluster_centroids_output_path,
                                       cluster_labels_output_path):
    sentence_embedding_list = np.load(sentence_embedding_file_path)
    #归一化
    normalizer = Normalizer(copy=False)
    sentence_embedding_list_norm = normalizer.fit_transform(
        sentence_embedding_list)
    end_time1 = datetime.datetime.now()
    #print('TIME: np.load sentence_embedding_list ', end_time1-start_time)

    #print('shape of sentence_embedding_list', np.shape(sentence_embedding_list))
    cluster_number = int(cluster_num)
    Kmeans = KMeans(n_clusters=cluster_number,
                    n_init=5,
                    max_iter=100,
                    n_jobs=-1)
    cluster_sentence = Kmeans.fit_predict(sentence_embedding_list_norm)
    cluster_sentence2 = Kmeans.fit(sentence_embedding_list_norm)
    end_time2 = datetime.datetime.now()
    print('TIME: Kmeans cluster ', end_time2 - end_time1)

    centroids = cluster_sentence2.cluster_centers_
    labels = cluster_sentence2.labels_
    #cluster_distance = Kmeans.transform(sentence_embedding_list)
    np.save(cluster_sentence_output_path, cluster_sentence)
    np.save(cluster_centroids_output_path, centroids)
    np.save(cluster_labels_output_path, labels)
Example #28
0
def normalize_test():
    X=[1,2,3,4,5,2,6,8]
    from sklearn.preprocessing import Normalizer
    normalizer = Normalizer()
    X2 = normalizer.fit_transform(X)

    print X2
Example #29
0
def data_transformation(X, final_columns, norm=False, z_score=True):
    '''
        Data transformation techniques
            1. Range transformation (Normalization)
            2. Z-Score transformation (Standardization) - Default
        
    '''
    X_transformed = X
    #   necessary transformations
    if norm:
        norm = Normalizer()
        X_transformed = norm.fit_transform(X)
        X_transformed = pd.DataFrame(X_transformed, columns=final_columns)
        print('Normalized')
    if z_score:
        scaler = StandardScaler()
        X_transformed = scaler.fit_transform(X)
        X_transformed = pd.DataFrame(X_transformed, columns=final_columns)
        print('Z-Score Applied')

        print(X_transformed)
        X_transformed_inversed = pd.DataFrame(
            scaler.inverse_transform(X_transformed), columns=final_columns)
        print(X_transformed_inversed)

        fi = 'data_transformation.pkl'
        with open(fi, 'wb') as mod:
            pickle.dump(scaler, mod)

    return X_transformed
Example #30
0
def get_tf_idf_M(M,
                 tf=["bin", "raw", "log", "dnorm"],
                 idf=["c", "smooth", "max", "prob"],
                 norm_samps=False):
    N = len(M)
    if tf == "raw":
        tf_M = np.copy(M)  #just the frequency of the word in a text
#    #TODO: check if dnorm is implemented OK
#    elif tf == "dnorm":
#        tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1))))
    if idf == "c":
        idf_v = []
        for i in range(
                M.shape[1]
        ):  #get the number of texts that contain a word words[i]
            idf_v.append(np.count_nonzero(
                M[:, i]))  #count the non zero values in columns of matrix M
        idf_v = np.array(idf_v)
        idf_v = np.log(N / idf_v)
    tf_idf_M = tf_M * idf_v
    if norm_samps:
        normalizer = Normalizer()
        tf_idf_M = normalizer.fit_transform(tf_idf_M)
#    np.save("tf_idf_M", tf_idf_M)
    return tf_idf_M
 def Normalized(self, df):
     meta = []
     nparray = df.to_numpy()
     normalizer = Normalizer()
     meta.append(f'Normalized with scikitlearn {normalizer}')
     nparray = normalizer.fit_transform(nparray)
     return pd.DataFrame(nparray, columns=df.columns), meta
def normalise(data, method='robust'):
    """Normalise `data` with `method`.

    Parameters
    ----------
    data: dict
        * train: tuple
            - X: features
            - y: labels
        * test: tuple
            - X: features
            - y: labels
    method: str
        Rescale (and center) data (per feature) by:
        * l2: unit L2 norm
        * l1: unit L1 norm
        * max: unit L{inf} norm
        * standard: standardise N(0, 1) each feature
        * maxabs: maximum absolute value
        * minmax: minimum and maximum values
        * robust: robust to outliers (IQR and median)
        * none: identity block

    Returns
    -------
    rescaled_data: dict
        * train: tuple
            - X: features
            - y: labels
        * test: tuple
            - X: features
            - y: labels
    """
    if method == 'none':
        return data

    X_train, y_train = data['train']
    X_test, y_test = data['test']

    if method == 'l2':
        trans = Normalizer('l2')
    elif method == 'l1':
        trans = Normalizer('l1')
    elif method == 'max':
        trans = Normalizer('max')
    elif method == 'standard':
        trans = StandardScaler()
    elif method == 'maxabs':
        trans = MaxAbsScaler()
    elif method == 'minmax':
        trans = MinMaxScaler()
    elif method == 'robust':
        trans = RobustScaler()
    else:
        raise ValueError('Unrecognised method=%s' % method)

    X_train = trans.fit_transform(X_train)
    X_test = trans.transform(X_test)

    return {'train': (X_train, y_train), 'test': (X_test, y_test)}
def outlier_dbscan(data):
    columns = [
        'wet_mean', 'green_mean', 'bright_mean', 'ARVI_mean', 'SAVI_mean',
        'NDBI_mean', 'mNDWI_mean', 'NDWI_mean', 'mNDVI_mean', 'NDVI_mean',
        'wet_p50', 'green_p50', 'bright_p50', 'ARVI_p50', 'SAVI_p50',
        'NDBI_p50', 'mNDWI_p50', 'NDWI_p50', 'mNDVI_p50', 'NDVI_p50',
        'S2_B12mean', 'S2_B11mean', 'S2_B8mean', 'S2_B4mean', 'S2_B3mean',
        'S2_B2mean', 'S2_B12med', 'S2_B11med', 'S2_B8med', 'S2_B4med',
        'S2_B3med', 'S2_B2med'
    ]
    t_c = data.TRAIN_CLASS.unique()
    for i in tqdm_notebook(range(len(t_c)),
                           desc='Processing Clustering Outlier data'):
        cl_data = data.loc[data.TRAIN_CLASS == t_c[i], columns].dropna()
        st_sc = Normalizer()
        model_ = DBSCAN(eps=.05,
                        min_samples=10).fit(st_sc.fit_transform(cl_data))
        cl_data['label'] = model_.labels_
        data.loc[cl_data.index, 'OUTLIER'] = cl_data.label
    data['OUTLIER'] = data.OUTLIER.apply(lambda y: 0 if y >= 0 else -1)
    data_outlier = data.loc[data.OUTLIER < 0, ['x', 'TRAIN_CLASS']].groupby(
        'TRAIN_CLASS').agg('count').rename(columns={
            'x': 'COUNT_OUTLIER'
        }).reset_index()
    fig = px.bar(data_outlier,
                 x="TRAIN_CLASS",
                 y="COUNT_OUTLIER",
                 title="OUTLIER")
    fig.show()
    return data
Example #34
0
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
Example #35
0
def preprocess(df,service_list,flag_list,labeled=False):
    print(df.shape)
    df_data,label=to_numeric(df,service_list,flag_list,labeled=labeled)
    print(len(service_list))
    print(len(flag_list))
    print('Selected',df_data.shape)
    scaler=Normalizer()

    #Continuous Data
    data_cont=df_data[CONT_FEATURES].values
    data_cont=scaler.fit_transform(data_cont)

    #Binary Category Data
    data_bin=df_data[CAT_FEATURES[3:]].values

    #Categorical Data
    enc = OneHotEncoder(categories=[range(3), range(len(service_list)), range(len(flag_list))])
    enc.fit(df_data[["protocol_type","service","flag"]].values)
    oneHotEncoding = enc.transform(df_data[["protocol_type","service","flag"]].values).toarray()

    print(oneHotEncoding.shape)
    df_final = np.concatenate((data_cont, oneHotEncoding,data_bin), axis=1)
    df_final = pd.DataFrame(df_final)
    print(df_final.shape)
    return df_final,label
def runPCA(input_data, test, d):
	input_data = removeCorrelation(input_data)
	test = removeCorrelation(test)

	normZ = Normalizer()
	scaledX = normZ.fit_transform(input_data.iloc[:,:-1])
	scaledTestX = normZ.transform(test)

	pca = PCA()
	pcaX = pca.fit_transform(scaledX)
	pcaX = pd.DataFrame(pcaX)
	print 'Approx 98% variance explained by '+str(d)+' features: ' + str(pca.explained_variance_ratio_[:d].sum())

	trainY = input_data.iloc[:,-1]
	trainY = trainY.reshape(len(trainY), 1)
	trainY = pd.DataFrame(trainY)
	trainY.columns = ['Class']

	trainDataAfterPCA = pd.concat([pcaX.iloc[:,:d], trainY], axis=1)
	
	testDataAfterPCA = pca.transform(scaledTestX)
	testDataAfterPCA = pd.DataFrame(testDataAfterPCA)
	testDataAfterPCA = testDataAfterPCA.iloc[:,:d]
	
	return trainDataAfterPCA, testDataAfterPCA
Example #37
0
    def _normalize(self, X, y, X_t):
        from sklearn.preprocessing import Normalizer
        NORM = Normalizer()

        X = NORM.fit_transform(X, y)
        X_t = NORM.transform(X_t)

        return X, X_t
Example #38
0
def kmeans(tfidf, svd, svd_trans, k=200, n_words=10):
    '''
    Performs k-means clustering on svd transformed data and plots it

    Args:
        tfidf: sklearn fitted TfidfVectorizer
        svd: sklearn fitted TruncatedSVD
        svd_trans: dense array with lsi transformed data
        k: the k in k-means
    Returns:
        km: the fitted KMean object
    '''

    # spherical kmeans, so normalize
    normalizer = Normalizer()
    norm_data = normalizer.fit_transform(svd_trans)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=5,
                verbose=2)
    km.fit(norm_data)

    original_space_centroids = svd.inverse_transform(km.cluster_centers_)
    order_centroids = original_space_centroids.argsort()[:, ::-1]

    terms = tfidf.get_feature_names()
    terms = prettify(terms)
    terms = np.array(terms)
    fig = plt.figure(figsize=(10, 8))
    for i in range(10):
        print("Cluster {:d}:".format(i))
        for ind in order_centroids[i, :n_words]:
            print(' {:s}'.format(terms[ind]))
        print('\n')

        # Make a figure and axes with dimensions as desired.
        ax = fig.add_subplot(2, 5, i+1)
        ax.set_title('Cluster {:d}'.format(i+1))

        component = order_centroids[i]
        cmap = plt.cm.Purples
        mn = np.min(component[:n_words])
        mx = np.max(component[:n_words])
        norm = mpl.colors.Normalize(mn, mx)

        cb = mpl.colorbar.ColorbarBase(ax, cmap=cmap, norm=norm,
                                       orientation='vertical')
        # sorted_component = np.sort(component)
        colors = sns.color_palette('Purples', 9).as_hex()
        colors = np.repeat(colors[-1], n_words)

        cb.set_ticks(np.linspace(mn, mx, n_words+2)[1:-1])
        cb.ax.yaxis.set_tick_params(size=0)
        cb.ax.tick_params(labelsize=10)
        for color, tick in zip(colors, cb.ax.get_yticklabels()):
            tick.set_color(color)
            tick.set_fontsize(14)
        cb.set_ticklabels(np.array(terms)[order_centroids[i, :n_words][::-1]])
    plt.tight_layout()
    return km
Example #39
0
    def reduce_dimension(self, n_components=2):
        """ Return PCA transform of self.data, with n_components. """

        reducer = PCA(n_components=n_components)

        X = self.data.values

        norm = Normalizer()
        Xnorm = norm.fit_transform(X)

        return reducer.fit_transform(Xnorm)
Example #40
0
def make_nn_regression(n_samples=100, n_features=100, n_informative=10,
                       dense=False, noise=0.0, test_size=0,
                       normalize_x=True, normalize_y=True,
                       shuffle=True, random_state=None):

    X, y, w = _make_nn_regression(n_samples=n_samples,
                                  n_features=n_features,
                                  n_informative=n_informative,
                                  shuffle=shuffle,
                                  random_state=random_state)

    if dense:
        X = X.toarray()

    if test_size > 0:
        cv = ShuffleSplit(len(y), n_iter=1, random_state=random_state,
                          test_size=test_size, train_size=1-test_size)

        train, test = list(cv)[0]
        X_train, y_train = X[train], y[train]
        X_test, y_test = X[test], y[test]
        if not dense:
            X_train.sort_indices()
            X_test.sort_indices()
    else:
        X_train, y_train = X, y
        if not dense:
            X_train.sort_indices()
        X_test, y_test = None, None

    # Add noise
    if noise > 0.0:
        generator = check_random_state(random_state)
        y_train += generator.normal(scale=noise * np.std(y_train),
                                    size=y_train.shape)
        y_train = np.maximum(y_train, 0)

    if normalize_x:
        normalizer = Normalizer()
        X_train = normalizer.fit_transform(X_train)
        if X_test is not None:
            X_test = normalizer.transform(X_test)

    if normalize_y:
        scaler = MinMaxScaler()
        y_train = scaler.fit_transform(y_train.reshape(-1, 1)).ravel()
        if y_test is not None:
            y_test = scaler.transform(y_test.reshape(-1, 1)).ravel()

    if X_test is not None:
        return X_train, y_train, X_test, y_test, w
    else:
        return X_train, y_train, w
def get_tf_idf_M(M, tf = ["bin", "raw", "log", "dnorm"], idf = ["c", "smooth", "max", "prob"], norm_samps=False):
    N = len(M)
    if tf == "raw":
        tf_M = np.copy(M) #just the frequency of the word in a text
#    #TODO: check if dnorm is implemented OK
#    elif tf == "dnorm":
#        tf_M = 0.5 + 0.5*(M/(np.amax(M, axis=1).reshape((N,1))))
    if idf == "c":
        idf_v = []
        for i in range(M.shape[1]): #get the number of texts that contain a word words[i]
            idf_v.append(np.count_nonzero(M[:,i])) #count the non zero values in columns of matrix M
        idf_v = np.array(idf_v)
        idf_v = np.log(N/idf_v)
    tf_idf_M = tf_M*idf_v
    if norm_samps:
        normalizer = Normalizer()
        tf_idf_M = normalizer.fit_transform(tf_idf_M)
#    np.savetxt("tf_idf_M_" + str(N) + ".txt", tf_idf_M , fmt="%s")
    return tf_idf_M
        
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
Example #42
0
def lstm_validate(lstm_model, evaluation_dataset, create_confusion_matrix=False, number_of_subframes=0, sample_strategy="random", batch_size=32):
	
	print("evaluate neural network...")
	validation_data = []
	validation_labels = []
	
	accuracy = 0
	n = 0
	idx = 0

	
	for _obj in evaluation_dataset:
		if number_of_subframes > 0:
			validation_data.append(get_buckets(_obj.get_hoj_set(), number_of_subframes, sample_strategy))
		else:
			validation_data.append(_obj.get_hoj_set())
		validation_labels.append(_obj.get_hoj_label()[0])


	# evaluate neural network
	score, acc = lstm_model.evaluate(np.array(validation_data), np.array(validation_labels), batch_size=batch_size, verbose=0)
			
	print("Accuracy:",acc)

	if create_confusion_matrix is True:
		predictions = lstm_model.predict(np.array(validation_data),batch_size = batch_size)
		
		predicted_labels = []
		real_labels = []

		for k in range(len(predictions)):
			predicted_idx = np.argmax(predictions[k])

			label_idx = np.argmax(validation_labels[k])
			
			real_labels.append(label_idx)
			predicted_labels.append(predicted_idx)


		cnf_matrix = confusion_matrix(real_labels, predicted_labels)

		norm = Normalizer()
		cnf_matrix = norm.fit_transform(cnf_matrix)

		return score, acc, cnf_matrix


	return score, acc, None
Example #43
0
class FFTTransformer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.model = LinearRegression()
        self.y_mean = None
        self.normalize = Normalizer()

    def fit(self, X, y=None):
        X_train = self.make_waves(X)
        y_train = numpy.array(y)
        self.y_mean = y_train.mean()
        self.model.fit(X_train, y_train - self.y_mean)
        return self

    def make_waves(self, X):
        X = X['times']
        time_scale = numpy.array([(time - X[0]).total_seconds() for time in X]).reshape(-1, 1)
        X_train = [
            numpy.concatenate((
                numpy.pi * 2.0 / (24 * 60 * 60) * delta,
                numpy.pi * 2.0 / (12 * 60 * 60) * delta,
                numpy.pi * 2.0 / (6 * 60 * 60) * delta,

                numpy.pi * 2.0 / (7 * 24 * 60 * 60) * delta,
                numpy.pi * 2.0 / (7.0 / 2 * 24 * 60 * 60) * delta,
                numpy.pi * 2.0 / (7.0 / 3 * 24 * 60 * 60) * delta,

                numpy.pi * 2.0 / (1380500.0) * delta,
                numpy.pi * 2.0 / (1380500.0 / 2) * delta,
                numpy.pi * 2.0 / (1380500.0 / 3) * delta), axis=0)
            for delta in time_scale]

        X_train = numpy.concatenate((numpy.sin(X_train), numpy.cos(X_train)), axis=1)
        return X_train

    def predict(self, X):
        X_test = self.make_waves(X)
        X_test = self.model.predict(X_test) + self.y_mean
        return X_test.reshape(-1, 1)

    def transform(self, X, y=None):
        X_test = self.predict(X)
        X_test = self.normalize.fit_transform(X_test)
        return X_test.reshape(-1, 1)
Example #44
0
def perform_classification (corpus_dir, extn, embedding_fname, class_labels_fname):
    '''
    Perform classification from
    :param corpus_dir: folder containing subgraph2vec sentence files
    :param extn: extension of subgraph2vec sentence files
    :param embedding_fname: file containing subgraph vectors in word2vec format (refer Mikolov et al (2013) code)
    :param class_labels_fname: files containing labels of each graph
    :return: None
    '''
    gensim_model = gensim.models.KeyedVectors.load_word2vec_format(fname=embedding_fname)
    logging.info('Loaded gensim model of subgraph vectors')

    subgraph_vocab = sorted(gensim_model.vocab.keys())
    logging.info('Vocab consists of {} subgraph features'.format(len(subgraph_vocab)))

    wlk_files = get_files(corpus_dir, extn)
    logging.info('Loaded {} graph WL kernel files for performing classification'.format(len(wlk_files)))
    c_vectorizer = CountVectorizer(input='filename',
                                   tokenizer=subgraph2vec_tokenizer,
                                   lowercase=False,
                                   vocabulary=subgraph_vocab)
    normalizer = Normalizer()

    X = c_vectorizer.fit_transform(wlk_files)
    X = normalizer.fit_transform(X)
    logging.info('X (sample) matrix shape: {}'.format(X.shape))


    Y = np.array(get_class_labels(wlk_files, class_labels_fname))
    logging.info('Y (label) matrix shape: {}'.format(Y.shape))

    seed = randint(0, 1000)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
    logging.info('Train and Test matrix shapes: {}, {}, {}, {} '.format(X_train.shape, X_test.shape,
                                                                        Y_train.shape, Y_test.shape))

    linear_kernel_svm_classify(X_train, X_test, Y_train, Y_test)

    subgraph_kernel = get_subgraph_kernel (gensim_model, subgraph_vocab)
    deep_kernel_svm_classify (X_train, X_test, Y_train, Y_test, subgraph_kernel)
Example #45
0
def vectorize(n, comp=0):
    tfv = TfidfVectorizer(min_df=1, strip_accents='unicode', ngram_range=(1,2), stop_words='english',
        sublinear_tf=True, use_idf=True, smooth_idf=True)

    # Fit and transform
    X = tfv.fit_transform(boiler_stream(trainfnm, n))
    lsa = None
    scaler = None
    if comp > 0:
        lsa = TruncatedSVD(comp)
        scaler = Normalizer(copy=False)
        X = lsa.fit_transform(X)
        X = scaler.fit_transform(X)

    # Transform only
    Z = tfv.transform(boiler_stream(testfnm, n))
    if lsa:
        Z = lsa.transform(Z)
        Z = scaler.transform(Z)
    
    np.save(trainvecfnm, X)
    np.save(testvecfnm, Z)
Example #46
0
def createGraph(similarity, sim_keep_above, graph_rule):
    G = nx.Graph()

    # Default (old version): may cause problems
    if graph_rule == "default":
        similarity = 1.0 - similarity
    elif graph_rule == "minmax":
        mm = MinMaxScaler()
        similarity = mm.fit_transform(1.0 - similarity)
    elif graph_rule == "normalization":
        norm = Normalizer()
        similarity = norm.fit_transform(1.0 - similarity)
    elif graph_rule == "inversed":
        similarity = similarity

    # Remove similarity small than S
    similarity[similarity < (sim_keep_above)] = 0.0
    similarity = (similarity * 10000).astype(int)

    if similarity.shape[0] == 1:
        G.add_node(0)
        return G

    max_size = similarity.shape[0]
    vertice = 0
    for i in xrange(0, max_size):
        for j in xrange(i, max_size):
            if not ALLOW_SELF_LOOP:
                if i == j:
                    continue

            v = similarity[i][j]
            # print i,j,v
            if v > 1:
                vertice += 1
                G.add_edge(i, j, weight=v)
    return G
Example #47
0
import pandas as pd
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import SGDRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error


df = pd.read_csv("forestfires.txt", index_col=False, sep=" ")

X = df.iloc[:,0:-1].values
Y = df.iloc[:,-1].values
normalizer = Normalizer()

X = normalizer.fit_transform(X)
k_fold_cv = KFold(n=Y.shape[0], n_folds=10, shuffle=True)


sgdr = SGDRegressor()

for train_index, test_index in k_fold_cv:
	X_train, X_test = X[train_index], X[test_index]
	Y_train, Y_test = Y[train_index], Y[test_index]
	sgdr.fit(X_train, Y_train)
	pred = sgdr.predict(X_test)
	error = mean_squared_error(Y_test, pred)
	print(error)
print len(newdata)
print len(newdata[0])
print np.shape(newdata)
print "data done"
print "logistic initialized"
# clf.fit(data[:,:-1], data[:,-1])
print "fitted data"
skf = StratifiedKFold(data[:,-1], n_folds=10, shuffle=True)
output =[]
finalscore = 0
counter = 0

for train, test in skf:
	counter = counter + 1
	n = Normalizer()	
	netdata = n.fit_transform([ newdata[i][:-1] for i in train ], [ data[i][-1] for i in train ])
	print np.shape(netdata)
	clf = GradientBoostingClassifier(warm_start = True, n_estimators = 1500)
	clf = clf.fit( netdata, [ data[i][-1] for i in train ])
	n = Normalizer()
	nowdata = n.fit_transform([ newdata[i][:-1] for i in test ])
	print np.shape(nowdata)
	prediction = clf.predict(nowdata)
	# pred = []
	# for i in prediction:
	# 	if(i > 1.5):
	# 		pred.append(2)
	# 	else:
	# 		pred.append(1)
	xscore = score.get_score( prediction , [ data[i][-1] for i in test ])
	finalscore = finalscore + xscore
Example #49
0
    def run(self, working_directory, clustering_algorithm, word2Vec_conf, vector_type):
        vertices_path = working_directory + "vertex.txt"
        seedsMap_path = working_directory + "seedsMap.txt"
        groundTruth =  working_directory + "groundTruth.csv"
        random_walks_path  = working_directory + "sequenceIDs.txt"

        urlsmap = self.get_urlmap(seedsMap_path)
        documents = self.get_content_map(vertices_path)
        groundTruthMap = self.get_content_map(groundTruth)
        random_walks1, random_walks2 = tee(self.get_sequences(random_walks_path))
        #true_labels = np.array([int(groundTruthMap[v]) for v in urlsmap.values()])
        true_labels = [int(groundTruthMap[v]) for v in urlsmap.values()]

        dim_link, dim_content = self.get_dimension_vectors(vector_type)

        embedding_matrix = []
        document_matrix = []
        codes = list(urlsmap.keys())

        if(dim_link>0):
            word2vec = self.runWord2Vec(word2Vec_conf, dim_link)
            word2vec.build_vocab(random_walks1)
            word2vec.train(random_walks2)
            for url in codes:
                embedding = word2vec[url]
                embedding_matrix.append(embedding)

            #Normalize embedding_matrix using L2
            normalizer_embedding = Normalizer(copy=False)
            embedding_matrix = normalizer_embedding.fit_transform(embedding_matrix)
            print("Normalize embedding_matrix, shape: ",embedding_matrix.shape)

        if(dim_content>0):
             for url in codes:
                 document_matrix.append(documents[url])
             content_matrix = self.get_content_matrix(document_matrix, dim_content)

        combined_matrix = []

        if(dim_link>0 and dim_content>0):
            combined_matrix = np.array ([np.concatenate((content_matrix[i], embedding_matrix[i])) for i in range(0, len(content_matrix))])
            print("Combined link and content matrices, shape: ", combined_matrix.shape)
        elif (dim_link>0):
            combined_matrix = embedding_matrix
        else:
            combined_matrix = content_matrix

        #clustering
        if(clustering_algorithm == "KMEANS"):
            num_clusters = len(set(true_labels))
            print("Clustering using KMEANS with num_clusters = ", num_clusters)
            algorithm = KMeans(n_clusters=num_clusters)
        elif (clustering_algorithm == "HDBSCAN"):
            print("Clustering using HDBSCAN with min 5 elements per cluster")
            algorithm = HDBSCAN(min_cluster_size=5)
        else:
            print("ERROR clustering, wrong parameter ", clustering_algorithm)
            sys.exit(2)

        #learned_labels = np.array(map(lambda x: int(x), algorithm.fit_predict(combined_matrix)))
        learned_labels = np.array([int(x) for x in algorithm.fit_predict(combined_matrix.astype(np.float))])

        #metrics analysis
        filtered_true_labels = []
        filtered_learned_labels = []
        filtered_combined_matrix = []
        for i in range(0, len(true_labels)):
            if (true_labels[i] != -1):
                filtered_true_labels.append(true_labels[i])
                filtered_learned_labels.append(learned_labels[i])
                filtered_combined_matrix.append(combined_matrix[i])
        filtered_true_labels = np.array(filtered_true_labels)
        filtered_learned_labels = np.array(filtered_learned_labels)
        filtered_combined_matrix = np.array(filtered_combined_matrix)

        print("Web pages to analyze: ", len(filtered_learned_labels))
        self.homogeneity = metrics.homogeneity_score(filtered_true_labels, filtered_learned_labels)
        self.completeness = metrics.completeness_score(filtered_true_labels,filtered_learned_labels)
        self.v_measure = metrics.v_measure_score(filtered_true_labels, filtered_learned_labels)
        self.ari = metrics.adjusted_rand_score(filtered_true_labels, filtered_learned_labels)
        self.ami = metrics.adjusted_mutual_info_score(filtered_true_labels, filtered_learned_labels)
        self.silhouette = metrics.silhouette_score(filtered_combined_matrix, filtered_learned_labels, metric='cosine')
        print('\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
        print(self.homogeneity, self.completeness, self.v_measure, self.ari, self.ami, self.silhouette)
        return(filtered_true_labels, filtered_learned_labels)
def load_blood_data(train=True, SEED=97, scale  = False, 
                                         minmax = False,
                                         norm   = False,
                                         nointercept = False,
                                         engineering = False):
    """
    Load training and test datasets
    for DrivenData's Predict Blood Donations warmup contest
    
    The training data is shuffled before it's returned; test data is not
    
    Note: patsy returns float64 data; Theano requires float32 so conversion
          will be required; the y values are converted to int32, so they're OK
    
    Arguments
    ---------
        train (bool) if True
                         y_train, X_train = load_blood_data(train=True, ...
                     if False
                         X_test, IDs = load_blood_data(train=False, ...
                         
        SEED (int)   random seed
        
        scale (bool) if True, scale the data to mean zero, var 1; standard normal
        
        minmax (2-tuple) to scale the data to a specified range, provide a
                         2-tuple (min, max)
                         
        norm (bool)  if True, L2 normalize for distance and similarity measures
        
        nointercept (bool) if True, patsy will not create an intercept
                         
                         
    Usage
    -----
    from load_blood_data import load_blood_data
    """
    from sklearn.utils         import shuffle
    from patsy                 import dmatrices, dmatrix
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import Normalizer
    import numpy  as np
    import pandas as pd
    import re
    
    global scaler
    global minmaxer
    global normalizer
    
    if (scale and minmax): raise ValueError("cannot specify both scale and minmax")
    if (scale and norm):   raise ValueError("cannot specify both scale and norm")
    if (norm  and minmax): raise ValueError("cannot specify both norm and minmax")
        
    if type(train) is not bool: raise ValueError("train must be boolean")
    if type(SEED)  is not int:  raise ValueError("SEED must be int")
    if type(scale) is not bool: raise ValueError("scale must be boolean")
    if type(norm)  is not bool: raise ValueError("norm must be boolean")
    if type(nointercept) is not bool: raise ValueError("nointercept must be boolean")
    if type(engineering) is not bool: raise ValueError("engineering must be boolean")
    
    # ------------- read the file -------------
    
    file_name = '../data/train.csv' if train else '../data/test.csv'
    data = pd.read_csv(file_name)
    
    
    # ------------- shorten the column names -------------
    
    column_names = ['ID','moSinceLast','numDonations','volume','moSinceFirst','donated']
    data.columns = column_names if train else column_names[:-1]
    
    
    # ------------- create new variables -------------
    
    if engineering:
        # Ratio of moSinceLast / moSinceFirst = moRatio
        data['moRatio'] = pd.Series(data.moSinceLast / data.moSinceFirst, index=data.index)
    
        # Ratio of (volume/numDonations) / moSinceFirst = avgDonation
        data['avgDonation'] = pd.Series((data.volume/data.numDonations) / data.moSinceFirst, index=data.index)
    
        # Ratio of moSinceFirst / numDonations = avgWait
        data['avgWait'] = pd.Series(data.moSinceFirst / data.numDonations, index=data.index)

        
    # ------------- scale the data -------------

    # transform data to mean zero, unit variance
    # ==========================================
    if scale:
        if train:
            scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = scaler.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to fit in a range
    # ================================
    if minmax:
        if len(minmax) != 2: raise ValueError("minmax must be a 2-tuple")
        if train:
            minmaxer = MinMaxScaler(feature_range = minmax)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = minmaxer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
            
    # transform data to unit vector (L2 norm for distance and similarity)
    # ===================================================================
    if norm:
        if train:
            normalizer = Normalizer(norm='l2', copy=True)
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.fit_transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        else:
            exclude = ['ID','donated']
            data.ix[:, data.columns.difference(exclude)] = normalizer.transform(
                data.ix[:, data.columns.difference(exclude)].values.astype(np.float32))
        
        
    # ------------- create the design matrix -------------
        
    # create the datasets with a patsy formula
    formula = 'donated ~ moSinceLast * moSinceFirst +  numDonations + volume'
    
    if engineering:
        formula = formula + ' + moRatio + avgDonation + avgWait'
        
    if nointercept: 
        formula = formula + ' -1'
        
    if not train:
        match = re.search(r"~\s??(.*)", formula)
        if match:
            formula = match.group(1)
        else:
            raise ValueError("Patsy formula {} does not match the expected format".format(formula))
            
            
    # ------------- return the values -------------
            
    if train:
        y_train, X_train = dmatrices(formula, data=data, return_type="dataframe")
        y_train = np.ravel(y_train).astype(np.int32)
        
        X_train, y_train = shuffle(X_train, y_train, random_state=SEED)
        return y_train, X_train
    else:
        X_test = dmatrix(formula, data=data, return_type="dataframe")
        IDs    = data.ID.values
        return X_test, IDs
Example #51
0
    #descriptors if exist
    vectors=None
    norm=None
    if desc:
        def str_column_to_array(df_column):
            lst=[]
            df_column.apply(lambda row: lst.append(np.array([float(elem) for elem in row.strip('[').strip(']').split(",")])))
            return lst

        vectors = str_column_to_array(train_df["desc"])

        norm="l1"
        if norm is not None:
            normalizer = Normalizer(norm)
            vectors = normalizer.fit_transform(vectors)

    logger.debug("Training...")
    if desc:
        #taining and ml algo on vectors
        model = ml().fit(vectors, labels)
    else:
        if args.model!="dnn":
            raise NotImplementedError("non dnn model is not proposed for direct images")
        #training raw data only with dnn
        if args.stats:
            model = ml(args.model_dir).fitdata(train_df["path"].tolist(), labels)
        else:
            model = ml().fitdata(train_df["path"].tolist(), labels)

Example #52
0
def normalizer(X):
    s = Normalizer(norm='l1')
    return s.fit_transform(X)
def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): 
	"""
	Plot data transformed into two dimensions by PCA. 
	PCA transforms into a new embedding dimension such that 
	the first dimension contains the maximal variance and following 
	dimensions maximal remaining variance. 
	This shoudl spread the observed n-dimensional data maximal. This 
	is unsupervised and will not consider target values. 
	"""
	if (scale): 
		scaler = StandardScaler()
		X = scaler.fit_transform(X)

	if (normalize): 
		normalizer = Normalizer(norm='l2')
		X = normalizer.fit_transform(X)
		
	if (embedding is 'pca'): 
		pca = PCA(n_components=2)
		X_transformed = pca.fit_transform(X)
	elif (embedding is 'isomap'):
		isomap = Isomap(n_components=2, n_neighbors=20)
		X_transformed = isomap.fit_transform(X)
	elif (embedding is 'lle' ): 
		lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5)
		X_transformed = lle.fit_transform(X)
	elif (embedding is 'tsne'): 
		t_sne = TSNE(n_components=2)
		X_transformed = t_sne.fit_transform(X)
	elif (embedding is 'spectral'): 
		se = SpectralEmbedding(n_components=2)
		X_transformed = se.fit_transform(X)
	elif (embedding is 'mds'):
		mds = MDS(n_components=2)
		X_transformed = mds.fit_transform(X)
	elif (embedding is 'gallery'): 
		plt.figure(1)
		
		plt.subplot(231)
		plt.title('pca')
		X_t = PCA(n_components=2).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(232)
		plt.title('isomap')
		X_t = Isomap(n_neighbors=20).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(233)
		plt.title('lle')
		X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(234)
		plt.title('tsne')
		X_t = TSNE().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(235)
		plt.title('spectral')
		X_t = SpectralEmbedding().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.subplot(236)
		plt.title('mds')
		X_t = MDS().fit_transform(X)
		plt.scatter(X_t[:,0 ], X_t[:, 1], c=y)

		plt.suptitle('Gallery transforms ' + title)

		return plt
	else:
		raise ValueError("Choose between pca, isomap and tsne")

	plt.title(title + ' ' + embedding + ' plot')
	sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y)
	plt.colorbar(sc)
	return plt
Example #54
0
mnb = MultinomialNB()
bnb = BernoulliNB()
knn = KNeighborsClassifier()
rf = RandomForestClassifier(n_estimators=51)
ada = AdaBoostClassifier()

classifiers = [lda,qda,svm,perceptron,gnb,mnb,bnb,knn,rf,ada]
classifier_names = ["LDA","QDA","SVM (RBF)","Perceptron","Gaussian NB","Multinomial NB",\
"Bernoulli NB","KNN (K=5)","Random Forests","Ada Boost"]

index = np.arange(len(classifier_names))  

#Extracting the data values in a numpy array and Preprocessing it

data = dataFrame.values
data_normalized = normalizer.fit_transform(data) 
data_standard = standardscale.fit_transform(data)
data_minmax = minmaxscaler.fit_transform(data) 
preprocess_names = ["Unscaled","Normalized","Standardized","MinMax"] 
preprocessors = [data,data_normalized,data_standard,data_minmax] 
train_labels = class_labels[:128]
test_labels = class_labels[128:]
performance_all_preprocess = list([]) 
count = 0

#Defines the Recursive Feature Selector for best feature selection 

def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
Example #55
0
def main(path):
    #tweetdata = loadfiles(path)
    tweetdata = pd.read_csv(path,header=0,dtype=str, names = ['text','lat','lng','class'])[:50000]
    
    #traindata, testdata = train_test_split(tweetdata,test_size=0.3, random_state=50)

    size = len(tweetdata)
    start = 7*size/10

    trainclass = tweetdata['class'][:start]
    testclass = tweetdata['class'][start:]

    vectorizer = TfidfVectorizer(max_df=0.5,min_df=2,
            stop_words='english',use_idf=True,encoding='utf-8',
            decode_error='ignore',lowercase=True)

    norm = Normalizer(copy=False)
    tfids = vectorizer.fit_transform(tweetdata['text'])
    normalized_tfids = norm.fit_transform(tfids)

    ch2 = SelectKBest(chi2, k=1000)
    #normalized_tfids = ch2.fit_transform(normalized_tfids,tweetdata['class'])

    data = pd.DataFrame(normalized_tfids.toarray())
    traindata = data[:start]
    testdata = data[start:]

    traindata = ch2.fit_transform(traindata,trainclass)
    testdata = ch2.fit_transform(testdata,testclass)

    #traindata= pd.DataFrame(traindata,columns=['text','lat','lng','class'])
    #testdata = pd.DataFrame(testdata,columns=['text','lat','lng','class'])
        
    
    #tweetdata['location'] = map(reverseGeocode, tweetdata['lat'],tweetdata['lng'])  
#    map(wordsForChiFeatures,tweetdata['text'], tweetdata['location'])    
#    totalCount = sum(j for j in wordLocDict.values() if j>1)
#    for i,j in wordLocDict.items():
#        # change 1 to any value as per requirement
#        if j>5 :
#            tweetdata[str(i)] = map(lambda x,y:assignFeature(x,y,i[0],totalCount),tweetdata['text'],tweetdata['location'])
#    tweetdata.to_csv('liw.csv',header=True, index=False,encoding='utf-8')
        
    #testdata= loadfiles('C:\Users\AravindKumarReddy\Downloads\SMMTest')
    #traindata['location'] = map(mapLocation, traindata['lat'],traindata['lng'])
    #testdata['location'] = map(mapLocation, testdata['lat'],testdata['lng'])
    #train_tfids =  vectorizer.fit_transform(traindata['text'])
    #test_tfids =  vectorizer.fit_transform(testdata['text'])

    #train_tfids = norm.fit_transform(train_tfids)
    #test_tfids = norm.fit_transform(test_tfids)
    #km = KMeans(n_clusters=2000, init='k-means++', max_iter=100, n_init=1)
    #km.fit(traindata[[1,2]])
    #y = traindata['class']

    nb = MultinomialNB(alpha=.1)
    nb.fit(traindata,trainclass)

    predictions = nb.predict(testdata)
    print predictions
    print '================================='
    print testclass
    print accuracy_score(testclass,predictions)
def normalize(features):
    nm = Normalizer()
    min_max_scaler = MinMaxScaler(feature_range=(0, 10))
    features = nm.fit_transform(features)
    return min_max_scaler.fit_transform(features)
Example #57
0
 def normalizer_scale(self, X):
     scaler = Normalizer()
     return scaler.fit_transform(X)
Example #58
0
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import Normalizer

clf = GradientBoostingClassifier(warm_start = True, n_estimators=1000)
print 'clf created'

trainData = np.array([ [ float(x.strip()) for x in line.split(',') ] for line in open('completedData10NN.csv') ])
n = Normalizer(norm = 'l1')
train = n.fit_transform(trainData[:,:-1], trainData[:,-1])
# train = normalize(trainData[:,:-1])
print 'train data read'

clf.fit(train, trainData[:,-1])
print 'clf trained'

testData = [ [ float(x.strip()) for x in line.split(',') ] for line in open('completedTestData10NN.csv') ]
n = Normalizer(norm = 'l1')
newdata = n.fit_transform(testData)
# newdata = normalize(testData)
print np.shape(newdata)
print "test data read"

prediction = clf.predict(newdata)
print "predicted"

f = open('team04_l1_n1000_10NN.txt', 'w')
def num(x):
	if x == 1:
		return 'A'
	elif x == 2:
Example #59
0
 trainingAccuracy = numpy.zeros(folds)
 trainingBaseline = numpy.zeros(folds)
 testingAccuracy = numpy.zeros(folds)
 testingBaseline = numpy.zeros(folds)
 testingDensity = []
 testingF1 = numpy.zeros(folds)
 
 # sys.stdout.write("Query\tFold\tGround-Truth\tPredicted\n")
 
 for i, (train, test) in enumerate(skf):
     vectorizer = CountVectorizer(min_df=1,dtype='double')
     normalizer = Normalizer()
     classifier = LinearSVC(loss='l1')
     strawMan = DummyClassifier(strategy='most_frequent')
 
     X = normalizer.fit_transform(vectorizer.fit_transform(examples[train]))
     y = labels[train]
     classifier.fit(X, y)
     strawMan.fit(X, y)
 
     testingDensity.extend(computeDensity(vectorizer, examples[test]))
     trainingAccuracy[i] = predict(classifier,normalizer,vectorizer,examples[train], labels[train])
     trainingBaseline[i] = predict(strawMan,normalizer,vectorizer,examples[train], labels[train])
     testingAccuracy[i] = predict(classifier,normalizer,vectorizer,examples[test], labels[test])
     testingBaseline[i] = predict(strawMan,normalizer,vectorizer,examples[test], labels[test])
     testingF1[i] = predictF1(classifier,normalizer,vectorizer,examples[test], labels[test])
 
 print "Training Accuracy:" + prettyPrint(trainingAccuracy)
 print "Test Accuracy:" + prettyPrint(testingAccuracy)
 print "Training Baseline:" + prettyPrint(trainingBaseline)
 print "Test Baseline:" + prettyPrint(testingBaseline)
Example #60
0
def main():
    # if sys.argv[2] == 'svm':
    #     Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    # elif sys.argv[2] == 'lr':
    #     Clf = LogisticRegression (C=0.1,max_iter=100,n_jobs=8)
    # elif sys.argv[2] == 'pa':
    #     Clf = PassiveAggressiveClassifier(C=0.1,n_iter=1,n_jobs=8,class_weight='balanced')
    # else:
    #     Clf = SGDClassifier(n_iter=1,n_jobs=8,class_weight='balanced')

    Clf = LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=100)
    Clf = LogisticRegression (C=0.1,max_iter=1000,n_jobs=8,class_weight='balanced')
    Clf = GridSearchCV(LogisticRegression(max_iter=1000,n_jobs=8,class_weight='balanced'), cv=5,
                   param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)
    # Clf = GridSearchCV(LinearSVC(C = 0.1, class_weight = 'balanced',max_iter=1000), cv=3,
    #                param_grid={"C": [0.001,0.01,0.1,1,10,100]},n_jobs=8)

    File = '/home/annamalai/Senti/UCI/amazon_cells_labelled.txt'
    Ngram = 2

    print 'Clf: {}, File: {}, ngram: {}'.format(Clf, File, Ngram)


    PosSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('1')]#[:100]
    NegSamples = [l.split('\t')[0].strip() for l in open (File).xreadlines() if l.strip().endswith('0')]#[:100]
    print 'loaded {} pos and {} neg samples'.format(len(PosSamples), len(NegSamples))
    X = PosSamples + NegSamples
    y = [1 for _ in xrange(len(PosSamples))] + [-1 for _ in xrange (len(NegSamples))]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=random.randint(0,100))
    print '# TrainLabels', len(y_train)
    print '# TestLabels', len(y_test)

    print 'performing CVectorizer'
    CVectorizer = CountVectorizer(lowercase = True,
                                  stop_words='english',
                                  # token_pattern='(?u)\b\w\w+\b',
                                  # tokenizer = SGTokenizer,
                                  tokenizer = Tokenizer,
                                  ngram_range=(1,2),
                                  dtype=np.float64,
                                  decode_error = 'ignore',
                                  max_df=0.8)
    print 'performing TfidfTransformer and Normalizer'
    # TFIDFTransformer = TfidfTransformer()
    normalizer = Normalizer()
    print 'creating Train and Test FVs'
    T0 = time()
    TrainFVs = CVectorizer.fit_transform(X_train)
    TestFVs = CVectorizer.transform(X_test)
    print 'feat ext time', time() - T0

    # TrainFVs = TFIDFTransformer.fit_transform(TrainFVs)
    # TestFVs = TFIDFTransformer.transform(TestFVs)

    TrainFVs = normalizer.fit_transform(TrainFVs)
    TestFVs = normalizer.transform(TestFVs)

    print 'Trai/test split'
    print TrainFVs.shape
    print TestFVs.shape
    # raw_input('hit any key...')

    print 'training classifier with train samples shape:', TrainFVs.shape
    T0 = time()
    # memory_dump('before_train_mem.txt')
    Model = Clf.fit (TrainFVs, y_train) # re-train on current training set (daily)
    print 'batch fitted'
    print 'training time', time() - T0
    # memory_dump('after_train_mem.txt')

    print 'testing classifier with test samples shape:', TestFVs.shape
    T0 = time()
    # memory_dump('before_test_mem.txt')
    PredictedLabels = Clf.predict(TestFVs)
    print 'testing time', time() - T0
    # memory_dump('after_test_mem.txt')

    print '*'*100
    print 'classification report'
    print '-'*20
    Accuracy = np.mean(PredictedLabels == y_test)
    print "Test Set Accuracy = ", Accuracy

    print(metrics.classification_report(y_test,
                PredictedLabels, target_names=['Neg', 'Pos']))

    print "Accuracy classification score:", metrics.accuracy_score(y_test, PredictedLabels)
    print "Hamming loss:", metrics.hamming_loss(y_test, PredictedLabels)
    print "Average hinge loss:", metrics.hinge_loss(y_test, PredictedLabels)
    print "Log loss:", metrics.log_loss(y_test, PredictedLabels)
    print "F1 Score:", metrics.f1_score(y_test, PredictedLabels)
    print "Zero-one classification loss:", metrics.zero_one_loss(y_test, PredictedLabels)
    print '*'*100

    Vocab = CVectorizer.get_feature_names()
    # print Vocab[:100]
    # raw_input()
    try:
        FeatureImportances = Clf.coef_[0]
    except:
        FeatureImportances = Clf.best_estimator_.coef_[0]

    print FeatureImportances.shape
    raw_input()
    PosTopFeatureIndices = FeatureImportances.argsort()[-100:][::-1]
    NegTopFeatureIndices = FeatureImportances.argsort()[:100][::-1]
    for PosFIndex, NegFIndex in zip(PosTopFeatureIndices, NegTopFeatureIndices):
                print Vocab[PosFIndex], '+-', Vocab[NegFIndex]


    FeatureImportancesSparseArray = ssp.lil_matrix((TestFVs.shape[1],TestFVs.shape[1]))
    FeatureImportancesSparseArray.setdiag(FeatureImportances)

    AllFVsTimesW = TestFVs*FeatureImportancesSparseArray
    print AllFVsTimesW.shape

    Ind = 0
    for TestFV in TestFVs:
        if PredictedLabels[Ind] != y_test[Ind]:
            Ind += 1
            continue
        if len(X_test[Ind].split()) < 5:
            Ind += 1
            continue
        print 'Sample: {}, actual label: {}'.format(X_test[Ind], y_test[Ind])
        # print TestFV
        # print TestFV.shape
        CurTestFV = np.array(AllFVsTimesW[Ind].toarray())
        CurTestFV = CurTestFV.transpose()
        CurTestFV = CurTestFV.reshape(CurTestFV.shape[0],)
        # print CurTestFV.shape
        # raw_input()
        PosTopFeatureIndices = CurTestFV.argsort()[-2:][::-1]
        NegTopFeatureIndices = CurTestFV.argsort()[:2][::-1]
        PosFeatImps= CurTestFV.argsort()[-2:]
        NegFeatImps = CurTestFV.argsort()[:2]
        Tmp = AllFVsTimesW[Ind].todense()
        Tmp = np.sort(Tmp)
        # print PosTopFeatureIndices, AllFVsTimesW[Ind].todense().argsort(), Tmp
        # print NegTopFeatureIndices, NegFeatImps
        if y_test[Ind] == 1:
            print 'top postive feats:', colored(', '.join(['['+Vocab[PosFIndex]+']' for PosFIndex in PosTopFeatureIndices]), 'green')

        else:
            print 'top negative feats: ', colored(', '.join (['['+Vocab[NegFIndex]+']' for NegFIndex in NegTopFeatureIndices]), 'red')
        Ind += 1
        raw_input()