Beispiel #1
0
def scale(df, scaling=None):
    """Scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to scale
    scaling : 'maxabs', 'minmax', 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    if scaling is None or scaling.lower() == 'none':
        return df

    df = df.dropna(axis=1, how='any')

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
    else:
        # Standard normalization
        scaler = StandardScaler()

    mat = df.as_matrix()
    mat = scaler.fit_transform(mat)
    df = pd.DataFrame(mat, columns=df.columns)

    return df
Beispiel #2
0
def calculate_district_dis(dis_style = "euclidean"):
    print(dis_style)
    cal_what_dis = calculate_function[dis_style]

    poi_df = pd.read_csv(os.path.join(DATA_DIR, CONCRETE_DIR, POI_SHEET_DIR, "poi_data.csv"))
    # get all the poi data in dataframe
    districts_poi = poi_df.values[:, 1:]


    scaler = MaxAbsScaler()
    scalered_districts_poi = scaler.fit_transform(districts_poi)

    if dis_style == "canberra":
        scalered_districts_poi = districts_poi

    result = OrderedDict()
    for based_d in range(districts_poi.shape[0]):
        result[based_d + 1] = OrderedDict()
        based_district_poi = scalered_districts_poi[based_d]
        for c_d in range(districts_poi.shape[0]):
            compare_district_poi = scalered_districts_poi[c_d]

            result[based_d + 1][c_d + 1] = cal_what_dis(based_district_poi, compare_district_poi)
        result[based_d + 1] = sorted(result[based_d + 1].items(), key=lambda d:d[1])

    return result
Beispiel #3
0
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df
Beispiel #4
0
def load_data(shuffle=True, n_cols=None):
    train_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.train.csv')
    test_path = get_p1_file('http://ftp.mcs.anl.gov/pub/candle/public/benchmarks/P1B1/P1B1.test.csv')

    usecols = list(range(n_cols)) if n_cols else None

    df_train = pd.read_csv(train_path, engine='c', usecols=usecols)
    df_test = pd.read_csv(test_path, engine='c', usecols=usecols)

    df_train = df_train.drop('case_id', 1).astype(np.float32)
    df_test = df_test.drop('case_id', 1).astype(np.float32)

    if shuffle:
        df_train = df_train.sample(frac=1, random_state=seed)
        df_test = df_test.sample(frac=1, random_state=seed)

    X_train = df_train.as_matrix()
    X_test = df_test.as_matrix()

    scaler = MaxAbsScaler()
    mat = np.concatenate((X_train, X_test), axis=0)
    mat = scaler.fit_transform(mat)

    X_train = mat[:X_train.shape[0], :]
    X_test = mat[X_train.shape[0]:, :]

    return X_train, X_test
def games_price_feature(directory: str,
                        app_index_dict: Dict[str, Any]) -> csr_matrix:
    """
    Get games price feature
    Params:
        directory: raw data path
        app_index_dict: key->appid, value->app's index
    return: 
        price: matrix of price
    """
    games = []  # games list
    for key, value in app_index_dict.items():
        games.insert(value, key)
    games_price = {}  #all games and price
    infp = open(directory, 'r')  #game_price
    in_reader = csv.reader(infp)
    valid_item = 0
    price_array = []
    for line in in_reader:
        if line[1] == 'undefine':
            games_price[line[0]] = 'undefine'
        else:
            price_filter = filter(lambda ch: ch in '0123456789.', line[1])
            price_str = ''.join(list(price_filter))
            games_price[line[0]] = float(price_str)
    for game in games:
        vector = []
        if game in games_price and games_price[game] != 'undefine':
            vector.append(games_price[game])
            valid_item += 1
            price_array.append(vector)
    price_array = np.array(price_array)
    X_scaled = scale(price_array)
    price_array = X_scaled.tolist()
    #if the game does not have price, then add 0
    for i in range(len(games)):
        if games[i] not in games_price or games_price[games[i]] == 'undefine':
            valid_item += 1
            price_array.insert(i, [0.])
    price_array = np.array(price_array)
    price_matrix = csr_matrix(price_array)
    logging.getLogger(__name__).debug('Item with price feature: ' +
                                      str(valid_item))
    logging.getLogger(__name__).debug('price_matrix.shape: ' +
                                      str(price_matrix.shape))
    infp.close()
    scaler = MaxAbsScaler()
    price_matrix = scaler.fit_transform(price_matrix)
    return price_matrix
Beispiel #6
0
def scaler_dummy(dataset,dataset_test):

    scaler_mm = MinMaxScaler() 
    scaler_ma = MaxAbsScaler()
    scaler_sd = StandardScaler()
    scaler_rb = RobustScaler()

    numerical = list(dataset.columns)
    data_transform_mm = pd.DataFrame(data = dataset)
    data_transform_ma = pd.DataFrame(data = dataset)
    data_transform_sd = pd.DataFrame(data = dataset)
    data_transform_rb = pd.DataFrame(data = dataset)

    data_transform_mm[numerical] = scaler_mm.fit_transform(dataset[numerical])
    data_transform_ma[numerical] = scaler_ma.fit_transform(dataset[numerical])
    data_transform_sd[numerical] = scaler_sd.fit_transform(dataset[numerical])
    data_transform_rb[numerical] = scaler_rb.fit_transform(dataset[numerical])
  #     scaler_mm.fit(dataset[numerical])
  #     scaler_ma.fit(dataset[numerical])
  #     scaler_sd.fit(dataset[numerical])
  #     scaler_rb.fit(dataset[numerical])

    data_transform_mm[numerical] = scaler_mm.transform(dataset[numerical])
    data_transform_ma[numerical] = scaler_ma.transform(dataset[numerical])
    data_transform_sd[numerical] = scaler_sd.transform(dataset[numerical])
    data_transform_rb[numerical] = scaler_rb.transform(dataset[numerical])

    ## get dummies
    features_final_mm = pd.get_dummies(data_transform_mm)
    features_final_ma = pd.get_dummies(data_transform_ma)
    features_final_sd = pd.get_dummies(data_transform_sd)
    features_final_rb = pd.get_dummies(data_transform_rb)

    numerical = list(dataset_test.columns)
    scaler_mm_fitted_test = scaler_mm.transform(dataset_test[numerical])
    scaler_ma_fitted_test = scaler_ma.transform(dataset_test[numerical])
    scaler_sd_fitted_test = scaler_sd.transform(dataset_test[numerical])
    scaler_rb_fitted_test = scaler_rb.transform(dataset_test[numerical])

    scaler_mm_fitted_test = pd.DataFrame(data = scaler_mm_fitted_test,columns=numerical)
    scaler_ma_fitted_test = pd.DataFrame(data = scaler_ma_fitted_test,columns=numerical)
    scaler_sd_fitted_test = pd.DataFrame(data = scaler_sd_fitted_test,columns=numerical)
    scaler_rb_fitted_test = pd.DataFrame(data = scaler_rb_fitted_test,columns=numerical)
    
    features_final_mmt = pd.get_dummies(scaler_mm_fitted_test)
    features_final_mat = pd.get_dummies(scaler_ma_fitted_test)
    features_final_sdt = pd.get_dummies(scaler_sd_fitted_test)
    features_final_rbt = pd.get_dummies(scaler_rb_fitted_test)        
    return features_final_mm, features_final_ma, features_final_sd, features_final_rb, features_final_mmt, features_final_mat, features_final_sdt, features_final_rbt
Beispiel #7
0
def load_data(train_path, test_path, gParameters):

    print('Loading data...')
    df_train = (pd.read_csv(train_path, header=None).values).astype('float32')
    df_test = (pd.read_csv(test_path, header=None).values).astype('float32')
    print('done')

    print('df_train shape:', df_train.shape)
    print('df_test shape:', df_test.shape)

    seqlen = df_train.shape[1]

    df_y_train = df_train[:, 0].astype('int')
    df_y_test = df_test[:, 0].astype('int')

    # only training set has noise
    Y_train = np_utils.to_categorical(df_y_train, gParameters['classes'])
    Y_test = np_utils.to_categorical(df_y_test, gParameters['classes'])

    df_x_train = df_train[:, 1:seqlen].astype(np.float32)
    df_x_test = df_test[:, 1:seqlen].astype(np.float32)

    X_train = df_x_train
    X_test = df_x_test

    scaler = MaxAbsScaler()
    mat = np.concatenate((X_train, X_test), axis=0)
    mat = scaler.fit_transform(mat)

    X_train = mat[:X_train.shape[0], :]
    X_test = mat[X_train.shape[0]:, :]

    # TODO: Add better names for noise boolean, make a featue for both RNA seq and label noise together
    # check if noise is on (this is for label)
    if gParameters['add_noise']:
        # check if we want noise correlated with a feature
        if gParameters['noise_correlated']:
            Y_train, y_train_noise_gen = candle.label_flip_correlated(
                Y_train, gParameters['label_noise'], X_train,
                gParameters['feature_col'], gParameters['feature_threshold'])
        # else add uncorrelated noise
        else:
            Y_train, y_train_noise_gen = candle.label_flip(
                Y_train, gParameters['label_noise'])
    # check if noise is on for RNA-seq data
    elif gParameters['noise_gaussian']:
        X_train = candle.add_gaussian_noise(X_train, 0, gParameters['std_dev'])

    return X_train, Y_train, X_test, Y_test
Beispiel #8
0
def features(anime_db):

    #The features we will be using for the system are the genre, type and the ratings
    anime_features = pd.concat([
        anime_db.genre.str.get_dummies(sep=","),
        pd.get_dummies(anime_db['type']), anime_db.rating
    ],
                               axis=1)

    #use MaxBsScaler to scale the features from 1-0, while preserving sparsity
    from sklearn.preprocessing import MaxAbsScaler
    max_abs_scaler = MaxAbsScaler()
    anime_features = max_abs_scaler.fit_transform(anime_features)

    return anime_features
Beispiel #9
0
def getviz_cosinus(X_train, y_train):
    preprocessing = MaxAbsScaler()
    X_train = preprocessing.fit_transform(X_train)
    reds = y_train == 0
    blues = y_train == 1
    plt.figure()
    kpca = KernelPCA(kernel='cosine', n_components=2, n_jobs=-1)
    X_kpca = kpca.fit_transform(X_train)
    plt.plot(X_kpca[reds, 0], X_kpca[reds, 1], "ro", label='csp-')
    plt.plot(X_kpca[blues, 0], X_kpca[blues, 1], "bo", label='csp+')
    plt.title("Projection by cosine PCA")
    plt.xlabel("1st principal component")
    plt.ylabel("2nd component")
    plt.legend(loc="lower right", prop={'size': 6})
    plt.show()
Beispiel #10
0
def MaxAbsScaledData(df, colClass):
    # preparing for standadrising
    colNames = df.columns.tolist()
    lstClass = df[colClass]
    # normalizing the data
    from sklearn.preprocessing import MaxAbsScaler
    scaler = MaxAbsScaler()
    # fit
    ar = scaler.fit_transform(df)
    # transform
    df = pd.DataFrame(data=ar)
    # # change as required
    df.columns = colNames
    df[colClass] = lstClass
    return (df)
Beispiel #11
0
def preprocess_testing():
    df = corpus_test.copy()
    #print(df)
    #df = df['comments']
    df['comments'] = df['comments'].map(lambda x: preprocess_text(x))
    #df = df.map(lambda x: preprocess_text(x))
    #y_train = df["subreddits"].to_numpy()
    global vectorizer
    x_train = vectorizer.transform(df["comments"])
    all_cols = np.arange(x_train.shape[1])
    global to_delete
    cols_to_keep = np.where(np.logical_not(np.in1d(all_cols, to_delete)))[0]
    x_train = x_train[:, cols_to_keep]
    scalar = MaxAbsScaler()
    x_train = scalar.fit_transform(x_train)
    return x_train
def preprocess(df, features):
    df['qt_coligados'].fillna(0, inplace=True)
    df['qt_socios'].fillna(1, inplace=True)
    df['qt_socios_pf'].fillna(1, inplace=True)
    df['qt_socios_pj'].fillna(0, inplace=True)
    df['qt_funcionarios'].fillna(0, inplace=True)
    df['tx_crescimento_12meses'].fillna(0, inplace=True)
    df['tx_crescimento_24meses'].fillna(0, inplace=True)
    df['fl_optante_simei'].fillna('False', inplace=True)
    df['fl_optante_simples'].fillna('False', inplace=True)
    df['nm_meso_regiao'].fillna('OUTROS', inplace=True)
    df['nu_meses_rescencia'].fillna(df['nu_meses_rescencia'].median(),
                                    inplace=True)
    df['vl_faturamento_estimado_aux'].fillna(
        df['vl_faturamento_estimado_aux'].median(), inplace=True)
    df['vl_faturamento_estimado_grupo_aux'].fillna(
        df['vl_faturamento_estimado_aux'].median(), inplace=True)
    df.loc[df['sg_uf_matriz'].isna(),
           'sg_uf_matriz'] = df.loc[df['sg_uf_matriz'].isna(), 'sg_uf']
    df['de_nivel_atividade'].fillna('MUITO BAIXA', inplace=True)
    df['de_saude_tributaria'].fillna('VERMELHO', inplace=True)
    df['idade_media_socios'].fillna(df['idade_media_socios'].median(),
                                    inplace=True)
    df['empsetorcensitariofaixarendapopulacao'].fillna(
        df['empsetorcensitariofaixarendapopulacao'].median(), inplace=True)
    df['porc_st_regular'] = df['qt_socios_st_regular'] / df['qt_socios']
    df['socio_pep'] = 0
    df.loc[df['qt_socios_pep'] > 0, 'socio_pep'] = 1
    df['coligada_exterior'] = 0
    df.loc[df['qt_coligados_exterior'] > 0, 'coligada_exterior'] = 1
    df['porc_socios_pf'] = df['qt_socios_pf'] / df['qt_socios']
    df['porc_socios_pj'] = df['qt_socios_pj'] / df['qt_socios']
    df = df[df['idade_media_socios'] > 0]
    df = df[features]
    df.loc[df['fl_rm'] == 'NAO', 'fl_rm'] = 0
    df.loc[df['fl_rm'] == 'SIM', 'fl_rm'] = 1
    df.loc[:, 'fl_rm'] = pd.to_numeric(df['fl_rm'])
    col_bool = df.dtypes[df.dtypes == 'bool'].index
    for col in col_bool:
        df[col] = df[col].astype(int)
    cat_cols = df.select_dtypes('object').columns
    cat_cols = cat_cols[1:]
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    scaler = MaxAbsScaler()
    df = pd.DataFrame(scaler.fit_transform(df.iloc[:, 1:]))
    df.fillna(0, inplace=True)
    return df
Beispiel #13
0
class SvrLrR(PredictModel):

    svr = None
    mas = None

    def create_predict_model(self):
        self.svr = SVR(kernel='linear')
        self.mas = MaxAbsScaler()

    def fit(self, X_train, X_valid, y_train, y_valid):
        self.create_predict_model()
        X_train = self.mas.fit_transform(X_train)
        self.svr.fit(X_train, y_train)

    def predict(self, X_test):
        X_test = self.mas.transform(X_test)
        return self.svr.predict(X_test)
class ScalerOperator(Operator):
    def __init__(self, params=0):
        '''
        :param params: 0 for StandardScaler, 1 for MinMaxScaler, 2 for MaxAbsScaler
        '''
        if params == 0:
            super().__init__(DATA_PERPROCESSING, 'dp_standardscaler', params)
            self.scaler = StandardScaler()
        elif params == 1:
            super().__init__(DATA_PERPROCESSING, 'dp_minmaxscaler', params)
            self.scaler = MinMaxScaler()
        elif params == 2:
            super().__init__(DATA_PERPROCESSING, 'dp_maxabsscaler', params)
            self.scaler = MaxAbsScaler()
        else:
            raise ValueError(
                "Invalid params for ScalerOperator. Expected {0,1,2}")

    def operate(self, dm_list: typing.List, phase='train'):
        # The input of a ScalerOperator is a DataManager
        assert len(dm_list) == 1 and isinstance(dm_list[0], DataManager)
        self.check_phase(phase)

        dm = dm_list[0]
        feature_types = dm.feature_types
        numercial_index = [
            i for i in range(len(feature_types))
            if feature_types[i] == "Float" or feature_types[i] == "Discrete"
        ]

        # Check if there are no numerical features in train_x
        if len(numercial_index) == 0:
            return dm

        if phase == 'train':
            x = dm.train_X
            x[:,
              numercial_index] = self.scaler.fit_transform(x[:,
                                                             numercial_index])
            dm.train_X = x
        else:
            x = dm.test_X
            x[:, numercial_index] = self.scaler.transform(x[:,
                                                            numercial_index])
            dm.test_X = x
        return dm
Beispiel #15
0
class SgdR(PredictModel):

    sgdr = None
    mas = None

    def create_predict_model(self):
        self.sgdr = SGDRegressor()
        self.mas = MaxAbsScaler()

    def fit(self, X_train, X_valid, y_train, y_valid):
        self.create_predict_model()
        X_train = self.mas.fit_transform(X_train)
        self.sgdr.fit(X_train, y_train)

    def predict(self, X_test):
        X_test = self.mas.transform(X_test)
        return self.sgdr.predict(X_test)
def scaleData(df):
    numericvars = ['AvgRating', 'TotalReviews', 'DegreeCentrality']
    mms = MinMaxScaler()
    dfnumss = pd.DataFrame(mms.fit_transform(df[numericvars]),
                           columns=['mms_' + x for x in numericvars],
                           index=df.index)
    dfnumss = pd.concat([df, dfnumss], axis=1)
    dfnumss = dfnumss.drop(numericvars, axis=1)

    numericabsvars = ['SalesRank']
    mas = MaxAbsScaler()
    dfnummas = pd.DataFrame(mas.fit_transform(dfnumss[numericabsvars]),
                            columns=['mas_' + x for x in numericabsvars],
                            index=df.index)
    dfnummas = pd.concat([dfnumss, dfnummas], axis=1)
    dfnummas = dfnummas.drop(numericabsvars, axis=1)
    return dfnummas
Beispiel #17
0
class LinearR(PredictModel):

    lr = None
    x_mas = None

    def create_predict_model(self):
        self.lr = LinearRegression()
        self.x_mas = MaxAbsScaler()

    def fit(self, X_train, X_valid, y_train, y_valid):
        self.create_predict_model()
        X_train = self.x_mas.fit_transform(X_train)
        self.lr.fit(X_train, y_train)

    def predict(self, X_test):
        X_test = self.x_mas.transform(X_test)
        return self.lr.predict(X_test)
def Dados_Balanceados_Separa_Teste_Onehot_Sem_Municipio_Orgao():
    feature_names = Load_Obj('feature_names_onehot_sem_municipio_orgao')
    X_data, y_data = load_svmlight_file(
        'desbalanceado_onehot_sem_municipio_orgao.svm',
        n_features=len(feature_names))  # pylint: disable=unbalanced-tuple-unpacking

    scaler = MaxAbsScaler()
    X_data_fit = scaler.fit_transform(X_data)
    Save_Obj(scaler, 'scaler_onehot_sem_municipio_orgao')

    X_train_cv, X_test, y_train_cv, y_test = train_test_split(
        X_data_fit, y_data, test_size=0.1, random_state=6439, stratify=y_data)

    dump_svmlight_file(X_train_cv, y_train_cv,
                       'treino_desbalanceado_onehot_sem_municipio_orgao.svm')
    dump_svmlight_file(X_test, y_test,
                       'test_desbalanceado_onehot_sem_municipio_orgao.svm')
def test_scaled_labeled_method_distances():
    initial_date = create_test_dates()[0]['initial_date']
    final_date = create_test_dates()[0]['final_date']
    method = create_test_methods()[0]
    admissible_test_date = initial_date + ' to ' + final_date
    expected = create_test_data()
    expected = expected[[admissible_test_date]]
    expected.rename(columns={admissible_test_date: method}, inplace=True)
    scaler = MaxAbsScaler()
    expected[method] = scaler.fit_transform(expected[[method]])
    print(
        distance_matrix.scaled_labeled_method_distances(
            create_test_data(), initial_date, final_date, method))
    print(expected)
    assert_frame_equal(
        distance_matrix.scaled_labeled_method_distances(
            create_test_data(), initial_date, final_date, method), expected)
Beispiel #20
0
def mainModel():

    # Optimize to get ideal parameters for LGBM model
    variables.mlData['paramsOptimizedLGBM'], variables.mlData['apsOptimizedLGBM'] = optimization.optimizeLGBM(settings.LGBMSpace)

    # Make the main LGBM model from the optimized parameters and the settings TFid
    variables.mlData['modelLGBM'], variables.mlData['probLGBM'], variables.mlData['apsLGBM'], variables.mlData['roc_aucLGBM'] = models.lgbmWMetrics(
        variables.mlData['xTrain'], variables.mlData['yTrain'], variables.mlData['xTest'], variables.mlData['yTest'],
        2 ** variables.mlData['paramsOptimizedLGBM'][1],
        variables.mlData['paramsOptimizedLGBM'][0],
        variables.mlData['paramsOptimizedLGBM'][1],
        variables.mlData['paramsOptimizedLGBM'][2],
        variables.mlData['paramsOptimizedLGBM'][3],
        variables.mlData['paramsOptimizedLGBM'][4],
        variables.mlData['paramsOptimizedLGBM'][5])

    # Random Forest
    variables.mlData['modelRF'], variables.mlData['probRF'], variables.mlData['apsRF'], variables.mlData['roc_aucRF'] = models.randomForestWMetrics(variables.mlData['xTrain'], variables.mlData['yTrain'], variables.mlData['xTest'], variables.mlData['yTest'])

    # Scaling
    variables.mlData['scaledXTrain'] = csr_matrix(variables.mlData['xTrain'].copy())
    variables.mlData['scaledXTest'] = csr_matrix(variables.mlData['xTest'].copy())

    scaler = MaxAbsScaler()
    variables.mlData['scaledXTrain'] = scaler.fit_transform(variables.mlData['scaledXTrain'])
    variables.mlData['scaledXTest'] = scaler.transform(variables.mlData['scaledXTest'])

    # Logistic Regression
    variables.mlData['modelLR'], variables.mlData['probLR'], variables.mlData['apsLR'], variables.mlData['roc_aucLR'] = models.logisticRegressionWMetrics(variables.mlData['scaledXTrain'], variables.mlData['yTrain'], variables.mlData['scaledXTest'], variables.mlData['yTest'])

    # Testing the correlation between models
    pd.DataFrame({'RF': variables.mlData['probRF'], 'LBGM': variables.mlData['probLGBM'], 'LR': variables.mlData['probLR']}).corr()

    # Final step: ensembling everything
    p = (variables.mlData['probRF'] + variables.mlData['probLGBM'] + variables.mlData['probLR']) / 3

    # Metrics for testing the ensemble
    aps = average_precision_score(variables.mlData['yTest'], p)
    roc_auc = roc_auc_score(variables.mlData['yTest'], p)

    # Save the models on disk
    jb.dump(variables.mlData['modelRF'], settings.RandomForestPath)
    jb.dump(variables.mlData['modelLGBM'], settings.lightGBMPath)
    jb.dump(variables.mlData['modelLR'], settings.logisticRegressionPath)
    jb.dump(variables.mlData['tFidVec'], settings.VectorizerPath)
    def get_gene_count_with_drugs(self, cut_off=None, normalized=True):
        columns = [col for col in self.country_drug_use_df if 'ldu' in col]
        drugs_per_country = dict(
            (self.country_drug_use_df['country_2letter'].ix[i],
             self.country_drug_use_df[columns].ix[i]) for i in range(9))
        sample_country = dict((self.metadata.ix[i]['sample_code'],
                               self.metadata.ix[i]['country'])
                              for i in range(self.metadata.shape[0]))

        X = np.array([
            drugs_per_country[sample_country[int(code)]].values
            for code in self.gene_counts_df.columns[1:]
        ])
        Y = self.gene_counts_df.ix[:, self.gene_counts_df.columns[1:]].T

        if normalized:
            meta = self.metadata.ix[:, ['sample_code', 'norm_Bacteria_pairs']]
            meta = meta.set_index('sample_code')

            # Y = np.array([Y.ix[str(code)].values / meta.ix[int(code)].ix['norm_Bacteria_pairs'] for code in Y.index])
            for code in Y.index:
                Y.ix[code] = Y.ix[code].apply(lambda x: np.divide(
                    float(x), meta.ix[int(code), 'norm_Bacteria_pairs']))

            # Y = Y * 1000000
            scaler = MaxAbsScaler()
            Y = scaler.fit_transform(Y)
        else:
            Y = Y.values

        normalizer = None

        # Y = Y.values
        # if normalized:
        #     normalizer = MaxAbsScaler()
        #     Y = normalizer.fit_transform(Y)
        #     Y = Y * 100
        # print Y.shape

        if cut_off:
            indices = np.where(Y.sum(axis=0) < cut_off)[0]
            X = np.delete(X, indices, axis=0)
            Y = np.delete(Y, indices, axis=0)

        return X, Y, normalizer
Beispiel #22
0
def init():
    data = pd.read_csv("test_data.csv")
    data['subcategory'] = data["subcategory"].apply(lambda x: x.strip())

    data_features = pd.concat([
        pd.get_dummies(data[["subcategory"]]),
        pd.get_dummies(data[["skill_set"]]),
        pd.get_dummies(data[["liked"]])
    ],
                              axis=1)

    from sklearn.preprocessing import MaxAbsScaler
    max_scaler = MaxAbsScaler()
    data_features = max_scaler.fit_transform(data_features)

    pickle.dump(data, open("list_data.sav", "wb"))

    return data_features
Beispiel #23
0
 def data_Standardization(self, data, length):
     print('[data processing]start data standardization !')
     # data = np.array(data)
     if self.Standardization:
         # scaler = StandardScaler()
         # trans_data = scaler.fit_transform(data)
         scaler = MaxAbsScaler()
         data = scaler.fit_transform(data)
     else:
         # data = data
         pass
     _data = []
     start = 0
     for l in length:
         _data.append(data[start:(start + l[0])])
         start += l[0]
     print('[data processing]data standardization end !')
     return _data
Beispiel #24
0
    def transform(self, X):
        """Scale the data.

        Parameters
        ----------
        X : array-like, shape = (n_samples, n_timestamps)
            Data to scale.

        Returns
        -------
        X_new : array-like, shape = (n_samples, n_timestamps)
            Scaled data.

        """
        X = check_array(X, dtype='float64')
        scaler = SklearnMaxAbsScaler()
        X_new = scaler.fit_transform(X.T).T
        return X_new
Beispiel #25
0
def maxabs_scale(dm):
    feature_types = dm.feature_types
    numercial_index = [i for i in range(len(feature_types))
                       if feature_types[i] == "Float" or feature_types[i] == "Discrete"]

    (train_x, _), (valid_x, _), (test_x, _) = dm.get_train(), dm.get_val(), dm.get_test()
    scaler = MaxAbsScaler()

    train_x[:, numercial_index] = scaler.fit_transform(train_x[:, numercial_index])
    dm.train_X = train_x
    if valid_x is not None:
        valid_x[:, numercial_index] = scaler.transform(valid_x[:, numercial_index])
        dm.val_X = valid_x
    if test_x is not None:
        test_x[:, numercial_index] = scaler.transform(test_x[:, numercial_index])
        dm.test_X = test_x

    return dm
Beispiel #26
0
def f_features(dataframe):
    # Data used to get features
    #gyr = dataframe.iloc[:,9:12].values.transpose()
    #lin = dataframe.iloc[:,12:15].values.transpose()
    eul = dataframe.iloc[:, 15:18].values.transpose()

    # calculate features
    #xy = get_xy_canvas(eul)
    #eul = get_angles_offset(eul)
    #xy_euc = euc_dist(xy[0],xy[1])

    # Features
    # feat_1 : gyr_x
    # feat_2 : gyr_y
    # feat_3 : gyr_z
    # feat_4 : lin_x
    # feat_5 : lin_y
    # feat_6 : lin_z
    # feat_7 : eul_roll
    # feat_8 : eul_yaw
    # feat_9 : eul_pitch
    # feat_10 : x_pos
    # feat_11 : y_pos
    # feat_12 : euc_dist_xy

    # Add features to feature list
    #features = [gyr[0], gyr[1], gyr[2], lin[0], lin[1], lin[2], eul[0], eul[1],
    #            eul[2], xy[0], xy[1], xy_euc]
    #features = [lin[0], lin[1], lin[2]]
    features = [eul[0], eul[1], eul[2]]
    features = np.array(features).transpose()

    # Feature scaling
    #sc = StandardScaler()
    #features = sc.fit_transform(features)

    # Feature scaling maximum absolute value
    ma = MaxAbsScaler()
    features = ma.fit_transform(features)

    # Resampling to 60 (Median lenght in database)
    features = resample(features, RESAMPLE_VAL)

    return features
def pipeline(coin_complete, sequence_length, model):

    coin_complete = coin_complete.replace([np.inf, -np.inf], np.nan)
    coin_complete = coin_complete.fillna(0)

    # scale data
    max_abs_scaler = MaxAbsScaler()
    temp = max_abs_scaler.fit_transform(coin_complete)
    temp = pd.DataFrame(temp, columns=coin_complete.columns)
    coin_complete = temp.set_index(coin_complete.index)

    x = time_series_to_supervised(coin_complete, sequence_length)

    y_predict = model.predict(x)

    # inverse scaler
    y_predict_inverse = y_predict * max_abs_scaler.scale_[1]

    return y_predict, y_predict_inverse
def load_data(train_path, test_path, num_classes):
    df_train = (pd.read_csv(train_path, header=None).values).astype('float32')
    df_test = (pd.read_csv(test_path, header=None).values).astype('float32')

    print('df_train shape:', df_train.shape)
    print('df_test shape:', df_test.shape)

    df_y_train = df_train[:, 0].astype('int')
    df_y_test = df_test[:, 0].astype('int')

    Y_train = np_utils.to_categorical(df_y_train, num_classes)
    Y_test = np_utils.to_categorical(df_y_test, num_classes)

    df_x_train = df_train[:, 1:PL].astype(np.float32)
    df_x_test = df_test[:, 1:PL].astype(np.float32)

    X_train = df_x_train
    X_test = df_x_test

    scaler = MaxAbsScaler()
    mat = np.concatenate((X_train, X_test), axis=0)
    mat = scaler.fit_transform(mat)

    X_train = mat[:X_train.shape[0], :]
    X_test = mat[X_train.shape[0]:, :]

    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    print('Y_train shape:', Y_train.shape)
    print('Y_test shape:', Y_test.shape)

    x_train_len = X_train.shape[1]

    # this reshaping is critical for the Conv1D to work

    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)

    print('X_train shape:', X_train.shape)
    print('X_test shape:', X_test.shape)

    return X_train, Y_train, X_test, Y_test
Beispiel #29
0
def scaled_labeled_method_distances(distance_matrix_df, initial_date, final_date, method):
    """
    Retrieves and scales a column of the input dataframe
    :param distance_matrix_df: A pandas dataframe
    :param final_date: 'YYYY-MM-DD'
    :param initial_date: 'YYYY-MM-DD'
    :param method: method used to compute the distance from the list
    ['cumulative' + GroItem, 'euclidean' + GroItem, 'dtw' + GroItem, 'tsfresh' + GroItem]
    :return: A pandas dataframe
    """
    list_of_methods = ['euclidean', 'cumulative', 'dtw', 'ts-features']
    if method.split('_')[0] not in list_of_methods:
        raise ValueError('Method of calculation unavailable')
    column_name = initial_date + ' to ' + final_date
    ranked_periods_df = pd.DataFrame(distance_matrix_df[column_name])
    scaler = MaxAbsScaler()
    ranked_periods_df.loc[:, column_name] = scaler.fit_transform(ranked_periods_df[[column_name]])
    ranked_periods_df.rename(columns={column_name: method}, inplace=True)
    return ranked_periods_df
Beispiel #30
0
def f_features(dataframe):
    # Data used to get features
    eul = dataframe.iloc[:, 15:18].values.transpose()

    # calculate features
    xy = get_xy_canvas(eul)

    # Add features to feature list
    features = [xy[0], xy[1]]
    features = np.array(features).transpose()

    # Feature scaling maximum absolute value
    ma = MaxAbsScaler()
    features = ma.fit_transform(features)

    # Resampling to 60 (Median lenght in database)
    features = resample(features, RESAMPLE_VAL)

    return features
Beispiel #31
0
def test_maxabs_scaler():
    x = np.array([-2.65395789, -7.97116295, -4.76729177, -0.76885033, -6.45609635])
    y = np.array([-8.9480332, -4.81582449, -3.73537263, -3.46051912,  1.35137275])
    z = np.array([-0.47827432, -2.26208059, -3.75151683, -1.90862151, -1.87541903])
    w = np.zeros_like(x)

    ds = vaex.from_arrays(x=x, y=y, z=z, w=w)
    df = ds.to_pandas_df()

    features = ['x', 'y', 'w']

    scaler_skl = MaxAbsScaler()
    result_skl = scaler_skl.fit_transform(df[features])
    scaler_vaex = vaex.ml.MaxAbsScaler(features=features)
    result_vaex = scaler_vaex.fit_transform(ds)

    assert result_vaex.absmax_scaled_x.values.tolist() == result_skl[:, 0].tolist(), "scikit-learn and vaex results do not match"
    assert result_vaex.absmax_scaled_y.values.tolist() == result_skl[:, 1].tolist(), "scikit-learn and vaex results do not match"
    assert result_vaex.absmax_scaled_w.values.tolist() == result_skl[:, 2].tolist(), "scikit-learn and vaex results do not match"
def main():
    datasets = gen_datasets()
    print "origin data:"
    print datasets

    #0均值,单位方差
    standard_scaler = StandardScaler()
    scaler_datasets = standard_scaler.fit_transform(datasets)
    print scaler_datasets
    print "-" * 80

    min_max_scaler = MinMaxScaler()
    scaler_datasets = min_max_scaler.fit_transform(datasets)
    print scaler_datasets
    print "-" * 80

    max_abs_scaler = MaxAbsScaler()
    scaler_datasets = max_abs_scaler.fit_transform(datasets)
    print scaler_datasets
    print "-" * 80

    normalize = Normalizer(norm="l1")
    normalize_datasets = normalize.fit_transform(datasets)
    print normalize_datasets
    print "-" * 80

    binarizer = Binarizer(threshold=1.1)
    binarizer_datasets = binarizer.fit_transform(datasets)
    print binarizer_datasets
    print "-" * 80

    one_hot_encoder = OneHotEncoder()
    one_hot_encoder_datasets = one_hot_encoder.fit_transform([[0, 1, 4],
                                                              [1, 2, 0],
                                                              [2, 3, 5]])
    print one_hot_encoder_datasets.toarray()
    print "-" * 80

    imputer = Imputer(missing_values=0, strategy="median")
    imputer_datasets = imputer.fit_transform(datasets)
    print imputer_datasets
    print imputer.statistics_
Beispiel #33
0
def second_question(train_data, train_labels, test_data, test_labels):
    """
    Second question:
    :param train_data: the train data
    :param train_labels: the train labels
    :param test_data: the test data
    :param test_labels: the test labels
    :return:
    """
    # prevent wrong flags values
    if (b_2 and (c_2 or d_2)) or (not b_2 and c_2 and d_2):
        raise ValueError(
            'Question 2: you can\'t set more than one value as True for question 2 flags'
        )

    # Load word2vec embeddings file
    words_embeddings = KeyedVectors.load_word2vec_format("wiki.en.100k.vec",
                                                         binary=False)

    # Get train and test data features by word2vec
    X_full_data = get_features(train_data + test_data,
                               train_labels + test_labels, words_embeddings)

    # Normalize full data features
    scaler = MaxAbsScaler()
    X_full_data_maxabs = scaler.fit_transform(X_full_data)

    # Run logistic regression on normalized train data
    model = LogisticRegression()
    model.fit(X_full_data_maxabs[0:len(train_data)], train_labels)

    # Predict using logistic regression on normalized test data
    y_predict = model.predict(
        X_full_data_maxabs[len(train_data):len(X_full_data_maxabs)])

    # Flatten full_test_labels, this are the y (true) labels
    y = np.ravel(test_labels)

    # Get f1_score and accuracy
    f_score = f1_score(y, y_predict, average='macro')
    accuracy = accuracy_score(y, y_predict)
    return accuracy, f_score
Beispiel #34
0
class SkflowLrR(PredictModel):

    ss = None
    dnn = None
    feature_columns = None

    def input_fn(self, X_train, y_train):
        feature_cols = {
            k: tf.constant(X_train[k].values)
            for k in self.feature_columns
        }
        labels = tf.constant(y_train.values)
        return feature_cols, labels

    def create_predict_model(self):
        self.ss = MaxAbsScaler()
        print()

    def fit(self, X_train, X_valid, y_train, y_valid):
        self.create_predict_model()

        self.feature_columns = X_train.columns
        tf_feature_cols = [
            tf.contrib.layers.real_valued_column(k)
            for k in self.feature_columns
        ]

        ss_X_train = self.ss.fit_transform(X_train)
        ss_X_train = pd.DataFrame(ss_X_train, columns=self.feature_columns)

        self.dnn = LinearRegressor(feature_columns=tf_feature_cols)
        self.dnn.fit(input_fn=lambda: self.input_fn(ss_X_train, y_train),
                     steps=1600)

    def predict(self, X_test):
        X_test = self.ss.transform(X_test)
        X_test_df = pd.DataFrame(X_test, columns=self.feature_columns)
        predict = self.dnn.predict(input_fn=lambda: self.input_fn(
            X_test_df, pd.DataFrame(np.zeros(len(X_test)))),
                                   as_iterable=False)
        return predict
Beispiel #35
0
def impute_and_scale(df, scaling=None):
    """Impute missing values with mean and scale data included in pandas dataframe.
        
    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean', axis=0)
    mat = imputer.fit_transform(df)
    # print(mat.shape)
    
    if scaling is None:
        return pd.DataFrame(mat, columns=df.columns)

    # Scaling data
    if scaling == 'maxabs':
        # Normalizing -1 to 1
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        # Scaling to [0,1]
        scaler = MinMaxScaler()
    else:
        # Standard normalization
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    # print(mat.shape)
    df = pd.DataFrame(mat, columns=df.columns)
    
    return df
Beispiel #36
0
from estimators import LSHNearestNeighbors
from preprocessors import text_preprocess


if __name__ == "__main__":
    df = pandas.read_csv("/media/alexander/b32bf4b4-8724-4107-9d19-abf6615c2f60/alexander/HELP_FILE/query.yaHotelId.showInTop.sure.final.tsv", sep="\t")
    print("Изначальная размерность данных:", df.shape,";", "Количество отелей:", len(df["yaHotelId"].unique()))
    sure_df = df[df["sure"]]
    print(sure_df.shape)
    filtered_values = [value[0] for value in sure_df["yaHotelId"].value_counts().iteritems() if value[1] >= 5]
    filtered_df = sure_df[sure_df["yaHotelId"].isin(filtered_values)]
    print("Получившаяся размерность данных:", filtered_df.shape, ";", "Количество отелей:", len(filtered_df["yaHotelId"].unique()))

    vectorizer = TfidfVectorizer(preprocessor=text_preprocess)
    y = np.array(filtered_df["yaHotelId"])
    X = vectorizer.fit_transform(filtered_df["query"])
    print("X shape:", X.shape)

    scaler = MaxAbsScaler()
    scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    clf = LSHNearestNeighbors(n_estimators=10, n_candidates=100, n_neighbors=9, mode="parzen window")
    clf.fit(X_train, y_train)
    t1 = time.time()
    y_pred = clf.predict(X_test)
    t2 = time.time() - t1
    print("delta time:", t2)
    print("mean time for one query:", t2/X_test.shape[0])
    print("accuracy:", accuracy_score(y_test, y_pred))
	1066, 1053, 1339, 1040, 497, 253, 1485, 337, 1347, 1343, 122, 980, 87, 126, 528,
	694, 1444, 655, 161, 626, 545, 906, 1235, 684, 263, 69, 882, 1209, 180, 1386,
	1074, 631, 908, 1176, 947, 401, 1085, 1029, 797, 1107, 386, 559, 588, 522, 644,
	614, 1440, 1140, 1267, 1475, 217, 1201, 456, 231, 1079, 1224, 1036, 156, 852, 1384,
	1288, 243, 760, 1071]

# 6. Zmiana na numpy.array
train_index = np.asarray(A)
test_index = np.asarray(B)

# 7. Podział danych
X_train, X_test = raw_X[train_index], raw_X[test_index]
y_train, y_test = raw_y[train_index], raw_y[test_index]

# 8. Normalizacja
X_train_norm = normalizer.fit_transform(X_train)
X_test_norm = normalizer.transform(X_test)

# 9. Nauka algorytmu
clf = SVC(kernel='rbf', C=1, gamma=0.5, coef0=0.0)
clf.fit(X_train_norm, y_train)
pred = clf.predict(X_test_norm)
acc = accuracy_score(pred, y_test)

# 10. Wynik
print "Accuracy:",acc

# 11. Zapisanie modelu
with open("model.pickle", "wb") as f:
  pickle.dump((clf, normalizer), f, 2);
Beispiel #38
0
    model.add(MaxoutDense(100, input_dim=42))
    model.add(Activation('relu'))
    model.add(GaussianNoise(0.00001))
    model.add(Dropout(0.3))

    model.add(MaxoutDense(1, input_dim=100))
    model.add(Activation('sigmoid'))

    #ada = Adagrad(lr=0.001)
    ada = SGD(lr=0.0003, momentum=0.9, decay=0.0001, nesterov=True)
    model.compile(optimizer=ada,
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    scaler = MaxAbsScaler()
    train_train_scaled = scaler.fit_transform(train_train[features])
    train_test_scaled = scaler.transform(train_test[features])

    model.fit(train_train_scaled, train_train.target.values, nb_epoch=150, batch_size=100)

    train_train_pred = model.predict(train_train_scaled, batch_size=100)
    train_test_pred = model.predict(train_test_scaled, batch_size=100)

    train_score = log_loss(train_train.target.values, train_train_pred)
    test_score = log_loss(train_test.target.values, train_test_pred)

    #test_poly = poly.transform(test[features])
    test_scaled = scaler.transform(test[features])
    test_pred = model.predict(test_scaled, batch_size=100)

    ensemble_train.loc[train_test.index, 'nn'] = train_test_pred
def main():
    X, y = get_data('../../data/train.csv')
    sclr = MaxAbsScaler()
    X = sclr.fit_transform(X)

    # pickle.dump(sclr, open('./dumps/scaler_pickle', 'wb+'))
    X_test, y_test = get_data('../../data/val.csv')
    X_test = sclr.transform(X_test)
    X_fin, y_fin = get_data('../../data/test.csv')
    X_fin = sclr.transform(X_fin)
    other, yo = get_data('../../data/other.csv')
    other = sclr.transform(other)

    lin = linear_model.LogisticRegression(
        C=10000,
    )
    # selector = RFE(lin, 21, step=1)
    # selector.fit(X, y)
    # X = selector.transform(X)
    # X_test = selector.transform(X_test)
    # X_fin = selector.transform(X_fin)
    # for i in range(len(selector.support_)):
    #     print i+1, selector.support_[i]

    lin.fit(X, y)
    # pickle.dump(lin, open('./dumps/lin_reg_pickle', 'wb+'))
    x1 = lin.predict_proba(X)
    x1_test = lin.predict_proba(X_test)
    # x1_fin = lin.predict_proba(X_fin)
    # o1 = lin.predict_proba(other)
    print 'lin'
    print metrics.classification_report(y, lin.predict(X))
    print metrics.classification_report(y_test, lin.predict(X_test))
    print metrics.classification_report(y_fin, lin.predict(X_fin))
    roc = lin.predict_proba(X_fin)
    # r = lin.predict(X_test)
    # l1 = []
    # l2 = []
    # for i in range(len(roc)):
    #     if max(roc[i]) > 0.5:
    #         l1.append(y_fin[i])
    #         l2.append(r[i])
    # print 'dsfasdfasd'
    # print metrics.classification_report(l1, l2)
    # return

    fpr_grd0, tpr_grd0, _ = metrics.roc_curve(y_fin, roc[:, 0], pos_label=0)
    fpr_grd1, tpr_grd1, _ = metrics.roc_curve(y_fin, roc[:, 1], pos_label=1)
    fpr_grd2, tpr_grd2, _ = metrics.roc_curve(y_fin, roc[:, 2], pos_label=2)
    plt.plot(fpr_grd0, tpr_grd0, label='NRP')
    plt.plot(fpr_grd1, tpr_grd1, label='RiPP')
    plt.plot(fpr_grd2, tpr_grd2, label='Polyketide')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()
    # print lin.coef_

    # print sum(lin.predict_proba(X_test)[0])
    svm_model = SVC(
        C=5000,
        # kernel='linear',
        # degree=2,
        coef0=100,
        # probability=True,
        # shrinking=True,
        # class_weight='balanced',
        probability=True,
        # decision_function_shape='ovr'
    )
    svm_model.fit(X, y)
    x2 = svm_model.predict_proba(X)
    x2_test = svm_model.predict_proba(X_test)
    x2_fin = svm_model.predict_proba(X_fin)
    o2 = svm_model.predict_proba(other)
    print 'svm'
    print metrics.classification_report(y, svm_model.predict(X))
    print metrics.classification_report(y_test, svm_model.predict(X_test))