Example #1
0
def norm_data(norm_num, X_train_or, X_test_or):
    if norm_num == 0:
        X_tr_norm = X_train_or
        X_ts_norm = X_test_or
    elif norm_num == 1:
        scaler = preprocessing.Normalizer().fit(X_train_or)
        X_tr_norm = scaler.transform(X_train_or)
        X_ts_norm = scaler.transform(X_test_or)
    elif norm_num == 2:
        scaler = preprocessing.StandardScaler().fit(X_train_or)
        X_tr_norm = scaler.transform(X_train_or)
        X_ts_norm = scaler.transform(X_test_or)
    elif norm_num == 3:
        scaler = preprocessing.MinMaxScaler().fit(X_train_or)
        X_tr_norm = scaler.transform(X_train_or)
        X_ts_norm = scaler.transform(X_test_or)
    elif norm_num == 4:
        scaler = preprocessing.MaxAbsScaler().fit(X_train_or)
        X_tr_norm = scaler.transform(X_train_or)
        X_ts_norm = scaler.transform(X_test_or)
    elif norm_num == 5:
        scaler1 = preprocessing.StandardScaler().fit(X_train_or)
        X_tr_norm1 = scaler1.transform(X_train_or)
        X_ts_norm1 = scaler1.transform(X_test_or)
        scaler2 = preprocessing.MinMaxScaler().fit(X_tr_norm1)
        X_tr_norm = scaler2.transform(X_tr_norm1)
        X_ts_norm = scaler2.transform(X_ts_norm1)
    elif norm_num == 6:
        scaler1 = preprocessing.StandardScaler().fit(X_train_or)
        X_tr_norm1 = scaler1.transform(X_train_or)
        X_ts_norm1 = scaler1.transform(X_test_or)
        scaler2 = preprocessing.MaxAbsScaler().fit(X_tr_norm1)
        X_tr_norm = scaler2.transform(X_tr_norm1)
        X_ts_norm = scaler2.transform(X_ts_norm1)
    return X_tr_norm, X_ts_norm
Example #2
0
def init_pp(ppi, raw_data):
    # Initialize list of scaler objects
    if ppi['name'] == 'MinMax':
        pp = [preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0)),  # temp
              preprocessing.MinMaxScaler(feature_range=(-1.0, 1.0))]  # humid.
    elif ppi['name'] == 'MaxAbs':
        pp = [preprocessing.MaxAbsScaler(),  # for temperature
              preprocessing.MaxAbsScaler()]  # and humidity
    elif ppi['name'] == 'StandardScaler':
        pp = [preprocessing.StandardScaler(),  # for temperature
              preprocessing.StandardScaler()]  # and humidity
    elif ppi['name'] == 'RobustScaler':
        pp = [preprocessing.RobustScaler(),  # for temperature
              preprocessing.RobustScaler()]  # and humidity
    elif ppi['name'] == 'SimpleY':
        pp = [10./1., 10./2.5]  # for temperature
    else:
        ValueError('Incorrect scaler name')
    # Initialize scalers with data
    if ppi['method'] == 'individually':
        pp[0].fit(unpack(raw_data, 'T'))
        pp[1].fit(unpack(raw_data, 'q'))
    elif ppi['method'] == 'alltogether':
        pp[0].fit(np.reshape(unpack(raw_data, 'T'), (-1, 1)))
        pp[1].fit(np.reshape(unpack(raw_data, 'q'), (-1, 1)))
    elif ppi['method'] == 'qTindividually':
        if ppi['name'] != 'SimpleY':
            pp = pp[0]
            pp.fit(raw_data)
    else:
        raise ValueError('Incorrect scaler method')
    return pp
Example #3
0
    def loading_matrices(self):
        # user feature
        df_rf = pd.read_csv(self.data_dir + 'rf_all_more.csv')
        # movie feature
        df_cf = pd.read_csv(self.data_dir + 'cf_all_more.csv')

        print("Completed loading the data")
        print("df_rf.shape:  ", df_rf.shape)
        print("df_cf.shape: ", df_cf.shape)
        print("#")

        org_u = df_rf.to_numpy()
        org_v = df_cf.to_numpy()
        print("orgU.shape: ", org_u.shape)
        print("orgV.shape: ", org_v.shape)
        print("#")

        u_scaler = preprocessing.MaxAbsScaler()
        self.U = u_scaler.fit_transform(org_u)
        v_scaler = preprocessing.MaxAbsScaler()
        self.V = v_scaler.fit_transform(org_v)

        print("U.shape: ", self.U.shape)
        print("V.shape: ", self.V.shape)
        print("#")
        print("U: ", np.min(self.U), ", ", np.max(self.U), ", ",
              np.median(self.U))
        print("V: ", np.min(self.V), ", ", np.max(self.V), ", ",
              np.median(self.V))
        print("#")
Example #4
0
def convert_tfidf(conf):
    """
    提取特征:
    :param filename:
    :return:
    """
    import jieba
    labels = ['人类作者', '自动摘要', '机器作者', '机器翻译']
    label_dict = dict([(label, i) for i, label in enumerate(labels)])
    print('start extract tfidf')
    texts = []
    texts_cut = []
    with open(conf.filename, encoding='utf-8') as fin:
        for line in fin:
            sample = json.loads(line.strip())
            if conf.has_label:
                label = sample['标签']
            else:
                label = labels[0]
            text = sample['内容']
            if label in label_dict:
                texts.append(text)
                texts_cut.append(' '.join(jieba.cut(text)))
    gram_low = 1
    gram_high = 6
    max_feature = 10000
    if conf.new_tfidf:
        vectorizer_char = TfidfVectorizer(encoding="utf8",
                                          analyzer='char',
                                          ngram_range=(gram_low, gram_high),
                                          max_features=max_feature)
        scaler = preprocessing.MaxAbsScaler()
        vectorizer_char.fit(texts)
        x = vectorizer_char.transform(texts)
        scaler.fit(x)
        x = scaler.transform(x)

        vectorizer_term = TfidfVectorizer(encoding="utf8",
                                          analyzer='word',
                                          ngram_range=(gram_low, gram_high),
                                          max_features=max_feature)
        scaler_term = preprocessing.MaxAbsScaler()
        vectorizer_term.fit(texts_cut)
        x_term = vectorizer_term.transform(texts_cut)
        scaler_term.fit(x_term)
        x_term = scaler_term.transform(x_term)

        pickle_dump((vectorizer_char, scaler, vectorizer_term, scaler_term),
                    conf.tfidf_helper)
    else:
        vectorizer_char, scaler, vectorizer_term, scaler_term = pickle_load(
            conf.tfidf_helper)
        x = vectorizer_char.transform(texts)
        x = scaler.transform(x)
        x_term = vectorizer_term.transform(texts_cut)
        x_term = scaler_term.transform(x_term)
    pickle_dump((x, x_term), conf.tfidf_file)
def normaldata(xtrold,xteold):
    Xtrain = xtrold.reshape(60000,784).astype('float32')
    Xtest = xteold.reshape(10000,784).astype('float32')
    scaler = preprocessing.MaxAbsScaler().fit(Xtrain)
    scaler = preprocessing.MaxAbsScaler().fit(Xtest)
    xtrnew = scaler.transform(Xtrain)
    xtenew = scaler.transform(Xtest)  #标准化 
#    xtrnew = Xtrain/255
#    Xtenew = Xtest/255  # 缩放 效果是一样的
    return xtrnew,xtenew
    def preprocessing_DX_DSIFF(self, validation_percentage, name_to_save):
        #scaling and divition between validation and training sets
        for i in range(len(self.DX)):
            if i == 0:
                DXp = self.DX[i]
                Ft = self.ftot_stru[i]
            else:
                DXp = np.concatenate((DXp, self.DX[i]), axis=0)
                Ft = np.concatenate((Ft, self.ftot_stru[i]), axis=0)
        DXx = DXp[:, :, 0]
        DXy = DXp[:, :, 1]
        DXz = DXp[:, :, 2]
        Fx = Ft[:, 0]
        Fy = Ft[:, 1]
        Fz = Ft[:, 2]
        scaler = preprocessing.MaxAbsScaler()
        DXx_scaled = scaler.fit_transform(DXx)
        filename = '%s/scaler_Fx.sav' % name_to_save
        joblib.dump(scaler, filename)

        scaler = preprocessing.MaxAbsScaler()
        DXy_scaled = scaler.fit_transform(DXy)
        filename = '%s/scaler_Fy.sav' % name_to_save
        joblib.dump(scaler, filename)

        scaler = preprocessing.MaxAbsScaler()
        DXz_scaled = scaler.fit_transform(DXz)
        filename = '%s/scaler_Fz.sav' % name_to_save
        joblib.dump(scaler, filename)

        mixer = np.array(range(DXx_scaled.shape[0]))
        for _ in range(1000):
            np.random.shuffle(mixer)
        n = int(len(mixer) *
                (1.0 - validation_percentage / 100.0))  # marking the 90%
        self.DXx_trai = DXx_scaled[mixer[:n]]
        self.DXx_vali = DXx_scaled[mixer[n:]]
        #DXx x component of nabla(X)
        #DXx x(numb_of_atoms = numb_of_struc*numb_atoms_in_struc, numb_of_feaures)
        self.Fx_trai = Fx[mixer[:n]]
        self.Fx_vali = Fx[mixer[n:]]
        #Fx x component of force 1-d array
        #Fx (numb_of_atoms = numb_of_struc*numb_atoms_in_struc)
        self.DXy_trai = DXy_scaled[mixer[:n]]
        self.DXy_vali = DXy_scaled[mixer[n:]]
        self.Fy_trai = Fy[mixer[:n]]
        self.Fy_vali = Fy[mixer[n:]]
        self.DXz_trai = DXz_scaled[mixer[:n]]
        self.DXz_vali = DXz_scaled[mixer[n:]]
        self.Fz_trai = Fz[mixer[:n]]
        self.Fz_vali = Fz[mixer[n:]]
        return None
Example #7
0
    def train(self, verbose=0, sigma=0, seed=23, transform=False):
        """
        Compiles the model, prints a summary, fits to data
        The boolean transform rescales the data if True (default),
        and uses raw data otherwise.

        The input sigma controls the noise for the train/val inputs
        """
        # load data and targets
        Phi_train, theta_Phi_train = deepcopy(self.train_data)
        Phi_val, theta_Phi_val = deepcopy(self.val_data)

        # add noise
        Phi_train, train_noise = tools.add_noise(Phi_train, sigma, seed=2)
        Phi_val, val_noise = tools.add_noise(Phi_val, sigma, seed=3)

        self.transformed = transform
        if transform:
            # transform train and val inputs
            Phi_train_tformer = preprocessing.MaxAbsScaler()
            Phi_val_tformer = preprocessing.MaxAbsScaler()
            Phi_train = Phi_train_tformer.fit_transform(Phi_train)
            Phi_val = Phi_val_tformer.fit_transform(Phi_val)

            # transform train and val targets
            theta_Phi_train_tformer = preprocessing.MaxAbsScaler()
            theta_Phi_val_tformer = preprocessing.MaxAbsScaler()
            theta_Phi_train = theta_Phi_train_tformer.fit_transform(
                theta_Phi_train)
            theta_Phi_val = theta_Phi_val_tformer.fit_transform(theta_Phi_val)

        # compile and print summary
        set_seed(seed)
        self.build_model()
        self.model.summary()

        # make callbacks and fit model
        callbacks = self.get_callbacks()
        self.model.fit(x=Phi_train,
                       y=theta_Phi_train,
                       validation_data=(Phi_val, theta_Phi_val),
                       batch_size=self.batch_size,
                       epochs=self.epochs,
                       callbacks=callbacks,
                       verbose=verbose)
        print('test mse:',
              self.model.evaluate(self.test_data[0], self.test_data[1]))
        print('test thetas:', self.model.predict(self.test_data[0]))
Example #8
0
 def PreprocessData(self, Preprocess='AbsMax'):  # default scaller is MaxAbs
     assert Preprocess in [
         'AbsMax', 'MinMax'
     ], "%r is not a registed preprocess method" % Preprocess
     try:
         print('The preprocess method is', Preprocess)
     except NameError:
         print('Please enter in the preprocess method, ')
     if Preprocess == 'AbsMax':
         scaler = preprocessing.MaxAbsScaler()
     if Preprocess == 'MinMax':
         scaler = preprocessing.MinMaxScaler()
     if self.IfSplitData:
         try:
             self.TRAIN_DATA_all = scaler.fit_transform(self.TRAIN_DATA_all)
             self.VAL_DATA_all = scaler.fit_transform(self.VAL_DATA_all)
         except NameError:
             print('The scaller haven'
                   't been defined, the data hasn'
                   't been processed')
     else:
         try:
             self.DATA_all = scaler.fit_transform(self.DATA_all)
             self.LABEL_all = scaler.fit_transform(self.LABEL_all)
         except NameError:
             print('The scaller haven'
                   't been defined, the data hasn'
                   't been processed')
     print('.\n..\n...\nPreprocess is done')
Example #9
0
def preprocess(preprocesstype, var):
    #preprocesstype: selects preproccesing type for model, "MMS" for MinMaxScaler, "RS" for Robustscaler, "SS" for StandardScaler, "MAS" for MaxAbsScaler
    #var for varibale np.array is set to

    from sklearn import preprocessing

    if preprocesstype == "MMS":
        print("preprocessing is done with MinMaxScaler")
        X = preprocessing.StandardScaler()
        var = X.fit_transform(var)
        return var
    elif preprocesstype == "RS":
        print("preprocessing is done with RobustScaler")
        X = preprocessing.RobustScaler()
        var = X.fit_transform(var)
        return var
    elif preprocesstype == "SS":
        print("preprocessing is done with StandardScaler")
        X = preprocessing.StandardScaler()
        var = X.fit_transform(var)
        return var
    elif preprocesstype == "MAS":
        print("preprocessing is done with MaxAbsScaler")
        X = preprocessing.MaxAbsScaler()
        var = X.fit_transform(var)
        return var
    else:
        print("Preprocessing type not recognized")
  def __init__(self):
      
      super().__init__()
 
      self.name          = 'Max-abs Scaler'
      self.model         = preprocessing.MaxAbsScaler()
      self.takes_label   = False
Example #11
0
    def test_maxAbsScaler(self):
        data = np.random.normal(10, 3, size=100)
        data = np.array([data]).T

        maxabs_scaler = preprocessing.MaxAbsScaler()

        self.scaler2dict2scaler_test(maxabs_scaler, data)
Example #12
0
def data_precess(train_data,yinzi = [], pre_style = 'max_min'):
    import sklearn.preprocessing as spp

    train_data.fillna(0, inplace=True)
    train_data = train_data[train_data['平均月收益'] != 0].copy()
    train_data0 = pd.DataFrame()

    # 数据处理
    if pre_style == 'max_min':
        train_data0 = spp.MinMaxScaler().fit_transform(train_data[yinzi])

    elif pre_style == 'max_abs':
        train_data0 = spp.MaxAbsScaler().fit_transform(train_data[yinzi])

    elif pre_style == 'standar':
        train_data0 = spp.StandardScaler().fit_transform(train_data[yinzi])

    elif pre_style == 'normal':
        train_data0 = spp.Normalizer().fit_transform(train_data[yinzi])


    train_data0 = pd.DataFrame(train_data0, columns=yinzi, index=train_data.index)
    train_data0.loc[:, '预测周期真实收益'] = pd.Series(train_data['预测周期真实收益'])

    return train_data0
Example #13
0
def news20(data_path=home+'/datasets/news20/news20.binary'):
  try:
    open(data_path, 'r')
  except FileNotFoundError as e:
    print(str(e))
    print("Download news20.binary from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html")
    return None, None, None, None

  X, Y = datasets.load_svmlight_file(data_path)


  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,
      random_state=42)

  # normalization makes learning very slow (due to lack of sparsity)
  s = preprocessing.MaxAbsScaler()
  X_train = s.fit_transform(X_train)
  X_test  = s.transform(X_test)

  X_train = preprocessing.normalize(X_train)
  X_test = preprocessing.normalize(X_test)

  X_train = X_train.toarray()
  X_test = X_test.toarray()

  return X_train, Y_train, X_test, Y_test
Example #14
0
def normalX(trainFile):
    from sklearn.datasets import load_svmlight_file
    x_train, y_train = load_svmlight_file(trainFile)

    print(x_train.shape, y_train.shape)

    from collections import Counter
    Y_statis = sorted(Counter(y_train).items())
    print(Y_statis, len(Y_statis))
    json.dump(Y_statis,
              open(DATAPATH + '/Y_statis.txt', 'w+', encoding='utf-8'),
              ensure_ascii=False)

    from sklearn import preprocessing
    max_abs_scaler = preprocessing.MaxAbsScaler()
    x_train = max_abs_scaler.fit_transform(x_train)
    pickle.dump(max_abs_scaler, open(MODELPATH + '/MaxAbsScaler.pickle', 'wb'),
                -1)

    from sklearn.model_selection import train_test_split
    X_train, x_test, Y_train, y_test = train_test_split(x_train,
                                                        y_train,
                                                        test_size=0.2)
    from sklearn.datasets import dump_svmlight_file
    dump_svmlight_file(x_test, y_test, DATAPATH + '/Normal_valiation.libsvm')
    dump_svmlight_file(x_train, y_train, DATAPATH + '/Normal_total.libsvm')
    dump_svmlight_file(X_train, Y_train, DATAPATH + '/Normal_train.libsvm')
Example #15
0
def get_matrix_of_concatenated_document_embeddings(embeddings, n_dim, texts, token_limit=20, stop_words=[''], scale=False):
    """

    :param embeddings:
    :param n_dim:
    :param texts:
    :param n_tokens:
    :param stop_words:
    :param scale:
    :return:
    """

    scaler = preprocessing.MaxAbsScaler()
    # scaler = preprocessing.MinMaxScaler()
    tokenizer = WordPunctTokenizer()

    matrix = np.zeros((len(texts), token_limit*n_dim))
    for i_texts in range(0, len(texts)):
        tokens = tokenizer.tokenize(texts[i_texts])
        tmp = []
        for i_token in range(0, token_limit):
            cur_embedding = [0] * n_dim
            # if text still has tokens left, the current token is in the embeddings, and it is not on the stop word list
            if i_token < len(tokens) and tokens[i_token] in embeddings.keys() and not tokens[i_token] in stop_words:
                tmp_embedding = scaler.fit_transform(embeddings[tokens[i_token]]) if scale else embeddings[tokens[i_token]]
                cur_embedding = tmp_embedding.tolist()
            tmp += cur_embedding

        matrix[i_texts] = np.array(tmp)

    return matrix
Example #16
0
def cadata(data_path=home+'/datasets/cadata/cadata'):
  """Reported performance:
    http://www.jmlr.org/papers/volume18/15-025/15-025.pdf
    http://www.stat.cmu.edu/~cshalizi/350/hw/solutions/solutions-06.pdf
  """

  try:
    open(data_path, 'r')
  except FileNotFoundError as e:
    print(str(e))
    print("Download cadata from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html")
    return None, None, None, None

  X, Y = datasets.load_svmlight_file(data_path)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,
      random_state=42)

  X_train = X_train.toarray()
  X_test = X_test.toarray()

  y_m = np.mean(Y_train)
  y_s = np.std(Y_train)

  Y_train = (Y_train-y_m)
  Y_test = (Y_test-y_m)

  s = preprocessing.MaxAbsScaler()
  X_train = s.fit_transform(X_train)
  X_test  = s.transform(X_test)

  X_train = preprocessing.normalize(X_train)
  X_test = preprocessing.normalize(X_test)

  return X_train, Y_train, X_test, Y_test
Example #17
0
def abalone(data_path=home+'/datasets/abalone/abalone'):

  try:
    open(data_path, 'r')
  except FileNotFoundError as e:
    print(str(e))
    print("Download abalone from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html")
    return None, None, None, None

  X, Y = datasets.load_svmlight_file(data_path)
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,
      random_state=42)

  X_train = X_train.toarray()
  X_test = X_test.toarray()

  y_m = np.mean(Y_train)
  y_s = np.std(Y_train)

  Y_train = (Y_train-y_m)
  Y_test = (Y_test-y_m)

  s = preprocessing.MaxAbsScaler()
  X_train = s.fit_transform(X_train)
  X_test  = s.transform(X_test)

  X_train = preprocessing.normalize(X_train)
  X_test = preprocessing.normalize(X_test)

  return X_train, Y_train, X_test, Y_test
Example #18
0
def compute_dimentionality_reduction_embedding(x, y):

    print('\nComputing dimentionality reduction embedding.\n')

    perplexity = 20
    n_components = 3

    embedding = manifold.TSNE(n_components=n_components,
                              perplexity=perplexity,
                              init='pca',
                              random_state=0)
    x_tSNE = embedding.fit_transform(x)

    # Normalize data.

    max_abs_scaler = preprocessing.MaxAbsScaler()
    x_tSNE_normalized = max_abs_scaler.fit_transform(x_tSNE)

    if n_components == 2:
        plot_data_2d(x_tSNE_normalized,
                     y,
                     markersize=2,
                     alpha=1.0,
                     auto_limit_enabled=False)

    elif n_components == 3:
        plot_data_3d(x_tSNE_normalized, y)

    return x_tSNE_normalized
    def classifyTest(self):
        #y, x = svm_read_problem('Default+Up+Down_30')
        #means, stdevs = self.calcMeansStdevs(x)
        #m = svm_train(y[:90], x[:90], '-s 0 -t 1')
        x_train, y_train = load_svmlight_file('Default+Up+Down+Left+Right')
        scaler = preprocessing.MaxAbsScaler()
        x_scaled = scaler.fit_transform(x_train)

        clf = svm.SVC(kernel='poly')
        clf.fit(x_scaled, y_train)

        for i in xrange(200):
            x = [0] * np.shape(x_train)[1]  #dict()
            count = 0
            for j in xrange(8 - 1, -1, -1):
                for k in xrange(j - 1, -1, -1):
                    self.connManager.connectElectrodes(j, k)
                    impedance = self.doFreqSweep()
                    x[count] = impedance
                    count += 1

            numBasicFeats = 28
            for j in xrange(numBasicFeats):
                for k in xrange(j - 1):
                    diff = abs(x[j] - x[k])
                    x[count] = diff
                    count += 1

            x_s = scaler.transform(x)
            #print x_s
            print clf.predict([x_s])
            #p_labs, p_acc, p_vals = svm_predict([0],[x],m)
            #print p_labs
            '''
Example #20
0
def mushrooms(data_path=home+'/datasets/mushrooms/mushrooms'):
  try:
    open(data_path, 'r')
  except FileNotFoundError as e:
    print(str(e))
    print("Download phishing from https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/mushrooms")
    return None, None, None, None

  X, Y = datasets.load_svmlight_file(data_path)

  X = X.toarray()

  Y[Y == 2] = -1

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,
      random_state=42)

  s = preprocessing.MaxAbsScaler()
  X_train = s.fit_transform(X_train)
  X_test  = s.transform(X_test)

  X_train = preprocessing.normalize(X_train)
  X_test = preprocessing.normalize(X_test)

  return X_train, Y_train, X_test, Y_test
Example #21
0
def get_fitness(genes, texts, labels):
    features = np.asarray([0 for _ in labels])
    acc = 0
    pool = mp.Pool(mp.cpu_count() - 2)
    for idx, text in enumerate(texts):
        pool.apply_async(calc_text_score, (genes, text, features, idx))
    # calculate accuracy
    max_abs_scaler = preprocessing.MaxAbsScaler()
    scaled_train_data = max_abs_scaler.fit_transform(features)
    skf = StratifiedKFold(5, True, 2019)
    missfits = []
    for train, test in skf.split(features, labels):
        clf = CalibratedClassifierCV(OneVsRestClassifier(SVC(C=1)))
        train_labels = [l for idx, l in enumerate(labels) if idx in train]
        test_labels = [l for idx, l in enumerate(labels) if idx in test]
        clf.fit(scaled_train_data[train], train_labels)
        predictions = clf.predict(scaled_train_data[test])
        proba = clf.predict_proba(scaled_train_data[test])
        # Reject option (used in open-set cases)
        for i, p in enumerate(predictions):
            sproba = sorted(proba[i], reverse=True)
            if sproba[0] - sproba[1] < 0.1:
                predictions[i] = u'<UNK>'
        missfits.extend([idx for idx, v in enumerate(labels) if v != labels[scaled_train_data in train][idx]])
    return acc, missfits
Example #22
0
def scale_x(feature_files):
    min_max_scaler = preprocessing.MaxAbsScaler()
    for f in feature_files:
        x, y = datasets.load_svmlight_file(f)
        x_scale = np.round(min_max_scaler.fit_transform(x), 4)
        datasets.dump_svmlight_file(x_scale, y, f)
        print str(f) + " finished."
Example #23
0
 def __init__(self):
     super().__init__('./feature_save/user_features_{}_{}.pkl')
     self.shop_to_index = None
     self.user_counter = None
     self.total_counter = None
     self.norm = 1
     self._scaler = preprocessing.MaxAbsScaler()
def normalize(matrix):
    """Matrix normalization
    
    Parameters
    ----------
    matrix: numpy matrix
        
    Returns
    -------
    X_scale_maxabs: numpy matrix
        rescaled matrix
    
    """
    # For details in this normalization, see: 
    # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html
    # Scale each feature by its maximum absolute value.
    # This estimator scales and translates each feature individually such
    # that the maximal absolute value of each feature in the training set will be 1.0. 
    # It does not shift/center the data, and thus does not destroy any sparsity.


    # This is the data I want to scale
    X_scale = np.copy(matrix)
    # This is the one I can use for the HCP
    
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_scale_maxabs = max_abs_scaler.fit_transform(X_scale)
   
    return X_scale_maxabs #X_train_minmax
def preprocessed_data(train_path = 'data/d_train_20180102.csv', test_path = 'data/d_test_B_20180128.csv'):
    """
    
    :return: 返回预处理过的数据
    """
    train, testB = raw_data(train_path,test_path)
    train_id = train.id.values.copy()
    feature_columns = [f for f in train.columns if f not in ['id', '血糖']]

    test_idB = testB.id.values.copy()
    data = pd.concat([train,   testB])

    # 类别映射
    data['性别'] = data['性别'].map({'男': 1, '女': 0})
    # 日期映射
    data['体检日期'] = pd.to_datetime(data['体检日期']).apply(lambda a: a.dayofyear)

    # 缺失值处理
    data.fillna(data.median(axis=0), inplace=True)

    # 归一化
    scaler = preprocessing.MaxAbsScaler()
    data[feature_columns] = scaler.fit_transform(data[feature_columns])

    train_feat = pd.DataFrame(data[data.id.isin(train_id)])

    test_featB = pd.DataFrame(data[data.id.isin(test_idB)])
    test_featB.drop(labels=['血糖'], axis=1, inplace=True)

    return train_feat,  test_featB
Example #26
0
    def bin_benchmark(train,
                      test,
                      test_labels,
                      lambdav=3,
                      normalize=True,
                      lap_vec=None,
                      bins_size=range(2, 10),
                      k=20):
        if normalize:
            max_abs_scaler = preprocessing.MaxAbsScaler()
            standard_scaler = preprocessing.StandardScaler(with_std=False)
            train = max_abs_scaler.fit_transform(
                standard_scaler.fit_transform(train))
            test = max_abs_scaler.transform(standard_scaler.transform(test))

        aurocs = np.zeros(len(bins_size))
        data_dim = train.shape[1]
        for i, b in enumerate(bins_size):
            bins = [b] * data_dim
            loop_wdbc = Loop(train,
                             lambdav=lambdav,
                             k=k,
                             lap_vec=lap_vec,
                             bins=bins)
            aurocs[i] = sklearn.metrics.roc_auc_score(
                test_labels, loop_wdbc.query_loop(test))
        return aurocs
Example #27
0
def get_scaler(scale_method='StandardScaler'):
    """
  Get different kinds of scalers from scikit-learn

  :param scale_method: scale method
  :returns: scaler instance
  :raises: none
  """
    scaler = None

    if scale_method == 'StandardScaler':
        scaler = preprocessing.StandardScaler()

    elif scale_method == 'MinMaxScaler':
        scaler = preprocessing.MinMaxScaler()

    elif scale_method == 'MaxAbsScaler':
        scaler = preprocessing.MaxAbsScaler()

    elif scale_method == 'RobustScaler':
        scaler = preprocessing.RobustScaler()

    elif scale_method == 'QuantileTransformer':
        scaler = preprocessing.QuantileTransformer()

    elif scale_method == 'Normalizer':
        scaler = preprocessing.Normalizer()

    elif scale_method == 'PowerTransformer':
        scaler = preprocessing.PowerTransformer()

    else:
        print(scale_method, ' not found')

    return scaler
Example #28
0
    def benchmark(train,
                  test,
                  test_labels,
                  ks,
                  lambdav=3,
                  normalize=True,
                  lap_vec=None,
                  bins=None):
        if normalize:
            max_abs_scaler = preprocessing.MaxAbsScaler()
            standard_scaler = preprocessing.StandardScaler(with_std=False)
            train = max_abs_scaler.fit_transform(
                standard_scaler.fit_transform(train))
            test = max_abs_scaler.transform(standard_scaler.transform(test))

        aurocs = np.zeros(len(ks))
        for i, k in enumerate(ks):
            loop_wdbc = Loop(train,
                             lambdav=lambdav,
                             k=k,
                             lap_vec=lap_vec,
                             bins=bins)
            aurocs[i] = sklearn.metrics.roc_auc_score(
                test_labels, loop_wdbc.query_loop(test))
        return aurocs
Example #29
0
def demo():
    print("Loading model...")
    model = torch.load(MODEL_PATH, map_location="cpu")['model']
    scaler = preprocessing.MaxAbsScaler()
    mix = np.zeros(64000, dtype=np.float32)
    mix_r = np.zeros(64000, dtype=np.float32)

    names = []
    for i in range(4):
        data, data_r, name = collect_data(i)
        mix += data
        mix_r += data_r
        names.append(name[:-1])
    mix = scaler.fit_transform(mix.reshape(-1, 1)).T
    mix_r = scaler.fit_transform(mix_r.reshape(-1, 1)).T
    print("The mixture is stored as mix.wav")
    sf.write('./mix.wav', mix[0, :], samplerate=16000)
    mix = torch.tensor(mix).unsqueeze(0)
    mix_r = torch.tensor(mix_r).unsqueeze(0)

    print("Processing...")
    features, features_ = model(mix_r, mix)
    distances = []
    for j in range(4):
        d = (features_[j] - features[j]).pow(2).sum(1)
        distances.append(d.item())
    # features_r, features = model([mix, mix_r])
    print("----------------------")
    print("The anomalys score for pump, slider, fan and valve are:")
    print(distances)
    print("The sources are:...")
    print(names)
Example #30
0
def svmTestBySample(dataset,
                    descriptor,
                    space,
                    channel,
                    illuminant="IIC",
                    testFolds=[5]):
    nameSpace, nameChannel = sc.getSpaceChannelName(space, channel)
    tt = descriptor.upper()
    # Loading Test Data
    fd = ""
    for i in testFolds:
        fd = fd + str(i) + "-"
    fd = fd[:-1]
    outfile = "../training-test-files/" + tt + "-" + illuminant + "-" + nameSpace + "-" + nameChannel + "/" + dataset + "-SVM-test-folds-" + fd
    ft, lb = readTrainingTestFiles(outfile)
    testMatrixF = np.array(ft)
    testMatrixL = np.array(lb)

    #Scale Train Features
    #testMatrixFScaled = preprocessing.scale(testMatrixF)

    #Scale features between [-1,1]
    max_abs_scaler = preprocessing.MaxAbsScaler()
    testMatrixFScaled = max_abs_scaler.fit_transform(testMatrixF)

    npath = "../models/" + tt + "-" + illuminant + "-" + nameSpace + "-" + nameChannel + "/"
    modelName = npath + "model-" + dataset + "-" + tt + "-" + illuminant + "-" + nameSpace + "-" + nameChannel + ".pkl"
    clf = joblib.load(modelName)
    outLabels = clf.predict(testMatrixFScaled)
    scores = clf.score(testMatrixFScaled, testMatrixL)
    return (outLabels, scores)