def fit(self, dframe):
        """
        Fit label encoder to pandas columns.

        Access individual column classes via indexig `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape, dtype=object)
            for idx, column in enumerate(self.columns):
                # fit LabelEncoder to get `classes_` for the column
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
                # append this column's encoder
                self.all_encoders_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape, dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].values)
                self.all_classes_[idx] = (column, np.array(le.classes_.tolist(), dtype=object))
                self.all_encoders_[idx] = le
        return self
Ejemplo n.º 2
0
def encode(df, dump=fromPickle):
    """
    Takes in: dataframe from clean_col
    
    Returns: a dataframe that LabelEncodes the categorical variables
    """
    encoders=dict()
    for col in lblColumns:
        if col not in final_cols:
            continue
        le = LabelEncoder()
        if dump:
            fName="%s/%s.npy"%(modelPath,col)
            if os.path.isfile(fName):
                le.classes_=np.load(fName)
            else:
                le.fit(df[col])
                np.save(fName, le.classes_)
        else:
            le.fit(df[col])
        encoders[col]=le
        df[col] = le.transform(df[col])
    # Order columns with logprice as the last column
    df = df[final_cols]
    df = df.reset_index().drop('index', axis = 1)
    return df
def test_vote_soft():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    train_probs = probs[0]
    test_probs = probs[1]
    print(len(train_probs))
    for prob in train_probs:
        print(prob.shape)
        print(type(prob))
    #train_attr = reduce(lambda a,b:a+b,train_probs)
    test_attr = reduce(lambda a,b:a+b,test_probs)

    pred = test_attr.idxmax(1)
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))
Ejemplo n.º 4
0
 def customEncode(df):
     global labelencoder
     le = LabelEncoder()
     le.fit(df['OutcomeType'])
     df['OutcomeType'] = le.transform(df['OutcomeType'])
     labelencoder = le
     return df
def test_hard_vote():
    X,y,test_X,test_Y =get_test_data()

    print("bag of words")
    bow = BagOfWordsClassifier()
    bow_probs = bow.get_proba(X,y,test_X,prefix="t")

    print("direct attribute")
    da = DirectAttributeClassifier()
    da_probs = da.get_proba(X,y,test_X,prefix="t")

    probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
    #train_probs = probs[0]
    test_probs = probs[1]
    print(len(test_probs))
    preds = [x.idxmax(1) for x in test_probs]
    pred = np.zeros(len(preds[0]),dtype=np.int8)
    print(len(pred))
    for i in range(len(preds[0])):
        votes = [p[i] for p in preds]
        print(votes)
        pred[i]= max(set(votes),key=votes.count)
        print(pred[i])
    from sklearn.preprocessing import LabelEncoder
    le = LabelEncoder()
    le.fit(y)
    pred = le.inverse_transform(pred)

    print(metrics.accuracy_score(test_Y,pred))

    """
Ejemplo n.º 6
0
def clean_device(df:pd.DataFrame):
    df['device_type'] = df['device_type'].map(lambda x: str(x).lower())

    unique = np.unique(df.device_type.values)
    max_num_devices = len(unique)
    print("Unique Devices : ", unique)
    print("Num Unique Devices : ", max_num_devices)
    print()

    devices = df['device_type'].values

    if os.path.exists('data/devices.pkl'):
        with open('data/devices.pkl', 'rb') as f:
            encoder = pickle.load(f)

    else:
        encoder = LabelEncoder()
        encoder.fit(devices)

        with open('data/devices.pkl', 'wb') as f:
            pickle.dump(encoder, f)

    # encode the values
    devices = encoder.transform(devices)

    return df, devices, max_num_devices
Ejemplo n.º 7
0
	def to_numeric(self, columns=[]):
		le = LabelEncoder()
		for i, c in enumerate(columns):
			le.fit(self.M[:, c])
			self.M[:, c] = le.transform(self.M[:, c])
		self.M = self.M.astype(np.float)
		return self
Ejemplo n.º 8
0
    def convert_columns_to_binary(self):
        """Converts all columns with two elements into a binary column.
        """

        # creating panel
        panel = pd.concat([self.train, self.test], ignore_index=True)

        change = False

        # converting two-element columns to binary column
        for colname in self.train.columns:
            if len(np.unique(self.train[colname].values.astype("str"))) == 2:
                if not all(np.unique(self.train[colname].values.astype("str")) == ["0","1"]):
                    label = LabelEncoder()
                    label.fit(list(panel[colname].values.astype("str")))
                    panel[colname] = label.transform(list(panel[colname].values.astype("str")))

                    change = True
                    print("Column %s converted to binary" % (colname))

        if not change:
            print("\nNo binary columns in data")

        self.train, self.test = panel.loc[0:len(self.train)-1,], panel.loc[len(self.train):len(panel)-1,]

        print("")
Ejemplo n.º 9
0
def clean_country(df:pd.DataFrame):
    df['country'] = df['country'].map(lambda x: str(x).upper())

    unique = np.unique(df.country.values)
    max_num_countries = len(unique)
    print("Unique Countries : ", unique)
    print("Num Unique Countries : ", max_num_countries)
    print()

    countries = df['country'].values

    if os.path.exists('data/countries.pkl'):
        with open('data/countries.pkl', 'rb') as f:
            encoder = pickle.load(f)

    else:
        encoder = LabelEncoder()
        encoder.fit(countries)

        with open('data/countries.pkl', 'wb') as f:
            pickle.dump(encoder, f)

    # encode the values
    countries = encoder.transform(countries)

    return df, countries, max_num_countries
Ejemplo n.º 10
0
def select_with_forest(X, y, n_trees=10, treshold=0.01):
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import ExtraTreesClassifier
    import pandas as pd
    import numpy as np
    # encode labels (str -> int):
    le = LabelEncoder()
    X = X.copy()
    for col in X.columns:
        le.fit(X[col].unique())
        X[col] = le.transform(X[col])
    # train the classifier:
    forest = ExtraTreesClassifier(criterion="entropy", n_estimators=n_trees)
    forest.fit(X, y)
    print('number of selected features: ', np.sum(forest.feature_importances_ >= treshold))
    # select important features:
    importances = pd.DataFrame()
    importances['predictor name'] = X.columns.tolist()
    importances['importance'] = forest.feature_importances_
    importances = importances.sort_values(by='importance', ascending=False)
    #X2 = forest.transform(X, treshold)
    #labels2 = X.columns[list(forest.feature_importances_>=treshold)]
    #X2 = pd.DataFrame(X2)
    #X2.columns = labels2
    return importances #X2
Ejemplo n.º 11
0
class OutputLabelColumn(BaseEstimator, TransformerMixin):
    '''
    Take a string or key categorical column and transform it to integer labels.
    '''

    def __init__(self):
        '''
        Set up the internal transformation.
        '''
        self._labeler = LabelEncoder()

    def fit(self, X, y=None):
        '''
        Fit the label and encoding
        '''
        handle_none = list(map(str, X))
        self._labeler.fit(handle_none)
        return self

    def transform(self, X):
        '''
        Transform a column of data into one hot encodings.

        Parameters
        ----------
        X : pandas series or numpy array
        '''
        handle_none = list(map(str, X))
        return self._labeler.transform(handle_none).astype(np.int32)
Ejemplo n.º 12
0
def make_encoder(test, train):
	'''
	creates a general label encoder for every
	unique value of all categorical variables
	'''
	# load in the whole dataset
	# df1 = pd.read_csv('ssvout/0.ssv', na_values='None')
	# df1 = df1.append(pd.read_csv('ssvout/5000.ssv', na_values='None'), ignore_index = True)
	# df1 = df1.append(pd.read_csv('ssvout/10000.ssv', na_values='None'), ignore_index = True)
	# df1 = df1.append(pd.read_csv('ssvout/15000.ssv', na_values='None'), ignore_index = True)
	# df1 = df1.append(pd.read_csv('ssvout/20000.ssv', na_values='None'), ignore_index = True)
	# scrube the data
	# df1 = scrub(df1)
	# initialize the encoder and a list to store all categorical values
	df1 = train
	df1 = train.append(test, ignore_index = True)
	encoder = LabelEncoder()
	values = []
	cols = []
	for col in df1:
		cols.append(col)
		for value in df1[col]:
			values.append(value)
	encoder.fit(values)
	pickle.dump(encoder, file('./encoder.pkl', 'w'))
Ejemplo n.º 13
0
    def fit(self, X, y=None):
        if self.categorical: # Need to one hot encode labels
            label_encoder = LabelEncoder()
            one_hot = OneHotEncoder()
            label_encoder.fit(y)
            one_hot.fit(list(map(lambda x:[x],label_encoder.transform(y))))
            self.stack_encoder = lambda x: one_hot.transform(list(map(lambda x:[x],label_encoder.transform(x)))).toarray()

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-self.hold_out_percent)

        predictions = []
        for (name, clf) in self.base_classifiers:
            print("Ensemble currently fitting:",name)
            clf.fit(X_train,y_train)
            if self.categorical:
                predictions.append(self.stack_encoder(clf.predict(X_test)))
            else:
                predictions.append(list(map(lambda x:[x], clf.predict(X_test))))

        predictions = np.hstack(predictions)

        print("Fitting stack classifier")
        self.stack_classifier[1].fit(predictions,y_test)

        if self.refit_base:
            for (name, clf) in self.base_classifiers:
                print("Ensemble currently refitting:",name)
                clf.fit(X,y)

        return self
def encode_categorical(data_dir,traindata,testdata, parameters):
    '''
    Function to encode the categorical features as numerical features using a LabelEncoder.
    Important is to encode the test and train file in the same manner
    '''
    #Unless pre-picked parameters to work on are given we work on these parameters.
    if parameters == None:
        names_parameters=['funder','installer','wpt_name','basin','subvillage','region','lga','ward','public_meeting','recorded_by'
        ,'scheme_management','scheme_name','permit','extraction_type','extraction_type_group','extraction_type_class','management','management_group','payment',
                        'payment_type','water_quality','quality_group','quantity','quantity_group','source','source_type','source_class',
                       'waterpoint_type','waterpoint_type_group']
    else:
        names_parameters=parameters
    #Get the shape of the data
    (testrows, testcolumns)=testdata.shape
    (trainrows, traincolumns)=traindata.shape

    #For all of the specified features:
    for feature in names_parameters:
        #We initialize an encoder.
        le = LabelEncoder()
        #We add the test and train data together
        fitdata=np.append(traindata[feature].values,testdata[feature].values)
        #Because the mapping to numbers has to be the same in both, thus we fit them together.
        le.fit(fitdata)
        #We however want to work with the train and test data separate so we transform them separately
        train_cat=le.transform(traindata[feature])
        test_cat = le.transform(testdata[feature])
        #We initialize empty arrays for the train and test data and store these to the relevant files.
        newtestdata=np.zeros((testrows,1), dtype=int)
        newtestdata[:, 0]=np.array(test_cat)
        store_data(newtestdata, train=False,labels=(feature +'_num'), one=True)
        newtraindata=np.zeros((trainrows,1), dtype=int)
        newtraindata[:, 0]=np.array(train_cat)
        store_data(newtraindata, train=True,labels=(feature +'_num'), one=True)
Ejemplo n.º 15
0
def preprocess():
    train = pd.read_csv("../data/train.csv")
    test = pd.read_csv("../data/test.csv")

    train['Date'] = pd.to_datetime(pd.Series(train['Original_Quote_Date']))
    train = train.drop('Original_Quote_Date', axis=1)

    test['Date'] = pd.to_datetime(pd.Series(test['Original_Quote_Date']))
    test = test.drop('Original_Quote_Date', axis=1)

    train['Year'] = train['Date'].apply(lambda x: x.year)
    train['Month'] = train['Date'].apply(lambda x: x.month)
    train['weekday'] = train['Date'].apply(lambda x: x.dayofweek)

    test['Year'] = test['Date'].apply(lambda x: x.year)
    test['Month'] = test['Date'].apply(lambda x: x.month)
    test['weekday'] = test['Date'].apply(lambda x: x.dayofweek)

    train = train.drop('Date', axis=1)
    test = test.drop('Date', axis=1)

    for f in train.columns:
        if train[f].dtype == 'object':
            lbl = LabelEncoder()
            # watch how to handle missing value labeling
            lbl.fit(list(train[f].values) + list(test[f].values))
            train[f] = lbl.transform(list(train[f].values))
            test[f] = lbl.transform(list(test[f].values))

    train = train.fillna(-1)
    test = test.fillna(-1)

    return train, test
Ejemplo n.º 16
0
def main():
    train = pd.read_csv('data/train.csv')
    test = pd.read_csv('data/test.csv')

    enc = LabelEncoder()
    joined = pd.concat((train['Product_Info_2'],
                        test['Product_Info_2']), axis=0)
    enc.fit(joined)
    train['Product_Info_2'] = enc.transform(train['Product_Info_2'])
    test['Product_Info_2'] = enc.transform(test['Product_Info_2'])


    X_train = train.drop('Response', axis=1).values
    y_train = train['Response'].values
    X_test = test.values

    mdl = xgb.XGBRegressor(learning_rate=0.05,
                           n_estimators=200,
                           subsample=0.5,
                           max_depth=6,
                           silent=False)
    mdl.fit(X_train, y_train)

    preds = mdl.predict(X_test)
    preds = [min(max(1, int(round(pred))), 8) for pred in preds]

    sub = pd.DataFrame({'Id': test['Id'], 'Response': preds})
    sub.to_csv('submissions/xgb.csv', index=False)
Ejemplo n.º 17
0
    def __init__(self,filename='train.json'):
        self.filename_tr=filename

        # Read JSON data using pandas
        # columns are: id, cuisine, ingredients
        data  = pd.read_json(filename)
        
        # Label Encoders
        labels = LabelEncoder()
        labels.fit(data.cuisine)
        self.classes = labels.classes_
        self.class_encode = labels.transform
        self.class_decode = labels.inverse_transform        

        # Get numerical labels for ytrain 
        y_train = labels.transform(data.cuisine)

        # Vectorization of ingredients Using WordNet lemmatization & Tfid
        data['ingredients_clean_string'] = [' , '.join(z).strip() for z in data['ingredients']]  
        data['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in data['ingredients']]

        vectorizer  = TfidfVectorizer(stop_words='english', ngram_range=(1,1), max_df=0.57, analyzer='word', token_pattern=r'\w+')
        x_train     = vectorizer.fit_transform(data.ingredients_string).todense()
        ingred_dict = vectorizer.vocabulary_
        self.vectorizer = vectorizer

        self.y_train = y_train
        self.x_train = x_train

        self.tsdata  = pd.DataFrame()
Ejemplo n.º 18
0
def create_partitions(work_dir_path):

    import cPickle
    import numpy as np

    from sklearn.preprocessing import LabelEncoder
    from sklearn.cross_validation import StratifiedKFold 

    # load data from one feature file
    npz    = np.load("%s/features/features.ssd.npz" % (work_dir_path))
    labels = npz["labels"]
    npz.close()
    
    # create labelencoder
    if not (os.path.exists("%s/labelencoder.pkl" % (work_dir_path)) and SKIP_EXISTING_FILES):
        
        le = LabelEncoder()
        le.fit(labels)
        
        with open("%s/labelencoder.pkl" % (work_dir_path), 'wb') as fid:
            cPickle.dump(le, fid)
            
    # create partitions
    if not (os.path.exists("%s/stratified_%dfold.pkl" % (work_dir_path, NUM_FOLDS)) and SKIP_EXISTING_FILES):
        
        cv = StratifiedKFold(le.transform(labels),
                             n_folds=NUM_FOLDS, 
                             shuffle=False)
        
        with open("%s/stratified_%dfold.pkl" % (work_dir_path, NUM_FOLDS), 'wb') as fid:
            cPickle.dump(cv, fid)
Ejemplo n.º 19
0
    def __init__(self, filename):
        with open(filename, "r") as f:
            lines = re.split(r'\n', f.read())[0:-1]

        data = []
        target = []

        for line in lines:
            fields = re.split(r',', line)
            data.append(fields[:-1])
            target.append(fields[-1])

        npdata = np.array(data)
        nptarget = np.array(target)

        # Scikit-learn requires numeric values.

        le = LabelEncoder()
        le.fit(nptarget)
        self.__target = le.transform(nptarget)

        nrows, ncols = npdata.shape
        self.__data = np.zeros((nrows, ncols), dtype = np.int64)
        for ix in xrange(ncols):
            col = npdata[:, ix]
            le.fit(col)
            self.__data[:, ix] = le.transform(col)

        # Most classifier results are skewed if categorical features are mapped
        # to integer-valued features. Use a one-hot encoding.

        oe = OneHotEncoder()
        oe.fit(self.__data)
        self.__data = oe.transform(self.__data).toarray()
Ejemplo n.º 20
0
def load_data():
    train_list = []
    for line in open('../data/train_clean.json', 'r'):
        train_list.append(json.loads(line))
    train = pd.DataFrame(train_list)
    
    #train_work = train[names[-1]]
    test_list = []
    for line in open('../data/test_clean.json', 'r'):
        test_list.append(json.loads(line))
    test = pd.DataFrame(test_list)
    
    print('--- NLP on major, simply cut the first word')
    le = LabelEncoder()
    print len(set(train['major']))
    train['major'] = train['major'].apply(lambda x :  " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')
    test['major']  = test['major'].apply(lambda x :  " ".join(jieba.cut(x,  cut_all = False)).split()[0] if x is not None  and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')

    print len(set(train['major']))
    le.fit(list(train['major']) + list(test['major']))
    train['major'] = le.transform(train['major'])
    test['major'] = le.transform(test['major'])
 
    le = LabelEncoder()
    train['gender'] = le.fit_transform(train['gender'])
    names =  train.columns
    
    le = LabelEncoder()
    test['gender'] = le.fit_transform(test['gender'])
    del train['_id']
    del test['_id']
    train = train.fillna(0)
    test = test.fillna(0)
    #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii')))
    return train, test
Ejemplo n.º 21
0
	def _fit(self, df, logger):
		col_le_dict = {}
		for colname, col in df.iteritems():
			le = LabelEncoder()
			le.fit(col)
			col_le_dict[colname] = le
		return col_le_dict
Ejemplo n.º 22
0
def data_processing(train,test,features):
    # train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    # train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    # train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    # train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and int(x[11:13]) >= 18 and int(x[11:13]) < 6) else 0)
    # features += ['hour','dark','StreetNo']

    print("Filling NAs")
    # print(train.mode())
    train = train.fillna(train.median().iloc[0])
    test = test.fillna(test.median().iloc[0])
    print("Label Encoder")
    le=LabelEncoder()
    for col in features:
        le.fit(list(train[col])+list(test[col]))
        train[col]=le.transform(train[col])
        test[col]=le.transform(test[col])

    le.fit(list(train[target]))
    train[target]=le.transform(train[target])

    print("Standard Scalaer")
    scaler=StandardScaler()
    for col in features:
        scaler.fit(list(train[col]))
        train[col]=scaler.transform(train[col])
        test[col]=scaler.transform(test[col])

    return train,test,features
Ejemplo n.º 23
0
def label_encoder(data):
    for f in data.columns:
        if data[f].dtypes == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(data[f].values))
            data[f] = lbl.transform(list(data[f].values))
    return data
Ejemplo n.º 24
0
class EncodingText():
    """
    LabelEncoder.transform的主要作用是把语料库的一个个单词转换成vocabulary对应的数字
    LabelEncoder.fit的主要作用是去重,并把vocabulary里的单词映射到数字。
    """
    def __init__(self,vocabulary):
        from sklearn.preprocessing import LabelEncoder
        self.le=LabelEncoder()
        self.vocabulary=vocabulary

    def fit(self,X,y=None):
        self.le.fit(self.vocabulary)
        return self

    def transform(self,X):
        for x in X:
            y = self.le.transform(x.split())
        return [self.le.transform(x.split()) for x in X]
        #return [self.getSparseM(x) for x in X]

    def getSparseM(self,x):
        from scipy.sparse import coo_matrix
        import numpy as np
        sent=x.split()
        ind=self.le.transform(sent)
        a=coo_matrix((np.ones([len(sent)]),(ind,range(len(sent)))),shape=(len(self.vocabulary),len(sent)))
        return a
def process_data(train,test,features,features_non_numeric):
    train['StreetNo'] = train['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    test['StreetNo'] = test['Address'].apply(lambda x: x.split(' ', 1)[0] if x.split(' ', 1)[0].isdigit() else 0)
    train['Address'] = train['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    test['Address'] = test['Address'].apply(lambda x: x.split(' ', 1)[1] if x.split(' ', 1)[0].isdigit() else x)
    train['hour'] = train['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    test['hour'] = test['Dates'].apply(lambda x: x[11:13] if len(x) > 4 else 12)
    train['dark'] = train['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0)
    test['dark'] = test['Dates'].apply(lambda x: 1 if (len(x) > 4 and x[11:13] >= 18 and x[11:13] < 6) else 0)
    features += ['hour','dark','StreetNo']

    print "Filling N/As: " + str(datetime.datetime.now())
    train = train.fillna(train.mode().iloc[0])
    test = test.fillna(test.mode().iloc[0])
    # Pre-processing non-numberic values
    print "Label Encoder: " + str(datetime.datetime.now())
    le = LabelEncoder()
    for col in features:
        # print col
        le.fit(list(train[col])+list(test[col]))
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])
    # Xgb requires goal to be numeric...
    le.fit(list(train[goal]))
    train[goal] = le.transform(train[goal])

    # Neural Network, Stochastic Gradient Descent is sensitive to feature scaling, so it is highly recommended to scale your data.
    print "Standard Scaler: " + str(datetime.datetime.now())
    scaler = StandardScaler()
    for col in set(features): # - set(features_non_numeric):
        # print col
        scaler.fit(list(train[col])+list(test[col]))
        train[col] = scaler.transform(train[col])
        test[col] = scaler.transform(test[col])
    return (train,test,features)
Ejemplo n.º 26
0
    def main(self, args=None):
        self.data = self.parser.parse_args(args=args)
        assert not self.data.update_klass
        best = load_json(self.data.params_fname)
        if isinstance(best, list):
            best = best[0]
        best = clean_params(best)
        print(self.data.params_fname, self.data.training_set)
        corpus, labels = read_data_labels(self.data.training_set)
        le = LabelEncoder()
        le.fit(labels)
        y = le.transform(labels)
        t = TextModel(corpus, **best)
        X = [t[x] for x in corpus]
        hy = [None for x in y]
        for tr, ts in KFold(n_splits=self.data.kratio,
                            shuffle=True, random_state=self.data.seed).split(X):
            c = SVC(model=t)
            c.fit([X[x] for x in tr], [y[x] for x in tr])
            _ = c.decision_function([X[x] for x in ts])
            [hy.__setitem__(k, v) for k, v in zip(ts, _)]

        i = 0
        with open(self.get_output(), 'w') as fpt:
            for tweet in tweet_iterator(self.data.training_set):
                tweet['decision_function'] = hy[i].tolist()
                i += 1
                fpt.write(json.dumps(tweet)+"\n")
        return hy
Ejemplo n.º 27
0
def main():
    org_type = pd.read_csv('../input/application_train.csv', usecols=['ORGANIZATION_TYPE'], nrows=None)
    print(org_type.shape)
    print(org_type.nunique())

    print(org_type.head())

    lbl = LabelEncoder()
    lbl.fit(org_type)
    org_type_label = lbl.transform(org_type)

    print(type(org_type_label))
    print(org_type_label.shape)
    print(org_type_label[:5])

    model = Sequential([
        Dense(32, units=784),
        Activation('relu'),
        Dense(10),
        Activation('softmax'),
    ])

    

    model.compile(loss='mean_absolute_error', optimizer='adam')
Ejemplo n.º 28
0
def encode_categorical_data(train, test, fill_missing = False):
    '''
    encoding is an extemely slow process
    So only use the training data to trian the encoder
    '''
    le = LabelEncoder()

    ## this step creates separate train and test dataFrame
    if fill_missing:
        train = train.fillna(value='missing')
        test = test.fillna(value='missing')

    counter = 0
    start_time = time.time()
    for col, dtype in zip(train.columns, train.dtypes):
        if dtype == 'object':
            le.fit(pd.concat([train[col], test[col]], axis=0))
            train[col] = le.transform(train[col])
            test[col] = le.transform(test[col])

        counter += 1
        if counter % 20 == 0:
            print '{} out of {} is processed using {} seconds...'.format(str(counter), str(train.shape[1]), round((time.time() - start_time), 0))

    end_time = time.time()
    print 'encoding process takes ', round((end_time - start_time)), 'seconds'

    ## train and test are newly created
    return train, test
Ejemplo n.º 29
0
def encode_dataset(train,test,meta,target_model='xgb'):
    y_train = train[meta['target']]
    train = train.drop([meta['target']],axis=1)
    assert train.shape[1] == test.shape[1]
    for i in range(train.shape[1]):
        assert train.columns[i] == test.columns[i]
    train_obs = len(train)
    #
    all_data = pd.concat([train,test],axis=0)
    for i,f in enumerate(meta['cols'].keys()):
        print(i,f,meta['cols'][f])
        if meta['cols'][f] == 'CAT':
            all_data[f] = all_data[f].astype('str')
            encoder = LabelEncoder()
            encoder.fit(np.unique(all_data[f].unique().tolist()))
            if target_model == 'xgb':
                all_data[f] = encoder.transform(all_data[f])
            else:
                all_data[f] = encoder.transform(all_data[f]).astype(int)
        elif meta['cols'][f] == 'NUM':
            all_data[f] = all_data[f].fillna(-1)
        elif meta['cols'][f] == 'REM':
            all_data = all_data.drop(f,axis=1)
        else:
            raise Exception(str(meta['cols'][f])+":unknown mapping")
    assert train_obs == len(y_train)
    return all_data , y_train
Ejemplo n.º 30
0
def label_encode_train_test_sets (train, test) :
	" Label encode 'supplier' and 'bracket_pricing' features for both train and test set "
	test_suppliers = np.sort(pd.unique(test.supplier.ravel()))
	print ("Test suppliers shape & elements: ", test_suppliers.shape, test_suppliers)
	train_suppliers = np.sort(pd.unique(train.supplier.ravel()))
	print ("Train suppliers shape & elements: ", train_suppliers.shape, train_suppliers)
	
	## Merge 'supplier' for both datasets first because we want encoding to be consistent across both
	# http://docs.scipy.org/doc/numpy/reference/generated/numpy.sort.html
	supplier_ids = []
	supplier_ids.extend(train_suppliers)
	supplier_ids.extend(test_suppliers)
	supplier_ids = np.sort(np.unique(supplier_ids))
	print ("Merged supplier_ids.shape: ", supplier_ids.shape)
	# print ("supplier_ids.elements: ", supplier_ids)

	## Perform label encoding fit on the merged array and then individually transform for train and test sets
	print ("Performing label encoding on supplier column...")
	label_e = LabelEncoder()
	label_e.fit(supplier_ids)
	train['supplier'] = label_e.transform(train['supplier'])
	test['supplier'] = label_e.transform(test['supplier'])

	## Perform label encoding on 'bracket_pricing'
	print ("Performing label encoding on bracket_pricing column...")
	train['bracket_pricing'] = label_e.fit_transform(train['bracket_pricing'])
	test['bracket_pricing'] = label_e.fit_transform(test['bracket_pricing'])

	return train, test
Ejemplo n.º 31
0
def t4_prepare_columns(train, test, good_columns_extra=None):
    good_columns = [
        # 'bond_lengths_mean_y',
        # 'bond_lengths_median_y',
        # 'bond_lengths_std_y',
        # 'bond_lengths_mean_x',
        'molecule_atom_index_0_dist_min',
        'molecule_atom_index_0_dist_max',
        'molecule_atom_index_1_dist_min',
        'molecule_atom_index_0_dist_mean',
        'molecule_atom_index_0_dist_std',
        'dist',
        'dist_lin',
        'subtype',
        'molecule_atom_index_1_dist_std',
        'molecule_atom_index_1_dist_max',
        'molecule_atom_index_1_dist_mean',
        'molecule_atom_index_0_dist_max_diff',
        'molecule_atom_index_0_dist_max_div',
        'molecule_atom_index_0_dist_std_diff',
        'molecule_atom_index_0_dist_std_div',
        'atom_0_couples_count',
        'molecule_atom_index_0_dist_min_div',
        'molecule_atom_index_1_dist_std_diff',
        'molecule_atom_index_0_dist_mean_div',
        'atom_1_couples_count',
        'molecule_atom_index_0_dist_mean_diff',
        'molecule_couples',
        'atom_index_1',
        'molecule_dist_mean',
        'molecule_atom_index_1_dist_max_diff',
        'molecule_atom_index_0_y_1_std',
        'molecule_atom_index_1_dist_mean_diff',
        'molecule_atom_index_1_dist_std_div',
        'molecule_atom_index_1_dist_mean_div',
        'molecule_atom_index_1_dist_min_diff',
        'molecule_atom_index_1_dist_min_div',
        'molecule_atom_index_1_dist_max_div',
        'molecule_atom_index_0_z_1_std',
        'y_0',
        'molecule_type_dist_std_diff',
        'molecule_atom_1_dist_min_diff',
        'molecule_atom_index_0_x_1_std',
        'molecule_dist_min',
        'molecule_atom_index_0_dist_min_diff',
        'molecule_atom_index_0_y_1_mean_diff',
        'molecule_type_dist_min',
        'molecule_atom_1_dist_min_div',
        'atom_index_0',
        'molecule_dist_max',
        'molecule_atom_1_dist_std_diff',
        'molecule_type_dist_max',
        'molecule_atom_index_0_y_1_max_diff',
        'molecule_type_0_dist_std_diff',
        'molecule_type_dist_mean_diff',
        'molecule_atom_1_dist_mean',
        'molecule_atom_index_0_y_1_mean_div',
        'molecule_type_dist_mean_div',
        'type',

        # Crane
        'dist_C_0_a0',
        'dist_C_1_a0',
        'dist_C_2_a0',
        'dist_C_3_a0',
        'dist_C_4_a0',
        'dist_F_0_a0',
        'dist_F_1_a0',
        'dist_F_2_a0',
        'dist_F_3_a0',
        'dist_F_4_a0',
        'dist_H_0_a0',
        'dist_H_1_a0',
        'dist_H_2_a0',
        'dist_H_3_a0',
        'dist_H_4_a0',
        'dist_N_0_a0',
        'dist_N_1_a0',
        'dist_N_2_a0',
        'dist_N_3_a0',
        'dist_N_4_a0',
        'dist_O_0_a0',
        'dist_O_1_a0',
        'dist_O_2_a0',
        'dist_O_3_a0',
        'dist_O_4_a0',
        'EN_a0',
        'rad_a0',
        'n_bonds_a0',
        'bond_lengths_mean_a0',
        'bond_lengths_std_a0',
        'bond_lengths_median_a0',
        'dist_C_0_a1',
        'dist_C_1_a1',
        'dist_C_2_a1',
        'dist_C_3_a1',
        'dist_C_4_a1',
        'dist_F_0_a1',
        'dist_F_1_a1',
        'dist_F_2_a1',
        'dist_F_3_a1',
        'dist_F_4_a1',
        'dist_H_0_a1',
        'dist_H_1_a1',
        'dist_H_2_a1',
        'dist_H_3_a1',
        'dist_H_4_a1',
        'dist_N_0_a1',
        'dist_N_1_a1',
        'dist_N_2_a1',
        'dist_N_3_a1',
        'dist_N_4_a1',
        'dist_O_0_a1',
        'dist_O_1_a1',
        'dist_O_2_a1',
        'dist_O_3_a1',
        'dist_O_4_a1',
        'EN_a1',
        'rad_a1',
        'n_bonds_a1',
        'bond_lengths_mean_a1',
        'bond_lengths_std_a1',
        'bond_lengths_median_a1',

        # Criskiev
        'atom_2',
        'atom_3',
        'atom_4',
        'atom_5',
        'atom_6',
        'atom_7',
        'atom_8',
        'atom_9',
        'd_1_0',
        'd_2_0',
        'd_2_1',
        'd_3_0',
        'd_3_1',
        'd_3_2',
        'd_4_0',
        'd_4_1',
        'd_4_2',
        'd_4_3',
        'd_5_0',
        'd_5_1',
        'd_5_2',
        'd_5_3',
        'd_6_0',
        'd_6_1',
        'd_6_2',
        'd_6_3',
        'd_7_0',
        'd_7_1',
        'd_7_2',
        'd_7_3',
        'd_8_0',
        'd_8_1',
        'd_8_2',
        'd_8_3',
        'd_9_0',
        'd_9_1',
        'd_9_2',
        'd_9_3',

        # Criskiev extra
        # 'd_1_0_log', 'd_2_0_log', 'd_2_1_log', 'd_3_0_log', 'd_3_1_log', 'd_3_2_log', 'd_4_0_log', 'd_4_1_log',
        # 'd_4_2_log', 'd_4_3_log', 'd_5_0_log', 'd_5_1_log', 'd_5_2_log', 'd_5_3_log', 'd_6_0_log', 'd_6_1_log',
        # 'd_6_2_log', 'd_6_3_log', 'd_7_0_log', 'd_7_1_log', 'd_7_2_log', 'd_7_3_log', 'd_8_0_log', 'd_8_1_log',
        # 'd_8_2_log', 'd_8_3_log', 'd_9_0_log', 'd_9_1_log', 'd_9_2_log', 'd_9_3',
        #
        # 'd_1_0_recp', 'd_2_0_recp', 'd_2_1_recp', 'd_3_0_recp', 'd_3_1_recp', 'd_3_2_recp', 'd_4_0_recp', 'd_4_1_recp',
        # 'd_4_2_recp', 'd_4_3_recp', 'd_5_0_recp', 'd_5_1_recp', 'd_5_2_recp', 'd_5_3_recp', 'd_6_0_recp', 'd_6_1_recp',
        # 'd_6_2_recp', 'd_6_3_recp', 'd_7_0_recp', 'd_7_1_recp', 'd_7_2_recp', 'd_7_3_recp', 'd_8_0_recp', 'd_8_1_recp',
        # 'd_8_2_recp', 'd_8_3_recp', 'd_9_0_recp', 'd_9_1_recp', 'd_9_2_recp', 'd_9_3'
    ]

    good_columns += (good_columns_extra
                     if good_columns_extra is not None else [])

    labels = {}
    for f in ['atom_1', 'type_0', 'type']:
        if f in good_columns:
            lbl = LabelEncoder()
            lbl.fit(list(train[f].values) + list(test[f].values))
            train[f] = lbl.transform(list(train[f].values))
            test[f] = lbl.transform(list(test[f].values))

            labels[f] = lbl

    X = train[good_columns].copy()
    X_test = test[good_columns].copy()

    return X, X_test, labels
Ejemplo n.º 32
0
def get_tag_dict(raw):
    label_encoder = LabelEncoder()
    label_encoder.fit(raw['tag'])
    return label_encoder
def myfunc(myDataset, shape, lang):
    myDataset = Remove3Row(myDataset)

    print(myDataset.head())
    X = myDataset.Text
    Y = myDataset.Label

    encoder = LabelEncoder()
    encoder.fit(Y)

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=1,
                                                    stratify=Y)

    Xtrain, Xval, Ytrain, Yval = train_test_split(Xtrain,
                                                  Ytrain,
                                                  test_size=0.25,
                                                  random_state=1,
                                                  stratify=Ytrain)

    Ytrain = encoder.transform(Ytrain)
    Ytest = encoder.transform(Ytest)
    Yval = encoder.transform(Yval)

    Ytrain = np_utils.to_categorical(Ytrain)
    Ytest = np_utils.to_categorical(Ytest)
    Yval = np_utils.to_categorical(Yval)

    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(Xtrain)

    Xtrain = tokenizer.texts_to_sequences(Xtrain)
    Xtest = tokenizer.texts_to_sequences(Xtest)
    Xval = tokenizer.texts_to_sequences(Xval)
    vocab_size = len(
        tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

    maxlen = 100

    Xtrain = pad_sequences(Xtrain, padding='post', maxlen=maxlen)
    Xtest = pad_sequences(Xtest, padding='post', maxlen=maxlen)
    Xval = pad_sequences(Xval, padding='post', maxlen=maxlen)

    from keras.models import Sequential
    from keras import layers

    embedding_dim = 50

    model = Sequential()
    model.add(
        layers.Embedding(input_dim=vocab_size,
                         output_dim=embedding_dim,
                         input_length=maxlen))

    model.add(GRU(256))
    model.add(Dropout(0.1))
    model.add(layers.Dense(shape, activation='sigmoid'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()

    history = model.fit(Xtrain,
                        Ytrain,
                        epochs=10,
                        verbose=True,
                        validation_data=(Xval, Yval))
    f = open("report.csv", "a")
    loss, accuracy = model.evaluate(Xtrain, Ytrain, verbose=True)
    print("Training Accuracy: {:.4f}".format(accuracy))
    f.write(lang + ",{:.4f}".format(accuracy))

    loss, accuracy = model.evaluate(Xval, Yval, verbose=True)

    print("Validation Accuracy:  {:.4f}".format(accuracy))
    f.write(",{:.4f} \n".format(accuracy))
    f.close()
    plot_history(history)
    Ypred = model.predict(Xtest)
    from sklearn.metrics import confusion_matrix, classification_report

    matrix = confusion_matrix(Ytest.argmax(axis=1), Ypred.argmax(axis=1))
    classification_Report = classification_report(Ytest.argmax(axis=1),
                                                  Ypred.argmax(axis=1),
                                                  output_dict=True)
    df = pd.DataFrame(classification_Report).transpose()
    df.to_csv(lang + "ClassificationReport.csv")
Ejemplo n.º 34
0
	return indices

#choose which size dataset and number of classes to use
X_train = pickle.load( open("../generate_train_test/final_dataset/X_train.p",'rb'))
X_test = pickle.load( open("../generate_train_test/final_dataset/X_test.p",'rb'))
y_train = pickle.load( open("../generate_train_test/final_dataset/y_train.p",'rb'))
y_test = pickle.load( open("../generate_train_test/final_dataset/y_test.p",'rb'))

#split into train and validation
X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size=0.25, random_state=0)

start_time = time.time()

#numerically encode the classes
le = LabelEncoder()
le.fit(y_train)
le_y_train = le.transform(y_train)
le_y_validate = le.transform(y_validate)
le_y_test = le.transform(y_test)

#create TFIDF features from text
tf_transformer = TfidfVectorizer(stop_words='english').fit(X_train)
X_train_tfidf = tf_transformer.transform(X_train)
X_validate_tfidf = tf_transformer.transform(X_validate)
#X_test_tfidf = tf_transformer.transform(X_test)

time_elapsed = time.time() - start_time
print "TFIDF: " + str(time_elapsed/60) + " minutes"
start_time = time.time()

Ejemplo n.º 35
0
def preprocessing_features(df_train, df_test, process_continuous):

    to_delete_features = ['default', 'pdays']
    continuous_features = [
        'age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
        'cons.conf.idx', 'euribor3m', 'nr.employed'
    ]
    categorical_ordered_features = [
        'education', 'housing', 'loan', 'contact', 'month', 'day_of_week',
        'poutcome'
    ]
    categorical_unordered_features = ['job', 'marital']

    unknown_present_features = [
        'job', 'marital', 'education', 'housing', 'loan'
    ]
    all_present_features = [
        'age', 'campaign', 'previous', 'emp.var.rate', 'cons.price.idx',
        'cons.conf.idx', 'euribor3m', 'nr.employed', 'contact', 'month',
        'day_of_week', 'poutcome'
    ]

    ### Delete Features
    for feat in to_delete_features:
        print "\n--------- deleting feature --------- ", feat
        del df_train[feat]
        del df_test[feat]

    ### Normalization or Standardization of Continuous Features
    if process_continuous == "Standardize":
        print "\n--------- Standardizing Continuous Features (Mean=0, Standard Deviation=1) --------- "
        standardization = StandardScaler()
        standardization.fit(df_train[continuous_features])
        print "Mean: ", standardization.mean_
        print "Variance: ", standardization.var_
        df_train[continuous_features] = standardization.transform(
            df_train[continuous_features])
        df_test[continuous_features] = standardization.transform(
            df_test[continuous_features])

    elif process_continuous == "Normalize":
        print "\n--------- Normalizing Continuous Features (Min=0, Max=1) --------- "
        min_max_scaling = MinMaxScaler()
        min_max_scaling.fit(df_train[continuous_features])
        print min_max_scaling.data_min_
        print min_max_scaling.data_max_
        df_train[continuous_features] = min_max_scaling.transform(
            df_train[continuous_features])
        df_test[continuous_features] = min_max_scaling.transform(
            df_test[continuous_features])

    ### Label Categorical Ordered Features -- Features used for Imputation (All Present)
    label_dict = {
        'education': {
            'illiterate': 0,
            'basic.4y': 4,
            'basic.6y': 6,
            'basic.9y': 9,
            'high.school': 11,
            'professional.course': 13,
            'university.degree': 14
        },
        'housing': {
            'no': 0,
            'yes': 1
        },
        'loan': {
            'no': 0,
            'yes': 1
        },
        'contact': {
            'telephone': 0,
            'cellular': 1
        },
        'month': {
            'jan': 1,
            'feb': 2,
            'mar': 3,
            'apr': 4,
            'may': 5,
            'jun': 6,
            'jul': 7,
            'aug': 8,
            'sep': 9,
            'oct': 10,
            'nov': 11,
            'dec': 12
        },
        'day_of_week': {
            'mon': 1,
            'tue': 2,
            'wed': 3,
            'thu': 4,
            'fri': 5,
            'sat': 6,
            'sun': 7
        },
        'poutcome': {
            'nonexistent': 0,
            'failure': 1,
            'success': 2
        }
    }

    for feat in categorical_ordered_features:
        if feat not in unknown_present_features:
            print "\n--------- Labelling feature Before Imputation --------- ", feat
            df_train = df_train.replace({feat: label_dict[feat]})
            df_test = df_test.replace({feat: label_dict[feat]})
            print "Labelled as: ", label_dict[feat]

    ### Imputation using SVM
    df_train_impute = df_train.loc[:,
                                   df_train.columns.isin(all_present_features)]
    df_test_impute = df_test.loc[:, df_test.columns.isin(all_present_features)]

    for feat in unknown_present_features:
        print "\nFilling Unkowns for Feature: ", feat
        train_impute = df_train[feat]
        test_impute = df_test[feat]

        train_impute_no_unknowns = train_impute[train_impute != 'unknown']
        train_impute_unknowns = train_impute[train_impute == 'unknown']
        test_impute_unknowns = test_impute[test_impute == 'unknown']

        df_train_impute_train_features = df_train_impute.loc[
            train_impute_no_unknowns.index]
        df_train_impute_test_features = df_train_impute.loc[
            train_impute_unknowns.index]
        df_test_impute_test_features = df_test_impute.loc[
            test_impute_unknowns.index]

        svm_model = SVC()
        svm_model.fit(df_train_impute_train_features, train_impute_no_unknowns)
        df_train.loc[df_train_impute_test_features.index,
                     feat] = svm_model.predict(df_train_impute_test_features)
        print "Train Filled with: ", df_train.loc[
            df_train_impute_test_features.index, feat].value_counts()
        df_test.loc[df_test_impute_test_features.index,
                    feat] = svm_model.predict(df_test_impute_test_features)
        print "Test Filled with: ", df_test.loc[
            df_test_impute_test_features.index, feat].value_counts()

    ### Label Categorical Ordered Features -- Features Imputated (Unkowns were Present)
    for feat in categorical_ordered_features:
        if feat in unknown_present_features:
            print "\n--------- Labelling feature After  Imputation --------- ", feat
            df_train = df_train.replace({feat: label_dict[feat]})
            df_test = df_test.replace({feat: label_dict[feat]})
            print "Labelled as: ", label_dict[feat]

    ### One hot encoding Categorical Un-ordered Features
    for feat in categorical_unordered_features:
        print "\n--------- One Hot Encoding feature --------- ", feat
        label_encoder = LabelEncoder()
        label_encoder.fit(df_train[feat])
        df_train[feat] = label_encoder.transform(df_train[feat])
        df_test[feat] = label_encoder.transform(df_test[feat])

    one_hot_encoder = OneHotEncoder(sparse=False)
    one_hot_encoder.fit(df_train[categorical_unordered_features])
    one_hot_encoded_array_train = one_hot_encoder.transform(
        df_train[categorical_unordered_features])
    one_hot_encoded_df_train = pd.DataFrame(one_hot_encoded_array_train,
                                            index=df_train.index)
    one_hot_encoded_array_test = one_hot_encoder.transform(
        df_test[categorical_unordered_features])
    one_hot_encoded_df_test = pd.DataFrame(one_hot_encoded_array_test,
                                           index=df_test.index)

    df_train = pd.concat(
        [df_train, one_hot_encoded_df_train],
        axis=1)  #concatenate old columns with new one hot encoded columns
    df_test = pd.concat(
        [df_test, one_hot_encoded_df_test],
        axis=1)  #concatenate old columns with new one hot encoded columns

    df_train = df_train.drop(
        categorical_unordered_features,
        axis=1)  #Delete columns which were one hot encoded
    df_test = df_test.drop(categorical_unordered_features,
                           axis=1)  #Delete columns which were one hot encoded

    ### Return pre-processed df
    return df_train, df_test
Ejemplo n.º 36
0
del train1
gc.collect()

print('click',data['click'].unique())

data=data.fillna(-1)

encoder=['adid', 'advert_id', 'advert_industry_inner', 'advert_name',
        'app_cate_id', 'app_id', 'campaign_id', 'carrier', 'city',]



col_encoder = LabelEncoder()
for feat in encoder:
    col_encoder.fit(data[feat])
    data[feat] = col_encoder.transform(data[feat])





minv = np.int32(0);
maxv = np.int32(5);
totalLength = 1#两个类别:根据损失函数类型决定是否,要把做成one-hot形式




batch_size = 10000;
graph = tf.Graph()
for i, t in enumerate(train['type'].unique()):
    plt.subplot(2, 4, i + 1)
    plt.scatter(train.loc[train['type'] == t, good_columns[0]],
                train.loc[train['type'] == t, 'scalar_coupling_constant'],
                label=t)
    plt.title(f'{good_columns[0]} vs target \n for {t} type')
fig, ax = plt.subplots(figsize=(20, 10))
for i, t in enumerate(train['type'].unique()):
    plt.subplot(2, 4, i + 1)
    plt.hist(train.loc[train['type'] == t, good_columns[0]], label='train')
    plt.hist(test.loc[test['type'] == t, good_columns[0]], label='test')
    plt.title(f'{good_columns[0]} distribution \n for {t} type')
for f in ['atom_1', 'type_0', 'type']:
    if f in good_columns:
        lbl = LabelEncoder()
        lbl.fit(list(train[f].values) + list(test[f].values))
        train[f] = lbl.transform(list(train[f].values))
        test[f] = lbl.transform(list(test[f].values))
X = train[good_columns].copy()
y = train['scalar_coupling_constant']
X_test = test[good_columns].copy()
del train, test
gc.collect()
n_fold = 3
folds = KFold(n_splits=n_fold, shuffle=True, random_state=11)
params = {
    'num_leaves': 128,
    'min_child_samples': 79,
    'objective': 'regression',
    'max_depth': 9,
    'learning_rate': 0.2,
Ejemplo n.º 38
0
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

dataset = load_iris()
X = dataset.data
Y = dataset.target

LE = LabelEncoder()
LE.fit(Y)
e_Y = LE.transform(Y)
Y = np_utils.to_categorical(e_Y)

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,train_size=0.8)

model = Sequential()
model.add(Dense(7,input_dim=4,activation='sigmoid'))
model.add(Dense(3,activation='softmax'))

model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

import matplotlib.pyplot as plt
history = model.fit(X_train,Y_train,epochs=20,batch_size=5,validation_split=0.2)
def sensitivity_specificity_support(y_true,
                                    y_pred,
                                    labels=None,
                                    pos_label=1,
                                    average=None,
                                    warn_for=('sensitivity', 'specificity'),
                                    sample_weight=None):
    """Compute sensitivity, specificity, and support for each class

    The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
    of true positives and ``fn`` the number of false negatives. The sensitivity
    quantifies the ability to avoid false negatives_[1].

    The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number
    of true negatives and ``fn`` the number of false negatives. The specificity
    quantifies the ability to avoid false positives_[1].

    The support is the number of occurrences of each class in ``y_true``.

    If ``pos_label is None`` and in binary classification, this function
    returns the average sensitivity and specificity if ``average``
    is one of ``'weighted'``.

    Parameters
    ----------
    y_true : ndarray, shape (n_samples, )
        Ground truth (correct) target values.

    y_pred : ndarray, shape (n_samples, )
        Estimated targets as returned by a classifier.

    labels : list, optional
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average. For multilabel targets,
        labels are column indices. By default, all labels in ``y_true`` and
        ``y_pred`` are used in sorted order.

    pos_label : str or int, optional (default=1)
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str or None, optional (default=None)
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

        ``'binary'``:
            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
        ``'micro'``:
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
        ``'samples'``:
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from
            :func:`accuracy_score`).
    warn_for : tuple or set, for internal use
        This determines which warnings will be made in the case that this
        function is being used to return only one of its metrics.

    sample_weight : ndarray, shape (n_samples, )
        Sample weights.

    Returns
    -------
    sensitivity : float (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )

    specificity : float (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )

    support : int (if ``average`` = None) or ndarray, \
        shape (n_unique_labels, )
        The number of occurrences of each label in ``y_true``.

    References
    ----------
    .. [1] `Wikipedia entry for the Sensitivity and specificity
           <https://en.wikipedia.org/wiki/Sensitivity_and_specificity>`_

    Examples
    --------
    >>> import numpy as np
    >>> from imblearn.metrics import sensitivity_specificity_support
    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
    >>> sensitivity_specificity_support(y_true, y_pred, average='macro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='micro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='weighted')
    (0.33333333333333331, 0.66666666666666663, None)

    """
    average_options = (None, 'micro', 'macro', 'weighted', 'samples')
    if average not in average_options and average != 'binary':
        raise ValueError('average has to be one of ' + str(average_options))

    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    present_labels = unique_labels(y_true, y_pred)

    if average == 'binary':
        if y_type == 'binary':
            if pos_label not in present_labels:
                if len(present_labels) < 2:
                    # Only negative labels
                    return (0., 0., 0)
                else:
                    raise ValueError("pos_label=%r is not a valid label: %r" %
                                     (pos_label, present_labels))
            labels = [pos_label]
        else:
            raise ValueError("Target is %s but average='binary'. Please "
                             "choose another average setting." % y_type)
    elif pos_label not in (None, 1):
        warnings.warn("Note that pos_label (set to %r) is ignored when "
                      "average != 'binary' (got %r). You may use "
                      "labels=[pos_label] to specify a single positive class."
                      % (pos_label, average), UserWarning)

    if labels is None:
        labels = present_labels
        n_labels = None
    else:
        n_labels = len(labels)
        labels = np.hstack(
            [labels, np.setdiff1d(
                present_labels, labels, assume_unique=True)])

    # Calculate tp_sum, pred_sum, true_sum ###

    if y_type.startswith('multilabel'):
        raise ValueError('imblearn does not support multilabel')
    elif average == 'samples':
        raise ValueError("Sample-based precision, recall, fscore is "
                         "not meaningful outside multilabel "
                         "classification. See the accuracy_score instead.")
    else:
        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = np.bincount(
                tp_bins, weights=tp_bins_weights, minlength=len(labels))
        else:
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = np.bincount(
                y_pred, weights=sample_weight, minlength=len(labels))
        if len(y_true):
            true_sum = np.bincount(
                y_true, weights=sample_weight, minlength=len(labels))

        # Compute the true negative
        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]
        tn_sum = tn_sum[indices]

    if average == 'micro':
        tp_sum = np.array([tp_sum.sum()])
        pred_sum = np.array([pred_sum.sum()])
        true_sum = np.array([true_sum.sum()])
        tn_sum = np.array([tn_sum.sum()])

    # Finally, we have all our sufficient statistics. Divide! #

    with np.errstate(divide='ignore', invalid='ignore'):
        # Divide, and on zero-division, set scores to 0 and warn:

        # Oddly, we may get an "invalid" rather than a "divide" error
        # here.
        specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
                                  'specificity', 'predicted', average,
                                  warn_for)
        sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true',
                                  average, warn_for)

    # Average the results

    if average == 'weighted':
        weights = true_sum
        if weights.sum() == 0:
            return 0, 0, None
    elif average == 'samples':
        weights = sample_weight
    else:
        weights = None

    if average is not None:
        assert average != 'binary' or len(specificity) == 1
        specificity = np.average(specificity, weights=weights)
        sensitivity = np.average(sensitivity, weights=weights)
        true_sum = None  # return no support

    return sensitivity, specificity, true_sum
Ejemplo n.º 40
0
y, docs = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split("\t")
    docs.append(content[0])
    y.append(content[1])




#%%
#==============================================================================
# Encode class values as integers 
#==============================================================================
encoder = LabelEncoder()

encoder.fit(y)

encoded_y = encoder.transform(y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)



#%%
#==============================================================================
# Define plot_history function
#==============================================================================
def plot_history(history):
    loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' not in s]
    val_loss_list = [s for s in history.history.keys() if 'loss' in s and 'val' in s]
Ejemplo n.º 41
0
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

dataset = pd.read_csv('NBA_2014_games.csv', parse_dates=["Date"])
encoding = LabelEncoder()
encoding.fit(dataset["Home/Neutral"].values)
home_teams = encoding.transform(
    dataset["Home/Neutral"].values)  #能把字符串类型的球队名转化为整型

encoding.fit(dataset["Visitor/Neutral"].values)
visitor_teams = encoding.transform(dataset["Visitor/Neutral"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T  #向量组合

onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()
print(X_teams_expanded)
def geometric_mean_score(y_true,
                         y_pred,
                         labels=None,
                         pos_label=1,
                         average='multiclass',
                         sample_weight=None,
                         correction=0.0):
    """Compute the geometric mean

    The geometric mean (G-mean) is the root of the product of class-wise
    sensitivity. This measure tries to maximize the accuracy on each of the
    classes while keeping these accuracies balanced. For binary classification
    G-mean is the squared root of the product of the sensitivity
    and specificity. For multi-class problems it is a higher root of the
    product of sensitivity for each class.

    For compatibility with other imbalance performance measures, G-mean can be
    calculated for each class separately on a one-vs-rest basis when
    ``average != 'multiclass'``.

    The best value is 1 and the worst value is 0. Traditionally if at least one
    class is unrecognized by the classifier, G-mean resolves to zero. To
    alleviate this property, for highly multi-class the sensitivity of
    unrecognized classes can be "corrected" to be a user specified value
    (instead of zero). This option works only if ``average == 'multiclass'``.

    Parameters
    ----------
    y_true : ndarray, shape (n_samples, )
        Ground truth (correct) target values.

    y_pred : ndarray, shape (n_samples, )
        Estimated targets as returned by a classifier.

    labels : list, optional
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average.

    pos_label : str or int, optional (default=1)
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str or None, optional (default=``'multiclass'``)
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

        ``'binary'``:
            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
        ``'micro'``:
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
        ``'macro'``:
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
        ``'weighted'``:
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
        ``'samples'``:
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from
            :func:`accuracy_score`).

    sample_weight : ndarray, shape (n_samples, )
        Sample weights.

    correction: float, optional (default=0.0)
        Substitutes sensitivity of unrecognized classes from zero to a given
        value.

    Returns
    -------
    geometric_mean : float

    Notes
    -----
    See :ref:`sphx_glr_auto_examples_evaluation_plot_metrics.py`.

    References
    ----------
    .. [1] Kubat, M. and Matwin, S. "Addressing the curse of
       imbalanced training sets: one-sided selection" ICML (1997)

    .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies
       for learning in class imbalance problems", Pattern Recognition,
       36(3), (2003), pp 849-851.

    Examples
    --------
    >>> from imblearn.metrics import geometric_mean_score
    >>> y_true = [0, 1, 2, 0, 1, 2]
    >>> y_pred = [0, 2, 1, 0, 0, 1]
    >>> geometric_mean_score(y_true, y_pred)
    0.0
    >>> geometric_mean_score(y_true, y_pred, correction=0.001)
    0.010000000000000004
    >>> geometric_mean_score(y_true, y_pred, average='macro')
    0.47140452079103168
    >>> geometric_mean_score(y_true, y_pred, average='micro')
    0.47140452079103168
    >>> geometric_mean_score(y_true, y_pred, average='weighted')
    0.47140452079103168
    >>> geometric_mean_score(y_true, y_pred, average=None)
    array([ 0.8660254,  0.       ,  0.       ])

    """
    if average is None or average != 'multiclass':
        sen, spe, _ = sensitivity_specificity_support(
            y_true,
            y_pred,
            labels=labels,
            pos_label=pos_label,
            average=average,
            warn_for=('specificity', 'specificity'),
            sample_weight=sample_weight)

        LOGGER.debug('The sensitivity and specificity are : %s - %s' %
                     (sen, spe))
        return np.sqrt(sen * spe)
    else:
        present_labels = unique_labels(y_true, y_pred)

        if labels is None:
            labels = present_labels
            n_labels = None
        else:
            n_labels = len(labels)
            labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
                                                     assume_unique=True)])

        le = LabelEncoder()
        le.fit(labels)
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]

        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
        else:
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = np.bincount(tp_bins, weights=tp_bins_weights,
                                 minlength=len(labels))
        else:
            # Pathological case
            true_sum = tp_sum = np.zeros(len(labels))
        if len(y_true):
            true_sum = np.bincount(y_true, weights=sample_weight,
                                   minlength=len(labels))

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]

        recall = _prf_divide(tp_sum, true_sum, "recall", "true", None,
                             "recall")
        recall[recall == 0] = correction

        gmean = sp.stats.gmean(recall)
        # old version of scipy return MaskedConstant instead of 0.0
        if isinstance(gmean, np.ma.core.MaskedConstant):
            return 0.0
        return gmean
Ejemplo n.º 43
0
def data_preprocess(df_train_, df_test_, delete_cols = [], drop_mask = []):
    print 'data_preprocess', '-' * 100
    train = df_train_.copy()
    test = df_test_.copy()

    train['TransactionAmt_to_mean_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('mean')
    train['TransactionAmt_to_mean_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('mean')
    train['TransactionAmt_to_std_card1'] = train['TransactionAmt'] / train.groupby(['card1'])['TransactionAmt'].transform('std')
    train['TransactionAmt_to_std_card4'] = train['TransactionAmt'] / train.groupby(['card4'])['TransactionAmt'].transform('std')

    test['TransactionAmt_to_mean_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('mean')
    test['TransactionAmt_to_mean_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('mean')
    test['TransactionAmt_to_std_card1'] = test['TransactionAmt'] / test.groupby(['card1'])['TransactionAmt'].transform('std')
    test['TransactionAmt_to_std_card4'] = test['TransactionAmt'] / test.groupby(['card4'])['TransactionAmt'].transform('std')

    train['id_02_to_mean_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('mean')
    train['id_02_to_mean_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('mean')
    train['id_02_to_std_card1'] = train['id_02'] / train.groupby(['card1'])['id_02'].transform('std')
    train['id_02_to_std_card4'] = train['id_02'] / train.groupby(['card4'])['id_02'].transform('std')

    test['id_02_to_mean_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('mean')
    test['id_02_to_mean_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('mean')
    test['id_02_to_std_card1'] = test['id_02'] / test.groupby(['card1'])['id_02'].transform('std')
    test['id_02_to_std_card4'] = test['id_02'] / test.groupby(['card4'])['id_02'].transform('std')

    train['D15_to_mean_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('mean')
    train['D15_to_mean_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('mean')
    train['D15_to_std_card1'] = train['D15'] / train.groupby(['card1'])['D15'].transform('std')
    train['D15_to_std_card4'] = train['D15'] / train.groupby(['card4'])['D15'].transform('std')

    test['D15_to_mean_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('mean')
    test['D15_to_mean_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('mean')
    test['D15_to_std_card1'] = test['D15'] / test.groupby(['card1'])['D15'].transform('std')
    test['D15_to_std_card4'] = test['D15'] / test.groupby(['card4'])['D15'].transform('std')

    train['D15_to_mean_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('mean')
    train['D15_to_mean_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('mean')
    train['D15_to_std_addr1'] = train['D15'] / train.groupby(['addr1'])['D15'].transform('std')
    train['D15_to_std_addr2'] = train['D15'] / train.groupby(['addr2'])['D15'].transform('std')

    test['D15_to_mean_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('mean')
    test['D15_to_mean_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('mean')
    test['D15_to_std_addr1'] = test['D15'] / test.groupby(['addr1'])['D15'].transform('std')
    test['D15_to_std_addr2'] = test['D15'] / test.groupby(['addr2'])['D15'].transform('std')

    train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
    train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
    test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
    test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)
    many_null_cols = [col for col in train.columns if train[col].isna().sum() * 1.0 / train.shape[0] > 0.9]
    many_null_cols_test = [col for col in test.columns if test[col].isna().sum() * 1.0 / test.shape[0] > 0.9]
    big_top_value_cols = [col for col in train.columns if train[col].value_counts(dropna = False, normalize = True).values[0] > 0.9]
    big_top_value_cols_test = [col for col in test.columns if test[col].value_counts(dropna = False, normalize = True).values[0] > 0.9]
    one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
    one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]
    cols_to_drop = list(set(many_null_cols + many_null_cols_test + big_top_value_cols + big_top_value_cols_test +
                            one_value_cols + one_value_cols_test))
    cols_to_drop.remove('isFraud')
    print len(cols_to_drop)
    train = train.drop(cols_to_drop, axis = 1)
    test = test.drop(cols_to_drop, axis = 1)
    cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
    for col in cat_cols:
        if col in train.columns:
            le = LabelEncoder()
            le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
            train[col] = le.transform(list(train[col].astype(str).values))
            test[col] = le.transform(list(test[col].astype(str).values))
    X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT', 'TransactionID'], axis=1)
    y = train.sort_values('TransactionDT')['isFraud']
    X_test = test.drop(['TransactionDT', 'TransactionID'], axis=1)
    del train
    test = test[["TransactionDT", 'TransactionID']]
    # by https://www.kaggle.com/dimartinot
    def clean_inf_nan(df):
        return df.replace([np.inf, -np.inf], np.nan)

    # Cleaning infinite values to NaN
    X = clean_inf_nan(X)
    X_test = clean_inf_nan(X_test)
    return X, y, X_test
Ejemplo n.º 44
0
# create 5 models
# 1 for each action
# feed data data to all models and take action with highest score

training_data = pd.read_csv(
    r'/home/tmo/Projects/dat19_footballrobots/06_Data_Science/labeled_data.csv'
)

# this func replaces the categorical variables with 0 & 1 depending on the passed decision param
training_data['decision'] = training_data['decision'].astype('category')
training_data['player_id'] = training_data['player_id'].astype('category')
training_data['mate_id'] = training_data['mate_id'].astype('category')
training_data['team_id'] = training_data['team_id'].astype('category')

le = LabelEncoder()
le.fit(training_data['decision'])
training_data['decision'] = le.transform(training_data['decision'])

le.fit(training_data['player_id'])
training_data['player_id'] = le.transform(training_data['player_id'])

le.fit(training_data['mate_id'])
training_data['mate_id'] = le.transform(training_data['mate_id'])

le.fit(training_data['team_id'])
training_data['team_id'] = le.transform(training_data['team_id'])

#split up data & label
features_training, labels_training = training_data.iloc[:, 0:
                                                        -1], training_data.iloc[:,
                                                                                -1:]
Ejemplo n.º 45
0
        #If training image contains exactly one face
        if len(face_bounding_boxes) == 1:
            face_enc = face_recognition.face_encodings(face)[0]
            # Add face encoding for current image with corresponding label (name) to the training data
            encodings.append(face_enc)
            names.append(person)
        else:
            print(person + "/" + person_img +
                  " was skipped and can't be used for training")

# normalize input vectors
in_encoder = Normalizer(norm='l2')
encodings = in_encoder.transform(encodings)
# label encode targets
out_encoder = LabelEncoder()
out_encoder.fit(names)
names = out_encoder.transform(names)

# Create and train the SVC classifier
clf = svm.SVC(gamma='scale', probability=True)
#clf = svm.SVC(kernel='linear', probability=True)
clf.fit(encodings, names)

# Load the test image with unknown faces into a numpy array
test_image = face_recognition.load_image_file('test/test.jpg')

# Find all the faces in the test image using the default HOG-based model
face_locations = face_recognition.face_locations(test_image)
no = len(face_locations)
print("Number of faces detected: ", no)
from sklearn.preprocessing import LabelEncoder

from Decaf_feature import DecafFeature
import pickle as pkl

dataset = {}
if len(sys.argv) == 1:
    print "Usage: classifier.py dataset_path"
else:
    print "Loading", sys.argv[1]
    dataset = pkl.load(file(sys.argv[1], 'rb'))

# 将类别名称编码为数字
le = LabelEncoder()
all_labels = dataset.keys()
le.fit(all_labels)

# 训练样本比率
_percent = 0.5
x_train=[]
x_test=[]
y_train=[]
y_test=[]

for k,v in dataset.items():
    print "Processing", k
    X = []
    Y = []
    for item in v:
        Y.append(k)
        X.append(item)
#train_set=train_set[colname]

#reduce output to 'normal' and 'attack'
test_labels=test_set['label'].copy()
test_labels[test_labels!='normal']='attack'

#~firstly, we are going to transform all categorical attibute to numeric attribute~

# preprocessing nomial features
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
le = LabelEncoder()

#preprocess 'service' attribute (having large number of unique value)
#Methodology: calculating the ratio of 'attack''s records of one service then replace service's name with the calculated values
value_list = le.fit(test_set['service'])
for element in value_list.classes_:
    N = test_set[test_set['service']==element].shape[0]#number of instance equal to 'element' in 'service''s column
    Np = test_set[(test_set['service']==element) & (test_set['label']!='normal')].shape[0]#number of instance equal to 'element' in 'service''s column and have label of 'attack'
    A=Np/N
    test_set.loc[test_set['service']==element,'service']=A
    
#preprocessing 'flag' and 'protocol_type' attribute (having few number of unique value)
a = pd.get_dummies(test_set[['flag','protocol_type']])
test_set = pd.concat([test_set,a],axis=1)
test_set=test_set.drop(['Unnamed: 0','flag','protocol_type'],axis=1)
#train_set['flag']=le.fit_transform(train_set['flag'])

#~/~

#exclude the 'label' attribute from set
Ejemplo n.º 48
0
# s = pd.Series(np.random.randn(100).cumsum())
# s.plot(linestyle='--', marker='.', color="r", grid=True)
# plt.show()

iris_tf = pd.read_csv("./iris.csv")
X = iris_tf[['Sepal.Length', 'Sepal.Width', 'Petal.Length',
             'Petal.Width']].values
y = iris_tf['Species'].values

# print(iris_tf.info())
# print(iris_tf.describe())
# print(iris_tf['Species'].value_counts())
# print(iris_tf.head())

enc = LabelEncoder()
label_encoder = enc.fit(y)
y = label_encoder.transform(y)

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(211)

label_dict = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}

colors = ['blue', 'red', 'green']
markers = ['s', 'o', '^']

for lab, c, m in zip(range(3), colors, markers):
    ax.scatter(
        X[y == lab, 2],
        X[y == lab, 3],
        c=c,  # color
Ejemplo n.º 49
0
def add_cat(data, col):
    enc = LabelEncoder()
    enc.fit(data[col])
    data[col + "Cat"] = enc.transform(data[col])
    set(df.columns) -
    set([dep_var]))  # List of all the column names (not the target class)

# Copy the dataframe for a hacky solution to provide the TabularList.from_df() method with a full dataset
data = df.copy()

# Rip out the target classification column and save it
df_y = df[dep_var]
del df[dep_var]

# Encode and transform the data
# Since the target classification is categorical, we need to have a way to interpret the neural network's output.
#  In this case, we are assigning each possible output with an integer 0..n-1, another option would be to use one-hot
#  encoding here. This may be explored further.
encoder = LabelEncoder()
encoder.fit(df_y)
data_y = encoder.transform(df_y)

# Normalize the x data and rename for consistency
data_x = (df - df.mean()) / (df.max() - df.min())
data_x = data_x.values

# Set up the metrics we want to collect. I wanted TP,TN,FP,FN but that wasn't available. Recall and precision are still
#  extremely helpful for evaluating the model
metrics = [accuracy, Recall(), Precision()]

# Keep track of which fold we are on
fold_num = 1
total_folds = 10

# Get the indices for the fold and train on that fold
from deap import creator, base, tools, algorithms
from scoop import futures
import random
import numpy
from scipy import interpolate
import matplotlib.pyplot as plt

# Read in data from CSV
# Data set from https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
dfData = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.xls', sep=',')
#dfData=pd.read_csv('bank-additional-full.csv', sep=';')

# Encode the classification labels to numbers
# Get classes and one hot encoded feature vectors
le = LabelEncoder()
le.fit(dfData['Churn'])
allClasses = le.transform(dfData['Churn'])
allFeatures = dfData.drop(['Churn'], axis=1)

# Form training, test, and validation sets
X_trainAndTest, X_validation, y_trainAndTest, y_validation = train_test_split(
    allFeatures, allClasses, test_size=0.20, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_trainAndTest,
                                                    y_trainAndTest,
                                                    test_size=0.20,
                                                    random_state=42)


# Feature subset fitness function
def getFitness(individual, X_train, X_test, y_train, y_test):
def main():
    print('Using Keras version: ', keras.__version__)

    usage = 'usage: %prog [options]'
    parser = argparse.ArgumentParser(usage)
    parser.add_argument(
        '-t',
        '--train_model',
        dest='train_model',
        help=
        'Option to train model or simply make diagnostic plots (0=False, 1=True)',
        default=1,
        type=int)
    parser.add_argument('-s',
                        '--suff',
                        dest='suffix',
                        help='Option to choose suffix for training',
                        default='',
                        type=str)
    parser.add_argument('-p',
                        '--para',
                        dest='hyp_param_scan',
                        help='Option to run hyper-parameter scan',
                        default=0,
                        type=int)
    parser.add_argument(
        '-i',
        '--inputs_file_path',
        dest='inputs_file_path',
        help=
        'Path to directory containing directories \'Bkgs\' and \'Signal\' which contain background and signal ntuples respectively.',
        default='',
        type=str)
    args = parser.parse_args()
    do_model_fit = args.train_model
    suffix = args.suffix

    # Create instance of the input files directory
    #inputs_file_path = 'HHWWgg_DataSignalMCnTuples/2017/'
    inputs_file_path = ''

    hyp_param_scan = args.hyp_param_scan
    # Set model hyper-parameters
    weights = 'BalanceYields'  # 'BalanceYields' or 'BalanceNonWeighted'
    optimizer = 'Nadam'
    validation_split = 0.25
    # hyper-parameter scan results
    if weights == 'BalanceNonWeighted':
        learn_rate = 0.0005
        epochs = 200
        batch_size = 200
    if weights == 'BalanceYields':
        learn_rate = 0.0001
        epochs = 600
        batch_size = 500
        #epochs = 10
        #batch_size=200

    # Create instance of output directory where all results are saved.
    output_directory = 'HHWWyyDNN_binary_%s_%s/' % (suffix, weights)
    check_dir(output_directory)
    hyperparam_file = os.path.join(output_directory,
                                   'additional_model_hyper_params.txt')
    additional_hyperparams = open(hyperparam_file, 'w')
    additional_hyperparams.write("optimizer: " + optimizer + "\n")
    additional_hyperparams.write("learn_rate: " + str(learn_rate) + "\n")
    additional_hyperparams.write("epochs: " + str(epochs) + "\n")
    additional_hyperparams.write("validation_split: " + str(validation_split) +
                                 "\n")
    additional_hyperparams.write("weights: " + weights + "\n")
    # Create plots subdirectory
    plots_dir = os.path.join(output_directory, 'plots/')
    input_var_jsonFile = open('input_variables.json', 'r')
    #selection_criteria = '( 1.>0. )'
    selection_criteria = '( ( fabs(weight) < 10 ) )'

    # Load Variables from .json
    variable_list = json.load(input_var_jsonFile, encoding="utf-8").items()

    # Create list of headers for dataset .csv
    column_headers = []
    for key, var in variable_list:
        column_headers.append(key)
    column_headers.append('weight')
    #column_headers.append('kinWeight')
    #column_headers.append('weight_NLO_node')
    column_headers.append('unweighted')
    column_headers.append('target')
    column_headers.append('key')
    column_headers.append('classweight')
    column_headers.append('process_ID')

    # Load ttree into .csv including all variables listed in column_headers
    print('<train-DNN> Input file path: ', inputs_file_path)
    outputdataframe_name = '%s/output_dataframe.csv' % (output_directory)
    if os.path.isfile(outputdataframe_name):
        data = pandas.read_csv(outputdataframe_name)
        print('<train-DNN> Loading data .csv from: %s . . . . ' %
              (outputdataframe_name))
    else:
        print('<train-DNN> Creating new data .csv @: %s . . . . ' %
              (inputs_file_path))
        data = load_data(inputs_file_path, column_headers, selection_criteria)
        # Change sentinal value to speed up training.
        data = data.mask(data < -25., -9.)
        data = data.mask(data == np.inf, -9.)
        data = data.mask(data == -np.inf, -9.)
        data = data.mask(data == np.nan, -9.)
        data_inf = data.isin([np.inf, -np.inf])
        data_nan = data.isin([np.nan])
        count_inf = np.isinf(data_inf).values.sum()
        count_nan = np.isinf(data_nan).values.sum()
        if count_inf > 0:
            print "WARNING ---> It contained " + str(
                count_inf) + " infinite values"
        if count_nan > 0:
            print "WARNING ---> It contained " + str(count_nan) + " NaN values"
        data.to_csv(outputdataframe_name, index=False)
        data = pandas.read_csv(outputdataframe_name)

    print('<main> data columns: ', (data.columns.values.tolist()))
    n = len(data)
    nHH = len(data.iloc[data.target.values == 1])
    nbckg = len(data.iloc[data.target.values == 0])
    print("Total (train+validation) length of HH = %i, bckg = %i" %
          (nHH, nbckg))

    # Make instance of plotter tool
    Plotter = plotter()
    # Create statistically independant training/testing data
    traindataset, valdataset = train_test_split(data, test_size=0.25)
    valdataset.to_csv((output_directory + 'valid_dataset.csv'), index=False)

    print('<train-DNN> Training dataset shape: ', traindataset.shape)
    print('<train-DNN> Validation dataset shape: ', valdataset.shape)

    # Event weights
    weights_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'weight']
    #weights_for_HH_NLO = traindataset.loc[traindataset['process_ID']=='HH', 'weight_NLO_node']
    weights_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'weight']
    weights_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'weight']
    weights_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'weight']
    weights_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'weight']
    weights_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'weight']
    weights_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'weight']
    weights_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'weight']
    weights_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'weight']

    HHsum_weighted = sum(weights_for_HH)
    Hggsum_weighted = sum(weights_for_Hgg)
    DiPhotonsum_weighted = sum(weights_for_DiPhoton)
    GJetsum_weighted = sum(weights_for_GJet)
    QCDsum_weighted = sum(weights_for_QCD)
    DYsum_weighted = sum(weights_for_DY)
    TTGsJetssum_weighted = sum(weights_for_TTGsJets)
    WGsJetssum_weighted = sum(weights_for_WGsJets)
    WWsum_weighted = sum(weights_for_WW)
    bckgsum_weighted = Hggsum_weighted + DiPhotonsum_weighted + GJetsum_weighted + QCDsum_weighted + DYsum_weighted + TTGsJetssum_weighted + WGsJetssum_weighted + WWsum_weighted

    nevents_for_HH = traindataset.loc[traindataset['process_ID'] == 'HH',
                                      'unweighted']
    nevents_for_Hgg = traindataset.loc[traindataset['process_ID'] == 'Hgg',
                                       'unweighted']
    nevents_for_DiPhoton = traindataset.loc[traindataset['process_ID'] ==
                                            'DiPhoton', 'unweighted']
    nevents_for_GJet = traindataset.loc[traindataset['process_ID'] == 'GJet',
                                        'unweighted']
    nevents_for_QCD = traindataset.loc[traindataset['process_ID'] == 'QCD',
                                       'unweighted']
    nevents_for_DY = traindataset.loc[traindataset['process_ID'] == 'DY',
                                      'unweighted']
    nevents_for_TTGsJets = traindataset.loc[traindataset['process_ID'] ==
                                            'TTGsJets', 'unweighted']
    nevents_for_WGsJets = traindataset.loc[traindataset['process_ID'] ==
                                           'WGsJets', 'unweighted']
    nevents_for_WW = traindataset.loc[traindataset['process_ID'] == 'WW',
                                      'unweighted']

    HHsum_unweighted = sum(nevents_for_HH)
    Hggsum_unweighted = sum(nevents_for_Hgg)
    DiPhotonsum_unweighted = sum(nevents_for_DiPhoton)
    GJetsum_unweighted = sum(nevents_for_GJet)
    QCDsum_unweighted = sum(nevents_for_QCD)
    DYsum_unweighted = sum(nevents_for_DY)
    TTGsJetssum_unweighted = sum(nevents_for_TTGsJets)
    WGsJetssum_unweighted = sum(nevents_for_WGsJets)
    WWsum_unweighted = sum(nevents_for_WW)
    bckgsum_unweighted = Hggsum_unweighted + DiPhotonsum_unweighted + GJetsum_unweighted + QCDsum_unweighted + DYsum_unweighted + TTGsJetssum_unweighted + WGsJetssum_unweighted + WWsum_unweighted

    #HHsum_weighted= HHsum_weighted*2.
    HHsum_unweighted = HHsum_unweighted * 2.

    if weights == 'BalanceYields':
        print('HHsum_weighted= ', HHsum_weighted)
        print('Hggsum_weighted= ', Hggsum_weighted)
        print('DiPhotonsum_weighted= ', DiPhotonsum_weighted)
        print('GJetsum_weighted= ', GJetsum_weighted)
        print('QCDsum_weighted= ', QCDsum_weighted)
        print('DYsum_weighted= ', DYsum_weighted)
        print('TTGsJetssum_weighted= ', TTGsJetssum_weighted)
        print('WGsJetssum_weighted= ', WGsJetssum_weighted)
        print('WWsum_weighted= ', WWsum_weighted)
        print('bckgsum_weighted= ', bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = HHsum_unweighted / HHsum_weighted
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_weighted)

    if weights == 'BalanceNonWeighted':
        print('HHsum_unweighted= ', HHsum_unweighted)
        print('Hggsum_unweighted= ', Hggsum_unweighted)
        print('DiPhotonsum_unweighted= ', DiPhotonsum_unweighted)
        print('GJetsum_unweighted= ', GJetsum_unweighted)
        print('QCDsum_unweighted= ', QCDsum_unweighted)
        print('DYsum_unweighted= ', DYsum_unweighted)
        print('TTGsJetssum_unweighted= ', TTGsJetssum_unweighted)
        print('WGsJetssum_unweighted= ', WGsJetssum_unweighted)
        print('WWsum_unweighted= ', WWsum_unweighted)
        print('bckgsum_unweighted= ', bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'HH',
                         ['classweight']] = 1.
        traindataset.loc[traindataset['process_ID'] == 'Hgg',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DiPhoton',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'GJet',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'QCD',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'DY',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'TTGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WGsJets',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)
        traindataset.loc[traindataset['process_ID'] == 'WW',
                         ['classweight']] = (HHsum_unweighted /
                                             bckgsum_unweighted)

    # Remove column headers that aren't input variables
    training_columns = column_headers[:-6]
    print('<train-DNN> Training features: ', training_columns)

    column_order_txt = '%s/column_order.txt' % (output_directory)
    column_order_file = open(column_order_txt, "wb")
    for tc_i in training_columns:
        line = tc_i + "\n"
        pickle.dump(str(line), column_order_file)

    num_variables = len(training_columns)

    # Extract training and testing data
    X_train = traindataset[training_columns].values
    X_test = valdataset[training_columns].values

    # Extract labels data
    Y_train = traindataset['target'].values
    Y_test = valdataset['target'].values

    # Create dataframe containing input features only (for correlation matrix)
    train_df = data.iloc[:traindataset.shape[0]]

    # Event weights if wanted
    #train_weights = traindataset['weight'].values*traindataset['weight_NLO_node'].values
    #test_weights = valdataset['weight'].values*valdataset['weight_NLO_node'].values
    #train_weights = abs(traindataset['weight'].values)*abs(traindataset['weight_NLO_node'].values)
    #test_weights = abs(valdataset['weight'].values)*abs(valdataset['weight_NLO_node'].values)
    #train_weights = abs(traindataset['weight'].values)*abs(traindataset['kinWeight'].values)*abs(traindataset['weight_NLO_node'].values)
    #test_weights = abs(valdataset['weight'].values)*abs(valdataset['kinWeight'].values)*abs(valdataset['weight_NLO_node'].values)
    train_weights = abs(traindataset['weight'].values)
    test_weights = abs(valdataset['weight'].values)

    # Weights applied during training.
    if weights == 'BalanceYields':
        #trainingweights = traindataset.loc[:,'classweight']*traindataset.loc[:,'weight']*traindataset.loc[:,'weight_NLO_node']
        #trainingweights = traindataset.loc[:,'classweight'].abs()*traindataset.loc[:,'weight'].abs()*traindataset.loc[:,'weight_NLO_node'].abs()
        #trainingweights = traindataset.loc[:,'classweight'].abs() * traindataset.loc[:,'weight'].abs() * traindataset.loc[:,'kinWeight'].abs() * traindataset.loc[:,'weight_NLO_node'].abs()
        trainingweights = traindataset.loc[:, 'classweight'].abs(
        ) * traindataset.loc[:, 'weight'].abs()
    if weights == 'BalanceNonWeighted':
        trainingweights = traindataset.loc[:, 'classweight']
    trainingweights = np.array(trainingweights)

    ## Input Variable Correlation plot
    correlation_plot_file_name = 'correlation_plot'
    Plotter.correlation_matrix(train_df)
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.png')
    Plotter.save_plots(dir=plots_dir,
                       filename=correlation_plot_file_name + '.pdf')

    # Fit label encoder to Y_train
    newencoder = LabelEncoder()
    newencoder.fit(Y_train)
    # Transform to encoded array
    encoded_Y = newencoder.transform(Y_train)
    encoded_Y_test = newencoder.transform(Y_test)

    if do_model_fit == 1:
        print('<train-BinaryDNN> Training new model . . . . ')
        histories = []
        labels = []

        if hyp_param_scan == 1:
            print('Begin at local time: ', time.localtime())
            hyp_param_scan_name = 'hyp_param_scan_results.txt'
            hyp_param_scan_results = open(hyp_param_scan_name, 'a')
            time_str = str(time.localtime()) + '\n'
            hyp_param_scan_results.write(time_str)
            hyp_param_scan_results.write(weights)
            learn_rates = [0.00001, 0.0001]
            epochs = [150, 200]
            batch_size = [400, 500]
            param_grid = dict(learn_rate=learn_rates,
                              epochs=epochs,
                              batch_size=batch_size)
            model = KerasClassifier(build_fn=gscv_model, verbose=0)
            grid = GridSearchCV(estimator=model,
                                param_grid=param_grid,
                                n_jobs=-1)
            grid_result = grid.fit(X_train,
                                   Y_train,
                                   shuffle=True,
                                   sample_weight=trainingweights)
            print("Best score: %f , best params: %s" %
                  (grid_result.best_score_, grid_result.best_params_))
            hyp_param_scan_results.write(
                "Best score: %f , best params: %s\n" %
                (grid_result.best_score_, grid_result.best_params_))
            means = grid_result.cv_results_['mean_test_score']
            stds = grid_result.cv_results_['std_test_score']
            params = grid_result.cv_results_['params']
            for mean, stdev, param in zip(means, stds, params):
                print("Mean (stdev) test score: %f (%f) with parameters: %r" %
                      (mean, stdev, param))
                hyp_param_scan_results.write(
                    "Mean (stdev) test score: %f (%f) with parameters: %r\n" %
                    (mean, stdev, param))
            exit()
        else:
            # Define model for analysis
            early_stopping_monitor = EarlyStopping(patience=100,
                                                   monitor='val_loss',
                                                   min_delta=0.0001,
                                                   verbose=1)
            #model = baseline_model(num_variables, learn_rate=learn_rate)
            model = new_model(num_variables, learn_rate=learn_rate)

            # Fit the model
            # Batch size = examples before updating weights (larger = faster training)
            # Epoch = One pass over data (useful for periodic logging and evaluation)
            #class_weights = np.array(class_weight.compute_class_weight('balanced',np.unique(Y_train),Y_train))
            history = model.fit(X_train,
                                Y_train,
                                validation_split=validation_split,
                                epochs=epochs,
                                batch_size=batch_size,
                                verbose=1,
                                shuffle=True,
                                sample_weight=trainingweights,
                                callbacks=[early_stopping_monitor])
            histories.append(history)
            labels.append(optimizer)
            # Make plot of loss function evolution
            #Plotter.plot_training_progress_acc(histories, labels)
            #acc_progress_filename = 'DNN_acc_wrt_epoch'
            #Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename+'.png')
            #Plotter.save_plots(dir=plots_dir, filename=acc_progress_filename+'.pdf')

            Plotter.history_plot(history, label='loss')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.png')
            Plotter.save_plots(dir=plots_dir, filename='history_loss.pdf')

            Plotter.history_plot(history, label='acc')
            Plotter.save_plots(dir=plots_dir, filename='history_accuracy.png')
            Plotter.save_plots(dir=plots_dir, filename='history_accuracy.pdf')
    else:
        model_name = os.path.join(output_directory, 'model.h5')
        model = load_trained_model(model_name)

    # Node probabilities for training sample events
    result_probs = model.predict(np.array(X_train))
    result_classes = model.predict_classes(np.array(X_train))

    # Node probabilities for testing sample events
    result_probs_test = model.predict(np.array(X_test))
    result_classes_test = model.predict_classes(np.array(X_test))

    # Store model in file
    model_output_name = os.path.join(output_directory, 'model.h5')
    model.save(model_output_name)
    weights_output_name = os.path.join(output_directory, 'model_weights.h5')
    model.save_weights(weights_output_name)
    model_json = model.to_json()
    model_json_name = os.path.join(output_directory, 'model_serialised.json')
    with open(model_json_name, 'w') as json_file:
        json_file.write(model_json)
    model.summary()
    model_schematic_name = os.path.join(output_directory,
                                        'model_schematic.png')
    #plot_model(model, to_file=model_schematic_name, show_shapes=True, show_layer_names=True)

    print('================')
    print('Training event labels: ', len(Y_train))
    print('Training event probs', len(result_probs))
    print('Training event weights: ', len(train_weights))
    print('Testing events: ', len(Y_test))
    print('Testing event probs', len(result_probs_test))
    print('Testing event weights: ', len(test_weights))
    print('================')

    # Initialise output directory.
    Plotter.plots_directory = plots_dir
    Plotter.output_directory = output_directory

    Plotter.ROC(model, X_test, Y_test, X_train, Y_train)
    Plotter.save_plots(dir=plots_dir, filename='ROC.png')
    Plotter.save_plots(dir=plots_dir, filename='ROC.pdf')
Ejemplo n.º 53
0
le = None

# grab all image paths in the current config.TRAIN
print("[INFO] processing '{} '...".format(config.TRAIN))
p = os.path.sep.join([config.BASE_PATH, config.TRAIN])
imagePaths = list(paths.list_images(p))

# randomly shuffle the image paths and then extract the class
# labels from the file paths
random.shuffle(imagePaths)
labels = [p.split(os.path.sep)[-2] for p in imagePaths]

# if the label encoder is None, create it
if le is None:
    le = LabelEncoder()
    le.fit(labels)
# open the output CSV file for writing
csvPath = os.path.sep.join(
    [config.BASE_CSV_PATH, "{}.csv".format(config.TRAIN)])

# write in the fierld names of the CSV, not the length of the header depends on the CNN output
cols = ["feat_{}".format(i) for i in range(0, 7 * 7 * 1024)]
fieldnames = ["class"] + cols

with open(csvPath, 'w', newline='') as outcsv:
    writer = csv.writer(outcsv)
    writer.writerow(fieldnames)

    # loop over the images in batches
    for (b, i) in enumerate(range(0, len(imagePaths), config.BATCH_SIZE)):
        # extract the batch of images and labels, then initialize the
opt_tau = thresholds[opt_idx]
# Accuracy at maximal F1 score
opt_acc = accuracy_score(identical, distances < opt_tau)

# Plot F1 score and accuracy as function of distance threshold
plt.plot(thresholds, f1_scores, label='F1 score')
plt.plot(thresholds, acc_scores, label='Accuracy')
plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold')
plt.title("Accuracy at threshold " + str(opt_tau) + " = " + str(opt_acc))
plt.xlabel('Distance threshold')
plt.legend()

targets = np.array([m.name for m in metadata])

encoder = LabelEncoder()
encoder.fit(targets)

# Numerical encoding of identities
y = encoder.transform(targets)

train_idx = np.arange(metadata.shape[0]) % 2 != 0
test_idx = np.arange(metadata.shape[0]) % 2 == 0

# 50 train examples of 10 identities (5 examples each)
X_train = embedded[train_idx]
# 50 test examples of 10 identities (5 examples each)
X_test = embedded[test_idx]

y_train = y[train_idx]
y_test = y[test_idx]
Ejemplo n.º 55
0
                   prefix="combine_model")


if __name__ == '__main__':
    # load dataset management file and gene count matrix
    df = pd.read_csv(os.path.join(DATASET_PATH, "dataset.tsv"),
                     header=0,
                     sep='\t',
                     index_col=0)
    sorted_cm = pd.read_csv(os.path.join(DATASET_PATH, "cm_final.tsv"),
                            header=0,
                            sep='\t',
                            index_col=0)
    # convert categorical string label to numerical label
    label_encoder = LabelEncoder()
    label_encoder.fit(list(set(sorted_cm.loc[:, LABEL_COLUMN])))
    class_list = label_encoder.classes_
    # split training data and test data by individual samples
    df["sample"] = df.index.map(find_sample_name)
    df_sample_label = df.groupby("sample").first()
    train_index, test_index, _, _ = train_test_split(
        df_sample_label.index,
        df_sample_label.label,
        train_size=train_ratio,
        stratify=df_sample_label.label)
    # subset training and test gene count matrix
    train_cm = sorted_cm[df["sample"].isin(train_index)]
    test_cm = sorted_cm[df["sample"].isin(test_index)]
    # calculate number of genes
    cm_shape = train_cm.shape[1] - ADDITIONAL_COLUMN
    if cross_validation:
Ejemplo n.º 56
0
ages = titanic_reduced.iloc[:, 1]  # get age column
ages = np.array(ages)

mean_age = np.mean(ages[~np.isnan(ages)])  # vypocita mean veku

ages[np.isnan(ages)] = mean_age  # replace nan with mean age

titanic_reduced["age"] = ages  # repalace data in dataframe

print("\nPassanger data")
print(titanic_reduced.loc[12])

# konverzia pohlavia male/female na 0/1
enc = LabelEncoder()
label_encoder = enc.fit(titanic_reduced["sex"])

#print "Categorical classes:", label_encoder.classes_
integer_classes = label_encoder.transform(label_encoder.classes_)

#print "Integer classes:", integer_classes
titanic_reduced["sex"] = label_encoder.transform(titanic_reduced["sex"])
#titanic_X[:, 2] = t

print('\nKodovanie kategorickych hodnot \n', )
print("Passanger data")
print(titanic_reduced.loc[12])

# ONE HOT ENCODING

# First, convert clases to 0-(N-1) integers using label_encoder
Ejemplo n.º 57
0
def create_dataset(train_path, test_path=None, valid_size=0.1, batch_size=32):
    X_train, X_val, y_train, y_val = utils.read_data(
        data_path=train_path,
        valid_size=valid_size)

    enc = LabelEncoder()
    enc.fit(y_train)
    print(enc.classes_)
    
    with open("models/label_enc.pkl", "wb") as f:
        pickle.dump(enc, f)

    y_train = enc.transform(y_train)
    y_train = to_categorical(y_train)

    train_gen = image.ImageDataGenerator(
        preprocessing_function=preprocess_input,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)

    train_gen.fit(X_train)

    y_val = enc.transform(y_val)
    y_val = to_categorical(y_val)

    valid_gen = image.ImageDataGenerator(
        preprocessing_function=preprocess_input)

    valid_gen.fit(X_val)

    print("train: {}".format(len(y_train)))
    print("valid: {}".format(len(y_val)))

    if test_path is not None:
        X_test, y_test = utils.read_data(
            data_path=test_path,
            valid_size=0)

        y_test = enc.transform(y_test)
        y_test = to_categorical(y_test)

        test_gen = image.ImageDataGenerator(
            preprocessing_function=preprocess_input)
        
        test_gen.fit(X_test)

        train_gen = train_gen.flow(X_train, y_train, batch_size)
        valid_gen = valid_gen.flow(X_val, y_val, batch_size)
        test_gen = test_gen.flow(X_test, y_test, batch_size)
        
        print("test: {}".format(len(y_test)))

        return train_gen, valid_gen, test_gen

    valid_gen = valid_gen.flow(X_val, y_val, batch_size)
    test_gen = test_gen.flow(X_test, y_test, batch_size) 

    return train_gen, valid_gen
class AudioDataGenerator:
    def __init__(self, audio_representation: AudioRepresentation,
                 kept_labels: List[str]):
        self._converter = AudioRepresentationConverterFactory.create_converter(
            audio_representation)
        self._encoder = LabelEncoder()
        self._num_classes = len(kept_labels)
        self._encoder.fit(kept_labels)

    def _read_wavfile(self, sample_filepath):
        file_data = wavfile.read(sample_filepath)
        samples = file_data[1]
        sr = file_data[0]
        if len(samples) >= sr:
            samples = samples
        else:
            samples = np.pad(
                samples,
                pad_width=(sr - len(samples), 0),
                mode="constant",
                constant_values=(0, 0),
            )

        return sr, samples

    def get_data_shape(self, sample_filepath: Path):

        converted_sample = self._converter.convert_audio_signal(
            [self._read_wavfile(sample_filepath)])[0]
        return converted_sample.shape

    def flow(self, samples: List[Tuple[Path, str]], batch_size: int):
        random.shuffle(samples)
        while True:
            for chunk in chunks(samples, batch_size):
                files = [self._read_wavfile(path) for path, _ in chunk]

                converted = self._converter.convert_audio_signal(files)
                labels = [label for _, label in chunk]
                X = np.concatenate([converted])
                y = to_categorical(self._encoder.transform(labels),
                                   self._num_classes)

                yield X, y

    def flow_in_memory(self, samples: List[Tuple[Path, str]], batch_size: int):
        random.shuffle(samples)
        data = []
        for chunk in chunks(samples, batch_size):
            files = [self._read_wavfile(path) for path, _ in chunk]

            converted = self._converter.convert_audio_signal(files)
            labels = [label for _, label in chunk]
            data.append((
                np.concatenate([converted]),
                to_categorical(self._encoder.transform(labels),
                               num_classes=self._num_classes),
            ))

        while True:
            for chunk in data:
                yield chunk
Ejemplo n.º 59
0
numpy.random.seed(0)

dataframe = pandas.read_csv("iris.csv", header=None)
dataset = dataframe.values

X = dataset[:, 0:4].astype(float)
Y = dataset[:, 4]

# Preprocess the labels

# LabelEncoder from scikit-learn turns each text label
# (e.g "Iris-setosa", "Iris-versicolor") into a vector
# In this case, each of the three labels are just assigned
# a number from 0-2.
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# to_categorical converts the numbered labels into a one-hot vector
dummy_y = np_utils.to_categorical(encoded_Y)


def baseline_model():
    model = Sequential()
    model.add(Dense(4, input_dim=4, init='normal', activation='relu'))
    model.add(Dense(3, init='normal', activation='sigmoid'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model
def main():

    epochs = 200
    sequence_len = 6
    glimse_size= [40 , 40 , 40]

    std = 0.2 
    h_g = h_l = 512
    hidden_size = 512

    num_classes = 2
    batch_size = 10

    learning_rate = 1e-4

    data_path = "/home/faltay/Dataset_Binary_Mirrored_Rot"    

    dementia_labels_path = "/home/faltay/3DCNN/Labels_Binary.pkl"  # load preprocessed dementia types
    save_model_path = "/home/faltay/Glimpse/Saved_Models_200"
    plot_path = "/home/faltay/Glimpse/Saved_Models_200/Plots/"


   # image transformation
    img_x = 224
    img_y = 160 
    
    begin_frame, end_frame, skip_frame = 70, 190, 1


    print("[INFO] Starting ...")
    
    # Detect devices
    use_cuda = T.cuda.is_available()                   # check if GPU exists
    T.cuda.set_device(1)
    device = T.device("cuda" if use_cuda else "cpu")   # use CPU or GPU
   

    # init tensorboard, a Plugin used for visualize the loss and acc
    logs_dir = './logs/'
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)

    tensorboard_dir = logs_dir
    configure(tensorboard_dir)

    #model = neuro_dram_net.NeuroDram_network()
    model = original_model.RecurrentAttention(glimse_size, h_g, h_l, std, hidden_size, num_classes) # Whole line added by FATIH

    # Total number of parameters
    pytorch_total_params = sum(p.numel() for p in model.parameters())

    # Total number of trainable parameters

    pytorch_total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)



    model.to(device) # Added by FATIH

    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, amsgrad=False)
    #optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0.1, amsgrad=False)

    print("[INFO] Data is now loading ...")
    # Load data
        
    # load dementia types names
    with open(dementia_labels_path, 'rb') as f:
        action_names = pickle.load(f)   # load dementia types names

    # convert labels -> category
    le = LabelEncoder()
    le.fit(action_names)

    # show how many classes there are
    list(le.classes_)

    # convert category -> 1-hot
    action_category = le.transform(action_names).reshape(-1, 1)
    enc = OneHotEncoder()
    enc.fit(action_category)


    dementia_type = []
    fnames = os.listdir(data_path)

    all_names = []
    for f in fnames:
        if f != ".ipynb_checkpoints": 
        
            loc1 = f.find('_')
            test = (f[0: loc1])
            # Temporarily we do not classify the unknown patients
            if test != "4" and test != "3":
                dementia_type.append(f[0: loc1])
                all_names.append(f)
                
    # list all data files
    all_X_list = all_names              # all video file names
    all_y_list = labels2cat(le, dementia_type)    # all video labels

    # train, test, val split so no patients are repeated 


    patients = []
    labels = []
    prev = []
    index = 0

    for pat in all_X_list: 
        prev.append(pat[2:10])
    for pat in all_X_list: 
        if pat[2:10] not in patients: 
            index = prev.index(pat[2:10])        

            patients.append(pat[2:10])
            labels.append(dementia_type[index])
            
    ## Undersampling process 




    i = 0
    for lab in labels: 
        if lab == "0":
            undersampling = np.random.choice([2,5], p=[0.9, 0.1])
            if undersampling ==2:
                del labels[i]
                del patients[i]
           
          
        i = i + 1
    i = 0

    for lab in labels: 
        if lab == "0":
            undersampling = np.random.choice([2,5], p=[0.4, 0.6])
            if undersampling ==2:
                del labels[i]
                del patients[i]
            
            
        i = i + 1


        
    train_list_prev, val_list, train_label_prev, val_label = train_test_split(patients, labels, test_size=0.20, random_state=42)
    train_list,test_list , train_label, test_label = train_test_split(train_list_prev, train_label_prev, test_size=0.15, random_state=42) 

    # Training set (To match patients with scanners)

    train_list_def = []
    train_label_def = []

    for pat in train_list:
        for scan in all_X_list: 
            if pat == scan[2:10]:
                train_list_def.append(scan)
                
    for scan in train_list_def: 
        index = all_X_list.index(scan)        
        train_label_def.append(dementia_type[index])

    train_list = train_list_def
    train_label = np.array(train_label_def).astype(np.int)   

    # Validation set 

    val_list_def = []
    val_label_def = []

    for pat in val_list:
        for scan in all_X_list: 
            if pat == scan[2:10]:
                val_list_def.append(scan)
                
    for scan in val_list_def: 
        index = all_X_list.index(scan)        
        val_label_def.append(dementia_type[index])

    val_list = val_list_def
    val_label = np.array(val_label_def).astype(np.int)  

    # Test set 

    test_list_def = []
    test_label_def = []

    for pat in test_list:
        for scan in all_X_list: 
            if pat == scan[2:10]:
                test_list_def.append(scan)
                
    for scan in test_list_def: 
        index = all_X_list.index(scan)        
        test_label_def.append(dementia_type[index])

    test_list = test_list_def
    test_label = np.array(test_label_def).astype(np.int) 


    # print("Size Dataset: ",len(all_X_list))
    # print("Train Dataset: ",len(train_list))
    # print("Test Dataset: ",len(test_list))
    # print("Val Dataset: ",len(val_list))


    save(save_model_path+'/train_list.npy', train_list)
    save(save_model_path+'/test_list.npy', test_list)
    save(save_model_path+'/train_label.npy', train_label)
    save(save_model_path+'/test_label.npy', test_label)
    save(save_model_path+'/val_list.npy', val_list)
    save(save_model_path+'/val_label.npy', val_label)

    # image transformation
    transform = transforms.Compose([transforms.Resize([img_x, img_y]),
                                    transforms.ToTensor(),
                                    transforms.Normalize(mean=[0.5], std=[0.5])])

    selected_frames = np.arange(begin_frame, end_frame, skip_frame).tolist()


    dataset_train = brain_data_loader.BrainDataset(data_path, train_list, train_label, selected_frames, transform = transform)
    dataset_validation = brain_data_loader.BrainDataset(data_path, val_list, val_label, selected_frames, transform = transform)
    dataset_test = brain_data_loader.BrainDataset(data_path, test_list, test_label, selected_frames, transform = transform)

    # # Dealing with class imbalances

    def make_weights_for_balanced_classes(images, nclasses):                        
        count = [0] * nclasses                                                      
        for item in images: 
            count[item[1]] += 1
            print(count, end="\r")
        weight_per_class = [0.] * nclasses                                      
        N = float(sum(count))  
        print("")

        for i in range(nclasses):                                                   
            weight_per_class[i] = N/float(count[i])

            print(weight_per_class[i], end="\r")

        weight = [0] * len(images)  
        print("")

        for idx, val in enumerate(images):                                          
            weight[idx] = weight_per_class[val[1]] 

            print(weight[idx], end="\r")

        return weight 

    weights = make_weights_for_balanced_classes(dataset_train, num_classes)     
    weights = torch.DoubleTensor(weights)  
    samp = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights),replacement=True)    


    # load dementia types names

    params_train = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 4, 'pin_memory': True, 'sampler':samp} if use_cuda else {}

    
    #params_train = {'batch_size': batch_size, 'shuffle': False, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}
    params_val =  {'batch_size': batch_size, 'shuffle': True, 'num_workers': 4, 'pin_memory': True} if use_cuda else {}


    train_loader = data.DataLoader(dataset_train, **params_train)
    valid_loader = data.DataLoader(dataset_validation, **params_val)
    test_loader = data.DataLoader(dataset_test, **params_val)

    print("[INFO] Data is loaded. ")
    # start training
    for epoch in range(epochs):
        # train, test model
        train_loss, train_acc, imgs, locs = train(model, train_loader, optimizer, sequence_len, epoch, device)
        
        with torch.no_grad():
            valid_acc = validation(model, valid_loader, optimizer, sequence_len, epoch, device,save_model_path, test=False)

        # print messages
        msg1 = "[Training] train loss: {:.3f} - train acc: {:.3f} "
        msg2 = " [Validation] val acc: {:.3f}"
        msg = msg1 + msg2
        print(msg.format(train_loss, train_acc, valid_acc))
        if train_acc > 98: 
                print("Entro  ")

                pickle.dump(imgs, open(plot_path + str(train_acc) + "train_g_{}.p".format(epoch),"wb"))
                pickle.dump(locs, open(plot_path + str(train_acc) +"train_l_{}.p".format(epoch),"wb"))

    
    # Test
    test_acc = validation(model, test_loader, optimizer , sequence_len, epoch, device, save_model_path,test=True)