warnings.simplefilter("ignore") df = pd.read_csv('data/housing/Boston.csv', header=None) #dropping index column df.drop(0, axis=1, inplace=True) df.columns = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] #Sample of the data print(df.head()) summarize_data(df, ['CHAS', 'RAD'], [ 'CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT' ], 'MEDV') #splitting data into test and train train, test = train_test_split(df, test_size=0.25, random_state=7) train_X = train[train.columns[:-1]] train_y = train['MEDV'] test_X = test[train.columns[:-1]] test_y = test['MEDV'] #fitting regression models fit_regression_models(train_X, train_y, test_X, test_y, 'scikit_learn_pkg/metrics/housing')
#Disabling Warnings if not sys.warnoptions: warnings.simplefilter("ignore") df = pd.read_csv('data/telco/WA_Fn-UseC_-Telco-Customer-Churn.csv') #Conerting text column to float df.loc[df['TotalCharges'] == ' ', 'TotalCharges'] = np.nan df['TotalCharges'] = pd.to_numeric(df['TotalCharges']) print(df.head()) classes = summarize_data(df, [ 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod' ], ['tenure', 'MonthlyCharges', 'TotalCharges'], 'Churn', 'classification') #Converting text to integers in columns df['Partner'] = df['Partner'].map({'Yes': 1, 'No': 0}) df['PhoneService'] = df['PhoneService'].map({'Yes': 1, 'No': 0}) df['Dependents'] = df['Dependents'].map({'Yes': 1, 'No': 0}) df['MultipleLines'] = df['MultipleLines'].map({ 'Yes': 1, 'No': 0, 'No phone service': -1 }) df['OnlineSecurity'] = df['OnlineSecurity'].map({ 'Yes': 1, 'No': 0,
#Two different datasets for red wine and white wine red_df = pd.read_csv('data/wine/winequality-red.csv', sep=';') white_df = pd.read_csv('data/wine/winequality-white.csv', sep=';') red_df['color'] = 'red' white_df['color'] = 'white' #combining red wine data and white wine data df = pd.concat([red_df, white_df], ignore_index=True) print(df.head()) summarize_data(df, ['color'], [ 'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol' ], 'quality') #one hot encoding df = pd.concat([df, pd.get_dummies(df['color'])], axis=1) df.drop('color', axis=1, inplace=True) #predictor variables independent_variables = list(df.columns) #quality is the predicted variable independent_variables.remove('quality') #splitting data into test and train train, test = train_test_split(df, test_size=0.25, random_state=7)
def process_data(data,opt): label_adj_matrix = None """if (opt.adj_matrix_lambda > 0): print('using heirarchy mask') if 'rcv1' in opt.dataset: label_adj_matrix = utils.get_pairwise_adj_rcv1(data['dict']['tgt'],path.join(opt.dataset,'tf_interactions.tsv')) else: label_adj_matrix = utils.get_pairwise_adj(data['dict']['tgt'],path.join(opt.dataset,'tf_interactions.tsv'))""" if opt.label_mask == 'prior': print('using prior mask') # train_matrix = torch.zeros(len(data['train']['tgt']),len(data['dict']['tgt'])) # for i in range(len(data['train']['tgt'])): # indices = torch.from_numpy(np.array(data['train']['tgt'][i])) # x = torch.zeros(len(data['dict']['tgt'])) # x.index_fill_(0, indices, 1) # train_matrix[i] = x # train_matrix = train_matrix[:,4:] # label_adj_matrix = torch.from_numpy(np.corrcoef(train_matrix.transpose(0,1).cpu().numpy())) # label_adj_matrix[label_adj_matrix < 0.0] = 0 # for i in range(label_adj_matrix.size(0)): label_adj_matrix[i,i] = 0 # label_adj_matrix[label_adj_matrix > 0] = 1 adj_matrix = torch.eye(len(data['dict']['tgt'])-4) for sample in data['train']['tgt']: sample2 = sample for i,idx1 in enumerate(sample[1:-1]): for idx2 in sample2[i+1:-1]: if idx1 != idx2: adj_matrix[idx1-4,idx2-4] = 1 adj_matrix[idx2-4,idx1-4] = 1 label_adj_matrix = adj_matrix label_vals = torch.zeros(len(data['train']['tgt']),len(data['dict']['tgt'])) for i in range(len(data['train']['tgt'])): indices = torch.from_numpy(np.array(data['train']['tgt'][i])) x = torch.zeros(len(data['dict']['tgt'])) x.index_fill_(0, indices, 1) label_vals[i] = x # stop() values,ranking = torch.sort(label_vals.sum(0),dim=0,descending=True) ranking_values = values[2:-2]/values[2:-2].sum() # mean_tf_labels = label_vals[:,4:].sum(1).mean() ranking = ranking.numpy().tolist() ranking = ranking[2:-2] ranking.insert(0,2) ranking += [0,1,3] for sample in data['train']['tgt']: sample = sorted(sample, key=ranking.index) for sample in data['valid']['tgt']: sample = sorted(sample, key=ranking.index) for sample in data['test']['tgt']: sample = sorted(sample, key=ranking.index) opt.max_token_seq_len_e = data['settings'].max_seq_len opt.max_token_seq_len_d = opt.max_ar_length if opt.summarize_data: utils.summarize_data(data) if not 'sider' in opt.dataset: data['train']['adj'],data['valid']['adj'],data['test']['adj'] = None,None,None #========= Preparing DataLoader =========# train_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['train']['src'], adj_insts=data['train']['adj'], tgt_insts=data['train']['tgt'], batch_size=opt.batch_size, binary_relevance=opt.binary_relevance, cuda=opt.cuda, shuffle=True, drop_last=True) valid_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['valid']['src'], adj_insts=data['valid']['adj'], tgt_insts=data['valid']['tgt'], batch_size=opt.test_batch_size, binary_relevance=opt.binary_relevance, shuffle=False, cuda=opt.cuda) test_data = DataLoader( data['dict']['src'], data['dict']['tgt'], src_insts=data['test']['src'], adj_insts=data['test']['adj'], tgt_insts=data['test']['tgt'], batch_size=opt.test_batch_size, binary_relevance=opt.binary_relevance, shuffle=False, cuda=opt.cuda) opt.src_vocab_size = train_data.src_vocab_size opt.tgt_vocab_size = train_data.tgt_vocab_size if opt.binary_relevance: opt.tgt_vocab_size = opt.tgt_vocab_size - 4 opt.max_ar_length = opt.tgt_vocab_size return train_data,valid_data,test_data,label_adj_matrix,opt
for col in df.columns: print 'col: %s' % col col_dtype = df[col].dtype print 'dtype: %s' % col_dtype print 'na values: %d' % df[col].isnull().sum() if col_dtype == int or col_dtype == float: print(df[col].describe()) else: print(df[col].value_counts()) print('\n') print(df.head()) classes = summarize_data( df, ['Dead_Alive', 'Gear', 'Entangled'], ['SCL_notch', 'SCL_tip', 'SCW', 'CCL_notch', 'TestLevel_Before'], 'Species', 'classification') #Converting text to integers in columns df['Dead_Alive'] = df['Dead_Alive'].map({'alive': 1, 'dead': 0}) df['Entangled'] = df['Entangled'].map({'free': 1, 'entangled': 0}) #one hot encoding df = pd.concat([df, pd.get_dummies(df['Gear'])], axis=1) df.drop('Gear', axis=1, inplace=True) #splitting data into test and train train, test = train_test_split(df, test_size=0.25, random_state=7) train_X = train[train.columns[1:]] train_y = train['Species']
from utils import fit_classification_models #Disabling Warnings if not sys.warnoptions: warnings.simplefilter("ignore") iris_df = pd.read_csv('data/iris/iris.csv', header=None) iris_df.columns = [ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species' ] print(iris_df.head()) classes = summarize_data( iris_df, [], ['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], 'species', 'classification') #splitting data into test and train iris_train, iris_test = train_test_split(iris_df, test_size=0.25, random_state=7) iris_train_X = iris_train[[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ]] iris_train_y = iris_train['species'] iris_test_X = iris_test[[ 'sepal_length', 'sepal_width', 'petal_length', 'petal_width' ]] iris_test_y = iris_test['species']
if not sys.warnoptions: warnings.simplefilter("ignore") df = pd.read_csv('data/auto/auto-mpg.csv', header=None) df.columns = [ 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name' ] #filling missing values with mean df['horsepower'].fillna(value=df['horsepower'].mean(), inplace=True) print(df.head()) summarize_data(df, ['cylinders', 'model_year', 'origin'], ['displacement', 'horsepower', 'weight', 'acceleration'], 'mpg') #dropping car name column df.drop('car_name', axis=1, inplace=True) #one hot encoding df = pd.concat([ df, pd.get_dummies(df['cylinders']), pd.get_dummies(df['origin']), pd.get_dummies(df['model_year']) ], axis=1) df.drop(['cylinders', 'origin', 'model_year'], axis=1, inplace=True) #splitting data into test and train