def main(): dataset_train = read_dataset( "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/treino.csv" ) dataset_test = read_dataset( "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/teste.csv" ) dataset_train.replace(to_replace=[None], value=np.nan, inplace=True) dataset_test.replace(to_replace=[None], value=np.nan, inplace=True) raw_dataset_values_train = dataset_train.drop(columns=['inadimplente']) transformed_values_train = input_data(raw_dataset_values_train) transformed_values_test = input_data(dataset_test) # Deve-se utilizar a mesma escala de dados para o treinamento e teste # https://datascience.stackexchange.com/questions/27615/should-we-apply-normalization-to-test-data-as-well scaler = StandardScaler() standardized_values_train = scaler.fit_transform(transformed_values_train) standardized_values_test = scaler.transform(transformed_values_test) standardized_values_train = pd.DataFrame( standardized_values_train, columns=raw_dataset_values_train.keys()) standardized_values_test = pd.DataFrame(standardized_values_test, columns=dataset_test.keys()) train_x = standardized_values_train train_y = dataset_train.inadimplente test_x = standardized_values_test undersample = RandomUnderSampler(sampling_strategy='majority') model = RandomForestClassifier() X_under, y_under = undersample.fit_resample(train_x, train_y) model.fit(X_under, y_under) filename = 'test_data_scientist_dataminer/modelo-adaboost.joblib' dump(model, filename) loaded_model = load(filename) predictions = model.predict(test_x) dataset_test_raw_df = read_dataset( "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/teste.csv" ) dataset_test_raw_df['inadimplente'] = predictions dataset_test_raw_df.to_csv("test_data_scientist_dataminer/teste.csv", index=False)
def note(): if os.path.isfile(LOVE_NOTES_FILE_PATH): data = dict( note=raw_input() ) append_data_into_file(data, LOVE_NOTES_FILE_PATH) else: data = dict( notes=[ dict( note=raw_input() ) ] ) util.input_data(data, LOVE_NOTES_FILE_PATH)
def complete_task(): not_valid_task_number = 1 if os.path.isfile(TODAYS_TASKS_ENTRY_FILE_PATH): with open(TODAYS_TASKS_ENTRY_FILE_PATH, 'r') as todays_tasks_entry: contents = yaml.load(todays_tasks_entry) i = 0 no_task_left = True for entry in contents['entries']: i += 1 if entry['status'] == 0: no_task_left = False if no_task_left: chalk.green( 'All tasks have been competed! Add a new task by entering "yoda diary nt"') else: click.echo('Today\'s agenda:') click.echo('----------------') click.echo("Number | Time | Task") click.echo("-------|---------|-----") i = 0 for entry in contents['entries']: i += 1 time = entry['time'] text = entry['text'] if entry['status'] == 0 else strike( entry['text']) # status = "O" if entry['status'] == 0 else "X" if entry['status'] == 0: no_task_left = False click.echo(" " + str(i) + " | " + time + ": " + text) while not_valid_task_number: chalk.blue( 'Enter the task number that you would like to set as completed') task_to_be_completed = int(raw_input()) if(task_to_be_completed > len(contents['entries'])): chalk.red('Please Enter a valid task number!') else: contents['entries'][task_to_be_completed - 1]['status'] = 1 util.input_data(contents, TODAYS_TASKS_ENTRY_FILE_PATH) not_valid_task_number = 0 else: chalk.red( 'There are no tasks for today. Add a new task by entering "yoda diary nt"')
def setup(): util.create_folder(MONEY_CONFIG_FOLDER_PATH) if util.ask_overwrite(MONEY_CONFIG_FILE_PATH): return chalk.blue('Enter default currency code:') currency_code = (raw_input().strip()) click.echo(currency_rates.get_rates(currency_code)) click.echo(currency_codes.get_symbol(currency_code)) click.echo(currency_codes.get_currency_name(currency_code)) chalk.blue('Enter inital amount:') initial_money = int(raw_input().strip()) setup_data = dict(currency_code=currency_code, initial_money=initial_money) util.input_data(setup_data, MONEY_CONFIG_FILE_PATH)
def setup(): util.create_folder(LOVE_CONFIG_FOLDER_PATH) if util.ask_overwrite(LOVE_CONFIG_FILE_PATH): return chalk.blue('Enter their name:') name = (raw_input().strip()) chalk.blue('Enter sex(M/F):') sex = (raw_input().strip()) chalk.blue('Where do they live?') place = (raw_input().strip()) setup_data = dict( name=name, place=place, sex=sex ) util.input_data(setup_data, LOVE_CONFIG_FILE_PATH)
def new_note(): today_entry_check() chalk.blue('Input your entry for note:') note = raw_input().strip() if os.path.isfile(TODAYS_NOTES_ENTRY_FILE_PATH): with open(TODAYS_NOTES_ENTRY_FILE_PATH, "r"): setup_data = dict( time=now_time(), text=note ) append_data_into_file(setup_data, TODAYS_NOTES_ENTRY_FILE_PATH) else: setup_data = dict( entries=[ dict( time=now_time(), text=note ) ] ) util.input_data(setup_data, TODAYS_NOTES_ENTRY_FILE_PATH)
def new_task(): today_entry_check() chalk.blue('Input your entry for task:') note = raw_input().strip() if os.path.isfile(TODAYS_TASKS_ENTRY_FILE_PATH): setup_data = dict( time=now_time(), text=note, status=0 ) append_data_into_file(setup_data, TODAYS_TASKS_ENTRY_FILE_PATH) else: setup_data = dict( entries=[ dict( time=now_time(), text=note, status=0 ) ] ) util.input_data(setup_data, TODAYS_TASKS_ENTRY_FILE_PATH)
def main(): dataset = read_dataset( "https://raw.githubusercontent.com/dataminerdbm/test_data_scientist/main/treino.csv" ) dataset.replace(to_replace=[None], value=np.nan, inplace=True) raw_dataset_values = dataset.drop(columns=['inadimplente']) transformed_values = input_data(raw_dataset_values) standardized_values = rescale_data(transformed_values, raw_dataset_values) # calc_corr_fig(standardized_values) x = standardized_values # Remove-se as demais características correlacionadas, mantendo-se apenas uma x_without_corr_feat = standardized_values.drop(columns=[ 'vezes_passou_de_30_59_dias', 'numero_de_vezes_que_passou_60_89_dias' ]) y = dataset.inadimplente SEED = 7707 np.random.seed(SEED) # Realiza-se a estratificação dos dados tendo em vista o desbalanceamento da base train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, stratify=y) train_x_without_corr_feat, test_x_without_corr_feat, train_y_without_corr_feat, test_y_without_corr_feat = train_test_split( x_without_corr_feat, y, test_size=0.3, stratify=y) undersample = RandomUnderSampler(sampling_strategy='majority') X_without_corr_feat_under, y_without_corr_feat_under = undersample.fit_resample( x_without_corr_feat, y) x_under, y_under = undersample.fit_resample(x, y) train_x_under, train_y_under = undersample.fit_resample(train_x, train_y) train_x_without_corr_feat_under, train_y_without_corr_feat_under = undersample.fit_resample( train_x_without_corr_feat, train_y_without_corr_feat) #tsne_scatterplot(x_without_corr_feat, y) # Os classificadores validados foram escolhidos de acordo com o aspecto da base de dados: # características numéricas, multidimensional com alto número de instâncias e problema não linearmente separável models = [ DummyClassifier(), KNeighborsClassifier(), DecisionTreeClassifier(), GaussianNB(), AdaBoostClassifier(n_estimators=100), RandomForestClassifier(), BaggingClassifier(base_estimator=GaussianNB(), n_estimators=100) ] k_size = 5 # Criando aleatoridade nos grupos de folds (para evitar repetição). Abordagem mais adequada para bases desbalanceadas # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GroupKFold.html#sklearn.model_selection.GroupKFold x_under['idade_r'] = x_under.idade + np.random.randint(-2, 3, size=14662) x_under.idade_r = x_under.idade + abs(x_under.idade.min()) + 1 print("Validando modelos com todas as características") validate_models_cv(x_under, y_under, x_under.idade_r, models, k_size) validate_models_holdout(train_x_under, train_y_under, test_x, test_y, models, k_size) print("Validando modelos sem as características correlacionadas") validate_models_cv(X_without_corr_feat_under, y_without_corr_feat_under, x_under.idade_r, models, k_size) validate_models_holdout(train_x_without_corr_feat_under, train_y_without_corr_feat_under, test_x_without_corr_feat, test_y_without_corr_feat, models, k_size)