def cancer_survival(): ### Initiate Logger Instance ### logger = Logger.get_instance('cancer_survival') ### Initiate Pickler Instance ### pickler = Pickler.get_instance() data = pickler.load('radiomics_v2') if data is None: data = read_csv('data/radiomics_v2.csv') pickler.save(data, 'radiomics_v2') ### Rename Columns with Black Space to use Underscores instead renamed_cols = { col: '_'.join(col.split(' ')) for col in filter(lambda x: x[0]!='v', data.columns) } logger.info(renamed_cols) data.rename(columns=renamed_cols, inplace=True) logger.info(f'Test Data: {data.shape}') ### Checks for null values and find the percentage of null values that we have ### logger.info(f'Columns with Null Data:\n{data.isnull().any()}') logger.info(f'Percentage of Null Data:\n{data.isnull().sum() / data.shape[0]}') ### Column Descriptions ### # Time to Event is the amount of time from the date of data collection until a patient's day of death or until his/her last check-up # Patient Status is the patient's latest status (0 = alive, 1 = dead) # Patient Status at 3-Year is the patient's status at 3-year mark (0 = alive, 1 = dead, -1 = unknown) # v_n's are radiomics features extracted from ROI drawn by radiologists ### Drop Duplicates ### data.drop_duplicates(inplace=True) ### Clinical_C looks skewed, must be corrected using Square Values ### data['Clinical_C_Squared'] = square(data.Clinical_C) data.drop(['Clinical_C'], axis=1, inplace=True) ### Clinical_D seems to have an outlying tail in the positive ### ### Remove Outliers ### data = data[(data.Clinical_D < data.Clinical_D.quantile(.95)) & (data.Clinical_D > 0)] ### Still Skewed, try Logarithmic Function ### data['Clinical_D_Log'] = log(data.Clinical_D) ### Looks Good, now drop original Clinical_D column ### data.drop(['Clinical_D'], axis=1, inplace=True) ### Remove invalid rows from with 0 Age data = data[(data['Age'] > 0)] age_range = 3 num_bins = int(data.Age.max() / age_range) data['Age_Range'] = cut(data['Age'], num_bins, labels=False) ### When Patient_Status_at_3_Year is unknown, impute with last known data ### data['Patient_Status_at_3_Year'] = data[['Patient_Status_at_3_Year', 'Patient_Status']].max(1) ### Train Model ### fold = 5 forest_params = dict( \ criterion= ['gini'], \ max_features= ['auto', 'sqrt', 'log2'], \ min_samples_split= range(2, 11), \ n_estimators= range(10, 21) \ ) gsv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=forest_params, \ scoring='neg_mean_absolute_error', verbose=10, n_jobs=15, cv=fold) trainable_data = data[data.Patient_Status_at_3_Year.notna()] x_train = trainable_data.drop(['Patient_ID', 'Time_to_Event', 'Patient_Status', 'Patient_Status_at_3_Year'], axis=1) y_train = trainable_data['Patient_Status_at_3_Year'] y_stratify = trainable_data['Gender'] * y_train x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, train_size=.8, stratify=y_stratify) gsv.fit(x_train, y_train) model = gsv.best_estimator_ logger.info(model) y_predict = model.predict(x_test) num_wrong_predictions = (y_predict != y_test).sum() r2 = r2_score(y_test, y_predict) mse = mean_squared_error(y_test, y_predict) rmse = sqrt(mse) logger.info(f'Number of Wrong Predictions: {num_wrong_predictions} / {len(y_predict)}') logger.info(f'R2: {r2:.4f}') logger.info(f'Mean Squared Error: {mse:.4f}') logger.info(f'Root Mean Squared Error: {rmse:.4f}') ### Save Model ### pickler.save(model, 'cancer_survival_estimator') Logger.release_instance() Pickler.release_instance()
def scrape(): ### Parse Arguments ### parser = ArgumentParser( description='Kaggle Competition Image Scraper by @aekasitt') parser.add_argument('search_term', type=str, \ help='The search term to query using Google Images') parser.add_argument('target_folder', type=str, \ help='Define the target folder to save the images queried to.') parser.add_argument('--amount', '-n', type=int, \ help='The amount of images to fetch', default=200) args = parser.parse_args() ### Initiate Logger Instance ### logger = Logger.get_instance('scraper') logger.info(args) target_folder = path.join(args.target_folder) name_idx = 0 if not path.exists(target_folder): makedirs(target_folder) else: images_saved = Path(target_folder).glob('*.png') name_idx = len(list(images_saved)) + 1 del images_saved wd = webdriver.Firefox(executable_path='./geckodriver') search_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'.format( q=args.search_term) wd.get(search_url) image_count = 0 img_urls = set() results_start = 0 interval_between_interactions = 1 # second while image_count < args.amount: scrollToEnd(wd) thumbnail_results = wd.find_elements_by_css_selector('img.rg_i') number_results = len(thumbnail_results) logger.info( f'Found: {number_results} search results. Extracting links from {results_start} to {number_results}' ) for img in thumbnail_results[results_start:number_results]: try: img.click() sleep(interval_between_interactions) except Exception: continue actual_images = wd.find_elements_by_css_selector('img.n3VNCb') for actual_image in actual_images: if actual_image.get_attribute( 'src') and 'http' in actual_image.get_attribute('src'): img_urls.add(actual_image.get_attribute('src')) image_count = len(img_urls) if len(img_urls) >= args.amount: logger.info(f'Found {image_count} image links, done!') break else: pass # TODO results_start = len(thumbnail_results) wd.quit() for img_url in img_urls: img_name = f'{args.search_term.split(" ")[-1]}{name_idx}' print(f'Search-Term: {args.search_term}') print(f'Image-Name: {img_name}') persist_image(args.target_folder, img_url, img_name, logger) name_idx += 1 Logger.release_instance()
def drug_allergy(): ### Initiate Logger Instance ### logger = Logger.get_instance('drug_allergy') ### Initiate Pickler Instance ### pickler = Pickler.get_instance() data = pickler.load('drugs') if data is None: data = read_csv('data/drugs.csv') pickler.save(data, 'drugs') logger.info(f'Test Data: {data.shape}') logger.info(f'Test Data Columns: {data.columns}') ### Column Descriptions ### # ELISpot_Control is the ELISpot test result for the POSITIVE CONTROL (i.e., we expect to see strong response) # ELISpot_Result is the ELISpot test result for SUSPECTED DRUG (i.e., this is the result that indicate whether the patient would be allergic to that drug) # NARANJO_Category is ORDINAL. # Exposure_Time is the amount of times since the patient has taken the drug until the ELISpot test date # Suspicion_Score is the suspicion level of the drug (1 = suspected drug, 2 = similar to suspected drug, 3 = negative control). This is ORDINAL. # Allergic_Reaction_Group is the severity of patient's allergic reaction. This is ORDINAL. # Drug_Group is CATEGORICAL. # Drug_Rechallenge_Result is the ground truth of this dataset that we want to predict. ### Checks for null values and find the percentage of null values that we have ### logger.info(f'Columns with Null Data:\n{data.isnull().any()}') logger.info( f'Percentage of Null Data:\n{data.isnull().sum() / data.shape[0]}') ### Drop Duplicates ### data.drop_duplicates(inplace=True) ### Impute Underlying Conditions with Population Mode ### data.Underlying_Condition_A.fillna( data.Underlying_Condition_A.mode().iloc[0], inplace=True) data.Underlying_Condition_D.fillna( data.Underlying_Condition_D.mode().iloc[0], inplace=True) data.Underlying_Condition_E.fillna( data.Underlying_Condition_E.mode().iloc[0], inplace=True) ### Create Dummies for Categorical Indenpendent Variables ### for column in data.columns: matched = re.match(r'\w+_Group', column) if matched is not None: data = data.join( get_dummies(data[column], prefix=column, dummy_na=True)) data.drop([column], axis=1, inplace=True) ### Naranjo Category and Naranjo Score ### dummy_naranjo = get_dummies(data['Naranjo_Category'], prefix='Naranjo_Category') naranjo = dummy_naranjo.mul(data.Naranjo_Score.fillna(0), axis=0) data = data.join(naranjo) data.drop(['Naranjo_Category', 'Naranjo_Score'], axis=1, inplace=True) ### Fills Exposure Time null rows with 0 ### data.Exposure_Time.fillna(0, inplace=True) ### ELISpot_Control is the ELISpot test result for the POSITIVE CONTROL (i.e., we expect to see strong response) ### data['ELISpot_Control_Log'] = log(data[['ELISpot_Control']]) data.drop(['ELISpot_Control'], axis=1, inplace=True) # Suspicion_Score is the suspicion level of the drug # 1 = suspected drug # 2 = similar to suspected drug # 3 = negative control). # This is ORDINAL. suspicion = get_dummies(data['Suspicion_Score'], prefix='Suspicion_Score') suspicion.rename(columns={ 'Suspicion_Score_1': 'Suspicion_Level_Suspected', 'Suspicion_Score_2': 'Suspicion_Level_Near_Suspected', 'Suspicion_Score_3': 'Suspicion_Level_Negative_Control' }, inplace=True) ### To Merge or not to merge between Suspected and Near_Suspected suspicion['Suspicion_Level_Suspected'] = suspicion[ 'Suspicion_Level_Suspected'] + suspicion[ 'Suspicion_Level_Near_Suspected'] suspicion.drop(['Suspicion_Level_Near_Suspected'], axis=1, inplace=True) data = data.join(suspicion) data.drop(['Suspicion_Score'], axis=1, inplace=True) logger.info(f'Test Data: {data.shape}') logger.info(f'Test Data Columns: {data.columns}') fold = 5 xgb_params = dict( \ booster= ['gbtree'], colsample_bytree= [0.9], learning_rate= [0.01, 0.05], n_estimators= [100, 300], max_delta_step= [0], # range: (0, infinity), defaults: 0 max_depth= [6], # range: (0, infinity), defaults: 6 min_child_weight= [1, 5, 10], silent= [True], subsample= [0.7], # range: (0, 1) reg_alpha= [0], # L1 regularization term on weights reg_lambda= [1] # L2 regularization term on weights ) gsv = GridSearchCV(estimator=XGBClassifier(), param_grid=xgb_params, \ scoring='neg_mean_absolute_error', verbose=10, n_jobs=15, cv=fold) trainable_data = data[data.Drug_Rechallenge_Result.notna()] x_train = trainable_data.drop(['Patient_ID', 'Drug_Rechallenge_Result'], axis=1) y_train = trainable_data['Drug_Rechallenge_Result'] x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, train_size=.8, stratify=y_train) gsv.fit(x_train, y_train) model = gsv.best_estimator_ logger.info(model) y_predict = model.predict(x_test) num_wrong_predictions = (y_predict != y_test).sum() r2 = r2_score(y_test, y_predict) mse = mean_squared_error(y_test, y_predict) rmse = sqrt(mse) logger.info( f'Number of Wrong Predictions: {num_wrong_predictions} / {len(y_predict)}' ) logger.info(f'R2: {r2:.4f}') logger.info(f'Mean Squared Error: {mse:.4f}') logger.info(f'Root Mean Squared Error: {rmse:.4f}') y_predict = model.predict( data.drop(['Patient_ID', 'Drug_Rechallenge_Result'], axis=1)) results = data[['Patient_ID', 'Drug_Rechallenge_Result']] results.loc[:, 'Predicted_Drug_Rechallenge_Result'] = y_predict results.to_csv('results.csv', index=False) Logger.release_instance() Pickler.release_instance()