コード例 #1
0
def cancer_survival():
  ### Initiate Logger Instance ###
  logger = Logger.get_instance('cancer_survival')
  ### Initiate Pickler Instance ###
  pickler = Pickler.get_instance()
  data = pickler.load('radiomics_v2')
  if data is None:
    data = read_csv('data/radiomics_v2.csv')
    pickler.save(data, 'radiomics_v2')
  ### Rename Columns with Black Space to use Underscores instead
  renamed_cols = {
      col: '_'.join(col.split(' ')) for col in filter(lambda x: x[0]!='v', data.columns)
  }
  logger.info(renamed_cols)
  data.rename(columns=renamed_cols, inplace=True)
  logger.info(f'Test Data: {data.shape}')
  ### Checks for null values and find the percentage of null values that we have ###
  logger.info(f'Columns with Null Data:\n{data.isnull().any()}')
  logger.info(f'Percentage of Null Data:\n{data.isnull().sum() / data.shape[0]}')

  ### Column Descriptions ###
  # Time to Event is the amount of time from the date of data collection until a patient's day of death or until his/her last check-up
  # Patient Status is the patient's latest status (0 = alive, 1 = dead)
  # Patient Status at 3-Year is the patient's status at 3-year mark (0 = alive, 1 = dead, -1 = unknown)
  # v_n's are radiomics features extracted from ROI drawn by radiologists

  ### Drop Duplicates ###
  data.drop_duplicates(inplace=True)

  ### Clinical_C looks skewed, must be corrected using Square Values ###
  data['Clinical_C_Squared'] = square(data.Clinical_C)
  data.drop(['Clinical_C'], axis=1, inplace=True)

  ### Clinical_D seems to have an outlying tail in the positive ###
  ### Remove Outliers ###
  data = data[(data.Clinical_D < data.Clinical_D.quantile(.95)) & (data.Clinical_D > 0)]
  ### Still Skewed, try Logarithmic Function ###
  data['Clinical_D_Log'] = log(data.Clinical_D)
  ### Looks Good, now drop original Clinical_D column ###
  data.drop(['Clinical_D'], axis=1, inplace=True)
  
  ### Remove invalid rows from with 0 Age
  data = data[(data['Age'] > 0)]
  age_range = 3
  num_bins = int(data.Age.max() / age_range)
  data['Age_Range'] = cut(data['Age'], num_bins, labels=False)

  ### When Patient_Status_at_3_Year is unknown, impute with last known data ###
  data['Patient_Status_at_3_Year'] = data[['Patient_Status_at_3_Year', 'Patient_Status']].max(1)

  ### Train Model ###
  fold = 5
  forest_params = dict( \
    criterion= ['gini'], \
    max_features= ['auto', 'sqrt', 'log2'], \
    min_samples_split= range(2, 11), \
    n_estimators= range(10, 21) \
  )
  gsv = GridSearchCV(estimator=RandomForestClassifier(), param_grid=forest_params, \
    scoring='neg_mean_absolute_error', verbose=10, n_jobs=15, cv=fold)

  trainable_data = data[data.Patient_Status_at_3_Year.notna()]
  x_train = trainable_data.drop(['Patient_ID', 'Time_to_Event', 'Patient_Status', 'Patient_Status_at_3_Year'], axis=1)
  y_train = trainable_data['Patient_Status_at_3_Year']
  y_stratify = trainable_data['Gender'] * y_train
  x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, train_size=.8, stratify=y_stratify)
  gsv.fit(x_train, y_train)
  model = gsv.best_estimator_
  logger.info(model)

  y_predict = model.predict(x_test)
  num_wrong_predictions = (y_predict != y_test).sum()
  r2 = r2_score(y_test, y_predict)
  mse = mean_squared_error(y_test, y_predict)
  rmse = sqrt(mse)
  logger.info(f'Number of Wrong Predictions: {num_wrong_predictions} / {len(y_predict)}')
  logger.info(f'R2: {r2:.4f}')
  logger.info(f'Mean Squared Error: {mse:.4f}')
  logger.info(f'Root Mean Squared Error: {rmse:.4f}')

  ### Save Model ###
  pickler.save(model, 'cancer_survival_estimator')

  Logger.release_instance()
  Pickler.release_instance()
コード例 #2
0
def scrape():
    ### Parse Arguments ###
    parser = ArgumentParser(
        description='Kaggle Competition Image Scraper by @aekasitt')
    parser.add_argument('search_term', type=str, \
        help='The search term to query using Google Images')
    parser.add_argument('target_folder', type=str, \
        help='Define the target folder to save the images queried to.')
    parser.add_argument('--amount', '-n', type=int, \
        help='The amount of images to fetch', default=200)
    args = parser.parse_args()
    ### Initiate Logger Instance ###
    logger = Logger.get_instance('scraper')
    logger.info(args)
    target_folder = path.join(args.target_folder)
    name_idx = 0
    if not path.exists(target_folder):
        makedirs(target_folder)
    else:
        images_saved = Path(target_folder).glob('*.png')
        name_idx = len(list(images_saved)) + 1
        del images_saved
    wd = webdriver.Firefox(executable_path='./geckodriver')
    search_url = 'https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img'.format(
        q=args.search_term)
    wd.get(search_url)
    image_count = 0
    img_urls = set()
    results_start = 0
    interval_between_interactions = 1  # second
    while image_count < args.amount:
        scrollToEnd(wd)
        thumbnail_results = wd.find_elements_by_css_selector('img.rg_i')
        number_results = len(thumbnail_results)
        logger.info(
            f'Found: {number_results} search results. Extracting links from {results_start} to {number_results}'
        )
        for img in thumbnail_results[results_start:number_results]:
            try:
                img.click()
                sleep(interval_between_interactions)
            except Exception:
                continue
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute(
                        'src') and 'http' in actual_image.get_attribute('src'):
                    img_urls.add(actual_image.get_attribute('src'))
            image_count = len(img_urls)

            if len(img_urls) >= args.amount:
                logger.info(f'Found {image_count} image links, done!')
                break
            else:
                pass  # TODO
            results_start = len(thumbnail_results)
    wd.quit()
    for img_url in img_urls:
        img_name = f'{args.search_term.split(" ")[-1]}{name_idx}'
        print(f'Search-Term: {args.search_term}')
        print(f'Image-Name: {img_name}')
        persist_image(args.target_folder, img_url, img_name, logger)
        name_idx += 1
    Logger.release_instance()
コード例 #3
0
def drug_allergy():
    ### Initiate Logger Instance ###
    logger = Logger.get_instance('drug_allergy')
    ### Initiate Pickler Instance ###
    pickler = Pickler.get_instance()
    data = pickler.load('drugs')
    if data is None:
        data = read_csv('data/drugs.csv')
        pickler.save(data, 'drugs')
    logger.info(f'Test Data: {data.shape}')
    logger.info(f'Test Data Columns: {data.columns}')

    ### Column Descriptions ###
    # ELISpot_Control is the ELISpot test result for the POSITIVE CONTROL (i.e., we expect to see strong response)
    # ELISpot_Result is the ELISpot test result for SUSPECTED DRUG (i.e., this is the result that indicate whether the patient would be allergic to that drug)
    # NARANJO_Category is ORDINAL.
    # Exposure_Time is the amount of times since the patient has taken the drug until the ELISpot test date
    # Suspicion_Score is the suspicion level of the drug (1 = suspected drug, 2 = similar to suspected drug, 3 = negative control). This is ORDINAL.
    # Allergic_Reaction_Group is the severity of patient's allergic reaction. This is ORDINAL.
    # Drug_Group is CATEGORICAL.
    # Drug_Rechallenge_Result is the ground truth of this dataset that we want to predict.

    ### Checks for null values and find the percentage of null values that we have ###
    logger.info(f'Columns with Null Data:\n{data.isnull().any()}')
    logger.info(
        f'Percentage of Null Data:\n{data.isnull().sum() / data.shape[0]}')

    ### Drop Duplicates ###
    data.drop_duplicates(inplace=True)

    ### Impute Underlying Conditions with Population Mode ###
    data.Underlying_Condition_A.fillna(
        data.Underlying_Condition_A.mode().iloc[0], inplace=True)
    data.Underlying_Condition_D.fillna(
        data.Underlying_Condition_D.mode().iloc[0], inplace=True)
    data.Underlying_Condition_E.fillna(
        data.Underlying_Condition_E.mode().iloc[0], inplace=True)

    ### Create Dummies for Categorical Indenpendent Variables ###
    for column in data.columns:
        matched = re.match(r'\w+_Group', column)
        if matched is not None:
            data = data.join(
                get_dummies(data[column], prefix=column, dummy_na=True))
            data.drop([column], axis=1, inplace=True)

    ### Naranjo Category and Naranjo Score ###
    dummy_naranjo = get_dummies(data['Naranjo_Category'],
                                prefix='Naranjo_Category')
    naranjo = dummy_naranjo.mul(data.Naranjo_Score.fillna(0), axis=0)
    data = data.join(naranjo)
    data.drop(['Naranjo_Category', 'Naranjo_Score'], axis=1, inplace=True)

    ### Fills Exposure Time null rows with 0 ###
    data.Exposure_Time.fillna(0, inplace=True)

    ### ELISpot_Control is the ELISpot test result for the POSITIVE CONTROL (i.e., we expect to see strong response) ###
    data['ELISpot_Control_Log'] = log(data[['ELISpot_Control']])
    data.drop(['ELISpot_Control'], axis=1, inplace=True)

    # Suspicion_Score is the suspicion level of the drug
    # 1 = suspected drug
    # 2 = similar to suspected drug
    # 3 = negative control).
    # This is ORDINAL.
    suspicion = get_dummies(data['Suspicion_Score'], prefix='Suspicion_Score')
    suspicion.rename(columns={
        'Suspicion_Score_1': 'Suspicion_Level_Suspected',
        'Suspicion_Score_2': 'Suspicion_Level_Near_Suspected',
        'Suspicion_Score_3': 'Suspicion_Level_Negative_Control'
    },
                     inplace=True)

    ### To Merge or not to merge between Suspected and Near_Suspected
    suspicion['Suspicion_Level_Suspected'] = suspicion[
        'Suspicion_Level_Suspected'] + suspicion[
            'Suspicion_Level_Near_Suspected']
    suspicion.drop(['Suspicion_Level_Near_Suspected'], axis=1, inplace=True)
    data = data.join(suspicion)
    data.drop(['Suspicion_Score'], axis=1, inplace=True)

    logger.info(f'Test Data: {data.shape}')
    logger.info(f'Test Data Columns: {data.columns}')
    fold = 5
    xgb_params = dict( \
        booster= ['gbtree'],
        colsample_bytree= [0.9],
        learning_rate= [0.01, 0.05],
        n_estimators= [100, 300],
        max_delta_step= [0], # range: (0, infinity), defaults: 0
        max_depth= [6],      # range: (0, infinity), defaults: 6
        min_child_weight= [1, 5, 10],
        silent= [True],
        subsample= [0.7],       # range: (0, 1)
        reg_alpha= [0],         # L1 regularization term on weights
        reg_lambda= [1]         # L2 regularization term on weights
    )
    gsv = GridSearchCV(estimator=XGBClassifier(), param_grid=xgb_params, \
      scoring='neg_mean_absolute_error', verbose=10, n_jobs=15, cv=fold)
    trainable_data = data[data.Drug_Rechallenge_Result.notna()]
    x_train = trainable_data.drop(['Patient_ID', 'Drug_Rechallenge_Result'],
                                  axis=1)
    y_train = trainable_data['Drug_Rechallenge_Result']
    x_train, x_test, y_train, y_test = train_test_split(x_train,
                                                        y_train,
                                                        train_size=.8,
                                                        stratify=y_train)
    gsv.fit(x_train, y_train)
    model = gsv.best_estimator_
    logger.info(model)

    y_predict = model.predict(x_test)
    num_wrong_predictions = (y_predict != y_test).sum()
    r2 = r2_score(y_test, y_predict)
    mse = mean_squared_error(y_test, y_predict)
    rmse = sqrt(mse)
    logger.info(
        f'Number of Wrong Predictions: {num_wrong_predictions} / {len(y_predict)}'
    )
    logger.info(f'R2: {r2:.4f}')
    logger.info(f'Mean Squared Error: {mse:.4f}')
    logger.info(f'Root Mean Squared Error: {rmse:.4f}')

    y_predict = model.predict(
        data.drop(['Patient_ID', 'Drug_Rechallenge_Result'], axis=1))
    results = data[['Patient_ID', 'Drug_Rechallenge_Result']]
    results.loc[:, 'Predicted_Drug_Rechallenge_Result'] = y_predict
    results.to_csv('results.csv', index=False)

    Logger.release_instance()
    Pickler.release_instance()