コード例 #1
0
def get_ptsd_mh_model(valid_users=None, forced_retrain=False):
    if not forced_retrain and os.path.isfile(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH):
        return util.load_picke_file(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH)

    tweets, labels = get_labelled_data_for_ptsd(valid_users)
    pipeline = get_classifier_pipeline()
    pipeline.fit(tweets, labels)
    util.dump_picke_file(pipeline, config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH)
    return pipeline
コード例 #2
0
def create_data_sets():
    """
    Loads the files from CLPsych data directory, and create the following data frames:
        * ctrl_depr
        * ctrl_depr_held_out
        * ctrl_ptsd
        * ctrl_ptsd_held_out
    Each dataset contains valid tweets, nltk and cmu pos tags
    :return: ctrl_depr, ctrl_depr_held_out
    """
    valid_users = get_user_pairs_for('depression')
    ctrl_depr_full = load_tweets_to_df(valid_labels=[0, 1], valid_users=valid_users)
    print(len(ctrl_depr_full))
    print(len(ctrl_depr_full.where(ctrl_depr_full['labels']==1).dropna()))
    ctrl_depr, ctrl_depr_held_out = train_test_split(ctrl_depr_full, test_size=0.3, random_state=1)
    util.dump_picke_file(ctrl_depr, 'paper_computed/ctrl_depr.p')
    util.dump_picke_file(ctrl_depr_held_out, 'paper_computed/ctrl_depr_held_out.p')
    ctrl_depr_full = None

    valid_users = get_user_pairs_for('ptsd')
    ctrl_ptsd_full = load_tweets_to_df(valid_labels=[0, 2], valid_users=valid_users)
    ctrl_ptsd_full['labels'] = (ctrl_ptsd_full['labels']/2).astype(int) #Convert label with 2 to 1
    ctrl_ptsd, ctrl_ptsd_held_out = train_test_split(ctrl_ptsd_full, test_size=0.3, random_state=1)
    util.dump_picke_file(ctrl_ptsd, 'paper_computed/ctrl_ptsd.p')
    util.dump_picke_file(ctrl_ptsd_held_out, 'paper_computed/ctrl_ptsd_held_out.p')

    return ctrl_depr, ctrl_depr_held_out, ctrl_ptsd, ctrl_ptsd_held_out
コード例 #3
0
        print(str(round(s.mean(), 3)) + '(' + str(round(s.std(), 3)) + '), ',
              end=' ')
    print()
    return scores


if __name__ == "__main__":
    df = pd.concat([
        util.load_picke_file(config.CTRL_DEPR_FILTERED_DF),
        util.load_picke_file(config.CTRL_DEPR_HELD_OUT_FILTERED_DF)
    ])
    # df = pd.concat([util.load_picke_file(config.CTRL_DEPR_DF), util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF)])
    # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)])
    # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_FILTERED_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_FILTERED_DF)])
    # df = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)
    labels = df['labels'].astype(int).values

    pipelines = models.get_pipelines(
        mlutils.selector_fn_noop, 'split_filtered_tweets',
        CalibratedClassifierCV(svm.LinearSVC(), cv=3), slda_priors_model,
        vocab_path)

    # cross_val_results = {}
    cross_val_results = util.load_picke_file('crossval_dvc_filtered_results.p')
    for p in pipelines_to_eval:
        if p in cross_val_results:
            continue
        print(p + '\t', end=' ', flush=True)
        cross_val_results[p] = cross_val(pipelines[p], df, labels)
        util.dump_picke_file(cross_val_results,
                             'crossval_dvc_filtered_results.p')
コード例 #4
0
        print(index)
        probs.append(model.predict_proba(row['tweets'])[:, 1])
    df['mental_health_probs'] = probs

    return df


if __name__ == '__main__':
    # ctrl_depr = util.load_picke_file(config.CTRL_DEPR_DF)
    # valid_users = ctrl_depr.index.values

    # ctrl_depr = add_depr_and_mental_health_tweet_probs(ctrl_depr, valid_users=valid_users)
    # util.dump_picke_file(ctrl_depr, config.CTRL_DEPR_DF)
    # ctrl_depr = None

    # ctrl_depr_held_out = util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF)
    # ctrl_depr_held_out = add_depr_and_mental_health_tweet_probs(ctrl_depr_held_out, valid_users=valid_users)
    # util.dump_picke_file(ctrl_depr_held_out, config.CTRL_DEPR_HELD_OUT_DF)
    # ctrl_depr_held_out=None

    ctrl_ptsd = util.load_picke_file(config.CTRL_PTSD_DF)
    valid_users = None  #ctrl_ptsd.index.values

    ctrl_ptsd = add_ptsd_and_mental_health_tweet_probs(ctrl_ptsd,
                                                       valid_users=valid_users)
    util.dump_picke_file(ctrl_ptsd, config.CTRL_PTSD_DF)

    ctrl_ptsd_held_out = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)
    ctrl_ptsd_held_out = add_ptsd_and_mental_health_tweet_probs(
        ctrl_ptsd_held_out, valid_users=valid_users)
    util.dump_picke_file(ctrl_ptsd_held_out, config.CTRL_PTSD_HELD_OUT_DF)