def get_ptsd_mh_model(valid_users=None, forced_retrain=False): if not forced_retrain and os.path.isfile(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH): return util.load_picke_file(config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH) tweets, labels = get_labelled_data_for_ptsd(valid_users) pipeline = get_classifier_pipeline() pipeline.fit(tweets, labels) util.dump_picke_file(pipeline, config.ACTIVE_MENTIONS_PTSD_MH_MODEL_PATH) return pipeline
def create_data_sets(): """ Loads the files from CLPsych data directory, and create the following data frames: * ctrl_depr * ctrl_depr_held_out * ctrl_ptsd * ctrl_ptsd_held_out Each dataset contains valid tweets, nltk and cmu pos tags :return: ctrl_depr, ctrl_depr_held_out """ valid_users = get_user_pairs_for('depression') ctrl_depr_full = load_tweets_to_df(valid_labels=[0, 1], valid_users=valid_users) print(len(ctrl_depr_full)) print(len(ctrl_depr_full.where(ctrl_depr_full['labels']==1).dropna())) ctrl_depr, ctrl_depr_held_out = train_test_split(ctrl_depr_full, test_size=0.3, random_state=1) util.dump_picke_file(ctrl_depr, 'paper_computed/ctrl_depr.p') util.dump_picke_file(ctrl_depr_held_out, 'paper_computed/ctrl_depr_held_out.p') ctrl_depr_full = None valid_users = get_user_pairs_for('ptsd') ctrl_ptsd_full = load_tweets_to_df(valid_labels=[0, 2], valid_users=valid_users) ctrl_ptsd_full['labels'] = (ctrl_ptsd_full['labels']/2).astype(int) #Convert label with 2 to 1 ctrl_ptsd, ctrl_ptsd_held_out = train_test_split(ctrl_ptsd_full, test_size=0.3, random_state=1) util.dump_picke_file(ctrl_ptsd, 'paper_computed/ctrl_ptsd.p') util.dump_picke_file(ctrl_ptsd_held_out, 'paper_computed/ctrl_ptsd_held_out.p') return ctrl_depr, ctrl_depr_held_out, ctrl_ptsd, ctrl_ptsd_held_out
print(str(round(s.mean(), 3)) + '(' + str(round(s.std(), 3)) + '), ', end=' ') print() return scores if __name__ == "__main__": df = pd.concat([ util.load_picke_file(config.CTRL_DEPR_FILTERED_DF), util.load_picke_file(config.CTRL_DEPR_HELD_OUT_FILTERED_DF) ]) # df = pd.concat([util.load_picke_file(config.CTRL_DEPR_DF), util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF)]) # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF)]) # df = pd.concat([util.load_picke_file(config.CTRL_PTSD_FILTERED_DF), util.load_picke_file(config.CTRL_PTSD_HELD_OUT_FILTERED_DF)]) # df = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF) labels = df['labels'].astype(int).values pipelines = models.get_pipelines( mlutils.selector_fn_noop, 'split_filtered_tweets', CalibratedClassifierCV(svm.LinearSVC(), cv=3), slda_priors_model, vocab_path) # cross_val_results = {} cross_val_results = util.load_picke_file('crossval_dvc_filtered_results.p') for p in pipelines_to_eval: if p in cross_val_results: continue print(p + '\t', end=' ', flush=True) cross_val_results[p] = cross_val(pipelines[p], df, labels) util.dump_picke_file(cross_val_results, 'crossval_dvc_filtered_results.p')
print(index) probs.append(model.predict_proba(row['tweets'])[:, 1]) df['mental_health_probs'] = probs return df if __name__ == '__main__': # ctrl_depr = util.load_picke_file(config.CTRL_DEPR_DF) # valid_users = ctrl_depr.index.values # ctrl_depr = add_depr_and_mental_health_tweet_probs(ctrl_depr, valid_users=valid_users) # util.dump_picke_file(ctrl_depr, config.CTRL_DEPR_DF) # ctrl_depr = None # ctrl_depr_held_out = util.load_picke_file(config.CTRL_DEPR_HELD_OUT_DF) # ctrl_depr_held_out = add_depr_and_mental_health_tweet_probs(ctrl_depr_held_out, valid_users=valid_users) # util.dump_picke_file(ctrl_depr_held_out, config.CTRL_DEPR_HELD_OUT_DF) # ctrl_depr_held_out=None ctrl_ptsd = util.load_picke_file(config.CTRL_PTSD_DF) valid_users = None #ctrl_ptsd.index.values ctrl_ptsd = add_ptsd_and_mental_health_tweet_probs(ctrl_ptsd, valid_users=valid_users) util.dump_picke_file(ctrl_ptsd, config.CTRL_PTSD_DF) ctrl_ptsd_held_out = util.load_picke_file(config.CTRL_PTSD_HELD_OUT_DF) ctrl_ptsd_held_out = add_ptsd_and_mental_health_tweet_probs( ctrl_ptsd_held_out, valid_users=valid_users) util.dump_picke_file(ctrl_ptsd_held_out, config.CTRL_PTSD_HELD_OUT_DF)