def analyze_maj(fname, mode): input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\majority\\' reports_dir = input_dir + 'reports\\' reports_filename = fname + '_' + mode.name + '_report' gen_majority_report(input_dir + fname + '.csv', reports_dir + reports_filename + '.csv', mode) df = pd.read_csv(reports_dir + '\\' + reports_filename + '.csv') queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} gen_metrics_comparison(folder=reports_dir, query_filenames=[reports_filename], actual_values=labels, cmp_filename=reports_filename + '_stats_report', mode=mode)
def group_all(): #input_dir = 'C:\\research\\falseMedicalClaims\\ECAI\\model input\\Yael_sigal_Irit\\by_group' #feature_file = "rel_only_group_features_by_stance_citation_range_1" #feature_file = "group_features_by_stance_citation_range_1" #df = pd.read_csv(input_dir + '\\group_features_by_stance.csv') #df = pd.read_csv(input_dir + '\\group_features_by_stance_no_enum.csv') #feature_file = "group_features_by_stance_citation_range_only_clinical1" #feature_file = "group_features_by_stance_citation_range_only_rev1" #feature_file = "group_features_by_stance_citation_range_1_no_stance" #feature_file = "group_features_by_stance_citation_range_1_no_stance_no_rel" input_folder = 'C:\\research\\falseMedicalClaims\\ECAI\\model input\\' cls = 'all_equal_weights' input_dir = input_folder + cls + '\\by_group' feature_file = "group_features_by_stance" df = pd.read_csv(input_dir + '\\' + feature_file + '.csv') queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} #mc = MajorityClassifier(input_dir + '\\majority.csv') decisionTreeLearner1 = SKLearner(DecisionTreeClassifier(random_state=0)) svcLearner = SKLearner(svm.SVC(gamma='scale')) #layers = [Layer(input=5, output=10), Layer(input=10, output=10)] #layers = [Layer(input=31, output=20), Layer(input=20, output=10)] layers = [Layer(input=39, output=20), Layer(input=20, output=10)] net = TwoLayersNet(layers) #params = get_parms(net) #nnlearner = NNLearner(dataHelper.Method.GROUP_ALL, net=net, params=params) learners = [decisionTreeLearner1] #learners = [nnlearner] predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY, dataHelper.Method.GROUP_ALL) reports_dir = input_dir + '\\reports\\' query_report_file_name = feature_file + '_query_report.csv' query_report_full_file_name = reports_dir + query_report_file_name #metrics_report_file_name =input_dir + '\\reports\\'+feature_file+'_metrics_report.csv' create_query_report_file(query_report_full_file_name, queries, learners, predictions, labels) files = ['google labels', 'majority', query_report_file_name] label_file = input_folder + cls + '\\labels.csv' gen_all_metrics_comparison(folder=reports_dir, files=files, label_file=label_file)
def w_h_report(): input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\' feature_file = "h_index_stance_label_shrink_neg" feature_file = "h_index_stance_label_shrink_neg" df = pd.read_csv(input_dir + '\\' + feature_file + '.csv') # feature_file = "group_features_by_stance_shrink" reports_dir = input_dir + '\\reports\\' query_report_file_name = feature_file + '_query_report.csv' queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} files = ['majority_nol', query_report_file_name] gen_metrics_comparison(folder=reports_dir, query_filenames=files, actual_values=labels, cmp_filename=feature_file + 'stats_report', mode=ValToClassMode.THREE_CLASSES_PESSIMISTIC) gen_metrics_comparison(folder=reports_dir, query_filenames=files, actual_values=labels, cmp_filename=feature_file + 'stats_report', mode=ValToClassMode.THREE_CLASSES_OPTIMISTIC)
def w_h(): # input_folder = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\' # input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\no outlayers\\' # feature_file = "group_features_by_stance_shrink_nol" input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\' feature_file = "group_features_by_stance_nol" # input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\posterior\\' # feature_file = "weighted_posterior_normed_ratio" df = pd.read_csv(input_dir + '\\' + feature_file + '.csv') features = list(df.head())[2:] queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} decisionTreeLearner = SKLearner(DecisionTreeClassifier(random_state=0), features) gnb = SKLearner(GaussianNB()) ecl = ExpectedValLearner() mult = MultipleBinaryCls(ValToClassMode.THREE_CLASSES_PESSIMISTIC) neigh = SKLearner(KNeighborsClassifier(n_neighbors=5)) lr = SKLearner(LogisticRegression(C=1e5)) kmeans = KMeansClassifier(input_dir + '\\dist.csv') decisionforestLearner = SKLearner(RandomForestClassifier(random_state=0)) #learners = [kmeans, neigh] learners = [mult] predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY, dataHelper.Method.GROUP_ALL, ValToClassMode.THREE_CLASSES_PESSIMISTIC) reports_dir = input_dir + '\\reports\\' query_report_file_name = feature_file + '_query_report.csv' query_report_full_file_name = reports_dir + query_report_file_name create_query_report_file(query_report_full_file_name, queries, learners, predictions, labels, ValToClassMode.W_H) files = ['majority', query_report_file_name] gen_metrics_comparison(folder=reports_dir, query_filenames=files, actual_values=labels, cmp_filename=feature_file + 'stats_report', mode=ValToClassMode.W_H)
def group_all(): #input_dir = 'C:\\research\\falseMedicalClaims\\examples\\model input\\pubmed\\normed\\group7' input_dir = 'C:\\research\\falseMedicalClaims\\ECAI\\model input\\Yael\\by_group' models = [ #DecisionTreeRegressor(random_state=0), DecisionTreeClassifier(random_state=0) ] #LinearRegression()] #df = pd.read_csv(input_dir + '\\group_features.csv') df = pd.read_csv(input_dir + '\\group_features_by_stance.csv') #df = pd.read_csv(input_dir + '\\group_features_by_paper_type.csv') queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} mc = MajorityClassifier(input_dir + '\\majority.csv') predictions = test_models(models, queries, dataHelper.Split.BY_QUERY, dataHelper.Method.GROUP_ALL) create_report_file(input_dir + '\\group_features_by_stance_report.csv', queries=queries, models=models, predictions=predictions, majority_classifier=mc, labels=labels)
def ijcai(): #input_folder = 'C:\\research\\falseMedicalClaims\\IJCAI\\model input\\ecai_new' input_folder = 'C:\\research\\falseMedicalClaims\\IJCAI\\model input\\non' #input_folder = 'C:\\research\\falseMedicalClaims\\IJCAI\\model input\\GTIC' input_dir = input_folder + '\\by_group' feature_file = "dummy_added_group_features_by_stance3" #feature_file = "dummy_added_group_features_by_stance_exp" #feature_file = "dist_exp" #feature_file = "dist_group_features_by_stance_paste_ecai2" feature_file = "dist" df = pd.read_csv(input_dir + '\\' + feature_file + '.csv') features = list(df.head())[2:] queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} decisionTreeLearner = SKLearner(DecisionTreeClassifier(random_state=0), features) gnb = SKLearner(GaussianNB()) ecl = ExpectedValLearner() mult = MultipleBinaryCls(ValToClassMode.W_H) neigh = SKLearner(KNeighborsClassifier(n_neighbors=5)) kmeans = KMeansClassifier(input_dir + '\\dist.csv') #decisionforestLearner = SKLearner(RandomForestClassifier(random_state=0)) learners = [mult] #learners = [decisionforestLearner] predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY, dataHelper.Method.GROUP_ALL) reports_dir = input_dir + '\\reports\\' query_report_file_name = feature_file + '_query_report.csv' query_report_full_file_name = reports_dir + query_report_file_name create_query_report_file(query_report_full_file_name, queries, learners, predictions, labels) files = ['majority', query_report_file_name] gen_all_metrics_comparison(folder=reports_dir, files=files, actual_values=labels, cmp_filename=feature_file + 'stats_report')
def learn_by_doctors_annotations(val2class, feature_file, directory, resample, majority_filename, quick=False, filter_queries=None): input_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\mult_features\\' output_dir = 'C:\\research\\falseMedicalClaims\\White and Hassan\\model input\\' + directory + '\\' # feature_file = "group_features_by_stance_nol" #majority_file = input_dir+ majority_filename reports_dir = output_dir + '\\reports\\' query_report_file_name = feature_file + '_query_report.csv' query_report_full_file_name = reports_dir + query_report_file_name #majority_report_file_name = 'majority_' + val2class.name + '.csv' #majority_report_full_file_name = reports_dir + majority_report_file_name #gen_majority_report(majority_file, majority_report_full_file_name, val2class) #feature_file = "group_features_by_label_shrink_nol" #feature_file = "group_features_by_stance_label_shrink_nol" df = pd.read_csv(input_dir + '\\' + feature_file + '.csv') queries = get_queries_from_df(df) labels = {q: int(queries[q].label) for q in queries} knn = KNeighborsClassifier(n_neighbors=5) neigh = SKLearner(knn, features=None, resample=resample) #weights = {1: 4.0, 3: 1.5 , 5:1.0} weights = {1: 100, 3: 25, 5: 1} svc = SKLearner(svm.SVC(class_weight=weights)) rfc = RandomForestClassifier(random_state=0) decisionforestLearner = SKLearner(rfc, features=None, resample=resample) # lr = SKLearner(LinearRegression(C=1e5)) mult = MultipleBinaryCls(ValToClassMode.FOUR_CLASSES, resample=resample) #learners = [decisionforestLearner, neigh] maj = MajorityClassifier(val2class) learners = [ maj, MultipleCls(val2class, resample=resample), decisionforestLearner, neigh ] # learners = [mult, decisionforestLearner, MultipleCls(val2class, resample = resample) ] if quick: for learner in learners: learner.quick_learn(queries) predictions = test_models(learners, queries, dataHelper.Split.BY_QUERY, dataHelper.Method.GROUP_ALL, val2class) create_query_report_file(query_report_full_file_name, input_dir, feature_file, queries, learners, predictions, labels, val2class) # files = [majority_report_file_name, query_report_file_name] files = [query_report_file_name] if resample: cmp_filename = feature_file + '_stats_report_resample' else: cmp_filename = feature_file + '_stats_report' gen_all_metrics_comparison(folder=reports_dir, files=files, actual_values=labels, cmp_filename=cmp_filename, val2class=val2class) if filter_queries: gen_all_metrics_comparison(folder=reports_dir, files=files, actual_values=labels, cmp_filename=cmp_filename, val2class=val2class, filter_queries=filter_queries)