def baseline(setting='color'): #get features and labels img_names = get_filename_list('../data/groupdataset_release/file_names.txt') print "Extracting features..." if setting == 'color': X = color_histogram('../data/groupdataset_release/images', img_names) elif setting == 'pixel': X = pixel_extractor('../data/groupdataset_release/resize_images', img_names) elif setting == 'bb': X = bb_extractor('../data/groupdataset_release/annotations/all', img_names) else: pass Y = get_label_matrix('../data/groupdataset_release/image_annotations.csv') #split into train and test print "Splitting into train and test set..." X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, Y, test_size=0.2) #initialize svm class_names = {'none': 1, 'low': 2, 'moderate': 3, 'high': 4} sentiments = ['interaction', 'focus', 'happiness'] for i in xrange(Y_train.shape[1]): print "Fitting svm...." svm_model = svm.SVC(kernel="linear", decision_function_shape='ovr', max_iter=10000) svm_model.fit(X_train, Y_train[:,i]) print "Predicting..." y_predict_train = svm_model.predict(X_train) y_predict = svm_model.predict(X_test) analysis.run_analyses(y_predict_train, Y_train[:,i], y_predict, Y_test[:,i], class_names, sentiments[i])
from load_dfs import DfLoader from analysis import run_analyses # load the dataframes DfLoad = DfLoader(snakemake.input.data_dir) both_df = DfLoad.eng_both() run_analyses([ { 'name': 'has_objc', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'has_objc', 'examples': [], }, { 'name': 'has_loca', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_loca'], 'examples': [], }, { 'name': 'has_time', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_time'], 'examples': [], }, ], snakemake.output.dir)
# NB snakemake runs script from /workflow directory sys.path.append('scripts/analysis') from load_dfs import DfLoader from analysis import run_analyses # load the dataframes DfLoad = DfLoader(snakemake.input.data_dir) both_df = DfLoad.eng_both() run_analyses([ { 'name': 'verb_stem', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'stem', }, { 'name': 'verb_person', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'person', 'examples': [], }, { 'name': 'is_stative', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['esv_is', 'niv_is'] }, ], snakemake.output.dir)
def main(): ##Torso Extraction## # img_path = "../data/groupdataset_release/images/4940922642_5dab04b030_o.jpg" # torso_extractor = TorsoExtractor() # torso_list, image = torso_extractor.detect_torsos(img_path) # ## Face Extraction ## # # img_path = "../data/groupdataset_release/images/Library3.jpg" # # face_extractor = FaceExtractor() # # faces_lists, image = face_extractor.detect_faces(img_path) # # for face_list in faces_lists: # # for (x,y,w,h) in face_list: # extract_faces = False # extract_missed_faces = False # if extract_faces: # src_path = '../data/GENKI-R2009a/Subsets/GENKI-4K/files' # dest_path = './cache/GENKI_faces' # image_util.extract_GENKI_faces(src_path, dest_path) # if extract_missed_faces: # src_path = '../data/GENKI-R2009a/Subsets/GENKI-4K/files' # dest_path = './cache/GENKI_faces/GENKI_faces_looser_bounds' # image_util.extract_missed_faces(dest_path) #SVM Training # img_path = "../data/GENKI-R2009a/Subsets/GENKI-4K/GENKI-4K_Images_Reduced.txt" # labels_path = "../data/GENKI-R2009a/Subsets/GENKI-4K/GENKI-4K_Labels_Reduced.txt" # img_path2 = '../data/groupdataset_release/images' # faces_path = '../data/groupdataset_release/faces' # train_again = False # if train_again: # svm = EmotionSVM(img_path, labels_path, img_path2, 'sad', dump=True) # svm.train() # # svm = train_smile_extractor(img_path, labels_path) # # joblib.dump(svm, 'svm_model.pkl') # else: # pass # # print 'Loading svm...' # # svm = EmotionSVM(img_path, labels_path, img_path2, 'smile', fit=False) # # all_face_features = get_all_face_features(img_path2, faces_path, svm) # # print all_face_features.shape # # np.save('../data/groupdataset_release/face_features.npy', all_face_features) # poselet_path = '../data/groupdataset_release/all_poseletes_hq' # all_poselet_features = get_all_poselet_features(poselet_path) # print all_poselet_features.shape # np.save('../data/groupdataset_release/poselet_features.npy', all_poselet_features) # basepath = '../data/groupdataset_release/annotations/all' # img_names = os.listdir(basepath) # svm = joblib.load('./svm_models/svm_orient_model.pkl') # X = get_image_orientation_features(svm) print "Extracting features..." X = construct_full_feature_matrix(only_poselet=True) Y = get_label_matrix('../data/groupdataset_release/image_annotations.csv') # binary = True # if binary: # Y[Y == 1] = 0 # Y[Y == 2] = 0 # Y[Y == 3] = 1 # Y[Y == 4] = 1 print "Splitting into train and test set..." X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split( X, Y, test_size=0.2) # class_names = {'none': 1, 'low': 2, 'moderate': 3, 'high': 4} class_names = {'no': 1, 'yes': 2} sentiments = ['interaction', 'focus', 'happiness', 'activity'] for i in xrange(Y_train.shape[1]): print "Fitting svm...." # svm_model = svm.LinearSVC(C=0.1) svm_model = svm.SVC(C=0.1, kernel="linear", decision_function_shape='ovr', verbose=True) svm_model.fit(X_train, Y_train[:, i]) joblib.dump(svm_model, './final_svm_models/svm_%s_model.pkl' % sentiments[i]) print "Predicting..." y_predict_train = svm_model.predict(X_train) y_predict = svm_model.predict(X_test) analysis.run_analyses(y_predict_train, Y_train[:, i], y_predict, Y_test[:, i], class_names, sentiments[i])
run_analyses([ { 'name': 'args', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cl_args', 'examples': [] }, { 'name': 'mo_verbtype', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'mother_verbtype', 'examples': [ { 'query': ('eng_TAMsimp == "PAST"' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST"' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PERF"' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp.isin(["PAST PERF", "PAST ~ PAST PERF"])' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp.isin(["PAST PERF", "PAST ~ PAST PERF"])' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES"' 'and mother_verbtype == "yqtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES"' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES"' 'and mother_verbtype == "ptcp"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES"' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG"' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG"' 'and mother_verbtype == "ptcp"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG"' 'and mother_verbtype == "wqtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "FUT ~ PRES"' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "FUT ~ PAST"' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST ~ PRES PART" ' 'and mother_verbtype == "infa"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PART" ' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, ] }, { 'name': 'verb_lex', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'lex', 'examples': [ { 'query': ('eng_TAMsimp == "PAST" ' 'and lex == "אמר"'), }, { 'query': ('eng_TAMsimp == "PAST" ' 'and lex == "בוא"'), }, { 'query': ('eng_TAMsimp == "PAST" ' 'and lex == "עשׂה"'), }, { 'query': ('eng_TAMsimp == "PAST" ' 'and lex == "עלה"'), }, { 'query': ('eng_TAMsimp == "PRES PERF" ' 'and lex == "מלט"'), }, { 'query': ('eng_TAMsimp == "PRES PERF" ' 'and lex == "מאס"'), }, { 'query': ('eng_TAMsimp == "PRES PERF" ' 'and lex == "היה"'), }, { 'query': ('eng_TAMsimp == "PRES PERF" ' 'and lex == "נתן"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PERF" ' 'and lex == "לקח"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PERF" ' 'and lex == "נתן"'), }, { 'query': ('eng_TAMsimp == "PRES" ' 'and lex.isin(["ישׁב", "ירא", "ידע", "מלא"])'), 'spread': 35, }, { 'query': ('eng_TAMsimp == "PRES" ' 'and lex == "אמר"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PRES PART" ' 'and lex == "חוה"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PRES PART" ' 'and lex == "זעק"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PRES PART" ' 'and lex == "שׁבר"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PRES PART" ' 'and lex == "יעץ"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG" ' 'and lex == "הלך"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG" ' 'and lex == "ישׁב"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG" ' 'and lex == "אכל"'), }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG" ' 'and lex == "היה"'), }, { 'query': ('eng_TAMsimp == "PAST ~ TO INF" ' 'and lex == "חוה"'), }, { 'query': ('eng_TAMsimp == "PAST ~ TO INF" ' 'and lex == "ישׁב"'), }, { 'query': ('eng_TAMsimp == "PAST ~ TO INF" ' 'and lex == "שׁתה"'), }, { 'query': ('eng_TAMsimp == "PAST ~ TO INF" ' 'and lex == "חטא"'), }, { 'query': ('eng_TAMsimp == "PAST ~ TO INF" ' 'and lex == "כלה"'), }, ] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'eng_tenses', 'df': eng_df, 'index': 'eng_TAMsimp', 'examples': [ ], }, { 'name': 'esv_tenses', 'df': esv_df, 'index': 'esv_TAM', 'examples': [ { 'query': ('esv_TAM == "MOD"') } ] }, { 'name': 'niv_tenses', 'df': niv_df, 'index': 'niv_TAM', 'examples': [ {'query': 'niv_TAMsimp == "IMPV"'} ] }, { 'name': 'eng_simp_disagree', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'examples': [ { 'query': ('esv_TAM == "FUT" or niv_TAM == "FUT"') }, { 'query': ('esv_TAM == "MOD let" or niv_TAM == "MOD let"') }, { 'query': ('esv_TAM == "MOD may" or niv_TAM == "MOD may"') }, { 'query': ('esv_TAM == "PRES" or niv_TAM == "PRES"') }, { 'query': ('esv_TAM == "MOD would" or niv_TAM == "MOD would"') }, { 'query': ('esv_TAM == "MOD shall" or niv_TAM == "MOD shall"') }, { 'query': ('esv_TAM == "PAST" or niv_TAM == "PAST"') }, { 'query': ('esv_TAM == "TO INF" or niv_TAM == "TO INF"') }, { 'query': ('esv_TAM == "MOD must" or niv_TAM == "MOD must"') }, { 'query': ('esv_TAM == "PRES PART" or niv_TAM == "PRES PART"') }, { 'query': ('esv_TAM == "MOD could" or niv_TAM == "MOD could"') }, { 'query': ('esv_TAM == "MOD might" or niv_TAM == "MOD might"') }, { 'query': ('esv_TAM == "MOD can" or niv_TAM == "MOD can"') }, { 'query': ('esv_TAM == "IMPV do not" or niv_TAM == "IMPV do not"') }, { 'query': ('esv_TAM == "IMPV" or niv_TAM == "IMPV"') }, ] }, ], snakemake.output.dir)
run_analyses( [ { 'name': 'clause_type', 'df': both_df, 'index': ['eng_TAMsimp', 'person'], 'columns': 'clause_type', 'examples': [ { 'query': ('eng_TAMsimp.isin(["FUT", "FUT ~ MOD shall"]) ' 'and person == "p3" ' 'and clause_type.isin(["xYqX"])') }, ], }, { 'name': 'cltype_maincl', 'df': both_df[both_df.clause_rela == 'Main'], 'index': ['eng_TAMsimp', 'person'], 'columns': 'clause_type', 'examples': [] }, { 'name': 'args', 'df': both_df, 'index': ['eng_TAMsimp', 'person'], 'columns': 'cl_args', 'examples': [ { 'query': ('eng_TAMsimp.isin(["FUT", "FUT ~ MOD shall"]) ' 'and person == "p3" ' 'and cl_args.isin(["_W_SV", "SV"]) '), }, ] }, { 'name': 'args_maincl', 'df': both_df[both_df.clause_rela == 'Main'], 'index': ['eng_TAMsimp', 'person'], 'columns': 'cl_args', 'examples': [] }, # # { # 'name': 'main_clause_type', # 'df': both_df[both_df.clause_rela == 'Main'], # 'index': 'eng_TAMsimp', # 'columns': 'clause_type', # }, # { # 'name': 'clause_rela', # 'df': both_df, # 'index': 'eng_TAMsimp', # 'columns': 'clause_rela', # }, # { # 'name': 'cltype_simp', # 'df': both_df, # 'index': 'eng_TAMsimp', # 'columns': 'cltype_simp', # }, # { # 'name': 'rela_cltypesimp', # 'df': both_df, # 'index': 'eng_TAMsimp', # 'columns': ['clause_rela', 'cltype_simp'], # }, # { # 'name': 'prec_part', # 'df': both_df, # 'index': 'eng_TAMsimp', # 'columns': 'prec_part', # 'examples': [ # ], # }, ], snakemake.output.dir)
run_analyses([ { 'name': 'has_objc', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'has_objc', 'examples': [{ 'query': ('eng_TAMsimp == "PRES" ' 'and has_objc == 1'), }, { 'query': ('eng_TAMsimp == "PAST" ' 'and has_objc == 1'), }, { 'query': ('eng_TAMsimp == "FUT" ' 'and has_objc == 1'), }], }, { 'name': 'has_loca', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_loca'], 'examples': [ { 'query': ('eng_TAMsimp == "PRES" ' 'and has_loca == 1'), }, { 'query': ('eng_TAMsimp == "PAST PROG" ' 'and has_loca == 1'), }, ], }, { 'name': 'has_time', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_time'], 'examples': [ { 'query': ('eng_TAMsimp == "FUT" ' 'and has_time == 1'), }, ], }, ], snakemake.output.dir)
def main(): ##Torso Extraction## # img_path = "../data/groupdataset_release/images/4940922642_5dab04b030_o.jpg" # torso_extractor = TorsoExtractor() # torso_list, image = torso_extractor.detect_torsos(img_path) # ## Face Extraction ## # # img_path = "../data/groupdataset_release/images/Library3.jpg" # # face_extractor = FaceExtractor() # # faces_lists, image = face_extractor.detect_faces(img_path) # # for face_list in faces_lists: # # for (x,y,w,h) in face_list: # extract_faces = False # extract_missed_faces = False # if extract_faces: # src_path = '../data/GENKI-R2009a/Subsets/GENKI-4K/files' # dest_path = './cache/GENKI_faces' # image_util.extract_GENKI_faces(src_path, dest_path) # if extract_missed_faces: # src_path = '../data/GENKI-R2009a/Subsets/GENKI-4K/files' # dest_path = './cache/GENKI_faces/GENKI_faces_looser_bounds' # image_util.extract_missed_faces(dest_path) #SVM Training # img_path = "../data/GENKI-R2009a/Subsets/GENKI-4K/GENKI-4K_Images_Reduced.txt" # labels_path = "../data/GENKI-R2009a/Subsets/GENKI-4K/GENKI-4K_Labels_Reduced.txt" # img_path2 = '../data/groupdataset_release/images' # faces_path = '../data/groupdataset_release/faces' # train_again = False # if train_again: # svm = EmotionSVM(img_path, labels_path, img_path2, 'sad', dump=True) # svm.train() # # svm = train_smile_extractor(img_path, labels_path) # # joblib.dump(svm, 'svm_model.pkl') # else: # pass # # print 'Loading svm...' # # svm = EmotionSVM(img_path, labels_path, img_path2, 'smile', fit=False) # # all_face_features = get_all_face_features(img_path2, faces_path, svm) # # print all_face_features.shape # # np.save('../data/groupdataset_release/face_features.npy', all_face_features) # poselet_path = '../data/groupdataset_release/all_poseletes_hq' # all_poselet_features = get_all_poselet_features(poselet_path) # print all_poselet_features.shape # np.save('../data/groupdataset_release/poselet_features.npy', all_poselet_features) # basepath = '../data/groupdataset_release/annotations/all' # img_names = os.listdir(basepath) # svm = joblib.load('./svm_models/svm_orient_model.pkl') # X = get_image_orientation_features(svm) print "Extracting features..." X = construct_full_feature_matrix(only_poselet=True) Y = get_label_matrix('../data/groupdataset_release/image_annotations.csv') # binary = True # if binary: # Y[Y == 1] = 0 # Y[Y == 2] = 0 # Y[Y == 3] = 1 # Y[Y == 4] = 1 print "Splitting into train and test set..." X_train, X_test, Y_train, Y_test = sklearn.cross_validation.train_test_split(X, Y, test_size=0.2) # class_names = {'none': 1, 'low': 2, 'moderate': 3, 'high': 4} class_names = {'no': 1, 'yes': 2} sentiments = ['interaction', 'focus', 'happiness', 'activity'] for i in xrange(Y_train.shape[1]): print "Fitting svm...." # svm_model = svm.LinearSVC(C=0.1) svm_model = svm.SVC(C=0.1, kernel="linear", decision_function_shape='ovr', verbose=True) svm_model.fit(X_train, Y_train[:,i]) joblib.dump(svm_model, './final_svm_models/svm_%s_model.pkl'%sentiments[i]) print "Predicting..." y_predict_train = svm_model.predict(X_train) y_predict = svm_model.predict(X_test) analysis.run_analyses(y_predict_train, Y_train[:,i], y_predict, Y_test[:,i], class_names, sentiments[i])
run_analyses( [ { 'name': 'clause_type', 'df': esv_df, 'index': 'esv_TAM', 'columns': 'clause_type', }, { 'name': 'clause_rela', 'df': esv_df, 'index': 'esv_TAM', 'columns': 'clause_rela', }, { 'name': 'cltype_simp', 'df': esv_df, 'index': 'esv_TAM', 'columns': 'cltype_simp', }, { 'name': 'rela_cltypesimp', 'df': esv_df, 'index': 'esv_TAM', 'columns': ['clause_rela', 'cltype_simp'], }, { 'name': 'args', 'df': esv_df, 'index': 'esv_TAM', 'columns': 'cl_args', 'examples': [] }, { 'name': 'rela_particle', 'df': esv_df, 'index': 'esv_TAM', 'columns': ['clause_rela', 'prec_part'], }, { 'name': 'prec_part', 'df': esv_df, 'index': 'esv_TAM', 'columns': 'prec_part', 'examples': [], }, # { # 'name': 'prec_part_gendom', # 'df': esv_df, # 'index': 'prec_part', # 'columns': ['genre', 'domain2', 'eng_TAM'], # }, # # { # 'name': 'args_mother', # 'df': esv_df, # 'index': 'eng_TAM', # 'columns': ['cl_args','mother_verbtype'], # 'examples': [], # }, # { # 'name': 'has_objc', # 'df': esv_df, # 'index': 'eng_TAM', # 'columns': 'has_objc', # }, # { # 'name': 'has_loca', # 'df': esv_df, # 'index': 'eng_TAM', # 'columns': ['clause_rela', 'has_loca'], # 'examples': [ # ], # }, # { # 'name': 'has_time', # 'df': esv_df, # 'index': 'eng_TAM', # 'columns': ['clause_rela', 'has_time'], # 'examples': [ # ], # }, ], snakemake.output.dir)
run_analyses([ { 'name': 'has_objc', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'has_objc', 'examples': [ { 'query': ('eng_TAMsimp == "FUT" ' 'and has_objc == 1 ') }, ], }, { 'name': 'has_loca', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_loca'], 'examples': [ { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and has_loca == 1 '), 'spread': 10, }, ], }, { 'name': 'has_time', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_time'], 'examples': [ { 'query': ('eng_TAMsimp.isin(["FUT", "FUT ~ MOD shall"]) ' 'and has_time == 1 ') }, { 'query': ('eng_TAMsimp == "MOD is to ~ MOD shall" ' 'and has_time == 1 ') }, { 'query': ('eng_TAMsimp == "IMPV ~ MOD shall" ' 'and has_time == 1 ') }, ], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'clause_type', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'clause_type', 'examples': [ { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and clause_type == "WQtX"'), }, { 'query': ('eng_TAMsimp == "MOD shall" ' 'and clause_type == "WQtX"'), }, ], }, { 'name': 'main_clause_type', 'df': both_df[both_df.clause_rela == 'Main'], 'index': 'eng_TAMsimp', 'columns': 'clause_type', }, { 'name': 'clause_rela', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'clause_rela', }, { 'name': 'cltype_simp', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cltype_simp', }, { 'name': 'rela_cltypesimp', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['clause_rela', 'cltype_simp'], }, { 'name': 'args', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cl_args', 'examples': [ ] }, { 'name': 'mo_verbtype', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'mother_verbtype', 'examples': [ { 'query': ('eng_TAMsimp.isin(["FUT", "FUT ~ MOD shall"]) ' 'and mother_verbtype == "ptcp"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES" ' 'and mother_verbtype == "yqtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "IMPV" ' 'and mother_verbtype == "impv"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "MOD must ~ MOD shall" ' 'and mother_verbtype == "Ø"'), 'bhs_text': ['mother_intertext', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST" ' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_intertext', 'clause_atom'], 'spread': 35, }, { 'query': ('eng_TAMsimp == "PAST" ' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_intertext', 'clause_atom'], 'spread': 35, }, ] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'eng_tenses', 'df': eng_df, 'index': 'eng_TAMsimp', 'examples': [ { 'query': ('eng_TAMsimp == "PRES PART"') }, { 'query': ('eng_TAMsimp == "PAST PERF"') }, { 'query': ('eng_TAMsimp == "PRES PERF"') }, { 'query': ('eng_TAMsimp == "PRES PERF PROG"') }, { 'query': ('eng_TAMsimp == "TO INF"') }, { 'query': ('eng_TAMsimp.str.match("MOD may|MOD was to|MOD is to|MOD let|MOD must")' ) }, ], }, { 'name': 'esv_tenses', 'df': esv_df, 'index': 'esv_TAM', 'examples': [] }, { 'name': 'niv_tenses', 'df': niv_df, 'index': 'niv_TAMsimp', }, { 'name': 'eng_simp_disagree', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'examples': [ { 'query': ('eng_TAMsimp == "PRES ~ PRES PART"') }, { 'query': ('eng_TAMsimp == "PRES ~ PRES PROG"') }, { 'query': ('eng_TAMsimp == "PAST ~ PAST PROG"') }, { 'query': ('eng_TAMsimp == "PAST ~ PRES PART"') }, { 'query': ('eng_TAMsimp == "PAST ~ PRES"') }, { 'query': ('eng_TAMsimp == "FUT ~ PRES"') }, ] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'genre', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'genre', 'examples': [], }, { 'name': 'domain', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'domain2', 'examples': [] }, { 'name': 'gendom', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['genre', 'domain2'], 'examples': [], }, { 'name': 'period', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'period', }, { 'name': 'period_gendom', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['period', 'genre', 'domain2'], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'verb_stem', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'stem', 'examples': [ { 'query': ('eng_TAM == "PRES..IND" ' 'and stem == "nif" ') }, { 'query': ('eng_TAM == "PRES..IND" ' 'and stem == "qal" ') }, ], }, { 'name': 'verb_lexst_ps', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['lex', 'stem', 'person'], 'fishers': False, }, { 'name': 'verb_person', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'person', 'examples': [ { 'query': ('eng_TAM == "PRES..IND" ' 'and person == "p1" '), 'spread': 10, }, ], }, { 'name': 'is_stative', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['esv_is', 'niv_is'] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'args', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cl_args', 'examples': [ { 'query': ('eng_TAMsimp == "PRES" ' 'and cl_args == "V"') }, { 'query': ('eng_TAMsimp == "PRES" ' 'and cl_args == "RV"') }, { 'query': ('eng_TAMsimp == "PRES" ' 'and cl_args == "QSV"') }, { 'query': ('eng_TAMsimp == "PAST" ' 'and cl_args == "RV"') }, { 'query': ('eng_TAMsimp == "PAST" ' 'and cl_args == "_W_SV"') }, { 'query': ('eng_TAMsimp == "PAST" ' 'and cl_args == "V"') }, { 'query': ('eng_TAMsimp == "PRES PART" ' 'and cl_args == "V"') }, { 'query': ('eng_TAMsimp == "PRES PROG" ' 'and cl_args == "RSV"') }, { 'query': ('eng_TAMsimp == "PRES PROG" ' 'and cl_args == "ISV"') }, { 'query': ('eng_TAMsimp == "PAST PROG" ' 'and cl_args == "_W_SV"') }, { 'query': ('eng_TAMsimp == "PAST PROG" ' 'and cl_args == "SV"') }, { 'query': ('eng_TAMsimp == "PAST PROG" ' 'and cl_args == "RV"') }, { 'query': ('eng_TAMsimp == "FUT" ' 'and cl_args == "ISV"') }, { 'query': ('eng_TAMsimp == "FUT" ' 'and cl_args == "CV"') }, { 'query': ('eng_TAMsimp == "FUT" ' 'and cl_args == "ASV"') }, ] }, { 'name': 'mo_verbtype', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'mother_verbtype', 'examples': [ { 'query': ('eng_TAMsimp == "PRES"' 'and mother_verbtype == "yqtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST" ' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PART" ' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PROG" ' 'and mother_verbtype == "infc"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PROG" ' 'and mother_verbtype == "yqtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PROG" ' 'and mother_verbtype == "wqtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PRES PROG" ' 'and mother_verbtype == "impv"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "PAST PROG" ' 'and mother_verbtype == "wayq"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, { 'query': ('eng_TAMsimp == "FUT" ' 'and mother_verbtype == "qtl"'), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, ] }, { 'name': 'verb_lex', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'lex', 'examples': [] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'eng_tenses', 'df': eng_df, 'index': 'eng_TAMsimp', 'examples': [], }, { 'name': 'esv_tenses', 'df': esv_df, 'index': 'esv_TAM', 'examples': [] }, { 'name': 'niv_tenses', 'df': niv_df, 'index': 'niv_TAMsimp', 'examples': [{ 'query': 'niv_TAMsimp == "IMPV"' }] }, { 'name': 'eng_simp_disagree', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'examples': [ { 'query': 'eng_TAMsimp == "PAST ~ PRES PART"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "PAST ~ PAST PERF"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "PAST ~ PRES PERF"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "PAST ~ PRES"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "PAST ~ TO INF"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "PRES ~ PRES PERF"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "PAST ~ PAST PROG"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "FUT ~ PRES"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "MOD could ~ PAST"', 'spread': 10, }, { 'query': 'eng_TAMsimp == "FUT ~ PAST"', 'spread': 10, }, { 'query': 'eng_TAMsimp.str.match(".*FUT")', 'spread': -1, }, ] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'verb_lex', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['lex', 'stem'], 'examples': [ { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and lex_etcbc == "HJH[" '), }, { 'query': ('eng_TAMsimp == "FUT" ' 'and lex_etcbc == "NTN[" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and lex_etcbc == "MWT[" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and lex_etcbc == "NPL[" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "IMPV ~ MOD shall" ' 'and lex_etcbc == "<FH[" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "MOD may" ' 'and lex_etcbc == ">KL[" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "IMPV" ' 'and lex_etcbc == "JR>[" '), 'spread': 10, }, ], 'special': [], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'eng_tenses', 'df': eng_df, 'index': 'eng_TAMsimp', 'examples': [], }, { 'name': 'esv_tenses', 'df': esv_df, 'index': 'esv_TAMsimp', }, { 'name': 'niv_tenses', 'df': niv_df, 'index': 'niv_TAMsimp', }, { 'name': 'trans_tam', 'df': eng_df, 'index': 'esv_TAMsimp', 'columns': 'niv_TAMsimp', 'fishers': False, 'examples': [], }, { 'name': 'disag_genre', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'columns': 'genre', }, { 'name': 'disag_domain', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'columns': 'domain2', }, { 'name': 'disag_gendom', 'df': disag_df_simp, 'index': 'eng_simp_agree', 'columns': ['genre', 'domain2'], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'genre', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'genre', 'examples': [ { 'query': ('eng_TAMsimp == "PRES" ' 'and genre == "poetry" '), }, ], }, { 'name': 'domain', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'domain2', }, { 'name': 'gendom', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['genre', 'domain2'], 'examples': [ { 'query': ('eng_TAMsimp.isin(["FUT ~ MOD shall", "FUT"]) ' 'and genre == "prophetic" ' 'and domain2 == "Q"'), }, { 'query': ('eng_TAMsimp.isin(["FUT ~ MOD shall", "FUT"]) ' 'and genre == "prose" ' 'and domain2 == "Q"'), }, { 'query': ('eng_TAMsimp == "IMPV" ' 'and genre == "prose" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAMsimp == "MOD let" ' 'and genre == "prose" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAMsimp == "MOD may" ' 'and genre == "prose" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAMsimp == "IMPV" ' 'and genre == "poetry" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAMsimp == "MOD can" ' 'and genre == "poetry" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAMsimp == "IMPV ~ MOD shall" ' 'and genre == "instruction" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "MOD must ~ MOD shall" ' 'and genre == "instruction" '), 'spread': 10, }, { 'query': ('eng_TAMsimp == "PAST" ' 'and domain2.isin(["D", "N"]) ' 'and genre == "prose" '), 'spread': 35, }, { 'query': ('eng_TAMsimp == "PAST" ' 'and genre == "poetry" '), 'spread': 35, }, ], }, { 'name': 'ps_gendom', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['genre', 'domain2', 'person'], }, { 'name': 'period', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'period', }, { 'name': 'period_gendom', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['period', 'genre', 'domain2'], }, ], snakemake.output.dir)
import sys # NB snakemake runs script from /workflow directory sys.path.append('scripts/analysis') from load_dfs import DfLoader from analysis import run_analyses # load the dataframes DfLoad = DfLoader(snakemake.input.data_dir) both_df = DfLoad.eng_both() # features needed for selections main_genre = ['prose', 'poetry', 'prophetic'] main_dom = ['Q', 'N'] run_analyses([ { 'name': 'args', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cl_args', 'examples': [ ] }, ], snakemake.output.dir)
import sys import pandas as pd # NB snakemake runs script from /workflow directory sys.path.append('scripts/analysis') from load_dfs import DfLoader from analysis import run_analyses # load the dataframes DfLoad = DfLoader(snakemake.input.data_dir) both_df = DfLoad.eng_both() run_analyses([ { 'name': 'verb_lex', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['lex', 'stem'], 'examples': [ ], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'verb_stem', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'stem', }, # { # 'name': 'verb_lexst_ps', # 'df': both_df, # 'index': 'eng_TAMsimp', # 'columns': ['lex', 'stem', 'person'], # 'fishers': False, # }, { 'name': 'verb_person', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'person', 'examples': [ { 'query': ('eng_TAMsimp == "FUT" ' 'and person == "p1" ') }, { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and person == "p1" ') }, ], }, { 'name': 'is_stative', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['esv_is', 'niv_is'] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'eng_tenses', 'df': eng_df, 'index': 'eng_TAMsimp', 'examples': [{ 'query': 'esv_TAMsimp == "MOD would"', 'spread': 35, }, { 'query': 'esv_TAMsimp == "HAB used to"', 'spread': 35, }], }, { 'name': 'esv_tenses', 'df': esv_df, 'index': 'esv_TAMsimp', }, { 'name': 'niv_tenses', 'df': niv_df, 'index': 'niv_TAMsimp', }, { 'name': 'both_tenses', 'df': eng_df, 'index': 'eng_TAMsimp', 'special': [{ 'df': 'count', 'do': sum_top_values }], }, { 'name': 'eng_simp_agree', 'df': agg_df, 'index': 'eng_simp_agree', }, { 'name': 'eng_simp_disagree', 'df': disag_df_simp, 'index': 'eng_TAMsimp', }, { 'name': 'trans_tam', 'df': eng_df, 'index': 'esv_TAMsimp', 'columns': 'niv_TAMsimp', 'fishers': False, 'examples': [], }, { 'name': 'both_genre', 'df': eng_df, 'index': 'eng_TAMsimp', 'columns': 'genre', }, { 'name': 'both_domain', 'df': eng_df, 'index': 'eng_TAMsimp', 'columns': 'domain2', }, { 'name': 'both_gendom', 'df': eng_df, 'index': 'eng_TAMsimp', 'columns': ['genre', 'domain2'], }, { 'name': 'disag_genre', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'columns': 'genre', }, { 'name': 'disag_domain', 'df': disag_df_simp[disag_df_simp.domain2.isin(['N', 'Q'])], 'index': 'eng_TAMsimp', 'columns': 'domain2', }, { 'name': 'disag_gendom', 'df': eng_df[eng_df.domain2.isin(['N', 'Q'])], 'index': 'eng_simp_agree', 'columns': ['genre', 'domain2'], }, { 'name': 'inter_gendom', 'df': disag_df_simp, 'index': 'eng_TAMsimp', 'columns': ['genre', 'domain2'], 'examples': [], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'clause_type', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'clause_type', 'examples': [], }, { 'name': 'clause_rela', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'clause_rela', }, { 'name': 'cltype_simp', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cltype_simp', }, { 'name': 'args', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'cl_args', 'examples': [] }, { 'name': 'mo_verbtype', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'mother_verbtype', 'examples': [] }, ], snakemake.output.dir)
run_analyses([ { 'name': 'french_tense', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'french_tense', 'examples': [ { 'query': ('eng_TAMsimp == "PAST" ' 'and french_tense == "imparfait"'), 'spread': -1, 'extra_text': { 'NBS': 'french', 'NBS (verse)': 'french_verse' }, }, { 'query': ('eng_TAMsimp == "PAST" ' 'and french_tense == "passé_comp"'), 'spread': -1, 'extra_text': { 'NBS': 'french', 'NBS (verse)': 'french_verse' }, }, { 'query': ('eng_TAMsimp == "PAST" ' 'and french_tense == "passé_simp"'), 'spread': -1, 'extra_text': { 'NBS': 'french', 'NBS (verse)': 'french_verse' }, }, { 'query': ('eng_TAMsimp == "PAST ~ PRES" ' 'and french_tense == "imparfait"'), 'spread': -1, 'extra_text': { 'NBS': 'french', 'NBS (verse)': 'french_verse' }, }, ], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'eng_tenses', 'df': eng_df, 'index': 'eng_TAM', }, { 'name': 'esv_tenses', 'df': esv_df, 'index': 'esv_TAM', }, { 'name': 'niv_tenses', 'df': niv_df, 'index': 'niv_TAM', }, { 'name': 'eng_agree', 'df': both_df, 'index': 'eng_agree', }, { 'name': 'eng_disagree', 'df': disag_df, 'index': 'eng_TAM', }, { 'name': 'trans_tam', 'df': both_df, 'index': 'esv_TAM', 'columns': 'niv_TAM', 'fishers': False, }, { 'name': 'disag_past', 'df': disag_df[disag_df.eng_TAM.str.match('.*PAST\.\.IND')], 'index': 'eng_TAM', }, { 'name': 'disag_pres_perf', 'df': disag_df[disag_df.eng_TAM.str.match('.*PRES\.PERF\.IND')], 'index': 'eng_TAM', 'examples': [ { 'query': ('eng_TAM == "FUT..IND ~ PRES.PERF.IND"'), }, { 'query': ('eng_TAM == "PRES..IND ~ PRES.PERF.IND"'), }, ], }, { 'name': 'disag_pres', 'df': disag_df[disag_df.eng_TAM.str.match('.*PRES\.\.IND')], 'index': 'eng_TAM', 'examples': [ { 'query': ('eng_TAM == "FUT..IND ~ PRES..IND"'), }, ], }, { 'name': 'disag_domain', 'df': disag_df[disag_df.domain2.isin(['N', 'Q'])], 'index': 'eng_TAM', 'columns': 'domain2', }, { 'name': 'disag_gendom', 'df': both_df[both_df.domain2.isin(['N', 'Q'])], 'index': 'eng_agree', 'columns': ['genre', 'domain2'], }, { 'name': 'inter_gendom', 'df': disag_df, 'index': 'eng_TAM', 'columns': ['genre', 'domain2'], 'examples': [ { 'query': ('eng_TAM == "PAST..IND ~ PRES.PERF.IND" ' 'and genre == "prose" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAM == "PAST..IND ~ PAST.PERF.IND" ' 'and genre == "prose" ' 'and domain2 == "N"'), 'spread': 10, }, { 'query': ('eng_TAM == "PAST..IND ~ PRES.PERF.IND" ' 'and genre == "prose" ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAM == "PAST..IND ~ PRES..IND" ' 'and genre.isin(["poetry", "prophetic"]) ' 'and domain2 == "Q"'), 'spread': 10, }, { 'query': ('eng_TAM == "PAST..IND ~ PRES..IND" ' 'and genre == "instruction" ' 'and domain2 == "Q"'), 'spread': 2, }, ], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'has_objc', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'has_objc', 'examples': [ { 'query': ('eng_TAMsimp == "FUT ~ MOD shall" ' 'and has_objc == 0 '), }, { 'query': ('eng_TAMsimp == "PRES" ' 'and has_objc == 0 '), }, ], }, { 'name': 'has_objc_person', 'df': both_df, 'index': ['eng_TAMsimp', 'person'], 'columns': 'has_objc', 'examples': [ { 'query': ('eng_TAMsimp == "FUT" ' 'and person == "p1" ' 'and has_objc == 1') }, ], }, { 'name': 'has_loca', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_loca'], 'examples': [ ], }, { 'name': 'has_time', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['has_time'], 'examples': [ { 'query': ('eng_TAMsimp == "PRES" ' 'and has_time == 0 '), }, { 'query': ('eng_TAMsimp == "MOD is to ~ MOD shall" ' 'and has_time == 1'), 'spread': 10, }, ], }, ], snakemake.output.dir)
run_analyses([ { 'name': 'genre', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'genre', 'examples': [], }, { 'name': 'domain', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'domain2', }, { 'name': 'gendom', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': ['genre', 'domain2'], 'examples': [ { 'query': ('genre == "poetry" ' 'and eng_TAMsimp == "PRES"'), 'spread': 15, }, { 'query': ('genre == "prophetic" ' 'and eng_TAMsimp == "PRES"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PRES"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "N" ' 'and eng_TAMsimp == "PAST"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PAST"'), 'spread': 15, }, { 'query': ('genre == "prophetic" ' 'and eng_TAMsimp == "PAST"'), 'spread': 15, }, { 'query': ('genre == "poetry" ' 'and eng_TAMsimp == "PAST"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "N" ' 'and eng_TAMsimp == "PRES PART"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PRES PART"'), 'spread': 15, }, { 'query': ('genre == "poetry" ' 'and eng_TAMsimp == "PRES PART"'), 'spread': 15, }, { 'query': ('genre == "prophetic" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PRES PART"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PRES PROG"'), 'spread': 15, }, { 'query': ('genre == "instruction" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PRES PROG"'), 'spread': 15, }, { 'query': ('genre == "prophetic" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "PRES PROG"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "N" ' 'and eng_TAMsimp == "PAST PROG"'), 'spread': 15, }, { 'query': ('genre == "prophetic" ' 'and eng_TAMsimp == "FUT"'), 'spread': 15, }, { 'query': ('genre == "prose" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "FUT"'), 'spread': 15, }, { 'query': ('genre == "poetry" ' 'and domain2 == "Q" ' 'and eng_TAMsimp == "FUT"'), 'spread': 15, }, ], }, { 'name': 'period', 'df': both_df, 'index': 'eng_TAMsimp', 'columns': 'period', }, ], snakemake.output.dir)
run_analyses([ { 'name': 'clause_type', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'clause_type', }, { 'name': 'clause_rela', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'clause_rela', }, { 'name': 'clause_rela', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'clause_rela', }, { 'name': 'cltype_simp', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'cltype_simp', }, { 'name': 'rela_cltypesimp', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['clause_rela', 'cltype_simp'], }, { 'name': 'rela_particle', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['clause_rela', 'prec_part'], }, { 'name': 'prec_part', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'prec_part', 'examples': [ { 'query': ('eng_TAM == "PRES.PERF.IND" ' 'and prec_part == "_KJ_" '), }, { 'query': ('eng_TAM == "PRES.PERF.IND" ' 'and prec_part.str.match("^_>M_.*")'), 'spread': 10, }, { 'query': ('eng_TAM == "PRES..IND" ' 'and prec_part.str.match(".*_KH_")'), 'spread': 15, }, { 'query': ('eng_TAM == "PAST.PERF.IND" ' 'and prec_part == "_>CR_"'), 'spread': 25, }, { 'query': ('eng_TAM == "PAST.PERF.IND" ' 'and clause_rela == "SubArg" ' 'and prec_part == "_KJ_" '), 'bhs_text': ['mother_clause_atom', 'clause_atom'], }, ], }, { 'name': 'prec_part_gendom', 'df': eng_df, 'index': 'prec_part', 'columns': ['genre', 'domain2', 'eng_TAM'], }, { 'name': 'args', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'cl_args', 'examples': [ { 'query': 'eng_TAM == "PAST..IND" and cl_args == "_W_SV"', 'spread': 20, }, { 'query': 'eng_TAM == "PAST..IND" and cl_args == "_W_OV"', 'spread': 20, }, { 'query': 'eng_TAM == "PAST..IND" and cl_args == "_W_AV"', 'spread': 20, }, { 'query': 'eng_TAM == "PAST..IND" and cl_args == "SV"', }, { 'query': ('eng_TAM == "PRES.PERF.IND" ' 'and cl_args.str.match("QV") ') }, { 'query': ('eng_TAM == "PRES.PERF.IND" ' 'and cl_args.str.match("IV") ') }, { 'query': ('eng_TAM == "PRES.PERF.IND" ' 'and cl_args.str.match("C[OS]?V") ' 'and (~prec_part.str.match(".*_KJ_|.*_>M_")) '), 'spread': 10, }, { 'query': ('eng_TAM == "PRES.PERF.IND" ' 'and cl_args == "V"') }, { 'query': ('eng_TAM == "PRES..IND" ' 'and lex_etcbc != ">MR[" ' 'and cl_args == "V"') }, { 'query': ('eng_TAM == "PRES..IND" ' 'and lex_etcbc != ">MR[" ' 'and cl_args == "AV"') }, { 'query': ('eng_TAM == "PRES..IND" ' 'and cl_args == "CAV"') }, { 'query': ('eng_TAM == "PRES..IND" ' 'and cl_args == "RSV"') }, { 'query': ('eng_TAM == "PRES..IND" ' 'and cl_args == "_W_SAV"') }, ] }, { 'name': 'args_mother', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['cl_args', 'mother_verbtype'], 'examples': [ { 'query': ('eng_TAM == "PAST..IND" ' 'and cl_args.str.match("_W_[OS]V") ' 'and mother_verbtype == "wayq" '), 'bhs_text': ['mother_clause', 'clause_atom'] }, { 'query': ('eng_TAM == "PAST..IND" ' 'and cl_args.str.match("_W_[OS]V") ' 'and mother_verbtype == "wayq" ' 'and mother_verb_lex == lex_etcbc '), 'bhs_text': ['mother_clause', 'clause_atom'], 'spread': 10, }, { 'query': ('eng_TAM == "PAST..IND"' 'and cl_args.str.match("_W_[OS]V") ' 'and mother_verbtype == "wayq" ' 'and mother_verbplain == "יהי" ' 'and mother_verb_lex == "HJH[" '), 'bhs_text': ['mother_clause', 'clause_atom'], 'spread': 20, }, { 'query': ('eng_TAM == "PAST..IND"' 'and cl_args == "CV" ' 'and mother_verbtype == "yqtl" '), 'bhs_text': ['mother_clause', 'clause_atom'], }, { 'query': ('eng_TAM == "PAST..IND"' 'and cl_args == "RV" ' 'and mother_verbtype == "yqtl" '), 'bhs_text': ['mother_clause', 'clause_atom'], }, { 'query': ('eng_TAM == "PAST..IND"' 'and cl_args == "CV" ' 'and mother_verbtype == "impv" '), 'bhs_text': ['mother_clause', 'clause_atom'], }, ], }, { 'name': 'has_objc', 'df': eng_df, 'index': 'eng_TAM', 'columns': 'has_objc', }, { 'name': 'has_loca', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['clause_rela', 'has_loca'], 'examples': [{ 'query': ('eng_TAM == "PAST..IND" ' 'and has_loca == 1 ' 'and clause_rela == "Main"') }], }, { 'name': 'has_time', 'df': eng_df, 'index': 'eng_TAM', 'columns': ['clause_rela', 'has_time'], 'examples': [{ 'query': ('eng_TAM == "PAST..IND" ' 'and has_time == 1 ' 'and clause_rela == "Main" ') }], }, ], snakemake.output.dir)
from load_dfs import DfLoader from analysis import run_analyses # load the dataframes DfLoad = DfLoader(snakemake.input.data_dir) eng_df = DfLoad.eng_agree() #esv_df = DfLoad.esv() #niv_df = DfLoad.niv() #both_df = DfLoad.eng_both() #disag_df = DfLoad.eng_disagree() run_analyses([ { 'name': 'inchoatives', 'df': eng_df, 'examples': [ { 'query': ( 'eng_TAM == "PAST" ' 'and (niv.str.match(".*became") | esv.str.match(".*became"))' ), 'spread': -1, # i.e. all } ] }, ], snakemake.output.dir)