Ejemplo n.º 1
0
def scoring_only(directory,
                 iaa_dir,
                 schema_dir,
                 scoring_dir,
                 viz_dir,
                 tua_dir,
                 threshold_func,
                 reporting=False):
    eval_dependency(directory, iaa_dir, schema_dir, out_dir=scoring_dir)
    print("WEIGHTING")
    weights = launch_Weighting(scoring_dir, reporting=reporting)
    print("SORTING POINTS")
    tuas, weights, tua_raw = pointSort(scoring_dir,
                                       input_dir=directory,
                                       weights=weights,
                                       tua_dir=tua_dir,
                                       reporting=reporting)
    points = eval_triage_scoring(tua_raw,
                                 weights,
                                 scoring_dir,
                                 threshold_func,
                                 reporting=reporting)
    print("SPLITTING")
    if viz_dir == None:
        x = directory.rfind("/")
        x += 1
        viz_dir = '../../visualization_' + directory[x:]
    splitcsv(scoring_dir,
             pointsFile=points,
             viz_dir=viz_dir,
             reporting=reporting)
Ejemplo n.º 2
0
def score_post_iaa(scoring_dir,
                   input_dir,
                   metadata_dir,
                   push_aws=True,
                   s3_bucket=None,
                   s3_prefix='',
                   threshold_func='raw_30',
                   reporting=False):
    """
    :param input_dir: the directory that holds all files from the tagworks datahunt export; used to match


    :param push_aws: True if we want outputs sent to the s3 AWS folder, false to just store locally
    :param s3_prefix: add something to the prefix of output files to keep everything tidy
    :param: threshold_func: the threshold function being used to determine inter-annotator agreement; for a
        comprehensive test of all the threshold functions set this to 'all'; this will not work if an iaa_directory is
        specified
    :return: No explicit return.  writes many csvs to the scoring directory created byscoreOnly.  Also pushes lots of
    files to AWS so that they can be visualized
    """
    weights = launch_Weighting(scoring_dir)
    print("SORTING POINTS")
    tuas, weights, tua_raw = pointSort(scoring_dir,
                                       input_dir=None,
                                       weights=weights,
                                       tua_dir=tua_dir,
                                       reporting=reporting)
    points = eval_triage_scoring(tua_raw,
                                 weights,
                                 scoring_dir,
                                 scoring_dir,
                                 threshold_func,
                                 reporting=reporting)
    if reporting:
        make_key(tuas, scoring_dir, prefix=threshold_func)
    print("----------------SPLITTING-----------------------------------")
    splitcsv(scoring_dir, pointsFile=points, reporting=reporting)
    #print("DONE, time elapsed", time()-start)
    ids = []
Ejemplo n.º 3
0
def calculate_scores_master(directory,
                            texts_path,
                            config_path,
                            schema_dir=None,
                            iaa_dir=None,
                            scoring_dir=None,
                            repCSV=None,
                            just_s_iaa=False,
                            just_dep_iaa=False,
                            use_rep=False,
                            reporting=False,
                            single_task=False,
                            highlights_file=None,
                            schema_file=None,
                            answers_file=None,
                            push_aws=True,
                            tua_dir=None,
                            s3_bucket=None,
                            s3_prefix='',
                            viz_dir=None,
                            threshold_func='raw_30'):
    """
    :param directory: the directory that holds all files from the tagworks datahunt export
    :param schema_dir: directory to the file holding all the schemas that created the datahunt tasks
    :param iaa_dir: the directory to output the raw IAA data to; if no input default is s_iaa_<directory>
    :param scoring_dir: directory to output data from every other stage of the scoring algorithm to; if no
        input default is scoring_<directory>
    :param repCSV: the csv that holds the rep score data
    :param just_s_iaa: True if the calculations should stop after the initial specialist IAA computation, false otherwise
    :param just_dep_iaa: True if the calculations should stop after the initial specialist IAA computation and the
        dependency computation, false otherwise
    :param use_rep: True if the scores should be computed using user rep scores; false otherwise
    :param reporting: True if user would like extra csv outputs.  These csvs aren't necessary to score but may be useful
        to humans trying to understand and analyze the algorithms
    :param single_task: True if there's only one task to be analyzed, false otherwise
    :param: highlights_file: only used if single_task is true; necessary if single_task is true; the path to the
        highlights file that is output from tagworks
    :param: schema_file: only used if single_task is true; necessary if single_task is true; the path to the schema file
        that is output from tagworks
    :param anwers_file: only used if single_task is true; necessary if single_task is true; the path to the answers file
        that is output from tagworks
    **if in the future the data import is adjusted to depend on other file outputs from tagworks, new parameters would
        have to be added to accomodate the change in importing procedures
    :param push_aws: True if we want outputs sent to the s3 AWS folder, false to just store locally
    :param s3_prefix: add something to the prefix of output files to keep everything tidy
    :param: threshold_func: the threshold function being used to determine inter-annotator agreement; for a
        comprehensive test of all the threshold functions set this to 'all'; this will not work if an iaa_directory is
        specified
    :return: No explicit return.  Running will create two directories named by the inputs. the iaa_dir will house
        a csv output from the IAA algorithm.  The scoring_dir will house the csvs output from the dependency evaluation
        algorithm; the weighting algorithm; the point sorting algorithm; and the final cleaning algorithm that prepares
        data to be visualized
    """
    print("Running scoring algorithm with:", threshold_func)
    all_funcs = [
        'raw_70', 'raw_50', 'raw_30', 'logis_0', 'logis+20', 'logis+40'
    ]
    target_funcs = ['raw_70', 'raw_50', 'raw_30']
    #all_funcs is every possible scoring function; target_funcs is just the functions you want to test when you say all
    if threshold_func == 'all':
        for func in target_funcs:
            if iaa_dir is None:
                if directory.startswith('./'):
                    iaa_direc = 's_iaa_' + func + '_' + directory[2:]
                else:
                    iaa_direc = 's_iaa_' + func + '_' + directory
            if scoring_dir is None:
                if directory.startswith('./'):
                    scoring_direc = 'scoring_' + func + '_' + directory[2:]
                else:
                    scoring_direc = 'scoring_' + func + '_' + directory
            calculate_scores_master(directory,
                                    schema_dir=schema_dir,
                                    iaa_dir=iaa_direc,
                                    scoring_dir=scoring_direc,
                                    texts_path=texts_path,
                                    repCSV=repCSV,
                                    just_s_iaa=just_s_iaa,
                                    just_dep_iaa=just_dep_iaa,
                                    use_rep=use_rep,
                                    reporting=reporting,
                                    single_task=single_task,
                                    highlights_file=highlights_file,
                                    schema_file=schema_file,
                                    answers_file=answers_file,
                                    push_aws=push_aws,
                                    s3_bucket=s3_bucket,
                                    s3_prefix=s3_prefix,
                                    threshold_func=func)
        return

    print("IAA PROPER")
    #iaa_dir is now handled inside IAA.py
    #if iaa_dir is None:
    #    iaa_dir = 's_iaa_'+directory
    if reporting:
        rep_direc = directory + "_report"
        make_directory(rep_direc)
    start = time()
    if not single_task:
        iaa_dir = calc_agreement_directory(directory,
                                           schema_dir,
                                           config_path,
                                           texts_path=texts_path,
                                           repCSV=repCSV,
                                           outDirectory=iaa_dir,
                                           useRep=use_rep,
                                           threshold_func=threshold_func)
    else:

        iaa_dir = calc_scores(highlights_file,
                              repCSV=repCSV,
                              schemaFile=schema_file,
                              outDirectory=iaa_dir,
                              useRep=use_rep,
                              threshold_func=threshold_func)

    if reporting:
        make_iaa_human_readable(iaa_dir, rep_direc)
    if just_s_iaa:
        return
    end = time()
    print("IAA TIME ELAPSED", end - start)
    print('iaaaa', iaa_dir)
    print("DEPENDENCY")
    eval_dependency(directory, iaa_dir, schema_dir, out_dir=scoring_dir)
    if just_dep_iaa:
        return

    print("WEIGHTING")
    weights = launch_Weighting(scoring_dir, reporting=reporting)
    print("SORTING POINTS")
    tuas, weights, tua_raw = pointSort(scoring_dir,
                                       input_dir=directory,
                                       weights=weights,
                                       tua_dir=tua_dir,
                                       reporting=reporting)
    points = eval_triage_scoring(tua_raw,
                                 weights,
                                 scoring_dir,
                                 threshold_func,
                                 reporting=reporting)
    if reporting:
        make_key(tuas, scoring_dir, prefix=threshold_func)
    print("----------------SPLITTING-----------------------------------")
    if viz_dir == None:
        x = directory.rfind("/")
        x += 1
        viz_dir = '../../visualization_' + directory[x:]
    splitcsv(scoring_dir,
             pointsFile=points,
             viz_dir=viz_dir,
             reporting=reporting)
    #print("DONE, time elapsed", time()-start)
    ids = []
Ejemplo n.º 4
0
from Weighting import launch_Weighting
from pointAssignment import pointSort
from Separator import indicesToStartEnd
import os
import pandas as pd
scoring_dir = '../test_data/pa_dep_input/'
tua_dir ='../test_data/pa_tua_input/'
reporting = True
input_dir = None

weights = launch_Weighting(scoring_dir)
print("SORTING POINTS")
print(scoring_dir, input_dir, '\n',weights.columns, '\n', tua_dir, reporting)
tuas, weights, tua_raw = pointSort(scoring_dir, input_dir=None, weights=weights, tua_dir=tua_dir, reporting=reporting)

arr = [1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100, 1101, 1102, 1103, 1104, 1105]
o = indicesToStartEnd(arr)
print(o)

arr = []
o = indicesToStartEnd(arr)
print(o)

def join_csvs_in_directory(in_directory, out_directory= None):
    in_files = []
    for root, dir, files in os.walk(in_directory):
        for file in files:
            in_files.append(in_directory + '/' + file)
    temp_dfs = []
    for i in range(len(in_files)):
        temp_dfs.append(pd.read_csv(in_files[i]))