Ejemplo n.º 1
0
def applyPreprocessing(sources,
                       num_samples,
                       out_dir,
                       num_processes,
                       clean_pandas=False,
                       clean_archive=False,
                       size=250,
                       single_list=True,
                       sort_columns=["MaxLepDeltaR"],
                       sort_ascending=False,
                       photon_max=DEFAULT_PHOTON_MAX,
                       neutral_max=DEFAULT_NEUTRAL_MAX,
                       charged_max=DEFAULT_CHARGED_MAX,
                       ):
    #Run Final processing
    label_dir_pairs = [(s.split("/")[-1], s) for s in sources]
    print(label_dir_pairs)
    
    object_profiles = [
        # ObjectProfile("Photon", -1, pre_sort_columns=["PT_ET"], pre_sort_ascending=False, sort_columns=[sort_on], sort_ascending=False, addColumns={"ObjType":3}),
        ObjectProfile("EFlowPhoton", photon_max, pre_sort_columns=["PT_ET"], pre_sort_ascending=False,
                      sort_columns=sort_columns, sort_ascending=sort_ascending,
                      addColumns={"ObjFt1": -1, "ObjFt2": -1, "ObjFt3": -1}),
        ObjectProfile("EFlowNeutralHadron", neutral_max, pre_sort_columns=["PT_ET"], pre_sort_ascending=False,
                      sort_columns=sort_columns, sort_ascending=sort_ascending,
                      addColumns={"ObjFt1": -1, "ObjFt2": -1, "ObjFt3": 1}),
        ObjectProfile("EFlowTrack", charged_max, pre_sort_columns=["PT_ET"], pre_sort_ascending=False,
                      sort_columns=sort_columns, sort_ascending=sort_ascending,
                      addColumns={"ObjFt1": -1, "ObjFt2": 1, "ObjFt3": -1}),
        ObjectProfile("Electron", 8, pre_sort_columns=["PT_ET"], pre_sort_ascending=False,
                      sort_columns=sort_columns,
                      sort_ascending=sort_ascending, addColumns={"ObjFt1": -1, "ObjFt2": 1, "ObjFt3": 1}),
        ObjectProfile("MuonTight", 8, pre_sort_columns=["PT_ET"], pre_sort_ascending=False,
                      sort_columns=sort_columns,
                      sort_ascending=sort_ascending, addColumns={"ObjFt1": 1, "ObjFt2": -1, "ObjFt3": -1}),
        ObjectProfile("MissingET", 1, addColumns={"ObjFt1": 1, "ObjFt2": -1, "ObjFt3": 1}), ]
    
    temp_archive = "/".join([out_dir,'temp_archive'])
    if not os.path.exists(temp_archive):
        os.mkdir(temp_archive)
    
    stride = strideFromTargetSize(object_profiles, label_dir_pairs, DEFAULT_OBSERV_TYPES, megabytes=size)
    print(stride)
    
    #Here we are essentially creating 
    dps = procsFrom_label_dir_pairs(0,
                                    num_samples,
                                    stride,
                                    temp_archive,
                                    label_dir_pairs,
                                    object_profiles,
                                    DEFAULT_OBSERV_TYPES,
                                    single_list=single_list,
                                    sort_columns=sort_columns,
                                    sort_ascending=sort_ascending,
                                    verbose=0)
    batchAssertArchived(dps,num_processes=num_processes)
        for max_EFlow_size in [100]:  #[100, 200]:

            object_profiles = [  #ObjectProfile("Electron",-1),
                #  ObjectProfile("MuonTight", -1),
                # ObjectProfile("Photon", -1),
                ObjectProfile("MissingET", 1)  #,
                #  ObjectProfile("EFlowPhoton",max_EFlow_size, sort_columns=[sort_on], sort_ascending=False),
                # ObjectProfile("EFlowNeutralHadron",max_EFlow_size, sort_columns=[sort_on], sort_ascending=False),
                # ObjectProfile("EFlowTrack",max_EFlow_size, sort_columns=[sort_on], sort_ascending=False)]
            ]
            resolveProfileMaxes(object_profiles, ldp)

            dps, l = getGensDefaultFormat(archive_dir, (100000,20000), 120000, \
                                 object_profiles,ldp,observ_types,megabytes=100, verbose=0)

            dependencies = batchAssertArchived(dps)
            train, num_train = l[0]
            val, num_val = l[1]
            #            test,  num_test  = l[2]
            max_q_size = l[2]
            print("MAXQ: ", max_q_size)

            for name in ['lorentz', 'not_lorentz', 'control_dense']:
                for sphereCoords in [False]:
                    for weight_output in [False, True]:
                        for depth in [2, 3, 4, 5]:
                            for width in [10, 25]:
                                for activation in ['relu']:
                                    for dropout in [0.0]:
                                        #Weight output is really only for lorentz
                                        if (weight_output == True