def __fetch_intent_actions(self):
     """
     :return: 0 success 1 failure
     """
     self.intent_actions = []
     output_actions_txt_path = os.path.join(self.dst_output_path,
                                            'actions.txt')
     if not os.path.exists(output_actions_txt_path):
         if extract_spec_list_from_file(
                 self.intent_actions, self.am_processed_path,
                 EXTRACT_SPECS['ACTION']) != STATUS_OK:
             return STATUS_ERR
         write_list_to_file(self.intent_actions, output_actions_txt_path)
     if (not self.intent_actions) and read_file_to_list(
             self.intent_actions, output_actions_txt_path) != STATUS_OK:
         return STATUS_ERR
     else:
         if self.include_intent_actions_126:
             get_filtered_vector(
                 self.feature_list, self.intent_actions,
                 CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST'])
             return STATUS_OK
         elif self.include_intent_actions_110:
             get_filtered_vector(
                 self.feature_list, self.intent_actions,
                 CONSTANTS['INTENT_ACTIONS_110']['REFERENCE_LIST'])
             return STATUS_OK
    def __fetch_sensitive_apis(self):
        """
        :return: 0 success 1 failure
        """
        self.sensitive_apis = []
        output_apis_txt_path = os.path.join(self.dst_output_path, 'apis.txt')
        if not os.path.exists(output_apis_txt_path):
            smali_search_result = glob.glob(os.path.join(
                self.smali_dir_path, "**\\*.smali"),
                                            recursive=True)
            for smali_file in smali_search_result:
                if extract_sensitive_apis_list_from_smali(
                        self.sensitive_apis, smali_file) != STATUS_OK:
                    print('extract apis failed')
                    return STATUS_ERR
            write_list_to_file(self.sensitive_apis, output_apis_txt_path)

        if (not self.sensitive_apis) and read_file_to_list(
                self.sensitive_apis, output_apis_txt_path) != STATUS_OK:
            return STATUS_ERR
        else:
            get_filtered_vector(
                self.feature_list, self.sensitive_apis,
                CONSTANTS['SENSITIVE_APIS_106']['REFERENCE_LIST'])
            return STATUS_OK
Beispiel #3
0
def normalize_coordinates(tile_fnames_or_dir, output_dir, jar_file):

    all_files = []

    for file_or_dir in tile_fnames_or_dir:
        if not os.path.exists(file_or_dir):
            print("{0} does not exist (file/directory), skipping".format(file_or_dir))
            continue

        if os.path.isdir(file_or_dir):
            actual_dir_files = glob.glob(os.path.join(file_or_dir, '*.json'))
            all_files.extend(actual_dir_files)
        else:
            all_files.append(file_or_dir)

    if len(all_files) == 0:
        print "No files for normalization found. Exiting."
        return

    print "Normalizing coordinates of {0} files".format(all_files)

    files_urls = []
    for file_name in all_files:
        tiles_url = utils.path2url(file_name)
        files_urls.append(tiles_url)

    list_file = os.path.join(output_dir, "all_files.txt")
    print "list_file", list_file
    utils.write_list_to_file(list_file, files_urls)

    list_file_url = utils.path2url(list_file)

    java_cmd = 'java -Xmx3g -XX:ParallelGCThreads=1 -Djava.awt.headless=true -cp "{0}" org.janelia.alignment.NormalizeCoordinates --targetDir {1} {2}'.format(
        jar_file, output_dir, list_file_url)
    utils.execute_shell_command(java_cmd)
Beispiel #4
0
def prepare_data_ids(vid_caps_path, ids_save_path):
    vid_caps_dict = utils.read_from_json(vid_caps_path)
    data_ids = []
    for vid_caps in vid_caps_dict.items():
        vid_id = vid_caps[0]
        if vid_id[-4:] == ".avi":
            vid_id = vid_id[:-4]
        for seq_id in range(len(vid_caps[1])):
            data_id = vid_id + "|" + str(seq_id)
            data_ids.append(data_id)
    utils.write_list_to_file(ids_save_path, data_ids)
def run(from_scratch, scroll_down_count):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(utils.get_chromedriver_path(),
                              chrome_options=chrome_options)
    driver.maximize_window()

    cities_list = cities_extraction(driver, from_scratch, scroll_down_count)
    logger.info(config.MSG_DICT["CITIES_FOUND_COUNT"].format(len(cities_list)))
    utils.write_list_to_file(config.CITIES_FILENAME, cities_list)

    time.sleep(config.GENERAL_WAITER)
    driver.quit()
Beispiel #6
0
def generate_dup2_commands(newfiles):
    renumberfilename = freerel_out["renumberfilename"]
    paths = ["tasks", "filter", "duplicates2", "dup2"]
    program_to_run = cadoprograms.Duplicates2
    progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths)
    progparams.pop("renumber", None)

    commands = []

    for i, (filename, rels) in enumerate(newfiles):
        if filename in generate_dup2_commands.slice_files[i]:
            continue  # nothing to do

        logger.info("Dup2: Processing slice %d", i)
        generate_dup2_commands.slice_rels[i] += rels
        rels = generate_dup2_commands.slice_rels[i] 
        generate_dup2_commands.slice_files[i].add(filename)
        files = list(generate_dup2_commands.slice_files[i])

        if len(files) <= 10:
            program = cadoprograms.Duplicates2(*files, rel_count=rels, renumber=renumberfilename, **progparams)
        else:
            filelist = utils.write_list_to_file(files, dup1dir)
            program = cadoprograms.Duplicates2(filelist=filelist, rel_count=rels, renumber=renumberfilename, **progparams)
        command = program.make_command_line()
        cmd_logger.debug(command)
        commands.append(command)
    return commands
Beispiel #7
0
def generate_dup2_commands(newfiles):
    renumberfilename = freerel_out["renumberfilename"]
    paths = ["tasks", "filter", "duplicates2", "dup2"]
    program_to_run = cadoprograms.Duplicates2
    progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths)
    progparams.pop("renumber", None)

    commands = []

    for i, (filename, rels) in enumerate(newfiles):
        if filename in generate_dup2_commands.slice_files[i]:
            continue  # nothing to do

        logger.info("Dup2: Processing slice %d", i)
        generate_dup2_commands.slice_rels[i] += rels
        rels = generate_dup2_commands.slice_rels[i]
        generate_dup2_commands.slice_files[i].add(filename)
        files = list(generate_dup2_commands.slice_files[i])

        if len(files) <= 10:
            program = cadoprograms.Duplicates2(*files,
                                               rel_count=rels,
                                               renumber=renumberfilename,
                                               **progparams)
        else:
            filelist = utils.write_list_to_file(files, dup1dir)
            program = cadoprograms.Duplicates2(filelist=filelist,
                                               rel_count=rels,
                                               renumber=renumberfilename,
                                               **progparams)
        command = program.make_command_line()
        cmd_logger.debug(command)
        commands.append(command)
    return commands
Beispiel #8
0
def generate_dup1_command(newfiles):
    paths = ["tasks", "filter", "duplicates1", "dup1"]
    program_to_run = cadoprograms.Duplicates1

    progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths)
    progparams.pop("prefix", None)
    progparams.pop("out", None)

    prefix = "dup1.%d" % (time.time())

    if generate_dup1_command.nr_slices is None:  # first run
        generate_dup1_command.nr_slices = 2  # default to 2**1
        if "nslices_log" in progparams:
            generate_dup1_command.nr_slices = 2 ** progparams["nslices_log"]
        for i in range(0, generate_dup1_command.nr_slices):
            try:
                os.makedirs(os.path.join(dup1dir, str(i)))
            except OSError as exception:
                if exception.errno != errno.EEXIST:
                    raise

    # might want to improve this and maintain status of all files 
    # (in case a dup1 run fails) so that a later run can try again
    if len(newfiles) <= 10:
        program = cadoprograms.Duplicates1(*newfiles, prefix=prefix, out=dup1dir, **progparams)
    else:
        filelist = utils.write_list_to_file(newfiles, dup1dir)
        program = cadoprograms.Duplicates1(filelist=filelist, prefix=prefix, out=dup1dir, **progparams)

    command = program.make_command_line()
    cmd_logger.debug(command)
    return command
Beispiel #9
0
def clean_caps_df(csv_data, present_vid_ids, present_vid_ids_csv):
    vid_list = list(set([get_vid_ids(s) for s in present_vid_ids]))
    assert len(vid_list) == len(present_vid_ids_csv)
    df = csv_data.loc[((csv_data['VideoID'].isin(vid_list))
                       & (csv_data['Language'] == 'English'))
                      & csv_data['Description'].notnull()]
    df.to_csv(config.MSVD_FINAL_CORPUS_PATH, index=False, encoding='utf-8')
    df = utils.read_csv_data(config.MSVD_FINAL_CORPUS_PATH)
    omitted_caps = []
    punct_dict = get_punctuations()
    translator = string.maketrans("", "")
    df['Description'] = df.apply(lambda row: clean_caps(
        row['Description'], punct_dict, translator, omitted_caps),
                                 axis=1)
    df = df.loc[df['Description'].notnull()]
    df.to_csv(config.MSVD_FINAL_CORPUS_PATH, index=False, encoding='utf-8')
    print("Non-ASCII captions omitted :" + str(len(omitted_caps)))
    utils.write_list_to_file(config.MSVD_OMMITTED_CAPS_PATH, omitted_caps)
    return df
 def __fetch_permissions(self):
     """
     :return: 0 success 1 failure
     """
     self.permissions = []
     output_permissions_txt_path = os.path.join(self.dst_output_path,
                                                'permissions.txt')
     if not os.path.exists(output_permissions_txt_path):
         if extract_spec_list_from_file(
                 self.permissions, self.am_processed_path,
                 EXTRACT_SPECS['PERMISSION']) != STATUS_OK:
             return STATUS_ERR
         write_list_to_file(self.permissions, output_permissions_txt_path)
     if (not self.permissions) and read_file_to_list(
             self.permissions, output_permissions_txt_path) != STATUS_OK:
         return STATUS_ERR
     else:
         get_filtered_vector(self.feature_list, self.permissions,
                             CONSTANTS['PERMISSIONS_147']['REFERENCE_LIST'])
         return STATUS_OK
Beispiel #11
0
def filter_clips(csv_data, vid_clips_list):
    pf = []
    mf = []
    tl = 0
    pvids = []
    for index, row in csv_data.iterrows():
        fname = str(row["VideoID"]) + "_" + str(row["Start"]) + "_" + str(
            row["End"]) + ".avi"
        if fname in vid_clips_list:
            if fname not in pf:
                pf.append(fname)
                tl += 1
            if row["VideoID"] not in pvids:
                pvids.append(row["VideoID"])
        else:
            if fname not in mf:
                mf.append(fname)
                tl += 1
    utils.write_list_to_file(config.DATA_DIR + "present_vid_ids.txt", pf)
    utils.write_list_to_file(config.DATA_DIR + "missing_vid_ids.txt", mf)
    utils.write_list_to_file(config.DATA_DIR + "present_csv_vid_ids.txt",
                             pvids)
    print("Present : {}".format(len(pf)))
    print("Missing : {}".format(len(mf)))
    print("Total (from CSV): {}".format(tl))
    return pf, mf, pvids
Beispiel #12
0
def generate_purge_command(dup2_out):
    paths = ["tasks", "filter", "purge", "purge"]
    program_to_run = cadoprograms.Purge

    progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths)

    nfree = freerel_out["nfree"]
    nunique = sum(dup2_out)
    input_nrels = nfree + nunique
    nprimes = freerel_out["nprimes"]

    minindex = int(progparams.get("col_minindex", -1))
    if minindex == -1:
        minindex = int(nprimes / 20.0)
        # For small cases, we want to avoid degenerated cases, so let's
        # keep most of the ideals: memory is not an issue in that case.
        if (minindex < 10000):
            minindex = 500
        progparams.setdefault("col_minindex", minindex)
    keep = progparams.pop("keep", None)

    relsdelfile = None  # not supporting dlp yet

    files = [freerel_out["freerelfilename"]]
    for i in range(generate_dup1_command.nr_slices):
        files += list(generate_dup2_commands.slice_files[i])

    if len(files) <= 10:
        program = cadoprograms.Purge(*files,
                                     nrels=input_nrels,
                                     out=purgedfile,
                                     outdel=relsdelfile,
                                     keep=keep,
                                     nprimes=nprimes,
                                     **progparams)
    else:
        filelist = utils.write_list_to_file(files, dup1dir)
        program = cadoprograms.Purge(nrels=input_nrels,
                                     out=purgedfile,
                                     outdel=relsdelfile,
                                     keep=keep,
                                     nprimes=nprimes,
                                     filelist=filelist,
                                     **progparams)

    command = program.make_command_line()
    cmd_logger.debug(command)
    return command, purgedfile
Beispiel #13
0
def generate_dup1_command(newfiles):
    paths = ["tasks", "filter", "duplicates1", "dup1"]
    program_to_run = cadoprograms.Duplicates1

    progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths)
    progparams.pop("prefix", None)
    progparams.pop("out", None)

    prefix = "dup1.%d" % (time.time())

    if generate_dup1_command.nr_slices is None:  # first run
        generate_dup1_command.nr_slices = 2  # default to 2**1
        if "nslices_log" in progparams:
            generate_dup1_command.nr_slices = 2**progparams["nslices_log"]
        for i in range(0, generate_dup1_command.nr_slices):
            try:
                os.makedirs(os.path.join(dup1dir, str(i)))
            except OSError as exception:
                if exception.errno != errno.EEXIST:
                    raise

    # might want to improve this and maintain status of all files
    # (in case a dup1 run fails) so that a later run can try again
    if len(newfiles) <= 10:
        program = cadoprograms.Duplicates1(*newfiles,
                                           prefix=prefix,
                                           out=dup1dir,
                                           **progparams)
    else:
        filelist = utils.write_list_to_file(newfiles, dup1dir)
        program = cadoprograms.Duplicates1(filelist=filelist,
                                           prefix=prefix,
                                           out=dup1dir,
                                           **progparams)

    command = program.make_command_line()
    cmd_logger.debug(command)
    return command
Beispiel #14
0
def split_data(csv_data):
    vid_ids = utils.read_file_to_list(config.MSVD_VID_IDS_ALL_PATH)
    assert len(vid_ids) == config.TOTAL_VIDS
    utils.shuffle_array(vid_ids)
    train_ids = vid_ids[0:1200]
    val_ids = vid_ids[1200:1300]
    test_ids = vid_ids[1300:1970]
    assert len(train_ids) == config.TRAIN_VIDS
    assert len(val_ids) == config.VAL_VIDS
    assert len(test_ids) == config.TEST_VIDS
    utils.write_list_to_file(config.MSVD_VID_IDS_TRAIN_PATH, train_ids)
    utils.write_list_to_file(config.MSVD_VID_IDS_VAL_PATH, val_ids)
    utils.write_list_to_file(config.MSVD_VID_IDS_TEST_PATH, test_ids)
    train_df = filter_df(csv_data, train_ids,
                         config.MSVD_FINAL_CORPUS_TRAIN_PATH)
    val_df = filter_df(csv_data, val_ids, config.MSVD_FINAL_CORPUS_VAL_PATH)
    test_df = filter_df(csv_data, test_ids, config.MSVD_FINAL_CORPUS_TEST_PATH)
    return train_df, val_df, test_df
Beispiel #15
0
def generate_purge_command(dup2_out):
    paths = ["tasks", "filter", "purge", "purge"]
    program_to_run = cadoprograms.Purge

    progparams = parameters.myparams(program_to_run.get_accepted_keys(), paths)

    nfree = freerel_out["nfree"]
    nunique = sum(dup2_out)
    input_nrels = nfree + nunique
    nprimes = freerel_out["nprimes"]

    minindex = int(progparams.get("col_minindex", -1))
    if minindex == -1:
        minindex = int(nprimes / 20.0)
        # For small cases, we want to avoid degenerated cases, so let's
        # keep most of the ideals: memory is not an issue in that case.
        if (minindex < 10000):
            minindex = 500
        progparams.setdefault("col_minindex", minindex)
    keep = progparams.pop("keep", None)
    
    relsdelfile = None  # not supporting dlp yet

    files = [freerel_out["freerelfilename"]]
    for i in range(generate_dup1_command.nr_slices):
        files += list(generate_dup2_commands.slice_files[i])

    if len(files) <= 10:
        program = cadoprograms.Purge(*files, nrels=input_nrels, out=purgedfile, outdel=relsdelfile, keep=keep, nprimes=nprimes, **progparams)
    else:
        filelist = utils.write_list_to_file(files, dup1dir)
        program = cadoprograms.Purge(nrels=input_nrels, out=purgedfile, outdel=relsdelfile, keep=keep, nprimes=nprimes, filelist=filelist, **progparams)

    command = program.make_command_line()
    cmd_logger.debug(command)
    return command, purgedfile
Beispiel #16
0
            # Merge the multiple mfovs pmcc match files into one per direction
            pmcc_fname = os.path.join(matched_pmcc_dir, "{0}_{1}_match_pmcc.json".format(fname1_prefix, fname2_prefix))



            j += 1
            matched_after_layers += 1



    print "all_pmcc_files: {0}".format(all_pmcc_files)

    # Create a single file that lists all tilespecs and a single file that lists all pmcc matches (the os doesn't support a very long list)
    ts_list_file = os.path.join(args.workspace_dir, "all_ts_files.txt")
    write_list_to_file(ts_list_file, all_ts_files)
    pmcc_list_file = os.path.join(args.workspace_dir, "all_pmcc_files.txt")
    write_list_to_file(pmcc_list_file, all_pmcc_files)


    # Optimize all layers to a single 3d image
    sections_opt_outputs = []
    for i in all_layers:
        out_section = os.path.join(post_optimization_dir, '{}_{}'.format(str(i).zfill(4), os.path.basename(layers_data[str(i)]['ts'])))
        sections_opt_outputs.append(out_section)

    dependencies = list(all_running_jobs)
    job_optimize = OptimizeLayersElastic(dependencies, sections_opt_outputs, [ ts_list_file ], [ pmcc_list_file ],
        post_optimization_dir, args.max_layer_distance, conf_fname=args.conf_file_name,
        skip_layers=args.skip_layers, threads_num=4)
    all_running_jobs.append(job_optimize)
    bbox1 = BoundingBox.fromList(ts1["bbox"])
    bbox2 = BoundingBox.fromList(ts2["bbox"])
    if bbox1.overlap(bbox2):
        imageUrl1 = ts1["mipmapLevels"]["0"]["imageUrl"]
        imageUrl2 = ts2["mipmapLevels"]["0"]["imageUrl"]
        tile_fname1 = os.path.basename(imageUrl1).split('.')[0]
        tile_fname2 = os.path.basename(imageUrl2).split('.')[0]
        print "Matching features of tiles: {0} and {1}".format(imageUrl1, imageUrl2)
        index_pair = [idx1, idx2]
        match_json = os.path.join(args.workspace_dir, "{0}_sift_matches_{1}_{2}.json".format(tiles_fname_prefix, tile_fname1, tile_fname2))
        # match the features of overlapping tiles
        if not os.path.exists(match_json):
            match_single_sift_features_and_filter(args.tiles_fname, all_features[imageUrl1], all_features[imageUrl2], match_json, index_pair, conf_fname=args.conf_file_name)
        all_matched_features.append(match_json)

print 'features matching took {0:1.4f} seconds'.format(time.time() - start_time)

# Create a single file that lists all tilespecs and a single file that lists all pmcc matches (the os doesn't support a very long list)
matches_list_file = os.path.join(args.workspace_dir, "all_matched_sifts_files.txt")
write_list_to_file(matches_list_file, all_matched_features)

# optimize the 2d layer montage
if not os.path.exists(args.output_file_name):
    print "Optimizing section in tilespec: {}".format(args.tiles_fname)
    start_time = time.time()
    optimize_2d_mfovs(args.tiles_fname, matches_list_file, args.output_file_name, args.conf_file_name)
    print '2D Optimization took {0:1.4f} seconds'.format(time.time() - start_time)



Beispiel #18
0
        raise NotImplementedError()
    encoded_video = np.loadtxt(encoded_feats_path, delimiter=',')
    print(encoded_video.shape)
    num, dim = encoded_video.shape
    assert num == dictsize
    for vid_id in range(num):
        vid_feats = encoded_video[vid_id].reshape(32, 1024)
        # print(vid_feats.shape)
        np.save(feat_save_path + whichdata + "_" + str(vid_id) + ".npy",
                vid_feats)


if __name__ == '__main__':
    print("generating vocab for train data...")
    vocab, _, omitted_caps_train = gen_vocab(config.MURALI_TRAIN_VIDS, "train")
    _, _, omitted_caps_test = gen_vocab(config.MURALI_TEST_VIDS, "test")
    omitted_caps = omitted_caps_train + omitted_caps_test
    utils.write_list_to_file(config.MURALI_MSVD_OMMITTED_CAPS_PATH,
                             omitted_caps)
    print("generating train data vid+seq ids...")
    prepare_data_ids(config.MURALI_MSVD_VID_CAPS_TRAIN_PATH,
                     config.MURALI_MSVD_DATA_IDS_TRAIN_PATH)
    # print("generating val data vid+seq ids...")
    # prepare_data_ids(config.MURALI_MSVD_VID_CAPS_VAL_PATH, config.MURALI_MSVD_DATA_IDS_VAL_PATH)
    print("generating test data vid+seq ids...")
    prepare_data_ids(config.MURALI_MSVD_VID_CAPS_TEST_PATH,
                     config.MURALI_MSVD_DATA_IDS_TEST_PATH)
    print("seperating train vids encoded features...")
    save_feats("train")
    print("seperating test vids encoded features...")
    save_feats("test")
Beispiel #19
0
def main():
    # setting parameters
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--mode',
                        type=str,
                        default=None,
                        help='TRAIN or FINETUNE or INFER.')
    parser.add_argument('--epoches',
                        type=int,
                        default=1000,
                        help='num epoches.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=50,
                        help='minibatch size.')
    parser.add_argument('--batch_increase',
                        type=bool,
                        default=True,
                        help='whether to increase the batch_size')
    parser.add_argument('--num_layers_encoder',
                        type=int,
                        default=2,
                        help='number of encoder layers.')
    parser.add_argument('--num_layers_decoder',
                        type=int,
                        default=1,
                        help='number of decoder layers.')
    parser.add_argument(
        '--embedding_dim',
        type=int,
        default=100,
        help='dimension of the embedding vectors in the embedding matrix.')
    parser.add_argument('--num_heads',
                        type=int,
                        default=8,
                        help='number of head in multi_heads attention.')
    parser.add_argument('--rnn_size_encoder',
                        type=int,
                        default=256,
                        help='number of hidden units in encoder.')
    parser.add_argument('--rnn_size_decoder',
                        type=int,
                        default=256,
                        help='number of hidden units in decoder.')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.001,
                        help='learning rate in every training step.')
    parser.add_argument('--learning_rate_decay',
                        type=float,
                        default=1,
                        help='only if exponential learning rate is used.')
    parser.add_argument('--learning_rate_decay_steps',
                        type=int,
                        default=100,
                        help='learning rate decay period.')
    parser.add_argument('--max_lr',
                        type=float,
                        default=0.01,
                        help='only if cyclic learning rate is used.')
    parser.add_argument('--label_smoothing',
                        type=float,
                        default=0,
                        help='the label smoothing rate.')
    parser.add_argument(
        '--keep_probability_i',
        type=float,
        default=1,  #0.825
        help='values inspired by Jeremy Howard\'s fast.ai course.')
    parser.add_argument(
        '--keep_probability_o',
        type=float,
        default=1,  #0.895
        help='values inspired by Jeremy Howard\'s fast.ai course.')
    parser.add_argument(
        '--keep_probability_h',
        type=float,
        default=1,  #0.86
        help='values inspired by Jeremy Howard\'s fast.ai course.')
    parser.add_argument(
        '--keep_probability_e',
        type=float,
        default=1,  #0.986
        help='values inspired by Jeremy Howard\'s fast.ai course.')
    # A bug occurred when 0 choosed. Please set beam_width greater than 0
    # at infer stage before the problem is resolved.
    parser.add_argument('--beam_width',
                        type=int,
                        default=1,
                        help='only used in inference, for Beam Search.')
    parser.add_argument(
        '--clip',
        type=int,
        default=5,
        help='value to clip the gradients to in training process.')
    parser.add_argument('--inference_targets',
                        type=int,
                        default=False,
                        help='maximum iterations at decoding period')
    parser.add_argument('--use_cyclic_lr',
                        type=int,
                        default=False,
                        help='use cyclical learning rates.')
    parser.add_argument(
        '--key_words_biasing',
        type=bool,
        default=True,
        help='whether implement the CLAS for key words, default YES')
    parser.add_argument(
        '--attention_type',
        type=str,
        default='MultiHeadAttention',
        help='MultiHeadAttention or BahdanauAttention can be selected.')
    parser.add_argument(
        '--attention_type_bias',
        type=str,
        default='MultiHeadAttention',
        help='MultiHeadAttention or BahdanauAttention can be selected.')
    parser.add_argument('--crf_layer',
                        type=bool,
                        default=True,
                        help='if add a crf layer on the decoder outputs.')
    parser.add_argument(
        '--dev',
        type=str,
        default='cpu',
        help=
        'training by CPU or GPU, input cpu or gpu:0 or gpu:1 or gpu:2 or gpu:3.'
    )
    args = parser.parse_args()

    ##################################################################################
    # initital the data, model graph, parameters
    ##################################################################################
    print("creating data operator...")
    # param vocab_create_mode='BUILD' in the first training
    # the trn files and wav files saved in different folders
    if args.mode == 'INFER':
        args.batch_size = 1
        data = Corpus(trn_file=TEST_TRN_FILE, wav_file=TEST_WAV_FILE, \
                      mfcc_file=TEST_OUTPUT_MFCC_FILE, args=args, \
                      vocab_create_mode='LOAD', mfcc_create='N')
    else:
        data = Corpus(trn_file=TRN_FILE, wav_file=WAV_FILE, mfcc_file=OUTPUT_MFCC_FILE, \
                  args=args, vocab_create_mode='LOAD', mfcc_create='N')
    print("building model graph...")
    model = LAS(args, data.vocab)
    model.build_model()
    saver = tf.train.Saver()

    sess = tf.Session()
    print("initializing parameters...")
    sess.run(tf.global_variables_initializer())

    ##################################################################################
    # TRAIN or INFERENCE stage
    ##################################################################################
    if args.mode == 'TRAIN':
        ## train
        with tf.device("/" + str(args.dev)):
            best_loss = np.inf
            for epoch in range(args.epoches):
                ## """attempt to increase the batch_size, increase 10 when the epoches increase 50,
                ## but the max batch_size should be 100 because of the memory limit."""
                if epoch % 50 == 0 and args.batch_increase and (epoch != 0):
                    args.batch_size += 10
                if args.batch_size >= 100:
                    args.batch_increase = False

                avg_loss = iter_epoches(sess, epoch, data, model)
                # if current loss is smaller than the best
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    print("best_loss: %6f" % (best_loss))
                    # save model
                    save_path = saver.save(sess, "save/model.ckpt")

    elif args.mode == 'FINETUNE':
        ## train the model base on the parameters of the previous training
        with tf.device("/" + str(args.dev)):
            # read model from file
            saver.restore(sess, "save/model.ckpt")
            best_loss = np.inf
            for epoch in range(args.epoches):
                avg_loss = iter_epoches(sess, epoch, data, model)
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    # save model
                    print("best_loss: %6f" % (best_loss))
                    save_path = saver.save(sess, "save/model.ckpt")

    elif args.mode == 'INFER':
        with tf.device("/" + str(args.dev)):
            # read model parameters from file
            saver.restore(sess, "save/model.ckpt")
            batches = data.batch_generator()
            lines = []
            wers = []
            count = 0
            biases = INFERENCE_BIAS
            bias_seq_len = [len(bias) for bias in biases]
            biases = data.trans_label_to_index(biases)
            biases = data.padding(biases, bias_seq_len)
            while True:
                count += 1
                if count % 1 == 0:
                    print(str(count) + ' finished...')
                try:
                    mfcc_features, audio_seq_len, labels, label_seq_len, _, _ = \
                                                                get_feeds(batches)
                    bias_att_len = [len(biases) for _ in range(len(labels))]
                    feed = {
                        model.audios: mfcc_features,
                        model.audio_sequence_lengths: audio_seq_len,
                        model.bias_ids: biases,
                        model.char_sequence_lengths: label_seq_len,
                        model.bias_sequence_lengths: bias_seq_len,
                        model.bias_attention_lengths: bias_att_len
                    }
                    train_ops = model.sample_words
                    preds = run_train_op(sess, train_ops, feed)
                    for p, label in zip(preds, labels):
                        sen = np.transpose(np.array(p), [1, 0])
                        line = ' '.join(data.trans_index_to_label(list(
                            sen[0])))
                        lines.append(line)
                        # calculate the WER
                        wers.append(get_edit_distance(line, label))
                except StopIteration:
                    break
            wer = np.mean(np.array(wers))
            print(wer)
            utils.write_list_to_file('pred/predictions.txt', lines, 'a+')
def extract_roi_from_matlab_annotations(movie_path: str,
                                        annotation_path: str,
                                        output_path: str,
                                        max_frame: int = 100000):

    if not os.path.exists(output_path):
        os.mkdir(output_path)

    # Create video source instance
    print('Initializing video capture at {}'.format(movie_path))
    video_src = Video_Reader(movie_path)
    _, image = video_src.get_frame()
    video_src.reset()

    img_height, img_width, img_channel = image.shape

    print('Reading annotation at {}'.format(annotation_path))
    Annotation_list = bbt.Read_Annotation(annotation_path,
                                          (img_width, img_height))

    cooccurring_tracks = []
    bounding_boxes_list = []
    bbx_to_gt_list = []
    track_to_gt_list = []

    print('Extracting face patches.')

    frame_idx = 0
    bbx_idx = 0
    num_frame = min(len(Annotation_list), max_frame)
    tbar = tqdm.tqdm(range(num_frame))
    for j in tbar:

        ret, image = video_src.get_frame()
        if not ret:
            break

        bounding_boxes = Annotation_list[frame_idx]

        track_list = []
        for bbx in bounding_boxes:

            cropped_image = image[bbx[1]:bbx[3], bbx[0]:bbx[2], :]
            cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB)
            cropped_image = Image.fromarray(cropped_image)
            cropped_image = utils.make_square(cropped_image)
            cropped_image = cropped_image.resize((160, 160),
                                                 resample=Image.LANCZOS)

            track_id = bbx[6]
            gt_label = bbx[4]
            bounding_boxes_list.append(
                [frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2], bbx[3]])
            bbx_to_gt_list.append([bbx_idx, gt_label])
            track_to_gt_list.append([track_id, gt_label])

            # Save image
            dir_name = '{:04d}'.format(track_id)
            image_name = '{:06d}.png'.format(bbx_idx)
            save_path = os.path.join(output_path, dir_name)
            if not os.path.exists(save_path):
                os.mkdir(save_path)
            save_file_path = os.path.join(save_path, image_name)
            cropped_image.save(save_file_path)

            track_list.append(track_id)
            bbx_idx += 1

        # Note co-occurring tracks
        if len(track_list) > 1:
            track_list = sorted(track_list)
            if track_list not in cooccurring_tracks:
                cooccurring_tracks.append(track_list)

        frame_idx += 1

    # Save co-occurring tracksset
    utils.write_list_to_file(
        os.path.join(output_path, "cooccurring_tracks.txt"),
        cooccurring_tracks)
    # Save bbx
    utils.write_list_to_file(os.path.join(output_path, "bbx.txt"),
                             bounding_boxes_list)

    # Save ground truth
    utils.write_list_to_file(os.path.join(output_path, "bbx_gt.txt"),
                             bbx_to_gt_list)
    utils.write_list_to_file(os.path.join(output_path, "track_gt.txt"),
                             track_to_gt_list)

    print('{} co-occurring tracks.'.format(len(cooccurring_tracks)))
Beispiel #21
0
def run_submit(args):    
        
    augment = ['null'] 
    out_dir = args.out_dir + f'/{args.model_name}'
    initial_checkpoint = args.initial_checkpoint
    batch_size = args.batch_size

    ## setup out_dir
    os.makedirs(out_dir +'/submit', exist_ok=True)

    log = Logger()
    log.open(out_dir+'/log.submit.txt',mode='a')
    log.write('\n--- [START %s] %s\n\n' % (IDENTIFIER, '-' * 64))
    log.write('\t%s\n' % COMMON_STRING)
    log.write('\n')
    log.write('\tSEED         = %u\n' % SEED)
    log.write('\t__file__     = %s\n' % __file__)
    log.write('\tout_dir      = %s\n' % out_dir)
    log.write('\n')

    log.write('submitting .... @ %s\n'%str(augment))
    log.write('initial_checkpoint  = %s\n'%initial_checkpoint)
    log.write('\n')

    if 1: #save
        log.write('** dataset setting **\n')
        files_train = [f'train_image_data_{fid}.feather' for fid in range(4)]
        data = read_data(args.data_dir, files_train)
        
        df = pd.read_csv(args.df_path)
        valid_split = np.load(args.data_dir + '/valid_b_fold1_15985.npy').tolist()
        valid_df = df[df['image_id'].isin(valid_split)]

        test_dataset = KaggleDataset(
            df       = df,
            data     = data,
            idx      = valid_df.index.values, 
            augment  = valid_augment,
        )

        log.write('\n')

        ## net
        log.write('** net setting **\n')
        if args.model_name == 'serex50':
            net = Serex50_Net().cuda()
        elif args.model_name == 'effnetb3':
            net = EfficientNet_3().cuda()
        else:
            raise NotImplemented

        net.load_state_dict(torch.load(initial_checkpoint, map_location=lambda storage, loc: storage), strict=True)

        image_id, truth, probability = do_evaluate(net, test_dataset, batch_size,  augment)


        if 1: #save
            write_list_to_file (out_dir + '/submit/image_id.txt',image_id)
            write_pickle_to_file(out_dir + '/submit/probability.pickle', probability)
            write_pickle_to_file(out_dir + '/submit/truth.pickle', truth)

    if 1:
        image_id = read_list_from_file(out_dir + '/submit/image_id.txt')
        probability = read_pickle_from_file(out_dir + '/submit/probability.pickle')
        truth       = read_pickle_from_file(out_dir + '/submit/truth.pickle')
    num_test= len(image_id)

    if 1:
        recall, avgerage_recall = compute_kaggle_metric(probability, truth)
        log.write('avgerage_recall : %f\n'%(avgerage_recall))

        for i,name in enumerate(TASK_NAME):
            log.write('%28s  %f\n'%(name,recall[i]))
        log.write('\n')
def extract_roi(movie_path: str,
                output_path: str,
                max_frame: int = 100000,
                tracker_max_age: int = 10):

    # Create video source instance
    print('Initializing video capture at {}'.format(movie_path))
    video_src = Video_Reader(movie_path)
    _, image = video_src.get_frame()
    video_src.reset()

    my_fastdt = FAST_DT("cpu", tracker_max_age=tracker_max_age)

    print('Extracting face patches.')

    image_dict = {}
    bbx_dict = {}
    cooccurring_tracks = []
    bbx_idx = 0
    tbar = tqdm.tqdm(range(max_frame))
    for frame_idx in tbar:

        ret, image = video_src.get_frame()
        if not ret:
            break

        bounding_boxes = my_fastdt.predict(image)

        for bbx in bounding_boxes:

            cropped_image = image[bbx[1]:bbx[3], bbx[0]:bbx[2], :]
            cropped_image = cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB)
            cropped_image = Image.fromarray(cropped_image)
            cropped_image = utils.make_square(cropped_image)
            cropped_image = F.resize(cropped_image, size=160, interpolation=1)

            track_id = bbx[4]
            # bounding_boxes_list.append([frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2], bbx[3]])

            if track_id not in image_dict.keys():
                image_dict[track_id] = [(cropped_image, bbx_idx, frame_idx)]
                bbx_dict[track_id] = [[
                    frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2],
                    bbx[3]
                ]]
            else:
                image_dict[track_id].append(
                    (cropped_image, bbx_idx, frame_idx))
                bbx_dict[track_id].append([
                    frame_idx, track_id, bbx_idx, bbx[0], bbx[1], bbx[2],
                    bbx[3]
                ])

            bbx_idx += 1

    # Remove the last samples of each track as they are residual samples from the tracker max age
    print('Removing residual samples.')
    track_id_list = list(image_dict.keys())
    for track_id in track_id_list:
        if len(image_dict[track_id]) + 1 < tracker_max_age:
            image_dict.pop(track_id)
            bbx_dict.pop(track_id)
        else:
            image_dict[track_id] = image_dict[track_id][1:-tracker_max_age]
            bbx_dict[track_id] = bbx_dict[track_id][1:-tracker_max_age]

    # Create the bounding_box_list
    bounding_boxes_list = []
    for track_id in bbx_dict.keys():
        for bbx in bbx_dict[track_id]:
            bounding_boxes_list.append(bbx)

    # Convert the track classed dictionary to a frame classed dictionary
    print('Creating dataset.')
    if not os.path.exists(output_path):
        os.mkdir(output_path)
    frame_to_track_dict = {}
    tbar2 = tqdm.tqdm(image_dict.keys())
    for track_id in tbar2:
        for cropped_image, bbx_idx, frame_idx in image_dict[track_id]:
            if frame_idx not in frame_to_track_dict.keys():
                frame_to_track_dict[frame_idx] = [track_id]
            else:
                frame_to_track_dict[frame_idx].append(track_id)

            # Save image
            dir_name = '{:04d}'.format(track_id)
            image_name = '{:06d}.png'.format(bbx_idx)
            save_path = os.path.join(output_path, dir_name)
            if not os.path.exists(save_path):
                os.mkdir(save_path)
            save_file_path = os.path.join(save_path, image_name)
            cropped_image.save(save_file_path)

    # Find co-occurring tracks
    print('Forming co-occurring tracks file.')
    for frame_idx in frame_to_track_dict.keys():
        track_list = []
        for track_id in frame_to_track_dict[frame_idx]:
            track_list.append(track_id)
        # Note co-occurring tracks
        if len(track_list) > 1:
            track_list = sorted(track_list)
            if track_list not in cooccurring_tracks:
                cooccurring_tracks.append(track_list)

        # Save co-occurring tracksset
        utils.write_list_to_file(
            os.path.join(output_path, "cooccurring_tracks.txt"),
            cooccurring_tracks)
        # Save bbx
        utils.write_list_to_file(os.path.join(output_path, "bbx.txt"),
                                 bounding_boxes_list)

    print('{} co-occurring tracks.'.format(len(cooccurring_tracks)))
                matched_json_basename = "{0}_matches_{1}_{2}.json".format(
                    tiles_fname_prefix, tile_fname1, tile_fname2)
                matched_json = os.path.join(layer_matched_sifts_dir,
                                            matched_json_basename)
                # match the features of overlapping tiles
                if not os.path.exists(matched_json):
                    print("Matching sift of tiles: {0} and {1}".format(
                        imageUrl1, imageUrl2))
                    match_single_sift_features_and_filter(
                        f, sifts_1, sifts_2, matched_json, index_pair)
                    layers_data[slayer]['matched_sifts'].append(matched_json)

        matches_list_file = os.path.join(
            layer_matched_sifts_dir,
            "{}_matched_sifts_files.txt".format(tiles_fname_prefix))
        write_list_to_file(matches_list_file,
                           layers_data[slayer]['matched_sifts'])

        # optimize (affine) the 2d layer matches (affine)
        opt_montage_json = os.path.join(
            optimized_2d_dir, "{0}_montaged.json".format(tiles_fname_prefix))
        if not os.path.exists(opt_montage_json):
            print("Optimizing (affine) layer matches: {0}".format(slayer))
            optimize_2d_stitching(f, matches_list_file, opt_montage_json)
        if render_first:
            render_tile(opt_montage_json)
            render_first = False
            a = input("Rendered right?(Yes/No)")
            if a == 'Yes' or a == 'yes':
                continue
            else:
                sys.exit(1)
Beispiel #24
0
def run():
    cities = utils.read_list_from_file(config.CITIES_FILENAME)
    users_list = get_all_users(cities)
    utils.write_list_to_file(config.USERS_FILENAME, users_list)
# Verify that all the layers are there and that there are no holes
all_layers.sort()
for i in range(len(all_layers) - 1):
    if all_layers[i + 1] - all_layers[i] != 1:
        for l in range(all_layers[i] + 1, all_layers[i + 1]):
            if l not in skipped_layers:
                print "Error missing layer {} between: {} and {}".format(l, all_layers[i], all_layers[i + 1])
                sys.exit(1)

# Normalize the sections
print [layer_to_bbox[l] for l in layer_to_bbox.keys()]
normalize_coordinates([layer_to_bbox[l] for l in layer_to_bbox.keys()], norm_dir, args.jar_file)

norm_list_file = os.path.join(args.workspace_dir, "all_norm_files.txt")
write_list_to_file(norm_list_file, all_norm_files)



# Render each layer individually
for tiles_fname in glob.glob(os.path.join(norm_dir, '*.json')):
    tiles_fname_prefix = os.path.splitext(os.path.basename(tiles_fname))[0]

    # read the layer from the file
    layer = read_layer_from_file(tiles_fname)


    # Check if it already rendered the files (don't know the output type)
    render_out_files = glob.glob(os.path.join(args.output_dir, '{0:0>4}_{1}.*'.format(layer, tiles_fname_prefix)))
    if len(render_out_files) > 0:
        print "Skipping rendering of layer {}, because found: {}".format(layer, render_out_files)
Beispiel #26
0
                        fold_ft_sentences[fold] = {
                            "all": [],
                            "ob": [],
                            "eb": [],
                            "s2r": []
                        }
                    append_to_aggreg_dict(fold_ft_sentences[fold],
                                          ft_sentences)

                    if fold not in fold_sentences:
                        fold_sentences[fold] = []
                    fold_sentences[fold].extend(sentences)

                    for type, ft_sents in ft_sentences.items():
                        utils.write_list_to_file(
                            ft_sents,
                            os.path.join(output_path, sys_name,
                                         ".".join([fold, type, "prep.ft"])))
                    utils.write_json_line_by_line(
                        sentences,
                        os.path.join(output_path, sys_name, fold + ".prep"))

            for fold, sentences in fold_ft_sentences.items():
                for type, ft_sents in sentences.items():
                    utils.write_list_to_file(
                        ft_sents,
                        os.path.join(output_path,
                                     ".".join([fold, type, "prep.ft"])))
            for fold, sentences in fold_sentences.items():
                utils.write_json_line_by_line(
                    sentences, os.path.join(output_path, fold + ".prep"))
def create_post_filter_jobs(slayer, filtered_ts_fname, layers_data, jobs,
                            matched_sifts_dir, workspace_dir, output_dir,
                            conf_file_name):

    layer_matched_sifts_intra_dir = os.path.join(
        matched_sifts_dir, os.path.join(layers_data[slayer]['prefix'],
                                        'intra'))
    layer_matched_sifts_inter_dir = os.path.join(
        matched_sifts_dir, os.path.join(layers_data[slayer]['prefix'],
                                        'inter'))
    create_dir(layer_matched_sifts_intra_dir)
    create_dir(layer_matched_sifts_inter_dir)

    # Read the filtered tilespec
    tiles_fname_prefix = os.path.splitext(
        os.path.basename(filtered_ts_fname))[0]
    cur_tilespec = load_tilespecs(filtered_ts_fname)

    mfovs = set()

    for ts in cur_tilespec:
        mfovs.add(ts["mfov"])

    # create the intra matched sifts directories
    for mfov in mfovs:
        mfov_intra_dir = os.path.join(layer_matched_sifts_intra_dir, str(mfov))
        create_dir(mfov_intra_dir)

    # A map between layer to a list of multiple matches
    multiple_match_jobs = {}
    # read every pair of overlapping tiles, and match their sift features
    jobs_match_intra_mfovs = {}
    jobs_match_inter_mfovs = []
    indices = []
    # TODO - use some other method to detect overlapping tiles
    for pair in itertools.combinations(xrange(len(cur_tilespec)), 2):
        idx1 = pair[0]
        idx2 = pair[1]
        ts1 = cur_tilespec[idx1]
        ts2 = cur_tilespec[idx2]
        # if the two tiles intersect, match them
        bbox1 = BoundingBox.fromList(ts1["bbox"])
        bbox2 = BoundingBox.fromList(ts2["bbox"])
        if bbox1.overlap(bbox2):
            imageUrl1 = ts1["mipmapLevels"]["0"]["imageUrl"]
            imageUrl2 = ts2["mipmapLevels"]["0"]["imageUrl"]
            tile_fname1 = os.path.basename(imageUrl1).split('.')[0]
            tile_fname2 = os.path.basename(imageUrl2).split('.')[0]
            index_pair = [
                "{}_{}".format(ts1["mfov"], ts1["tile_index"]),
                "{}_{}".format(ts2["mfov"], ts2["tile_index"])
            ]
            if ts1["mfov"] == ts2["mfov"]:
                # Intra mfov job
                cur_match_dir = os.path.join(layer_matched_sifts_intra_dir,
                                             str(ts1["mfov"]))
            else:
                # Inter mfov job
                cur_match_dir = layer_matched_sifts_inter_dir
            match_json = os.path.join(
                cur_match_dir,
                "{0}_sift_matches_{1}_{2}.json".format(tiles_fname_prefix,
                                                       tile_fname1,
                                                       tile_fname2))
            # match the features of overlapping tiles
            if not os.path.exists(match_json):
                print "Matching sift of tiles: {0} and {1}".format(
                    imageUrl1, imageUrl2)
                # The filter is done, so assumes no dependencies
                dependencies = []

                # Check if the job already exists
                if ts1["mfov"] == ts2["mfov"]:
                    # Intra mfov job
                    if ts1["mfov"] in jobs[slayer]['matched_sifts'][
                            'intra'].keys():
                        job_match = jobs[slayer]['matched_sifts']['intra'][
                            ts1["mfov"]]
                    else:
                        job_match = MatchMultipleSiftFeaturesAndFilter(
                            cur_match_dir,
                            filtered_ts_fname,
                            "intra_l{}_{}".format(slayer, ts1["mfov"]),
                            threads_num=4,
                            wait_time=None,
                            conf_fname=conf_file_name)
                        jobs[slayer]['matched_sifts']['intra'][
                            ts1["mfov"]] = job_match
                else:
                    # Inter mfov job
                    if jobs[slayer]['matched_sifts']['inter'] is None:
                        job_match = MatchMultipleSiftFeaturesAndFilter(
                            cur_match_dir,
                            filtered_ts_fname,
                            "inter_{}".format(slayer),
                            threads_num=4,
                            wait_time=None,
                            conf_fname=conf_file_name)
                        jobs[slayer]['matched_sifts']['inter'] = job_match
                    else:
                        job_match = jobs[slayer]['matched_sifts']['inter']
                job_match.add_job(dependencies,
                                  layers_data[slayer]['sifts'][imageUrl1],
                                  layers_data[slayer]['sifts'][imageUrl2],
                                  match_json, index_pair)

                #jobs[slayer]['matched_sifts'].append(job_match)
            layers_data[slayer]['matched_sifts'].append(match_json)

    # Create a single file that lists all tilespecs and a single file that lists all pmcc matches (the os doesn't support a very long list)
    matches_list_file = os.path.join(
        workspace_dir, "{}_matched_sifts_files.txt".format(tiles_fname_prefix))
    write_list_to_file(matches_list_file, layers_data[slayer]['matched_sifts'])

    # optimize (affine) the 2d layer matches (affine)
    opt_montage_json = os.path.join(
        output_dir, "{0}_montaged.json".format(tiles_fname_prefix))
    if not os.path.exists(opt_montage_json):
        print "Optimizing (affine) layer matches: {0}".format(slayer)
        dependencies = []
        if jobs[slayer]['matched_sifts']['inter'] is not None:
            dependencies.append(jobs[slayer]['matched_sifts']['inter'])
        if jobs[slayer]['matched_sifts']['intra'] is not None and len(
                jobs[slayer]['matched_sifts']['intra']) > 0:
            dependencies.extend(
                jobs[slayer]['matched_sifts']['intra'].values())
        job_opt_montage = OptimizeMontageTransform(dependencies,
                                                   filtered_ts_fname,
                                                   matches_list_file,
                                                   opt_montage_json,
                                                   conf_fname=conf_file_name)
    layers_data[slayer]['optimized_montage'] = opt_montage_json
    def __fetch_function_call_graph(self):
        """

        :return:
        """
        """ 
        Deal with the component corresponding action
        Component names in the AndroidManifest file are either complete or incomplete. If there is only one word, it is considered incomplete
        In comp_dict, key is the class name,value is the action feature value
        All methods in the class inherit this action feature   
        """
        comp_dict = {}
        for comp_match in COMPONENT_PATTERN.finditer(self.am_content):
            action_list = []
            comp_action_features = []
            comp_detail = comp_match.group(0)
            comp_name = comp_match.group('compname')
            if comp_name.startswith('.'):
                comp_name = self.package_name + comp_name
            elif len(comp_name.split('.')) == 1:
                comp_name = self.package_name + '.' + comp_name
            class_path = join_class_path(comp_name)
            for action_match in INTENT_ACTION_PATTERN.finditer(comp_detail):
                action_list.append(action_match.group('action').split('.')[-1])
            get_filtered_vector(
                comp_action_features, action_list,
                CONSTANTS['INTENT_ACTIONS_126']['REFERENCE_LIST'])
            comp_dict[class_path] = np.array(comp_action_features,
                                             dtype=np.uint8,
                                             ndmin=2)

        output_func_call_pairs_txt_path = os.path.join(self.dst_output_path,
                                                       'func_call_pairs.txt')
        if not os.path.exists(output_func_call_pairs_txt_path):
            temp_dict = {}
            smali_search_result = glob.glob(os.path.join(
                self.smali_dir_path, "**\\*.smali"),
                                            recursive=True)
            for smali_file in smali_search_result:
                if extract_func_call_pairs_list_from_smali(
                        temp_dict, smali_file) != STATUS_OK:
                    print('extract func call pairs failed')
                    return STATUS_ERR
            self.func_call_pairs = list(temp_dict.keys())
            write_list_to_file(self.func_call_pairs,
                               output_func_call_pairs_txt_path)
            temp_dict.clear()
        if (not self.func_call_pairs) and read_file_to_list(
                self.func_call_pairs,
                output_func_call_pairs_txt_path) != STATUS_OK:
            return STATUS_ERR

        all_funcs_set = set()
        for call_pair in self.func_call_pairs:
            temp_list = call_pair.split(' ')
            if len(temp_list) == 3:
                all_funcs_set.add(temp_list[0])
                all_funcs_set.add(temp_list[2])
            elif len(temp_list) == 2:
                print('length 2 -> ' + ','.join(temp_list))
            elif len(temp_list) == 1:
                print('length 1 -> ' + ','.join(temp_list))
            elif len(temp_list) == 0:
                print('length 0')
            else:
                print('other length ' + str(len(temp_list)))

        # have a MainNode,,,
        self.nodes_num = len(list(all_funcs_set)) + 1
        if self.nodes_num > 30000:
            return STATUS_ERR
        all_funcs_set = None
        print('nodes num->', self.nodes_num)

        self.adj_matrix = np.zeros((self.nodes_num, self.nodes_num),
                                   dtype=np.uint8)
        self.node_features = np.zeros((self.nodes_num, 273), dtype=np.uint8)
        self.node_labels = []
        all_funcs = []
        api_lv_match = TARGET_SDK_VER_PATTERN.search(self.am_content)
        if not api_lv_match:
            api_lv_match = MIN_SDK_VER_PATTERN.search(self.am_content)
        if api_lv_match and int(api_lv_match.group('apilevel')) >= 16:
            self.api_level = api_lv_match.group('apilevel')

        # The construct mainNode is characterized by the entire app, and its tag is the tag of the app,malicious 10 benign 01
        all_funcs.append('MainNode')
        self.node_labels.append(
            [1, 0]) if self.is_malicious else self.node_labels.append([0, 1])
        self.adj_matrix[0] = np.ones((1, self.nodes_num), dtype=np.uint8)
        self.node_features[0] = np.array(self.feature_list,
                                         dtype=np.uint8)[0:273]

        for call_pair in self.func_call_pairs:
            temp_list = call_pair.split(' ')
            if len(temp_list) == 3:
                caller = temp_list[0]
                called = temp_list[2]
                """
                Extract by API
                """

                # row :caller| column :called
                caller_idx = self.__process_func(caller, all_funcs, comp_dict)
                called_idx = self.__process_func(called, all_funcs, comp_dict)
                self.adj_matrix[caller_idx, called_idx] = 1
            elif len(temp_list) == 2:
                print('length 2 -> ' + ','.join(temp_list))
            elif len(temp_list) == 1:
                print('length 1 -> ' + ','.join(temp_list))
            elif len(temp_list) == 0:
                print('length 0')
            else:
                print('other length ' + str(len(temp_list)))

        write_list_to_file(all_funcs,
                           os.path.join(self.dst_output_path, 'all_funcs.txt'))
        return STATUS_OK
            "{0}_{1}_filter_ransac.json".format(fname1_prefix, fname2_prefix))
        if not os.path.exists(ransac_fname):
            print "Filter-and-Ransac of layers: {0} and {1}".format(i, i + j)
            filter_ransac(match_json, path2url(layer_to_ts_json[i]),
                          ransac_fname, args.jar_file, conf)
        all_model_files.append(ransac_fname)

        j += 1
        matched_after_layers += 1

# Optimize all layers to a single 3d image
all_ts_files = layer_to_ts_json.values()
create_dir(args.output_dir)

ts_list_file = os.path.join(args.workspace_dir, "all_ts_files.txt")
write_list_to_file(ts_list_file, all_ts_files)
matched_sifts_list_file = os.path.join(args.workspace_dir,
                                       "all_matched_sifts_files.txt")
write_list_to_file(matched_sifts_list_file, all_matched_sifts_files)
model_list_file = os.path.join(args.workspace_dir, "all_model_files.txt")
write_list_to_file(model_list_file, all_model_files)

optimize_layers_affine([ts_list_file], [matched_sifts_list_file],
                       [model_list_file],
                       fixed_layers,
                       args.output_dir,
                       args.max_layer_distance,
                       args.jar_file,
                       conf,
                       args.skip_layers,
                       manual_matches=args.manual_match)
Beispiel #30
0
            bbox_and_norm_jobs.append(bbox_job)

    # Normalize the coordination on all files (at a single execution)
    normalized_all_files = True
    for f in norm_files:
        if not os.path.exists(f):
            normalized_all_files = False
            break

    if not normalized_all_files:
        norm_job = NormalizeCoordinates(jobs['bbox'], bbox_files, norm_dir,
                                        args.jar_file, norm_files)
        bbox_and_norm_jobs.append(norm_job)

    norm_list_file = os.path.join(args.workspace_dir, "all_norm_files.txt")
    write_list_to_file(norm_list_file, norm_files)

    # Perform the rendering
    for f in json_files.keys():
        # read the layer from the file
        layer = read_layer_from_file(f)

        # If the layer in the file is not in the required range, continue to the next file
        if args.from_layer != -1:
            if layer < args.from_layer:
                continue
        if args.to_layer != -1:
            if layer > args.to_layer:
                continue

        tiles_fname = os.path.basename(f)