Exemple #1
0
def pass2(files_list):
    """ 
    Read files_list and verify file contents
    @return: Queue of files with error    
    """
    files_failed_list = []
    files_queue = queue.Queue()

    # Starting workkers
    worker.Reader.mFilesTotal = len(files_list)
    threads = []
    for i in range(args.jobs):
        print_verbose("Started reader job {}".format(i + 1))
        th = worker.Reader(files_queue, files_failed_list)
        th.start()
        threads.append(th)

    # Send work to queue
    if args.jobs > 1:  # Create random queue
        reordered_files = files_list.copy()
        while len(reordered_files) > 0:
            index = random.randint(0, len(reordered_files) - 1)
            files_queue.put(reordered_files[index])
            del reordered_files[index]
    else:
        for f_name in files_list:
            files_queue.put(f_name)

    # Wait queue ends
    files_queue.join()
    for th in threads:
        files_queue.put(None)
    for th in threads:
        th.join()
    return files_failed_list
Exemple #2
0
def run(common_args, cmd_argv):
    args = docopt(__doc__, argv=cmd_argv)

    # -b option is not supported/needed
    if (args['-b'] != None):
        sys.exit(
            "The '-b' option is not supported/needed.  Use a 'remote-ref' as the <id> argument"
        )

    # Default Package name
    pkg = args['<repo>']
    if (args['-p']):
        pkg = args['-p']

    # Set directory for the subtree directory
    dst = os.path.join(args['<dst>'], pkg)
    dst = utils.force_unix_dir_sep(dst)
    utils.print_verbose(f"Location of the copy being updated: {dst}")

    # Update the 'subtree'
    cmd = f'git subtree pull --prefix {dst} {args["<origin>"]}/_git/{args["<repo>"]} {args["<id>"]} --squash'
    t = utils.run_shell(cmd, common_args['-v'])
    utils.check_results(
        t,
        "ERROR: Failed the update a subtree for the specified package/repository."
    )
Exemple #3
0
def run( common_args, cmd_argv ):
    args = docopt(scm.umount.USAGE, argv=cmd_argv)

    # Success Msg
    if ( args['get-success-msg'] ):
        print( "Repo unmount.  You will need to perform a 'git add/rm' to remove the deleted files" )
        return

    # Error Msg
    if ( args['get-error-msg'] ):
        print( "" ) # No addition info
        return

    # -b option is not supported/needed
    if ( args['-b'] != None ):
        sys.exit( "The '-b' option is not supported/needed.  Use a 'remote-ref' as the <id> argument" )

    # Default Package name
    pkg = args['<repo>']
    if ( args['-p'] ):
        pkg = args['-p']

    # Set the foreign package directory to be deleted
    dst = os.path.join( args['<dst>'] , pkg )
    if ( not os.path.isdir(dst) ):
        sys.exit( f"ERROR: The Package/Directory - {dst} - does not exist." )
    utils.print_verbose( f"Package/directory being removed: {dst}" )

    # The is no 'git subtree rm' command -->we just simply delete the package directory
    utils.set_tree_readonly( dst, False )
    utils.remove_tree( dst )
Exemple #4
0
def calc_files_dirs(test_size, file_size=100 * 2**20, max_files=1024):
    # return: (array dirs_list, array filse_list)
    files_required = int(test_size / file_size) + 1
    dirs_required = int(files_required / max_files) + 1
    # Making dirs
    res_dirs = []
    res_files = []
    print_verbose("Required {} files {} dirs  ({} files per dir)".format(
        files_required, dirs_required, max_files))
    while dirs_required > 0:
        res_dirs.append(args.work_dir + "/" + "{:09d}".format(dirs_required))
        dirs_required -= 1
    index = 0
    counter = max_files
    while files_required > 0:
        file_path = "{}/{:09d}".format(res_dirs[index], counter)
        #file_path = args.work_dir + "/" + file_name
        res_files.append(file_path)
        counter -= 1
        if counter < 0:
            counter = max_files
            index += 1
        files_required -= 1

    return (res_dirs, res_files)
Exemple #5
0
def run(common_args, cmd_argv):
    args = docopt(scm.mount.USAGE, argv=cmd_argv)

    # Success Msg
    if (args['get-success-msg']):
        print("Repo mounted and committed to your repo")
        return

    # Error Msg
    if (args['get-error-msg']):
        print("")  # No message
        return

    # Check if there are pending repo changes
    cmd = f'git diff-index HEAD --exit-code --quiet'
    t = utils.run_shell(cmd, False)
    cmd = f'git diff-index --cached HEAD --exit-code --quiet'
    t2 = utils.run_shell(cmd, False)
    utils.check_results(
        t,
        "ERROR: Your local repo has pending tree modification (i.e. need to do a commit/revert)."
    )
    utils.check_results(
        t2,
        "ERROR: Your local repo has pending index modification (i.e. need to do a commit/revert)."
    )

    # -b option is not supported/needed
    if (args['-b'] != None):
        sys.exit(
            "The '-b' option is not supported/needed.  Use a 'remote-ref' as the <id> argument"
        )

    # Default Package name
    pkg = args['<repo>']
    if (args['-p']):
        pkg = args['-p']

    # Make sure the Parent destination directory exists
    dst = args['<dst>']
    utils.mkdirs(dst)

    # Set directory for the subtree directory
    dst = os.path.join(dst, pkg)
    dst = utils.force_unix_dir_sep(dst)
    utils.print_verbose(f"Destination for the copy: {dst}")

    # Create a 'subtree'
    cmd = f'git subtree add --prefix {dst} {args["<origin>"]}/{args["<repo>"]}.git {args["<id>"]} --squash'
    t = utils.run_shell(cmd, common_args['-v'])
    if (utils.is_error(t)):  # Clean-up dst dir if there was failure
        utils.remove_tree(dst)
    utils.check_results(
        t,
        "ERROR: Failed to create a subtree for the specified package/repository."
    )
def read_matrix_from_file(filename, no_rows, no_cols):
    try:
        print_verbose("Trying to read matrix from ", filename, " with size ",
                      no_rows, "x", no_cols)
        f = open(filename)
        lines = f.readlines()
        return parse_matrix(lines, no_rows, no_cols)
    except:
        print "Failed to read matrix from " + filename
        return None
Exemple #7
0
 def createFile(self, file_name, binary_pattern, size=1024, seed=0):
     blocks = int(size / len(binary_pattern))
     try:
         with open(file_name, "wb") as binary_file:
             for i in range(blocks):
                 res = binary_file.write(binary_pattern)
     except OSError as e:
         if e.errno == 28:
             print_verbose("Disk full")
         pass
Exemple #8
0
def pass3(dirs_list, files_list):
    # TODO: Remove dirs
    print_verbose("Removing files ")
    for file_name in files_list:
        os.remove(file_name)
    dirs_list.append(args.work_dir)
    for d in dirs_list:
        try:
            os.rmdir(d)
        except OSError as e:
            print_err("Cannot delete directory '{}': {}".format(d, e.strerror))
    print_verbose("Files removed. Test finished")
def write_matrix_to_file(filename, Mat):
    try:
        print_verbose("Trying to write matrix in ", filename)
        f = open(filename, 'w+')
        for line in Mat:
            three_decimal_row = ["%.3f" % i for i in line]
            line = ""
            for num in three_decimal_row:
                line += str(num) + " "
            f.write(line + "\n")
        f.close()
    except:
        print "Couldn't write matrix into file " + filename
        raise
Exemple #10
0
def run(common_args, cmd_argv):
    args = docopt(scm.copy.USAGE, argv=cmd_argv)

    # Use the mount command so as to have consistent pre/post GIT behavior with adopting non-integrated packages
    if (not args['--force']):
        cmd_argv[0] = 'mount'
        cmd_argv.insert(1, '--noro')
        scm.git.mount.run(common_args, cmd_argv)

    # Do a brute force copy
    else:
        # -b option is not supported/needed
        if (args['-b'] != None):
            sys.exit(
                "The '-b' option is not supported/needed.  Use a 'remote-ref' as the <id> argument"
            )

        # Default Package name
        pkg = args['<repo>']
        if (args['-p']):
            pkg = args['-p']

        # Make sure the destination directory exists
        dst = os.path.join(os.getcwd(), args['<dst>'])
        utils.print_verbose(f"Destination for the copy: {dst}")
        utils.mkdirs(dst)

        # Create a clone of the repo
        # NOTE: I hate cloning the entire repo - but I have not found a way to get JUST a snapshot by a remote-ref
        cmd = f'git clone --branch {args["<id>"]} --depth=1 {args["<origin>"]}/_git/{args["<repo>"]} {pkg}'
        utils.push_dir(dst)
        t = utils.run_shell(cmd, common_args['-v'])
        utils.pop_dir()
        if (utils.is_error(t)):  # Clean-up dst dir if there was failure
            utils.remove_tree(dst)
        utils.check_results(
            t,
            f"ERROR: Failed the retreive/clone the specified package/repository. Note: the <id> ({args['<id>']}) MUST be a git TAG."
        )

        # Remove the .git directoy since this is a non-tracked copy
        gitdir = os.path.join(dst, pkg, ".git")
        utils.remove_tree(
            gitdir,
            warn_msg="Not able to remove the .git directory for local copy")
Exemple #11
0
    def run(self):
        args = __main__.args  # FIXME:
        total_files = self.mInQueue.qsize()
        while True:
            file_name = self.mInQueue.get()
            if file_name is None:
                break
            start_time = time.time()
            self.createFile(file_name, BINARY_PATTERN, args.file_size)
            self.mOutList.append(file_name)
            elapsed_time = time.time() - start_time
            files_left = self.mInQueue.qsize()
            percent = int(50 * (total_files - files_left) / total_files)
            print_verbose(
                "[{:02d}%,{},{} free] Created file {}  [{}/s]".format(
                    percent, files_left,
                    format_human(get_free_space(args.work_dir)), file_name,
                    format_human(int(args.file_size / elapsed_time))))

            files_left -= 1
            self.mInQueue.task_done()
Exemple #12
0
 def run(self):
     while True:
         file_name = self.mInQueue.get()
         if file_name is None:
             break
         offset = 0
         start_time = time.time()
         with open(file_name, "rb") as binary_file:
             error = False
             while not error:
                 try:
                     res = binary_file.read(256)
                 except OSError as e:
                     error = True
                     print_err(e)
                 if not res:
                     break
                 elif res != BINARY_PATTERN[0:len(res)]:
                     self.mOutList.append(file_name)
                     error = True
                 offset += len(res)
         elapsed_time = time.time() - start_time
         files_left = Reader.mFilesTotal - Reader.mFilesProcessed
         percent = int(50 +
                       50 * Reader.mFilesProcessed / Reader.mFilesTotal)
         str_out = "[{:02d}% {} left {} err] File '{}' ".format(
             percent, files_left, len(self.mOutList), file_name)
         if not error:
             str_out += "[{} {}/s] Ok".format(
                 format_human(offset),
                 format_human(int(offset / elapsed_time)))
         else:
             str_out += "Error ar offset 0x{:X}".format(offset)
         print_verbose(str_out)
         with Reader.mLock:
             Reader.mFilesProcessed += 1
         self.mInQueue.task_done()
Exemple #13
0
def pass1(dirs_list, files_list):
    """ Create files_queue using a pattern
       @return List of files created
    """
    files_created = []

    print_verbose("Test size {}, file size {}.".format(
        format_human(args.test_size), format_human(args.file_size)))

    # Creating temporal dirs
    for dir_name in dirs_list:
        if not check_dir(dir_name, create=True):
            print_err("Cannot continue, {} is not a directory".format(new_dir))
        else:
            print_verbose("Created directory {}".format(dir_name))

    # Create worker queue, if jobs > 1 create a random list
    files_queue = queue.Queue()

    if args.jobs > 1:
        reordered_files = files_list.copy()
        while len(reordered_files) > 0:
            index = random.randint(0, len(reordered_files) - 1)
            files_queue.put(reordered_files[index])
            del reordered_files[index]
    else:
        for f_name in files_list:
            files_queue.put(f_name)

    # Starting workkers
    threads = []
    for i in range(args.jobs):
        print_verbose("Started writer job {}".format(i + 1))
        th = worker.Writer(files_queue, files_created)
        th.start()
        threads.append(th)
    files_queue.join()
    for th in threads:
        files_queue.put(None)
    for th in threads:
        th.join()
    return files_created
Exemple #14
0
            os.rmdir(d)
        except OSError as e:
            print_err("Cannot delete directory '{}': {}".format(d, e.strerror))
    print_verbose("Files removed. Test finished")


args = parse_commanline()

files_failed_list = []
files_list = []
dirs_list = []

if getattr(args, '2', False):
    # Only do pass 2
    existent_files_list = []
    print_verbose("Running only pass 2, files verification")
    for root, dirs, files in os.walk(args.work_dir, topdown=False):
        for name in files:
            existent_files_list.append(os.path.join(root, name))
    print_verbose("Found {} files".format(len(existent_files_list)))
    files_failed_list = pass2(existent_files_list)
else:
    # Do pass1 pass2 and pass3
    dirs_list, files_list = calc_files_dirs(args.test_size, args.file_size,
                                            args.files_dir)

    files_queue = pass1(dirs_list, files_list)
    files_failed_list = pass2(files_queue)
    if len(files_failed_list) > 0:
        print_err("Error: Test not passed, temporary files not removed")
    else:
Exemple #15
0
def main(run_dir="rfe_chain",
         start=None,
         start_auc=None,
         verbose=None,
         logfile=None):
    """
    Main function to run the chain.
    """
    if logfile is not None:
        sys.stdout = open(logfile, "w")

    # load starting json
    with open(start) as f:
        start = json.load(f)
    if start_auc is None:
        startauc = 0.8

    start['AUC_SCORE_PATH'] = run_dir

    # have to load a list of possible features to replace with
    if all("10feat" in feature for feature in start['FEATURES']):
        with open("10featlist.json") as fh:
            featlist = json.load(fh)['FEATURES']
    else:
        featlist = get_featlist()

    # and possible preceding modifiers
    modlist = get_modlist()

    # creat list of combinations of these two lists
    comblist = []
    for mod in modlist:
        for feature in featlist:
            comblist.append('{0}_{1}_'.format(mod, feature))

    # define sampled json
    prevsample = copy.deepcopy(start)

    # initialise auc
    prevauc = startauc

    first = 1
    counter = 0
    converged = False
    # will decide what constitutes converged later
    while not converged:

        sample = copy.deepcopy(prevsample)
        # If this isn't the first one, sample new settings
        if not first:
            # Sample a new hdf5 and replace existing at random
            #   Or, just push it in, or just drop a hdf5 at random
            utils.print_verbose(
                "===== Sampling new proposal "
                "settings ======", flag=verbose)
            # sample new settings
            # shuffle combinations
            random.shuffle(comblist)

            # pop 3 features off this
            added = [comblist.pop() for i in range(3)]

            # add them to the settings
            sample['FEATURES'] = added

            utils.print_verbose(
                "============================"
                "===============", flag=verbose)

        # ensure that ordering of the features is the same between jsons
        sample['FEATURES'].sort()

        # Then save this new json with a descriptive name
        # unless it's already been generated
        if first:
            featurerecord = "".join(sample['FEATURES'])
        else:
            featurerecord = featurerecord + "".join(sample['FEATURES'])
        md5name = hashlib.md5(featurerecord.encode('UTF-8')).hexdigest()
        # get a list of the files in the run_dir
        existingjsons = glob.glob(run_dir + "/*.json")
        # check if the md5 exists
        if md5name + ".json" in existingjsons:
            # then load the results of that run
            with open(os.path.join(run_dir, "AUC_scores.csv"), "r") as fh:
                c = csv.reader(fh, delimiter="\t")
                utils.print_verbose("Already ran {0},"
                                    "reading from results.".format(md5name),
                                    flag=verbose)
                for line in c:
                    # look for that md5sum
                    if md5name in line[0]:
                        auc_score = line[-1]
        else:
            # save a json with this name and run train.py on it
            samplefname = os.path.join(run_dir, md5name + ".json")
            utils.print_verbose("Creating new settings"
                                " file for {0}".format(samplefname),
                                flag=verbose)
            with open(samplefname, "w") as fh:
                json.dump(sample, fh)
            # call train.py
            try:
                if first:
                    auc_score_dict = train.main(samplefname,
                                                verbose=verbose,
                                                store_models=False,
                                                store_features=True)
                else:
                    picklefname = prevsamplefname.split(".")[0] + \
                        "_feature_dump.pickle"
                    # load the features saved in the last run
                    auc_score_dict = train.main(samplefname,
                                                verbose=verbose,
                                                store_models=False,
                                                store_features=True,
                                                load_pickled=picklefname)
                prevsamplefname = samplefname
                auc_score = auc_score_dict['all']
            except IndexError:
                print("Warning: accidentally added invalid feature.")
                os.remove(samplefname)
                # set auc to zero so these settings are not accepted
                auc_score = 0

        prevsample = sample

        # can't be first anymore
        first = 0

        # as it may be bad manners to run infinite loops
        counter += 1
        if counter > 100:
            converged = True

    return None
Exemple #16
0
def main(settingsfname, verbose=False):

    settings = utils.get_settings(settingsfname)

    subjects = settings['SUBJECTS']

    data = utils.get_data(settings, verbose=verbose)

    metadata = utils.get_metadata()

    features_that_parsed = [
        feature for feature in settings['FEATURES']
        if feature in list(data.keys())
    ]

    settings['FEATURES'] = features_that_parsed

    utils.print_verbose("=====Feature HDF5s parsed=====", flag=verbose)

    # get model
    model_pipe = utils.build_model_pipe(settings)

    utils.print_verbose("=== Model Used ===\n"
                        "{0}\n==================".format(model_pipe),
                        flag=verbose)

    # dictionary to store results
    subject_predictions = {}

    accuracy_scores = {}

    for subject in subjects:
        utils.print_verbose("=====Training {0} Model=====".format(
            str(subject)),
                            flag=verbose)

        # initialise the data assembler
        assembler = utils.DataAssembler(settings, data, metadata)
        X, y = assembler.test_train_discrimination(subject)

        # get the CV iterator
        cv = utils.sklearn.cross_validation.StratifiedShuffleSplit(
            y, random_state=settings['R_SEED'], n_iter=settings['CVITERCOUNT'])

        # initialise lists for cross-val results
        predictions = []
        labels = []
        allweights = []

        # run cross validation and report results
        for train, test in cv:

            # calculate the weights
            weights = utils.get_weights(y[train])
            # fit the model to the training data
            model_pipe.fit(X[train], y[train], clf__sample_weight=weights)
            # append new predictions
            predictions.append(model_pipe.predict(X[test]))
            # append test weights to store (why?) (used to calculate auc below)
            weights = utils.get_weights(y[test])
            allweights.append(weights)
            # store true labels
            labels.append(y[test])

        # stack up the results
        predictions = utils.np.hstack(predictions)
        labels = utils.np.hstack(labels)
        weights = utils.np.hstack(allweights)

        # calculate the total accuracy
        accuracy = utils.sklearn.metrics.accuracy_score(labels,
                                                        predictions,
                                                        sample_weight=weights)

        print("Accuracy score for {1}: {0:.3f}".format(accuracy, subject))

        # add AUC scores to a subj dict
        accuracy_scores.update({subject: accuracy})

        # store results from each subject
        subject_predictions[subject] = (predictions, labels, weights)

    # stack subject results (don't worrry about this line)
    predictions, labels, weights = map(
        utils.np.hstack, zip(*list(subject_predictions.values())))

    # calculate global accuracy
    accuracy = utils.sklearn.metrics.accuracy_score(labels,
                                                    predictions,
                                                    sample_weight=weights)

    print(
        "predicted accuracy score over all subjects: {0:.2f}".format(accuracy))

    # output AUC scores to file
    accuracy_scores.update({'all': accuracy})

    settings['DISCRIMINATE'] = 'accuracy_scores.csv'
    # settings['AUC_SCORE_PATH'] = 'discriminate_scores'
    utils.output_auc_scores(accuracy_scores, settings)

    return accuracy_scores
def main(settingsfname, verbose=False):

    settings = utils.get_settings(settingsfname)

    subjects = settings['SUBJECTS']

    data = utils.get_data(settings, verbose=verbose)

    metadata = utils.get_metadata()

    features_that_parsed = [feature for feature in
                            settings['FEATURES'] if feature in list(data.keys())]

    settings['FEATURES'] = features_that_parsed

    utils.print_verbose("=====Feature HDF5s parsed=====", flag=verbose)

    # get model
    model_pipe = utils.build_model_pipe(settings)

    utils.print_verbose("=== Model Used ===\n"
                        "{0}\n==================".format(model_pipe), flag=verbose)

    # dictionary to store results
    subject_predictions = {}

    accuracy_scores = {}

    for subject in subjects:
        utils.print_verbose(
            "=====Training {0} Model=====".format(str(subject)),
                            flag=verbose)

        # initialise the data assembler
        assembler = utils.DataAssembler(settings, data, metadata)
        X, y = assembler.test_train_discrimination(subject)

        # get the CV iterator
        cv = utils.sklearn.cross_validation.StratifiedShuffleSplit(
            y,
                               random_state=settings['R_SEED'],
                               n_iter=settings['CVITERCOUNT'])

        # initialise lists for cross-val results
        predictions = []
        labels = []
        allweights = []

        # run cross validation and report results
        for train, test in cv:

            # calculate the weights
            weights = utils.get_weights(y[train])
            # fit the model to the training data
            model_pipe.fit(X[train], y[train], clf__sample_weight=weights)
            # append new predictions
            predictions.append(model_pipe.predict(X[test]))
            # append test weights to store (why?) (used to calculate auc below)
            weights = utils.get_weights(y[test])
            allweights.append(weights)
            # store true labels
            labels.append(y[test])

        # stack up the results
        predictions = utils.np.hstack(predictions)
        labels = utils.np.hstack(labels)
        weights = utils.np.hstack(allweights)

        # calculate the total accuracy
        accuracy = utils.sklearn.metrics.accuracy_score(labels,
                                                        predictions,
                                                        sample_weight=weights)

        print("Accuracy score for {1}: {0:.3f}".format(accuracy, subject))

        # add AUC scores to a subj dict
        accuracy_scores.update({subject: accuracy})

        # store results from each subject
        subject_predictions[subject] = (predictions, labels, weights)

    # stack subject results (don't worrry about this line)
    predictions, labels, weights = map(utils.np.hstack,
                                       zip(*list(subject_predictions.values())))

    # calculate global accuracy
    accuracy = utils.sklearn.metrics.accuracy_score(labels, predictions,
                                                    sample_weight=weights)

    print(
        "predicted accuracy score over all subjects: {0:.2f}".format(accuracy))

    # output AUC scores to file
    accuracy_scores.update({'all': accuracy})

    settings['DISCRIMINATE'] = 'accuracy_scores.csv'
    # settings['AUC_SCORE_PATH'] = 'discriminate_scores'
    utils.output_auc_scores(accuracy_scores, settings)

    return accuracy_scores
def main(run_dir="rfe_chain", start=None, start_auc=None,
         verbose=None, logfile=None):
    """
    Main function to run the chain.
    """
    if logfile is not None:
        sys.stdout = open(logfile, "w")

    # load starting json
    with open(start) as f:
        start = json.load(f)
    if start_auc is None:
        startauc = 0.8

    start['AUC_SCORE_PATH'] = run_dir

    # have to load a list of possible features to replace with
    if all("10feat" in feature for feature in start['FEATURES']):
        with open("10featlist.json") as fh:
            featlist = json.load(fh)['FEATURES']
    else:
        featlist = get_featlist()

    # and possible preceding modifiers
    modlist = get_modlist()

    # creat list of combinations of these two lists
    comblist = []
    for mod in modlist:
        for feature in featlist:
            comblist.append('{0}_{1}_'.format(mod, feature))

    # define sampled json
    prevsample = copy.deepcopy(start)

    # initialise auc
    prevauc = startauc

    first = 1
    counter = 0
    converged = False
    # will decide what constitutes converged later
    while not converged:

        sample = copy.deepcopy(prevsample)
        # If this isn't the first one, sample new settings
        if not first:
            # Sample a new hdf5 and replace existing at random
            #   Or, just push it in, or just drop a hdf5 at random
            utils.print_verbose("===== Sampling new proposal "
                                "settings ======", flag=verbose)
            # sample new settings
            # shuffle combinations
            random.shuffle(comblist)

            # pop 3 features off this
            added = [comblist.pop() for i in range(3)]

            # add them to the settings
            sample['FEATURES'] = added

            utils.print_verbose("============================"
                                "===============", flag=verbose)

        # ensure that ordering of the features is the same between jsons
        sample['FEATURES'].sort()

        # Then save this new json with a descriptive name
        # unless it's already been generated
        if first:
            featurerecord = "".join(sample['FEATURES'])
        else:
            featurerecord = featurerecord + "".join(sample['FEATURES'])
        md5name = hashlib.md5(featurerecord.encode('UTF-8')).hexdigest()
        # get a list of the files in the run_dir
        existingjsons = glob.glob(run_dir + "/*.json")
        # check if the md5 exists
        if md5name + ".json" in existingjsons:
            # then load the results of that run
            with open(os.path.join(run_dir, "AUC_scores.csv"), "r") as fh:
                c = csv.reader(fh, delimiter="\t")
                utils.print_verbose("Already ran {0},"
                                    "reading from results.".format(md5name), flag=verbose)
                for line in c:
                    # look for that md5sum
                    if md5name in line[0]:
                        auc_score = line[-1]
        else:
            # save a json with this name and run train.py on it
            samplefname = os.path.join(run_dir, md5name + ".json")
            utils.print_verbose("Creating new settings"
                                " file for {0}".format(samplefname), flag=verbose)
            with open(samplefname, "w") as fh:
                json.dump(sample, fh)
            # call train.py
            try:
                if first:
                    auc_score_dict = train.main(samplefname, verbose=verbose,
                                                store_models=False, store_features=True)
                else:
                    picklefname = prevsamplefname.split(".")[0] + \
                        "_feature_dump.pickle"
                    # load the features saved in the last run
                    auc_score_dict = train.main(samplefname, verbose=verbose,
                                                store_models=False, store_features=True,
                                                load_pickled=picklefname)
                prevsamplefname = samplefname
                auc_score = auc_score_dict['all']
            except IndexError:
                print("Warning: accidentally added invalid feature.")
                os.remove(samplefname)
                # set auc to zero so these settings are not accepted
                auc_score = 0

        prevsample = sample

        # can't be first anymore
        first = 0

        # as it may be bad manners to run infinite loops
        counter += 1
        if counter > 100:
            converged = True

    return None
Exemple #19
0
def main(mcmcdir="hdf5mcmc",
         start=None,
         start_auc=None,
         verbose=True,
         logfile=None,
         discr_flag=False):
    """
    Contains the main loop for this script.
    Pseudo-MHMCMC to find optimal AUC scoring
    combinations of HDF5s.
    start - location of json file settings to begin at
    """
    if logfile is not None:
        sys.stdout = open(logfile, "w")
    # pseudo-code for the MCMC iteration
    # want it to start with the probably good features
    with open(start) as f:
        start = json.load(f)
    if start_auc is None:
        startauc = 0.8

    # hardcode AUC results to the hdf5mcmc directory
    start['AUC_SCORE_PATH'] = mcmcdir

    # have to load a list of possible features to replace with
    if all("10feat" in feature for feature in start['FEATURES']):
        with open("10featlist.json") as fh:
            featlist = json.load(fh)['FEATURES']
    else:
        featlist = get_featlist()

    # and possible preceding modifiers
    modlist = get_modlist()

    # define sampled json
    prevsample = copy.deepcopy(start)

    # initialise auc
    prevauc = startauc

    counter = 0
    converged = False
    # will decide what constitutes converged later
    while not converged:

        sample = copy.deepcopy(prevsample)
        # Sample a new hdf5 and replace existing at random
        #   Or, just push it in, or just drop a hdf5 at random
        utils.print_verbose("===== Sampling new proposal "
                            "settings ======",
                            flag=verbose)
        u = np.random.rand()
        if u < 0.25:
            # drop an element at random
            features = sample['FEATURES'][:]
            random.shuffle(features)
            dropped = features.pop()
            sample['FEATURES'] = features
            utils.print_verbose("Dropped feature {0}".format(dropped),
                                flag=verbose)
        elif u > 0.25 and u < 0.5:
            # keep trying to sample a new feature until we
            # find one that's not in there already
            while True:
                # push a new feature, but don't remove an old one
                newfeature = random.sample(featlist, 1)[0]
                newmod = random.sample(modlist, 1)[0]
                added = '{0}_{1}_'.format(newmod, newfeature)
                if added not in sample['FEATURES']:
                    break
            sample['FEATURES'].append(added)
            utils.print_verbose("Added feature {0}".format(added),
                                flag=verbose)
        elif u > 0.5:
            # push a new feature and remove an old one
            features = sample['FEATURES'][:]
            random.shuffle(features)
            dropped = features.pop()
            # keep trying to sample a new feature until we
            # find one that's not in there already
            while True:
                # push a new feature, but don't remove an old one
                newfeature = random.sample(featlist, 1)[0]
                newmod = random.sample(modlist, 1)[0]
                added = '{0}_{1}_'.format(newmod, newfeature)
                if added not in sample['FEATURES']:
                    break
            features.append(added)
            sample['FEATURES'] = features
            utils.print_verbose("Switched feature {0} for "
                                "{1}".format(dropped, added),
                                flag=verbose)
        utils.print_verbose("============================"
                            "===============",
                            flag=verbose)
        # ensure that ordering of the features is the same between jsons
        sample['FEATURES'].sort()

        # Then save this new json with a descriptive name
        # unless it's already been generated
        md5name = hashlib.md5("".join(
            sample['FEATURES']).encode('UTF-8')).hexdigest()
        # get a list of the files in the mcmcdir
        existingjsons = glob.glob(mcmcdir + "/*.json")
        # check if the md5 exists
        if md5name + ".json" in existingjsons:
            # then load the results of that run
            with open(os.path.join(mcmcdir, "AUC_scores.csv"), "r") as fh:
                c = csv.reader(fh, delimiter="\t")
                utils.print_verbose("Already ran {0},"
                                    "reading from results.".format(md5name),
                                    flag=verbose)
                for line in c:
                    # look for that md5sum
                    if md5name in line[0]:
                        auc_score = line[-1]
        else:
            # save a json with this name and run train.py on it
            samplefname = os.path.join(mcmcdir, md5name + ".json")
            utils.print_verbose("Creating new settings"
                                " file for {0}".format(samplefname),
                                flag=verbose)
            with open(samplefname, "w") as fh:
                json.dump(sample, fh)
            # call train.py or discriminate.py
            if discr_flag:
                try:
                    auc_score_dict = discriminate.main(samplefname,
                                                       verbose=verbose)
                    # don't want to rename this variable
                    # even though it is no longer an AUC score
                    # want a low accuracy score, strangely enough
                    auc_score = 1 - auc_score_dict['all']
                except IndexError:
                    print("Warning: accidentally added invalid feature.")
                    os.remove(samplefname)
                    # set auc to zero so these settings are not accepted
                    auc_score = 0
            else:
                try:
                    auc_score_dict = train.main(samplefname,
                                                verbose=verbose,
                                                store_models=False)
                    auc_score = auc_score_dict['all'] - 0.5
                except IndexError:
                    print("Warning: accidentally added invalid feature.")
                    os.remove(samplefname)
                    # set auc to zero so these settings are not accepted
                    auc_score = 0

        utils.print_verbose("==== Acceptance calculation ====", flag=verbose)
        # compute acceptance probability from AUC:
        #     r = min(1,AUC/(previous AUC))
        acceptance = np.max([np.min([1, auc_score / prevauc]), 0])

        u = np.random.rand()
        # accept new point with probability r
        if u < acceptance:
            prevsample = sample
            # save current auc
            prevauc = auc_score
            utils.print_verbose("accepting new settings with probability "
                                "{0}".format(acceptance),
                                flag=verbose)
        else:
            utils.print_verbose("rejecting new settings with probability "
                                "{0}".format(1.0 - acceptance),
                                flag=verbose)
        utils.print_verbose("================================", flag=verbose)
        # otherwise it will not overwrite prevsample, so continue from where it
        # was

        # as it may be bad manners to run infinite loops
        counter += 1
        if counter > 100:
            converged = True
Exemple #20
0
                  "are ok and test is active."
            continue
        A = read_matrix_from_file(fnameA,
                                  tests[test]['M'],
                                  tests[test]['K'])


        B = read_matrix_from_file(fnameB,
                                  tests[test]['K'],
                                  tests[test]['N'])
        
        C = read_matrix_from_file(fnameC,
                                  tests[test]['M'],
                                  tests[test]['N'])
        params = tests[test]
        print_verbose("Runing test", test)
        ts = datetime.datetime.now()
        result = f_dgemm(params['TRANSA'], params['TRANSB'],
            params['M'], params['N'], params['K'],
            params['ALPHA'], A, params['LDA'],
            B, params['LDB'],
            params['BETA'], C, params['LDC'])
        te = datetime.datetime.now()
        print_verbose("Test ", test, "done in", te-ts)
        #print_matrix(A, "-----Matrix A-----", "----------")
        #print_matrix(B, "-----Matrix B-----", "----------")
        #print_matrix(C, "-----Matrix C-----", "----------")
        top = "----Test " + test + ": " + str(params['ALPHA']) +\
          "*A*B + " + str(params['BETA']) + "*C----"
        #print_matrix(result, top, "--------")
        write_matrix_to_file(fnameRes, result)
Exemple #21
0
def f_dgemm(TRANSA, TRANSB, M, N, K, ALPHA, A, LDA, B, LDB, BETA, C, LDC):
    """ Return Alpha * A * B + C.
    N N : A[M][K] * B[K][N] + C[M][N]
    N T : A[M][K] * B[N][K] + C[M][N] -> N = K
    T N : A[K][M] * B[K][N] + C[M][N] -> M = K
    T T : A[K][M] * B[N][K] + C[M][N] -> M = N = K  
    Keyword arguments:
    ========================================================
    M -- int M >= 0, number of rows of the matrix A and C
    N -- int N >= 0, number of columns of the matrix B and C
    K -- int K >= 0, number of columns of the matrix A and B
    ALPHA -- double precision float scalar aplha
    A -- matrix of double precision floats [LDA][ka]
        ka -- K for TRANSA // m otherwise
        LDA -- integer : first dimension of A. 
                When TRANSA = 'N', LDA = max(1, M),
                otherwise LDA = max(1, K)
    B -- matrix of double precision floats [LDB][kb]
        kb -- N for TRANSB // k otherwise
        LDB -- integer : first dimension of B.
                When TRANSB = 'N', LDB = max(1, K),
                otherwise LDB = max(1, N)
    BETA -- double precision float scalar beta
    C -- matrix of double precision floats [LDC][n].
        Matrix C will be overwritten with the result matrix
        LDC - first dimension of matrix C, equal to max(1, M).
    ==========================================================
    """
    nota = (TRANSA == 'N')
    notb = (TRANSB == 'N')
    #check if A is transposed
    nrowa = K
    ncola = M
    if nota:
        print_verbose("Matrix A is not transposed")
        nrowa = M
        ncola = K

    #check if B is transposed
    nrowb = N
    if notb:
        print_verbose("Matrix B is not transposed")
        nrowb = K
    '''
       Test input parameters
    '''
    if not nota and TRANSA != 'C' and TRANSA != 'T':
        perror("Wrong TRANSA parameter")
    elif not notb and TRANSB != 'C' and TRANSB != 'T':
        perror("Wrong TRANSB parameter")
    elif M < 0:
        perror("M < 0")
    elif N < 0:
        perror("N < 0")
    elif K < 0:
        perror("K < 0")
    elif LDA < max(1, nrowa):
        perror("LDA lower than max(1, A_#rows)")
    elif LDB < max(1, nrowb):
        perror("LDB lower than max(1, B_#rows)")
    elif LDC < max(1, M):
        perror("LDC lower than max(1, m)")
    print_verbose("Alpha:", ALPHA)
    print_verbose("Beta:", BETA)
    # Quick return
    if M == 0\
       or N == 0\
       or ((f_equal(ALPHA, 0.0) or (K == 0)) and f_equal(BETA, 1.0)):
        return C

    # If ALPHA is 0.0
    if f_equal(ALPHA, 0.0):
        if f_equal(BETA, 0.0):
            for j in xrange(1, N):
                for i in xrange(1, M):
                    C[i][j] = 0.0
        else:
            for j in xrange(1, N):
                for i in xrange(1, M):
                    C[i][j] = BETA * C[i][j]

    # Start the operations
    if notb:
        if nota:
            # Form C := alpha*A*B + beta*C
            for j in xrange(N):
                if f_equal(BETA, 0.0):
                    for i in xrange(M):
                        C[i][j] = 0.0
                elif not f_equal(BETA, 1.0):
                    for i in xrange(M):
                        C[i][j] = BETA * C[i][j]
                for l in xrange(K):
                    temp = ALPHA * B[l][j]
                    for i in xrange(M):
                        C[i][j] = C[i][j] + temp * A[i][l]
        else:
            # FORM C := alpha*A**T*B + beta*C
            for j in xrange(N):
                for i in xrange(M):
                    temp = 0.0
                    for l in xrange(K):
                        temp = temp + A[l][i] * B[l][j]
                    if f_equal(BETA, 0.0):
                        C[i][j] = ALPHA * temp
                    else:
                        C[i][j] = ALPHA * temp + BETA * C[i][j]
    else:
        if nota:
            # Form C := alpha*A*B**T + beta*C
            for j in xrange(N):
                if f_equal(BETA, 0):
                    for i in xrange(M):
                        C[i][j] = 0.0
                elif not f_equal(BETA, 1.0):
                    for i in xrange(M):
                        C[i][j] = BETA * C[i][j]
                for l in xrange(K):
                    #print K, len(B[j])
                    temp = ALPHA * B[j][l]
                    for i in xrange(M):
                        C[i][j] = C[i][j] + temp * A[i][l]
        else:
            # Form C := alpha*A**T*B**T + beta*C
            for j in xrange(N):
                for i in xrange(M):
                    temp = 0.0
                    for l in xrange(K):
                        temp = temp + A[l][i] * B[j][l]
                    if f_equal(BETA, 0.0):
                        C[i][j] = ALPHA * temp
                    else:
                        C[i][j] = ALPHA * temp + BETA * C[i][j]
    # End of function
    return C
def main(mcmcdir="hdf5mcmc", start=None, start_auc=None,
         verbose=True, logfile=None, discr_flag=False):
    """
    Contains the main loop for this script.
    Pseudo-MHMCMC to find optimal AUC scoring
    combinations of HDF5s.
    start - location of json file settings to begin at
    """
    if logfile is not None:
        sys.stdout = open(logfile, "w")
    # pseudo-code for the MCMC iteration
    # want it to start with the probably good features
    with open(start) as f:
        start = json.load(f)
    if start_auc is None:
        startauc = 0.8

    # hardcode AUC results to the hdf5mcmc directory
    start['AUC_SCORE_PATH'] = mcmcdir

    # have to load a list of possible features to replace with
    if all("10feat" in feature for feature in start['FEATURES']):
        with open("10featlist.json") as fh:
            featlist = json.load(fh)['FEATURES']
    else:
        featlist = get_featlist()

    # and possible preceding modifiers
    modlist = get_modlist()

    # define sampled json
    prevsample = copy.deepcopy(start)

    # initialise auc
    prevauc = startauc

    counter = 0
    converged = False
    # will decide what constitutes converged later
    while not converged:

        sample = copy.deepcopy(prevsample)
        # Sample a new hdf5 and replace existing at random
        #   Or, just push it in, or just drop a hdf5 at random
        utils.print_verbose("===== Sampling new proposal "
                            "settings ======", flag=verbose)
        u = np.random.rand()
        if u < 0.25:
            # drop an element at random
            features = sample['FEATURES'][:]
            random.shuffle(features)
            dropped = features.pop()
            sample['FEATURES'] = features
            utils.print_verbose(
                "Dropped feature {0}".format(dropped),
                flag=verbose)
        elif u > 0.25 and u < 0.5:
            # keep trying to sample a new feature until we
            # find one that's not in there already
            while True:
                # push a new feature, but don't remove an old one
                newfeature = random.sample(featlist, 1)[0]
                newmod = random.sample(modlist, 1)[0]
                added = '{0}_{1}_'.format(newmod, newfeature)
                if added not in sample['FEATURES']:
                    break
            sample['FEATURES'].append(added)
            utils.print_verbose(
                "Added feature {0}".format(added),
                flag=verbose)
        elif u > 0.5:
            # push a new feature and remove an old one
            features = sample['FEATURES'][:]
            random.shuffle(features)
            dropped = features.pop()
            # keep trying to sample a new feature until we
            # find one that's not in there already
            while True:
                # push a new feature, but don't remove an old one
                newfeature = random.sample(featlist, 1)[0]
                newmod = random.sample(modlist, 1)[0]
                added = '{0}_{1}_'.format(newmod, newfeature)
                if added not in sample['FEATURES']:
                    break
            features.append(added)
            sample['FEATURES'] = features
            utils.print_verbose("Switched feature {0} for "
                                "{1}".format(dropped, added), flag=verbose)
        utils.print_verbose("============================"
                            "===============", flag=verbose)
        # ensure that ordering of the features is the same between jsons
        sample['FEATURES'].sort()

        # Then save this new json with a descriptive name
        # unless it's already been generated
        md5name = hashlib.md5(
            "".join(sample['FEATURES']).encode('UTF-8')).hexdigest()
        # get a list of the files in the mcmcdir
        existingjsons = glob.glob(mcmcdir + "/*.json")
        # check if the md5 exists
        if md5name + ".json" in existingjsons:
            # then load the results of that run
            with open(os.path.join(mcmcdir, "AUC_scores.csv"), "r") as fh:
                c = csv.reader(fh, delimiter="\t")
                utils.print_verbose("Already ran {0},"
                                    "reading from results.".format(md5name), flag=verbose)
                for line in c:
                    # look for that md5sum
                    if md5name in line[0]:
                        auc_score = line[-1]
        else:
            # save a json with this name and run train.py on it
            samplefname = os.path.join(mcmcdir, md5name + ".json")
            utils.print_verbose("Creating new settings"
                                " file for {0}".format(samplefname), flag=verbose)
            with open(samplefname, "w") as fh:
                json.dump(sample, fh)
            # call train.py or discriminate.py
            if discr_flag:
                try:
                    auc_score_dict = discriminate.main(samplefname,
                                                       verbose=verbose)
                    # don't want to rename this variable
                    # even though it is no longer an AUC score
                    # want a low accuracy score, strangely enough
                    auc_score = 1 - auc_score_dict['all']
                except IndexError:
                    print("Warning: accidentally added invalid feature.")
                    os.remove(samplefname)
                    # set auc to zero so these settings are not accepted
                    auc_score = 0
            else:
                try:
                    auc_score_dict = train.main(samplefname,
                                                verbose=verbose, store_models=False)
                    auc_score = auc_score_dict['all'] - 0.5
                except IndexError:
                    print("Warning: accidentally added invalid feature.")
                    os.remove(samplefname)
                    # set auc to zero so these settings are not accepted
                    auc_score = 0

        utils.print_verbose("==== Acceptance calculation ====", flag=verbose)
        # compute acceptance probability from AUC:
        #     r = min(1,AUC/(previous AUC))
        acceptance = np.max([np.min([1, auc_score / prevauc]), 0])

        u = np.random.rand()
        # accept new point with probability r
        if u < acceptance:
            prevsample = sample
            # save current auc
            prevauc = auc_score
            utils.print_verbose("accepting new settings with probability "
                                "{0}".format(acceptance), flag=verbose)
        else:
            utils.print_verbose("rejecting new settings with probability "
                                "{0}".format(1.0 - acceptance), flag=verbose)
        utils.print_verbose("================================", flag=verbose)
        # otherwise it will not overwrite prevsample, so continue from where it
        # was

        # as it may be bad manners to run infinite loops
        counter += 1
        if counter > 100:
            converged = True