Esempio n. 1
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    try:
        output_file = os.path.join(world.directory, "reify.py")
        with open(check_file, open_mode("r")) as check_file_handler:
            check_contents = check_file_handler.read().strip("\n")
        # remove unicode mark for strings if Python3
        if PYTHON3:
            check_contents = check_contents.replace( \
                " u'", " '").replace("{u'", "{'").replace(' u"', ' "')
        with open(output_file, open_mode("r")) as output_file:
            output_file_contents = output_file.read()
        #strip comments at the beginning of the file
        output_file_contents = re.sub(r'""".*"""', '', output_file_contents,
                                      flags=re.S).strip("\n")
        if check_contents == output_file_contents:
            assert True
        else:
            if PYTHON3:
                # look for an alternative in PYTHON3
                check_contents = python3_contents(check_file, check_contents)
            if check_contents == output_file_contents:
                assert True
            else:
                assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                               (output_file_contents, check_contents))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 2
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    try:
        output_file = os.path.join(world.directory, "reify.py")
        with open(check_file, open_mode("r")) as check_file_handler:
            check_contents = check_file_handler.read().strip("\n")
        # remove unicode mark for strings if Python3
        if PYTHON3:
            check_contents = check_contents.replace( \
                " u'", " '").replace("{u'", "{'").replace(' u"', ' "')
        with open(output_file, open_mode("r")) as output_file:
            output_file_contents = output_file.read()
        #strip comments at the beginning of the file
        output_file_contents = re.sub(r'""".*"""',
                                      '',
                                      output_file_contents,
                                      flags=re.S).strip("\n")

        #strip internally added project id information
        prefix = "" if PYTHON3 else "u"
        p_str = r',\s\\\n    \{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
            % prefix
        output_file_contents = re.sub(p_str,
                                      ')',
                                      output_file_contents,
                                      flags=re.S).strip("\n")
        p_str = r',\s\\\n    \s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
            % prefix
        output_file_contents = re.sub(p_str,
                                      ')',
                                      output_file_contents,
                                      flags=re.S).strip("\n")
        p_str = r',\n    \s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
            % prefix
        output_file_contents = re.sub(p_str,
                                      '})',
                                      output_file_contents,
                                      flags=re.S).strip("\n")
        p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix
        output_file_contents = re.sub(p_str,
                                      '',
                                      output_file_contents,
                                      flags=re.S).strip("\n")
        print output_file_contents
        if check_contents == output_file_contents:
            assert True
        else:
            if PYTHON3:
                # look for an alternative in PYTHON3
                check_contents = python3_contents(check_file, check_contents)
            if check_contents == output_file_contents:
                assert True
            else:
                assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                               (output_file_contents, check_contents))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 3
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    output_file = os.path.join(world.directory, "reify.py")
    with open(check_file, open_mode("r")) as check_file_handler:
        check_contents = check_file_handler.read().strip("\n")
        check_contents_lines = check_contents.split("\n")
        for index, line in enumerate(check_contents_lines):
            if line:
                check_contents_lines[index] = INDENT + line
        check_contents = "\n".join(check_contents_lines)
    # remove unicode mark for strings if Python3
    if PYTHON3:
        check_contents = check_contents.replace( \
            " u'", " '").replace("{u'", "{'").replace( \
            ' u"', ' "').replace('u\\\'', '\\\'')
    with open(output_file, open_mode("r")) as output_file:
        output_file_contents = output_file.read()
    #strip comments at the beginning of the file
    output_file_contents = re.sub(r'#!.*def\smain\(\):\n', '',
                                  output_file_contents,
                                  flags=re.S).strip("\n")
    output_file_contents = output_file_contents.replace( \
        '\n\nif __name__ == "__main__":\n    main()', '')

    #strip internally added project id information
    prefix = "" if PYTHON3 else "u"
    p_str = r',\s\\\n%s\{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
        % (INDENT * 2, prefix)
    output_file_contents = re.sub(p_str,
                                  ')', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r',\s\\\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
        % (INDENT * 2, prefix)
    output_file_contents = re.sub(p_str,
                                  ')', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r',\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
        % (INDENT * 2, prefix)
    output_file_contents = re.sub(p_str,
                                  '})', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix
    output_file_contents = re.sub(p_str,
                                  '', output_file_contents,
                                  flags=re.S).strip("\n")
    if check_contents == output_file_contents:
        assert True
    else:
        if PYTHON3:
            # look for an alternative in PYTHON3
            check_contents = python3_contents(check_file, check_contents)
            if check_contents != output_file_contents:
                check_contents = python3_contents(
                    check_file, check_contents, alternative="_1")
        eq_(check_contents, output_file_contents)
Esempio n. 4
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    output_file = os.path.join(world.directory, "reify.py")
    with open(check_file, open_mode("r")) as check_file_handler:
        check_contents = check_file_handler.read().strip("\n")
        check_contents_lines = check_contents.split("\n")
        for index, line in enumerate(check_contents_lines):
            if line:
                check_contents_lines[index] = INDENT + line
        check_contents = "\n".join(check_contents_lines)
    # remove unicode mark for strings if Python3
    if PYTHON3:
        check_contents = check_contents.replace( \
            " u'", " '").replace("{u'", "{'").replace( \
            ' u"', ' "').replace('u\\\'', '\\\'')
    with open(output_file, open_mode("r")) as output_file:
        output_file_contents = output_file.read()
    #strip comments at the beginning of the file
    output_file_contents = re.sub(r'#!.*def\smain\(\):\n',
                                  '',
                                  output_file_contents,
                                  flags=re.S).strip("\n")
    output_file_contents = output_file_contents.replace( \
        '\n\nif __name__ == "__main__":\n    main()', '')

    #strip internally added project id information
    prefix = "" if PYTHON3 else "u"
    p_str = r',\s\\\n%s\{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
        % (INDENT * 2, prefix)
    output_file_contents = re.sub(p_str, ')', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r',\s\\\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
        % (INDENT * 2, prefix)
    output_file_contents = re.sub(p_str, ')', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r',\n%s\s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
        % (INDENT * 2, prefix)
    output_file_contents = re.sub(p_str,
                                  '})',
                                  output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix
    output_file_contents = re.sub(p_str, '', output_file_contents,
                                  flags=re.S).strip("\n")
    if check_contents == output_file_contents:
        assert True
    else:
        if PYTHON3:
            # look for an alternative in PYTHON3
            check_contents = python3_contents(check_file, check_contents)
            if check_contents != output_file_contents:
                check_contents = python3_contents(check_file,
                                                  check_contents,
                                                  alternative="_1")
        eq_(check_contents, output_file_contents)
Esempio n. 5
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    try:
        output_file = os.path.join(world.directory, "reify.py")
        with open(check_file, open_mode("r")) as check_file_handler:
            check_contents = check_file_handler.read().strip("\n")
        # remove unicode mark for strings if Python3
        if PYTHON3:
            check_contents = check_contents.replace( \
                " u'", " '").replace("{u'", "{'").replace(' u"', ' "')
        with open(output_file, open_mode("r")) as output_file:
            output_file_contents = output_file.read()
        #strip comments at the beginning of the file
        output_file_contents = re.sub(r'""".*"""', '', output_file_contents,
                                      flags=re.S).strip("\n")

        #strip internally added project id information
        prefix = "" if PYTHON3 else "u"
        p_str = r',\s\\\n    \{\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
            % prefix
        output_file_contents = re.sub(p_str,
                                      ')', output_file_contents,
                                      flags=re.S).strip("\n")
        p_str = r',\s\\\n    \s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
            % prefix
        output_file_contents = re.sub(p_str,
                                      ')', output_file_contents,
                                      flags=re.S).strip("\n")
        p_str = r',\n    \s\'project\':\s%s\'project/[a-f0-9]{24}\'\}\)' \
            % prefix
        output_file_contents = re.sub(p_str,
                                      '})', output_file_contents,
                                      flags=re.S).strip("\n")
        p_str = r',\s\'project\':\s%s\'project/[a-f0-9]{24}\'' % prefix
        output_file_contents = re.sub(p_str,
                                      '', output_file_contents,
                                      flags=re.S).strip("\n")
        print output_file_contents
        if check_contents == output_file_contents:
            assert True
        else:
            if PYTHON3:
                # look for an alternative in PYTHON3
                check_contents = python3_contents(check_file, check_contents)
            if check_contents == output_file_contents:
                assert True
            else:
                assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                               (output_file_contents, check_contents))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 6
0
def best_first_search(datasets_file,
                      api,
                      args,
                      common_options,
                      staleness=None,
                      penalty=None,
                      objective_name=None,
                      resume=False):
    """Selecting the fields to be used in the model construction

    """
    counter = 0
    loop_counter = 0
    features_file = os.path.normpath(
        os.path.join(args.output_dir, FEATURES_LOG))
    with open(features_file, u.open_mode("w")) as features_handler:
        features_writer = csv.writer(features_handler, lineterminator="\n")
        features_writer.writerow(
            ["step", "state", "score", "metric_value", "best_score"])
        features_handler.flush()
        if staleness is None:
            staleness = DEFAULT_STALENESS
        if penalty is None:
            penalty = DEFAULT_PENALTY
        # retrieving the first dataset in the file
        try:
            with open(datasets_file, u.open_mode("r")) as datasets_handler:
                dataset_id = datasets_handler.readline().strip()
        except IOError, exc:
            sys.exit("Could not read the generated datasets file: %s" %
                     str(exc))
        try:
            stored_dataset = u.storage_file_name(args.output_dir, dataset_id)
            with open(stored_dataset, u.open_mode("r")) as dataset_handler:
                dataset = json.loads(dataset_handler.read())
        except IOError:
            dataset = api.check_resource(dataset_id,
                                         query_string=ALL_FIELDS_QS)
        # initial feature set
        fields = Fields(dataset)
        excluded_features = ([] if args.exclude_features is None else
                             args.exclude_features.split(args.args_separator))
        try:
            excluded_ids = [
                fields.field_id(feature) for feature in excluded_features
            ]
            objective_id = fields.field_id(objective_name)
        except ValueError, exc:
            sys.exit(exc)
Esempio n. 7
0
def create_kfold_json(args,
                      kfold_field=DEFAULT_KFOLD_FIELD,
                      objective_field=None,
                      resume=False):
    """Create the files to generate a new field with a random integer from
       0 to k-1, and a filter file for each of these indexes.

    """
    output_dir = args.output_dir
    k = args.k_folds if args.k_folds else DEFAULT_KFOLDS
    try:
        selecting_file_list = []
        for index in range(0, k):
            new_field = NEW_FIELD % (index, k, kfold_field, index,
                                     objective_field)
            selecting_file = TEST_DATASET % index
            selecting_file = os.path.normpath(
                os.path.join(output_dir, selecting_file))
            selecting_file_list.append(selecting_file)
            # When resuming, check if the file already exists
            if not resume or not os.path.isfile(selecting_file):
                resume = False
                with open(selecting_file, u.open_mode("w")) as test_dataset:
                    test_dataset.write(new_field)
        return selecting_file_list, resume
    except IOError:
        sys.exit("Could not create the necessary files.")
Esempio n. 8
0
def best_first_search(datasets_file,
                      api,
                      args,
                      command_obj,
                      staleness=None,
                      penalty=None,
                      objective_name=None,
                      resume=False):
    """Selecting the fields to be used in the model construction

    """
    counter = 0
    loop_counter = 0
    features_file = os.path.normpath(
        os.path.join(args.output_dir, FEATURES_LOG))
    features_writer = UnicodeWriter(features_file).open_writer()
    features_header = FEATURES_HEADER
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_PENALTY
    # retrieving the first dataset in the file
    try:
        with open(datasets_file, u.open_mode("r")) as datasets_handler:
            dataset_id = datasets_handler.readline().strip()
    except IOError, exc:
        sys.exit("Could not read the generated datasets file: %s" % str(exc))
Esempio n. 9
0
def create_kfold_json(args, kfold_field=DEFAULT_KFOLD_FIELD,
                      objective_field=None, resume=False):
    """Create the files to generate a new field with a random integer from
       0 to k-1, and a filter file for each of these indexes.

    """
    output_dir = args.output_dir
    k = args.k_folds if args.k_folds else DEFAULT_KFOLDS
    try:
        selecting_file_list = []
        for index in range(0, k):
            new_field = NEW_FIELD % (index, k, kfold_field,
                                     index, objective_field)
            selecting_file = TEST_DATASET % index
            selecting_file = os.path.normpath(os.path.join(output_dir,
                                                           selecting_file))
            selecting_file_list.append(selecting_file)
            # When resuming, check if the file already exists
            if not resume or not os.path.isfile(selecting_file):
                resume = False
                with open(selecting_file, u.open_mode("w")) as test_dataset:
                    test_dataset.write(new_field)
        return selecting_file_list, resume
    except IOError:
        sys.exit("Could not create the necessary files.")
Esempio n. 10
0
def i_check_sample_file(step, check_sample_file=None):
    if check_sample_file is None:
        assert False
    check_sample_file = res_filename(check_sample_file)
    try:
        sample_file = os.path.join(world.directory, "sample.csv")
        with open(check_sample_file, open_mode("r")) as check_sample_file:
            check_sample_contents = check_sample_file.read()
        with open(sample_file, open_mode("r")) as sample_file:
            sample_file_contents = sample_file.read()
        if check_sample_contents == sample_file_contents:
            assert True
        else:
            assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                           (sample_file_contents, check_sample_contents))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 11
0
def i_check_sample_file(step, check_sample_file=None):
    if check_sample_file is None:
        assert False
    check_sample_file = res_filename(check_sample_file)
    try:
        sample_file = os.path.join(world.directory, "sample.csv")
        with open(check_sample_file, open_mode("r")) as check_sample_file:
            check_sample_contents = check_sample_file.read()
        with open(sample_file, open_mode("r")) as sample_file:
            sample_file_contents = sample_file.read()
        if check_sample_contents == sample_file_contents:
            assert True
        else:
            assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                           (sample_file_contents, check_sample_contents))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 12
0
def multi_label_expansion(training_set,
                          training_set_header,
                          args,
                          output_path,
                          labels=None,
                          session_file=None,
                          input_flag=False):
    """Splitting the labels in a multi-label objective field to create
       a source with column per label

    """
    objective_field = args.objective_field
    input_reader = TrainReader(training_set,
                               training_set_header,
                               objective_field,
                               multi_label=True,
                               labels=labels,
                               label_separator=args.label_separator,
                               training_separator=args.training_separator,
                               multi_label_fields=args.multi_label_fields_list,
                               label_aggregates=args.label_aggregates_list,
                               objective=not input_flag)
    # read file to get all the different labels if no --labels flag is given
    # or use labels given in --labels and generate the new field names
    new_headers = input_reader.get_label_headers()

    try:
        file_name = os.path.basename(training_set)
    except AttributeError:
        file_name = "test_set.csv" if input_flag else "training_set.csv"
    output_file = "%s%sextended_%s" % (output_path, os.sep, file_name)
    message = u.dated("Transforming to extended source.\n")
    u.log_message(message, log_file=session_file, console=args.verbosity)
    with open(output_file, u.open_mode('w')) as output_handler:
        output = csv.writer(output_handler, lineterminator="\n")
        output.writerow(new_headers)
        # read to write new source file with column per label
        input_reader.reset()
        if training_set_header:
            input_reader.get_next()
        while True:
            try:
                row = input_reader.get_next(extended=True)
                output.writerow(row)
            except StopIteration:
                break

    # training sources are zipped to minimize upload time and resources
    if not input_flag:
        output_file_zip = "%s%sextended_%s.zip" % (output_path, os.sep,
                                                   file_name)
        with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file:
            output_zipped_file.write(output_file, file_name)
        output_file = output_file_zip
        objective_field = input_reader.headers[input_reader.objective_column]

    input_reader.close()
    return (output_file, input_reader.get_multi_label_data())
Esempio n. 13
0
def best_first_search(datasets_file, api, args, common_options,
                      staleness=None, penalty=None, objective_name=None,
                      resume=False):
    """Selecting the fields to be used in the model construction

    """
    counter = 0
    loop_counter = 0
    features_file = os.path.normpath(os.path.join(args.output_dir,
                                                  FEATURES_LOG))
    with open(features_file, u.open_mode("w")) as features_handler:
        features_writer = csv.writer(features_handler, lineterminator="\n")
        features_writer.writerow([
            "step", "state", "score", "metric_value", "best_score"])
        features_handler.flush()
        if staleness is None:
            staleness = DEFAULT_STALENESS
        if penalty is None:
            penalty = DEFAULT_PENALTY
        # retrieving the first dataset in the file
        try:
            with open(datasets_file, u.open_mode("r")) as datasets_handler:
                dataset_id = datasets_handler.readline().strip()
        except IOError, exc:
            sys.exit("Could not read the generated datasets file: %s" %
                     str(exc))
        try:
            stored_dataset = u.storage_file_name(args.output_dir, dataset_id)
            with open(stored_dataset, u.open_mode("r")) as dataset_handler:
                dataset = json.loads(dataset_handler.read())
        except IOError:
            dataset = api.check_resource(dataset_id,
                                         query_string=ALL_FIELDS_QS)
        # initial feature set
        fields = Fields(dataset)
        excluded_features = ([] if args.exclude_features is None else
                             args.exclude_features.split(
                                 args.args_separator))
        try:
            excluded_ids = [fields.field_id(feature) for
                            feature in excluded_features]
            objective_id = fields.field_id(objective_name)
        except ValueError, exc:
            sys.exit(exc)
Esempio n. 14
0
def i_check_sample_json(step, check_sample_file=None):
    if check_sample_file is None:
        assert False
    check_sample_file = res_filename(check_sample_file)
    try:
        sample_file = os.path.join(world.directory, "stat_info.json")
        with open(check_sample_file, open_mode("r")) as check_sample_file:
            contents = check_sample_file.read()
            check_sample_json = json.loads(contents)
        with open(sample_file, open_mode("r")) as sample_file:
            contents = sample_file.read()
            sample_file_json = json.loads(contents)
        if check_sample_json == sample_file_json:
            assert True
        else:
            assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                           (sample_file_json, check_sample_json))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 15
0
def retrieve_subcommands():
    """Retrieves the executed subcommands in inverse order

    """
    global subcommand_list
    subcommand_list = open(subcommand_file, u.open_mode("r")).readlines()
    if not u.PYTHON3:
        subcommand_list = [subcommand.decode(u.SYSTEM_ENCODING)
                           for subcommand in subcommand_list]
    subcommand_list.reverse()
Esempio n. 16
0
def i_check_sample_json(step, check_sample_file=None):
    if check_sample_file is None:
        assert False
    check_sample_file = res_filename(check_sample_file)
    try:
        sample_file = os.path.join(world.directory, "stat_info.json")
        with open(check_sample_file, open_mode("r")) as check_sample_file:
            contents = check_sample_file.read()
            check_sample_json = json.loads(contents)
        with open(sample_file, open_mode("r")) as sample_file:
            contents = sample_file.read()
            sample_file_json = json.loads(contents)
        if check_sample_json == sample_file_json:
            assert True
        else:
            assert False, ("File contents:\n%s\nExpected contents:\n%s" %
                           (sample_file_json, check_sample_json))
    except Exception, exc:
        assert False, str(exc)
Esempio n. 17
0
def retrieve_subcommands():
    """Retrieves the executed subcommands in inverse order

    """
    global subcommand_list
    subcommand_list = open(subcommand_file, u.open_mode("r")).readlines()
    if not u.PYTHON3:
        subcommand_list = [subcommand.decode(u.SYSTEM_ENCODING)
                           for subcommand in subcommand_list]
    subcommand_list.reverse()
Esempio n. 18
0
def multi_label_expansion(training_set, training_set_header,
                          args, output_path,
                          labels=None, session_file=None, input_flag=False):
    """Splitting the labels in a multi-label objective field to create
       a source with column per label

    """
    objective_field = args.objective_field
    input_reader = TrainReader(training_set, training_set_header,
                               objective_field, multi_label=True,
                               labels=labels,
                               label_separator=args.label_separator,
                               training_separator=args.training_separator,
                               multi_label_fields=args.multi_label_fields_list,
                               label_aggregates=args.label_aggregates_list,
                               objective=not input_flag)
    # read file to get all the different labels if no --labels flag is given
    # or use labels given in --labels and generate the new field names
    new_headers = input_reader.get_label_headers()

    try:
        file_name = os.path.basename(training_set)
    except AttributeError:
        file_name = "test_set.csv" if input_flag else "training_set.csv"
    output_file = "%s%sextended_%s" % (output_path, os.sep, file_name)
    message = u.dated("Transforming to extended source.\n")
    u.log_message(message, log_file=session_file,
                  console=args.verbosity)
    with open(output_file, u.open_mode('w')) as output_handler:
        output = csv.writer(output_handler, lineterminator="\n")
        output.writerow(new_headers)
        # read to write new source file with column per label
        input_reader.reset()
        if training_set_header:
            input_reader.get_next()
        while True:
            try:
                row = input_reader.get_next(extended=True)
                output.writerow(row)
            except StopIteration:
                break

    # training sources are zipped to minimize upload time and resources
    if not input_flag:
        output_file_zip = "%s%sextended_%s.zip" % (output_path,
                                                   os.sep, file_name)
        with ZipFile(output_file_zip, 'w', ZIP_DEFLATED) as output_zipped_file:
            output_zipped_file.write(output_file, file_name)
        output_file = output_file_zip
        objective_field = input_reader.headers[input_reader.objective_column]

    input_reader.close()
    return (output_file, input_reader.get_multi_label_data())
Esempio n. 19
0
def python3_contents(filename, prior_contents, alternative=""):
    """Check for a file that has alternative contents for Python3 and return
       its contents

    """
    directory = os.path.dirname(filename)
    basename = os.path.basename(filename)
    basename_name, basename_ext = basename.split(".")
    filename = os.path.join(directory, "%s_py3%s.%s" % ( \
        basename_name, alternative, basename_ext))
    try:
        with open(filename, open_mode("r")) as file_handler:
            return file_handler.read().strip("\n")
    except IOError:
        return prior_contents
Esempio n. 20
0
def python3_contents(filename, prior_contents, alternative=""):
    """Check for a file that has alternative contents for Python3 and return
       its contents

    """
    directory = os.path.dirname(filename)
    basename = os.path.basename(filename)
    basename_name, basename_ext = basename.split(".")
    filename = os.path.join(directory, "%s_py3%s.%s" % ( \
        basename_name, alternative, basename_ext))
    try:
        with open(filename, open_mode("r")) as file_handler:
            return file_handler.read().strip("\n")
    except IOError:
        return prior_contents
Esempio n. 21
0
def create_candidates_evaluations(datasets_file,
                                  args,
                                  command_obj,
                                  resume=False,
                                  random_candidates=DEFAULT_MIN_CANDIDATES):
    """ Create random candidates ensembles evaluations

    """
    global subcommand_list
    output_dir = os.path.normpath(
        u.check_dir(
            os.path.join(u"%s%s" % (args.output_dir, random_candidates),
                         "evaluation.json")))
    command = COMMANDS["random_candidates"] % (datasets_file,
                                               random_candidates, output_dir)
    command_args = command.split()
    """
    common_options_list = u.get_options_list(args, command_obj.common_options,
                                             prioritary=command_args)
    command_args.extend(common_options_list)
    """
    command_args.append("--objective")
    command_args.append(args.objective_field)
    command_args = add_model_options(command_args, args)

    command_obj.propagate(
        command_args, exclude=["--dataset", "--datasets", "--dataset-file"])
    command = rebuild_command(command_args)
    if resume:
        next_command = subcommand_list.pop()
        if different_command(next_command, command):
            resume = False
            u.sys_log_message(command, log_file=subcommand_file)
            main_dispatcher(args=command_args)
        elif not subcommand_list:
            main_dispatcher(args=['main', '--resume'])
            resume = False
    else:
        u.sys_log_message(command, log_file=subcommand_file)
        main_dispatcher(args=command_args)
    evaluation_file = os.path.normpath(
        os.path.join(output_dir, "evaluation.json"))
    try:
        with open(evaluation_file, u.open_mode("r")) as evaluation_handler:
            evaluation = json.loads(evaluation_handler.read())
        return evaluation, resume
    except (ValueError, IOError):
        sys.exit("Failed to retrieve evaluation.")
Esempio n. 22
0
def create_candidates_evaluations(datasets_file, args, command_obj,
                                  resume=False,
                                  random_candidates=DEFAULT_MIN_CANDIDATES):
    """ Create random candidates ensembles evaluations

    """
    global subcommand_list
    output_dir = os.path.normpath(u.check_dir(
        os.path.join(u"%s%s" % (args.output_dir, random_candidates),
                     "evaluation.json")))
    command = COMMANDS["random_candidates"] % (
        datasets_file, random_candidates, output_dir)
    command_args = command.split()
    """
    common_options_list = u.get_options_list(args, command_obj.common_options,
                                             prioritary=command_args)
    command_args.extend(common_options_list)
    """
    command_args.append("--objective")
    command_args.append(args.objective_field)
    command_args = add_model_options(command_args, args)

    command_obj.propagate(command_args, exclude=["--dataset",
                                                 "--datasets",
                                                 "--dataset-file"])
    command = rebuild_command(command_args)
    if resume:
        next_command = subcommand_list.pop()
        if different_command(next_command, command):
            resume = False
            u.sys_log_message(command, log_file=subcommand_file)
            main_dispatcher(args=command_args)
        elif not subcommand_list:
            main_dispatcher(args=['main', '--resume'])
            resume = False
    else:
        u.sys_log_message(command, log_file=subcommand_file)
        main_dispatcher(args=command_args)
    evaluation_file = os.path.normpath(os.path.join(output_dir,
                                                    "evaluation.json"))
    try:
        with open(evaluation_file, u.open_mode("r")) as evaluation_handler:
            evaluation = json.loads(evaluation_handler.read())
        return evaluation, resume
    except (ValueError, IOError):
        sys.exit("Failed to retrieve evaluation.")
Esempio n. 23
0
def create_node_th_evaluations(datasets_file,
                               args,
                               common_options,
                               resume=False,
                               node_threshold=DEFAULT_MIN_NODES):
    """ Create node_threshold evaluations

    """
    global subcommand_list
    output_dir = os.path.normpath(
        u.check_dir(
            os.path.join(u"%s%s" % (args.output_dir, node_threshold),
                         "evaluation.json")))
    command = COMMANDS["node_threshold"] % (datasets_file, node_threshold,
                                            output_dir)
    command_args = command.split()
    common_options_list = u.get_options_list(args,
                                             common_options,
                                             prioritary=command_args)
    command_args.extend(common_options_list)
    command_args.append("--objective")
    command_args.append(args.objective_field)
    command_args = add_model_options(command_args, args)
    command = rebuild_command(command_args)
    if resume:
        next_command = subcommand_list.pop()
        if different_command(next_command, command):
            resume = False
            u.sys_log_message(command, log_file=subcommand_file)
            main_dispatcher(args=command_args)
        elif not subcommand_list:
            main_dispatcher(args=['main', '--resume'])
            resume = False
    else:
        u.sys_log_message(command, log_file=subcommand_file)
        main_dispatcher(args=command_args)
    evaluation_file = os.path.normpath(
        os.path.join(output_dir, "evaluation.json"))
    try:
        with open(evaluation_file, u.open_mode("r")) as evaluation_handler:
            evaluation = json.loads(evaluation_handler.read())
        return evaluation, resume
    except (ValueError, IOError):
        sys.exit("Failed to retrieve evaluation.")
Esempio n. 24
0
def create_node_th_evaluations(datasets_file, args, common_options,
                               resume=False,
                               node_threshold=DEFAULT_MIN_NODES):
    """ Create node_threshold evaluations

    """
    global subcommand_list
    output_dir = os.path.normpath(u.check_dir(
        os.path.join(u"%s%s" % (args.output_dir, node_threshold),
                     "evaluation.json")))
    command = COMMANDS["node_threshold"] % (
        datasets_file, node_threshold, output_dir)
    command_args = command.split()
    common_options_list = u.get_options_list(args, common_options,
                                             prioritary=command_args)
    command_args.extend(common_options_list)
    command_args.append("--objective")
    command_args.append(args.objective_field)
    command_args = add_model_options(command_args, args)
    command = rebuild_command(command_args)
    if resume:
        next_command = subcommand_list.pop()
        if different_command(next_command, command):
            resume = False
            u.sys_log_message(command, log_file=subcommand_file)
            main_dispatcher(args=command_args)
        elif not subcommand_list:
            main_dispatcher(args=['main', '--resume'])
            resume = False
    else:
        u.sys_log_message(command, log_file=subcommand_file)
        main_dispatcher(args=command_args)
    evaluation_file = os.path.normpath(os.path.join(output_dir,
                                                    "evaluation.json"))
    try:
        with open(evaluation_file, u.open_mode("r")) as evaluation_handler:
            evaluation = json.loads(evaluation_handler.read())
        return evaluation, resume
    except (ValueError, IOError):
        sys.exit("Failed to retrieve evaluation.")
Esempio n. 25
0
def best_first_search(datasets_file, api, args, common_options,
                      staleness=None, penalty=None, objective_name=None,
                      resume=False):
    """Selecting the fields to be used in the model construction

    """
    counter = 0
    loop_counter = 0
    features_file = os.path.normpath(os.path.join(args.output_dir,
                                                  FEATURES_LOG))
    features_writer = UnicodeWriter(features_file).open_writer()
    features_header = FEATURES_HEADER
    if staleness is None:
        staleness = DEFAULT_STALENESS
    if penalty is None:
        penalty = DEFAULT_PENALTY
    # retrieving the first dataset in the file
    try:
        with open(datasets_file, u.open_mode("r")) as datasets_handler:
            dataset_id = datasets_handler.readline().strip()
    except IOError, exc:
        sys.exit("Could not read the generated datasets file: %s" %
                 str(exc))
Esempio n. 26
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    output_file = os.path.join(world.directory, os.path.basename(output))
    with open(check_file, open_mode("r")) as check_file_handler:
        check_contents = check_file_handler.read().strip("\n")
    """
        check_contents_lines = check_contents.split("\n")
        for index, line in enumerate(check_contents_lines):
            if line:
                check_contents_lines[index] = INDENT + line
        check_contents = "\n".join(check_contents_lines)
    """
    # remove unicode mark for strings if Python3
    if PYTHON3:
        check_contents = check_contents.replace( \
            " u'", " '").replace("{u'", "{'").replace( \
            ' u"', ' "').replace('u\\\'', '\\\'')
        check_contents = re.sub(r'\n\s*', '\n', check_contents)
    with open(output_file, open_mode("r")) as output_file:
        output_file_contents = output_file.read()

    #strip comments at the beginning of the file
    output_file_contents = re.sub(r'#!.*def\smain\(\):\n',
                                  '',
                                  output_file_contents,
                                  flags=re.S).strip("\n")
    output_file_contents = output_file_contents.replace( \
        '\nif __name__ == "__main__":\n    main()', '')

    #strip internally added project id information
    prefix = "" if PYTHON3 else "u"
    p_str = r'%s\'project\':\s%s\'project/[a-f0-9]{24}\',?\s?' \
        % (prefix, prefix)
    output_file_contents = re.sub(p_str, '', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r'/[a-f0-9]{24}'
    output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S)
    check_contents = re.sub(p_str, '', check_contents, flags=re.S)
    p_str = r';;.*\n'
    output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S)
    check_contents = re.sub(p_str, '', check_contents, flags=re.S)
    p_str = r'created by.*\n'
    output_file_contents = re.sub(p_str, '', output_file_contents, flags=re.S)
    check_contents = re.sub(p_str, '', check_contents, flags=re.S)
    p_str = r'    api = .*?\n'
    output_file_contents = re.sub(p_str,
                                  '    api = BigML()\n',
                                  output_file_contents,
                                  flags=re.S).strip("\n")
    output_file_contents = re.sub(r'\n\s*', '\n', output_file_contents)
    check_contents = re.sub(r'\n\s*', '\n', check_contents)
    output_file_contents = output_file_contents.strip("\n").strip()
    check_contents = check_contents.strip("\n").strip()
    if check_contents != output_file_contents:
        if PYTHON3:
            # look for an alternative in PYTHON3
            check_contents = python3_contents( \
                check_file, check_contents)
            if check_contents != output_file_contents:
                check_contents = python3_contents(check_file,
                                                  check_contents,
                                                  alternative="_1")
        with open("%s_bck" % check_file, "w") as bck_file:
            bck_file.write(output_file_contents)
        eq_(check_contents, output_file_contents)
Esempio n. 27
0
def best_candidates_number(datasets_file, args, common_options,
                           penalty=None,
                           resume=False):
    """Selecting the best number of random candidates
       to be used in the ensemble construction

    """
    loop_counter = 0
    candidates_file = os.path.normpath(os.path.join(args.output_dir,
                                                    CANDIDATES_LOG))
    with open(candidates_file, u.open_mode("w")) as candidates_handler:
        candidates_writer = csv.writer(candidates_handler, lineterminator="\n")
        candidates_writer.writerow([
            "step", "random_candidates", "score", "metric_value",
            "best_score"])
        candidates_handler.flush()
        args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                        "random"))
        max_candidates = args.max_candidates + 1

        if args.nodes_step is None:
            args.nodes_step = DEFAULT_CANDIDATES_STEP
        random_candidates = args.min_candidates

        if penalty is None:
            penalty = DEFAULT_CANDIDATES_PENALTY
        best_score = - float('inf')
        metric = args.optimize
        score = best_score
        while random_candidates < max_candidates:
            loop_counter += 1
            (score,
             metric_value,
             metric,
             resume) = candidates_evaluate(datasets_file, args,
                                           random_candidates, common_options,
                                           penalty=penalty, resume=resume,
                                           metric=metric)
            candidates_writer.writerow([
                loop_counter, random_candidates, score, metric_value,
                best_score])
            candidates_handler.flush()
            if (score - EPSILON) > best_score:
                best_candidates = random_candidates
                best_score = score
                message = 'New best random candidates number is: %s\n' % \
                    best_candidates
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
                if metric in PERCENT_EVAL_METRICS:
                    message = '%s = %0.2f%% (score = %s)\n' % (
                        metric.capitalize(), metric_value * 100, score)
                else:
                    message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                          metric_value,
                                                          score)
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            random_candidates += DEFAULT_CANDIDATES_STEP

        message = ('The best random candidates number is: %s \n'
                   % best_candidates)
        u.log_message(message, log_file=session_file, console=1)
        if metric in PERCENT_EVAL_METRICS:
            message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                           (best_score * 100)))
        else:
            message = ('%s = %f\n' % (metric.capitalize(), best_score))
        u.log_message(message, log_file=session_file, console=1)
        return best_candidates
Esempio n. 28
0
     os.path.join(args.output_dir, FEATURES_LOG))
 features_writer = UnicodeWriter(features_file).open_writer()
 features_header = FEATURES_HEADER
 if staleness is None:
     staleness = DEFAULT_STALENESS
 if penalty is None:
     penalty = DEFAULT_PENALTY
 # retrieving the first dataset in the file
 try:
     with open(datasets_file, u.open_mode("r")) as datasets_handler:
         dataset_id = datasets_handler.readline().strip()
 except IOError, exc:
     sys.exit("Could not read the generated datasets file: %s" % str(exc))
 try:
     stored_dataset = u.storage_file_name(args.output_dir, dataset_id)
     with open(stored_dataset, u.open_mode("r")) as dataset_handler:
         dataset = json.loads(dataset_handler.read())
 except IOError:
     dataset = api.check_resource(dataset_id, query_string=ALL_FIELDS_QS)
 # initial feature set
 fields = Fields(dataset)
 excluded_features = ([] if args.exclude_features is None else
                      args.exclude_features.split(args.args_separator))
 try:
     excluded_ids = [
         fields.field_id(feature) for feature in excluded_features
     ]
     objective_id = fields.field_id(objective_name)
 except ValueError, exc:
     sys.exit(exc)
 field_ids = [
Esempio n. 29
0
def i_check_output_file(step, output=None, check_file=None):
    if check_file is None or output is None:
        assert False
    check_file = res_filename(check_file)
    output_file = os.path.join(world.directory, os.path.basename(output))
    with open(check_file, open_mode("r")) as check_file_handler:
        check_contents = check_file_handler.read().strip("\n")
    """
        check_contents_lines = check_contents.split("\n")
        for index, line in enumerate(check_contents_lines):
            if line:
                check_contents_lines[index] = INDENT + line
        check_contents = "\n".join(check_contents_lines)
    """
    # remove unicode mark for strings if Python3
    if PYTHON3:
        check_contents = check_contents.replace( \
            " u'", " '").replace("{u'", "{'").replace( \
            ' u"', ' "').replace('u\\\'', '\\\'')
        check_contents = re.sub(r'\n\s*', '\n', check_contents)
    with open(output_file, open_mode("r")) as output_file:
        output_file_contents = output_file.read()

    #strip comments at the beginning of the file
    output_file_contents = re.sub(r'#!.*def\smain\(\):\n', '',
                                  output_file_contents,
                                  flags=re.S).strip("\n")
    output_file_contents = output_file_contents.replace( \
        '\nif __name__ == "__main__":\n    main()', '')

    #strip internally added project id information
    prefix = "" if PYTHON3 else "u"
    p_str = r'%s\'project\':\s%s\'project/[a-f0-9]{24}\',?\s?' \
        % (prefix, prefix)
    output_file_contents = re.sub(p_str,
                                  '', output_file_contents,
                                  flags=re.S).strip("\n")
    p_str = r'/[a-f0-9]{24}'
    output_file_contents = re.sub(p_str,
                                  '', output_file_contents,
                                  flags=re.S)
    check_contents = re.sub(p_str,
                            '', check_contents,
                            flags=re.S)
    p_str = r';;.*\n'
    output_file_contents = re.sub(p_str,
                                  '', output_file_contents,
                                  flags=re.S)
    check_contents = re.sub(p_str,
                            '', check_contents,
                            flags=re.S)
    p_str = r'created by.*\n'
    output_file_contents = re.sub(p_str,
                                  '', output_file_contents,
                                  flags=re.S)
    check_contents = re.sub(p_str,
                            '', check_contents,
                            flags=re.S)
    p_str = r'    api = .*?\n'
    output_file_contents = re.sub(p_str,
                                  '    api = BigML()\n', output_file_contents,
                                  flags=re.S).strip("\n")
    output_file_contents = re.sub(r'\n\s*', '\n', output_file_contents)
    check_contents = re.sub(r'\n\s*', '\n', check_contents)
    output_file_contents = output_file_contents.strip("\n").strip()
    check_contents = check_contents.strip("\n").strip()
    if check_contents != output_file_contents:
        if PYTHON3:
            # look for an alternative in PYTHON3
            check_contents = python3_contents( \
                check_file, check_contents)
            if check_contents != output_file_contents:
                check_contents = python3_contents(
                    check_file, check_contents, alternative="_1")
        with open("%s_bck" % check_file, "w") as bck_file:
            bck_file.write(output_file_contents)
        eq_(check_contents, output_file_contents)
Esempio n. 30
0
 features_writer = UnicodeWriter(features_file).open_writer()
 features_header = FEATURES_HEADER
 if staleness is None:
     staleness = DEFAULT_STALENESS
 if penalty is None:
     penalty = DEFAULT_PENALTY
 # retrieving the first dataset in the file
 try:
     with open(datasets_file, u.open_mode("r")) as datasets_handler:
         dataset_id = datasets_handler.readline().strip()
 except IOError, exc:
     sys.exit("Could not read the generated datasets file: %s" %
              str(exc))
 try:
     stored_dataset = u.storage_file_name(args.output_dir, dataset_id)
     with open(stored_dataset, u.open_mode("r")) as dataset_handler:
         dataset = json.loads(dataset_handler.read())
 except IOError:
     dataset = api.check_resource(dataset_id,
                                  query_string=ALL_FIELDS_QS)
 # initial feature set
 fields = Fields(dataset)
 excluded_features = ([] if args.exclude_features is None else
                      args.exclude_features.split(
                          args.args_separator))
 try:
     excluded_ids = [fields.field_id(feature) for
                     feature in excluded_features]
     objective_id = fields.field_id(objective_name)
 except ValueError, exc:
     sys.exit(exc)
Esempio n. 31
0
def best_candidates_number(datasets_file, args, common_options,
                           penalty=None,
                           resume=False):
    """Selecting the best number of random candidates
       to be used in the ensemble construction

    """
    loop_counter = 0
    candidates_file = os.path.normpath(os.path.join(args.output_dir,
                                                    CANDIDATES_LOG))
    with open(candidates_file, u.open_mode("w")) as candidates_handler:
        candidates_writer = csv.writer(candidates_handler, lineterminator="\n")
        candidates_writer.writerow([
            "step", "random_candidates", "score", "metric_value",
            "best_score"])
        candidates_handler.flush()
        args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                        "random"))
        max_candidates = args.max_candidates + 1

        if args.nodes_step is None:
            args.nodes_step = DEFAULT_CANDIDATES_STEP
        random_candidates = args.min_candidates

        if penalty is None:
            penalty = DEFAULT_CANDIDATES_PENALTY
        best_score = - float('inf')
        metric = args.optimize
        score = best_score
        while random_candidates < max_candidates:
            loop_counter += 1
            (score,
             metric_value,
             metric,
             resume) = candidates_evaluate(datasets_file, args,
                                           random_candidates, common_options,
                                           penalty=penalty, resume=resume,
                                           metric=metric)
            candidates_writer.writerow([
                loop_counter, random_candidates, score, metric_value,
                best_score])
            candidates_handler.flush()
            if (score - EPSILON) > best_score:
                best_candidates = random_candidates
                best_score = score
                message = 'New best random candidates number is: %s\n' % \
                    best_candidates
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
                if metric in PERCENT_EVAL_METRICS:
                    message = '%s = %0.2f%% (score = %s)\n' % (
                        metric.capitalize(), metric_value * 100, score)
                else:
                    message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                          metric_value,
                                                          score)
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            random_candidates += DEFAULT_CANDIDATES_STEP

        message = ('The best random candidates number is: %s \n'
                   % best_candidates)
        u.log_message(message, log_file=session_file, console=1)
        if metric in PERCENT_EVAL_METRICS:
            message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                           (best_score * 100)))
        else:
            message = ('%s = %f\n' % (metric.capitalize(), best_score))
        u.log_message(message, log_file=session_file, console=1)
        return best_candidates
Esempio n. 32
0
def best_node_threshold(datasets_file, args, common_options,
                        staleness=None, penalty=None,
                        resume=False):
    """Selecting the node_limit to be used in the model construction

    """
    loop_counter = 0
    nodes_file = os.path.normpath(os.path.join(args.output_dir,
                                               NODES_LOG))
    with open(nodes_file, u.open_mode("w")) as nodes_handler:
        nodes_writer = csv.writer(nodes_handler, lineterminator="\n")
        nodes_writer.writerow([
            "step", "node_threshold", "score", "metric_value", "best_score"])
        nodes_handler.flush()
        args.output_dir = os.path.normpath(os.path.join(args.output_dir,
                                                        "node_th"))
        max_nodes = args.max_nodes + 1

        if args.min_nodes is None:
            args.min_nodes = DEFAULT_MIN_NODES
        if args.nodes_step is None:
            args.nodes_step = DEFAULT_NODES_STEP
        node_threshold = args.min_nodes
        if staleness is None:
            staleness = DEFAULT_STALENESS
        if penalty is None:
            penalty = DEFAULT_NODES_PENALTY
        best_score = - float('inf')
        best_unchanged_count = 0
        metric = args.optimize
        score = best_score
        while best_unchanged_count < staleness and node_threshold < max_nodes:
            loop_counter += 1
            (score,
             metric_value,
             metric,
             resume) = node_threshold_evaluate(datasets_file, args,
                                               node_threshold, common_options,
                                               penalty=penalty, resume=resume,
                                               metric=metric)
            nodes_writer.writerow([
                loop_counter, node_threshold, score, metric_value, best_score])
            nodes_handler.flush()
            if (score - EPSILON) > best_score:
                best_threshold = node_threshold
                best_score = score
                best_unchanged_count = 0
                message = 'New best node threshold: %s\n' % (best_threshold)
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
                if metric in PERCENT_EVAL_METRICS:
                    message = '%s = %0.2f%% (score = %s)\n' % (
                        metric.capitalize(), metric_value * 100, score)
                else:
                    message = '%s = %f (score = %s)\n' % (metric.capitalize(),
                                                          metric_value,
                                                          score)
                u.log_message(message, log_file=session_file,
                              console=args.verbosity)
            else:
                best_unchanged_count += 1
            node_threshold += args.nodes_step

        message = ('The best node threshold is: %s \n'
                   % best_threshold)
        u.log_message(message, log_file=session_file, console=1)
        if metric in PERCENT_EVAL_METRICS:
            message = ('%s = %0.2f%%\n' % (metric.capitalize(),
                                           (best_score * 100)))
        else:
            message = ('%s = %f\n' % (metric.capitalize(), best_score))
        u.log_message(message, log_file=session_file, console=1)
        return best_threshold