Example #1
0
    def __init__(self, path, tables={}, enable_traces=True):
        """
        Arguments:
            path (str): The path to the database file.
            tables (dictionary of {str: tuple of str}, optional): A diction
              of {name: schema} pairs, where a schema is list of tuple pairs,
              of the form: (name, type).
           enable_traces(bool, optional): Enable traces for user
             defined functions and aggregates.
        """
        self.path = fs.path(path)

        # Create directory if needed.
        parent_dir = fs.dirname(path)
        if parent_dir:
            fs.mkdir(parent_dir)

        self.connection = sql.connect(self.path)

        for name,schema in six.iteritems(tables):
            self.create_table(name, schema)

        io.debug("Opened connection to '{0}'".format(self.path))

        # Register exit handler
        atexit.register(self.close)

        # Enable traces for user defined functions and aggregates. See:
        #
        # https://docs.python.org/2/library/sqlite3.html#sqlite3.enable_callback_tracebacks
        if enable_traces:
            sql.enable_callback_tracebacks(True)
Example #2
0
def unpack_archive(*components, **kwargs):
    """
    Unpack a compressed archive.

    Arguments:
        *components (str[]): Absolute path.
        compression (str, optional): Archive compression type.
    """
    path = fs.path(*components)
    compression = kwargs.get("compression", "bz2")

    # extract tar relative to it's directory
    fs.cd(fs.dirname(path))

    tar = tarfile.open(path, "r:" + compression)
    tar.extractall()
    tar.close()

    fs.cdpop()
Example #3
0
def unpack_archive(*components, **kwargs) -> str:
    """
    Unpack a compressed archive.

    Arguments:
        *components (str[]): Absolute path.
        **kwargs (dict, optional): Set "compression" to compression type.
            Default: bz2. Set "dir" to destination directory. Defaults to the
            directory of the archive.

    Returns:
        str: Path to directory.
    """
    path = fs.abspath(*components)
    compression = kwargs.get("compression", "bz2")
    dir = kwargs.get("dir", fs.dirname(path))

    fs.cd(dir)
    tar = tarfile.open(path, "r:" + compression)
    tar.extractall()
    tar.close()
    fs.cdpop()

    return dir
Example #4
0
def unpack_archive(*components, **kwargs) -> str:
    """
    Unpack a compressed archive.

    Arguments:
        *components (str[]): Absolute path.
        **kwargs (dict, optional): Set "compression" to compression type.
            Default: bz2. Set "dir" to destination directory. Defaults to the
            directory of the archive.

    Returns:
        str: Path to directory.
    """
    path = fs.abspath(*components)
    compression = kwargs.get("compression", "bz2")
    dir = kwargs.get("dir", fs.dirname(path))

    fs.cd(dir)
    tar = tarfile.open(path, "r:" + compression)
    tar.extractall()
    tar.close()
    fs.cdpop()

    return dir
Example #5
0
def evaluate(model, embeddings, folder_data, samples_per_class, folder_results, dense_layer_size, print_summary,
             num_epochs, batch_size):
    # Set seed for reproducibility
    seed = 204

    ####################################################################################################################
    # Get data
    vsamples_per_class = FLAGS.vsamples

    # Data acquisition
    num_classes = 104
    y_train = np.empty(0)  # training
    X_train = list()
    folder_data_train = folder_data + '_train'
    y_val = np.empty(0)  # validation
    X_val = list()
    folder_data_val = folder_data + '_val'
    y_test = np.empty(0)  # testing
    X_test = list()
    folder_data_test = folder_data + '_test'
    print('Getting file names for', num_classes, 'classes from folders:')
    print(folder_data_train)
    print(folder_data_val)
    print(folder_data_test)
    for i in range(1, num_classes + 1):  # loop over classes

        # training: Read data file names
        folder = os.path.join(folder_data_train, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\ttraining  : Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec']

        # training: Randomly pick programs
        assert len(seq_files) >= samples_per_class, "Cannot sample " + str(samples_per_class) + " from " + str(
            len(seq_files)) + " files found in " + folder
        X_train += resample(seq_files, replace=False, n_samples=samples_per_class, random_state=seed)
        y_train = np.concatenate([y_train, np.array([int(i)] * samples_per_class, dtype=np.int32)])

        # validation: Read data file names
        folder = os.path.join(folder_data_val, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\tvalidation: Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec']

        # validation: Randomly pick programs
        if vsamples_per_class > 0:
            assert len(seq_files) >= vsamples_per_class, "Cannot sample " + str(vsamples_per_class) + " from " + str(
                len(seq_files)) + " files found in " + folder
            X_val += resample(seq_files, replace=False, n_samples=vsamples_per_class, random_state=seed)
            y_val = np.concatenate([y_val, np.array([int(i)] * vsamples_per_class, dtype=np.int32)])
        else:
            assert len(seq_files) > 0, "No .rec files found in" + folder
            X_val += seq_files
            y_val = np.concatenate([y_val, np.array([int(i)] * len(seq_files), dtype=np.int32)])

        # test: Read data file names
        folder = os.path.join(folder_data_test, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\ttest      : Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec']
        assert len(seq_files) > 0, "No .rec files found in" + folder
        X_test += seq_files
        y_test = np.concatenate([y_test, np.array([int(i)] * len(seq_files), dtype=np.int32)])

    # Load dictionary and cutoff statements
    folder_vocabulary = FLAGS.vocabulary_dir
    dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle')
    print('\tLoading dictionary from file', dictionary_pickle)
    with open(dictionary_pickle, 'rb') as f:
        dictionary = pickle.load(f)
    unk_index = dictionary[rgx.unknown_token]
    del dictionary

    # Encode source codes and get max. sequence length
    X_seq_train, maxlen_train = encode_srcs(X_train, 'training', unk_index)
    X_seq_val, maxlen_val = encode_srcs(X_val, 'validation', unk_index)
    X_seq_test, maxlen_test = encode_srcs(X_test, 'testing', unk_index)
    maxlen = max(maxlen_train, maxlen_test, maxlen_val)
    print('Max. sequence length overall:', maxlen)
    print('Padding sequences')
    X_seq_train = pad_src(X_seq_train, maxlen, unk_index)
    X_seq_val = pad_src(X_seq_val, maxlen, unk_index)
    X_seq_test = pad_src(X_seq_test, maxlen, unk_index)

    # Get one-hot vectors for classification
    print('YTRAIN\n', y_train)
    y_1hot_train = get_onehot(y_train, num_classes)
    y_1hot_val = get_onehot(y_val, num_classes)

    ####################################################################################################################
    # Setup paths

    # Set up names paths
    model_name = model.__name__
    model_path = os.path.join(folder_results,
                              "classifyapp/models/{}.model".format(model_name))
    predictions_path = os.path.join(folder_results,
                                    "classifyapp/predictions/{}.result".format(model_name))

    # If predictions have already been made with these embeddings, load them
    if fs.exists(predictions_path):
        print("\tFound predictions in", predictions_path, ", skipping...")
        with open(predictions_path, 'rb') as infile:
            p = pickle.load(infile)

    else:  # could not find predictions already computed with these embeddings

        # Embeddings
        import tensorflow as tf  # for embeddings lookup
        embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1)
        vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape
        print('XSEQ:\n', X_seq_train)
        print('EMB:\n', embedding_matrix_normalized)

        gen_test = EmbeddingPredictionSequence(batch_size, X_seq_test, embedding_matrix_normalized)

        # If models have already been made with these embeddings, load them
        if fs.exists(model_path):
            print("\n\tFound trained model in", model_path, ", skipping...")
            model.restore(model_path)

        else:  # could not find models already computed with these embeddings

            gen_train = EmbeddingSequence(batch_size, X_seq_train, y_1hot_train, embedding_matrix_normalized)
            gen_val = EmbeddingSequence(batch_size, X_seq_val, y_1hot_val, embedding_matrix_normalized)

            ############################################################################################################
            # Train

            # Create a new model and train it
            print('\n--- Initializing model...')
            model.init(seed=seed,
                       maxlen=maxlen,
                       embedding_dim=int(embedding_dimension),
                       num_classes=num_classes,
                       dense_layer_size=dense_layer_size)
            if print_summary:
                model.model.summary()
            print('\n--- Training model...')
            model.train_gen(train_generator=gen_train,
                            validation_generator=gen_val,
                            verbose=True,
                            epochs=num_epochs)

            # Save the model
            fs.mkdir(fs.dirname(model_path))
            model.save(model_path)
            print('\tsaved model to', model_path)

        ################################################################################################################
        # Test

        # Test model
        print('\n--- Testing model...')
        p = model.predict_gen(generator=gen_test)[0]

        # cache the prediction
        fs.mkdir(fs.dirname(predictions_path))
        with open(predictions_path, 'wb') as outfile:
            pickle.dump(p, outfile)
        print('\tsaved predictions to', predictions_path)

    ####################################################################################################################
    # Return accuracy
    accuracy = p == y_test  # prediction accuracy
    return accuracy
Example #6
0
 def test_dirname(self):
     self._test("", fs.dirname("foo"))
     self._test("/tmp", fs.dirname("/tmp/labm8.tmp"))
Example #7
0
def test_dirname():
    assert "" == fs.dirname("foo")
    assert "/tmp" == fs.dirname("/tmp/labm8.tmp")
Example #8
0
def test_must_exist():
    with tempfile.NamedTemporaryFile(prefix='labm8_') as f:
        assert fs.must_exist(f.name) == f.name
        assert fs.must_exist(fs.dirname(f.name), fs.basename(f.name)) == f.name
    with pytest.raises(fs.File404):
        fs.must_exist("/not/a/real/path")
Example #9
0
def inline_fs_headers(path: Path,
                      stack: List[str],
                      lang: clgen.Language = clgen.Language.OPENCL,
                      topdir: Path = None) -> str:
    """
    Recursively inline headers in file.

    Parameters
    ----------
    path : str
        File.
    stack : List[str]
        File stack.
    topdir : Path
        The top level directory to stop searching for includes in.

    Returns
    -------
    str
        Inlined file.
    """
    stack.append(path)

    if topdir is None:
        topdir = fs.dirname(path)
    # shell escaped top directory
    escp_topdir = topdir.replace('"', '\\"')

    include_re = clgen.include_regexp(lang)

    with open(path, encoding="utf-8") as infile:
        src = infile.read()

    outlines = []
    for line in src.split('\n'):
        match = re.match(include_re, line)
        if match:
            # We have an import to inline!
            include = match.group("path")

            # Search for files with that name in the repository
            include_basename = fs.basename(include)
            esc_basename = include_basename.replace('"', '\\"')
            candidates = [x for x in
                subprocess.check_output(
                    f'find "{escp_topdir}" -type f -name {esc_basename}',
                    shell=True, universal_newlines=True)\
                    .split('\n')
                if x]

            # Select which file to inline:
            if len(candidates) == 1:
                # If there's exactly one match, then we're done:
                file_to_inline = candidates[0]
            elif len(candidates) > 1:
                # We have multiple candidates to inline, so we'll compare the
                # full paths (relative to the top directory) to select the one
                # whose name is the closest match:
                rel_matches = [match[len(topdir) + 1:] for match in candidates]
                distances = [
                    editdistance.eval(include, path) for path in rel_matches
                ]
                min_distance = min(distances)
                file_to_inline = candidates[distances.index(min_distance)]
                log.debug(
                    f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}"
                )
            else:
                # We didn't find anything suitable:
                file_to_inline = None

            # Process the inline file:
            if file_to_inline in stack:
                # We've already inlined this file, so ignore it:
                outlines.append(
                    clgen.format_as_comment(
                        lang, f'[FETCH] ignored_include({line})'))
            elif file_to_inline:
                # Inline the file by recursively expanding its contents:
                outlines.append(
                    clgen.format_as_comment(lang,
                                            f'[FETCH] begin_include({line})'))
                inline_src = inline_fs_headers(file_to_inline, stack)
                outlines.append(inline_src)
                outlines.append(
                    clgen.format_as_comment(lang,
                                            f'[FETCH] end_include({line})'))
            else:
                # We didn't find anything suitable, so keep the original
                # include:
                outlines.append(
                    clgen.format_as_comment(lang,
                                            f'[FETCH] not_found({line})'))
                outlines.append(line)
        else:
            outlines.append(line)

    return '\n'.join(outlines)
Example #10
0
def write_file(path, contents):
    fs.mkdir(fs.dirname(path))
    with open(path, 'w') as outfile:
        outfile.write(contents)
Example #11
0
def evaluate(model, device, data_folder, out_folder, embeddings,
             dense_layer_size, print_summary, num_epochs, batch_size):

    data = []

    # Create device list
    if device == 'all':
        device_list = ["Cypress", "Tahiti", "Fermi", "Kepler"]
    else:
        device_list = [device]

    for i, platform in enumerate(device_list):
        print(
            '\n------------------------------------------------------------------'
        )
        print('--- Platform', platform, '[', i + 1, '/ 4 ]')
        print(
            '------------------------------------------------------------------'
        )
        platform_name = platform2str(platform)

        # Read data
        oracle_file = os.path.join(data_folder, "pact-2014-oracles.csv")
        oracles = pd.read_csv(oracle_file)
        runtimes_file = os.path.join(data_folder, "pact-2014-runtimes.csv")
        df = pd.read_csv(runtimes_file)
        print('\tRead data from', oracle_file, '\n\tand', runtimes_file)

        # Extract data
        oracle_runtimes = np.array(
            [float(x) for x in oracles["runtime_" + platform]])
        y = np.array([int(x) for x in oracles["cf_" + platform]],
                     dtype=np.int32)
        y_1hot = get_onehot(oracles, platform)

        # Encode source codes
        X_seq, maxlen = encode_srcs(data_folder, df)

        # Embeddings
        import tensorflow as tf  # for embeddings lookup
        embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1)
        vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape
        seq_ = tf.placeholder(dtype=tf.int32)

        # Tensor of shape (num_input_files, sequence length, embbedding dimension)
        embedding_input_ = tf.nn.embedding_lookup(embedding_matrix_normalized,
                                                  seq_)

        # Make tf block less gpu memory
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            embedding_input = sess.run(embedding_input_,
                                       feed_dict={seq_: X_seq})

        # Leave-one-out cross-validation
        kf = KFold(n_splits=len(y), shuffle=False)

        for j, (train_index, test_index) in enumerate(kf.split(y)):
            print('--- Cross validation step [', j + 1, '/ ', len(y), ']')
            kernel = sorted(set(df["kernel"]))[test_index[0]]
            X_cc, y_cc = get_magni_features(df, oracles, platform)

            model_name = model.__name__
            model_basename = model.__basename__

            model_path = os.path.join(
                out_folder,
                "models/{model_basename}-{platform}-{j}.model".format(
                    model_basename=model_basename, platform=platform, j=j))
            predictions_path = os.path.join(
                out_folder,
                "predictions/{model_basename}-{platform}-{j}.result".format(
                    model_basename=model_basename, platform=platform, j=j))

            if fs.exists(predictions_path):
                # load result from cache
                print("\tFound predictions in", predictions_path,
                      ", skipping...")
                with open(predictions_path, 'rb') as infile:
                    p = pickle.load(infile)
            else:

                if fs.exists(model_path):
                    # load a trained model from cache
                    print("\n\tFound trained model in", model_path,
                          ", skipping...")
                    model.restore(model_path)
                else:

                    # Initialize model and print summary
                    print('\n--- Training model...')
                    model.init(seed, maxlen, int(embedding_dimension),
                               dense_layer_size)
                    if print_summary:
                        model.model.summary()

                    # Train and cache a model
                    model.train(sequences=embedding_input[train_index, :, :],
                                verbose=True,
                                y_1hot=y_1hot[train_index],
                                epochs=num_epochs,
                                batch_size=batch_size)

                    # cache the model
                    fs.mkdir(fs.dirname(model_path))
                    model.save(model_path)
                    print('\tsaved model to', model_path)

                # test model
                print('\n--- Testing model...')
                p = model.predict(sequences=embedding_input[test_index, :, :],
                                  batch_size=batch_size)[0]

                # The runtimes of some coarsening factors are not recorded in the data table. If that is the case for
                # the predicted cf, clamp it down to the highest cf for which the runtime is recorded
                p = min(p, 2**(len(X_cc[test_index[0]]) - 1))

                # cache the prediction
                fs.mkdir(fs.dirname(predictions_path))
                with open(predictions_path, 'wb') as outfile:
                    pickle.dump(p, outfile)
                print('\tsaved predictions to', predictions_path)

            o = y[test_index[0]]  # oracle prediction (true value)
            correct = p == o  # predictions' correctness

            # get runtime without thread coarsening
            row = df[(df["kernel"] == kernel) & (df["cf"] == 1)]
            assert (len(row) == 1)  # sanity check
            nocf_runtime = float(row["runtime_" + platform])

            # get runtime of prediction
            row = df[(df["kernel"] == kernel) & (df["cf"] == p)]
            assert (len(row) == 1)  # sanity check
            p_runtime = float(row["runtime_" + platform])

            # get runtime of oracle coarsening factor
            o_runtime = oracle_runtimes[test_index[0]]

            # speedup and % oracle
            s_oracle = nocf_runtime / o_runtime
            p_speedup = nocf_runtime / p_runtime
            p_oracle = o_runtime / p_runtime

            # record result
            data.append({
                "Model": model_name,
                "Platform": platform_name,
                "Kernel": kernel,
                "Oracle-CF": o,
                "Predicted-CF": p,
                "Speedup": p_speedup,
                "Oracle": p_oracle
            })

    return pd.DataFrame(data,
                        columns=[
                            "Model", "Platform", "Kernel", "Oracle-CF",
                            "Predicted-CF", "Speedup", "Oracle"
                        ])
Example #12
0
def evaluate(model, device, data_folder, out_folder, embeddings,
             dense_layer_size, print_summary, num_epochs,
             batch_size) -> pd.DataFrame:
    from sklearn.model_selection import StratifiedKFold

    # Create device list
    if device == 'all':
        device_list = ["amd", "nvidia"]
    else:
        device_list = [device]

    data = []
    for i, platform in enumerate(device_list):
        platform_name = platform2str(platform)

        # Load runtime data
        data_file = os.path.join(data_folder, "cgo17-{}.csv".format(platform))
        print('\n--- Read data from', data_file)
        df = pd.read_csv(data_file)

        # Encode input source codes
        sequences, maxlen = encode_srcs(data_folder, df)

        # Load embeddings
        import tensorflow as tf  # for embeddings lookup
        embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1)
        vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape
        seq_ = tf.compat.v1.placeholder(dtype=tf.int32)
        # Tensor of shape (num_input_files, sequence length, embbedding dimension)
        embedding_input_ = tf.compat.v1.nn.embedding_lookup(
            params=embedding_matrix_normalized, ids=seq_)

        # Make tf block less gpu memory
        config = tf.compat.v1.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.compat.v1.Session(config=config) as sess:
            embedding_input = sess.run(embedding_input_,
                                       feed_dict={seq_: sequences})

        # Values used for training & predictions
        aux_in = auxiliary_inputs(df)

        # Optimal mappings
        y = np.array([1 if x == "GPU" else 0 for x in df["oracle"].values])
        y_1hot = encode_1hot(y)

        # 10-fold cross-validation
        n_splits = 10
        kf = StratifiedKFold(n_splits=n_splits,
                             shuffle=True,
                             random_state=seed)
        for j, (train_index, test_index) in enumerate(kf.split(sequences, y)):
            print('--- Cross validation step [', j, '/ ', n_splits, ']')

            model_name = model.__name__
            model_basename = model.__basename__
            model_path = os.path.join(
                out_folder,
                "models/{model_basename}-{platform}-{j}.model".format(
                    model_basename=model_basename, platform=platform, j=j))
            predictions_path = os.path.join(
                out_folder,
                "predictions/{model_basename}-{platform}-{j}.result".format(
                    model_basename=model_basename, platform=platform, j=j))
            log_dir = os.path.join(out_folder, "logs")

            if fs.exists(predictions_path):
                # load result from cache
                print("\tFound predictions in", predictions_path,
                      ", skipping...")
                with open(predictions_path, 'rb') as infile:
                    p = pickle.load(infile)
            else:

                if fs.exists(model_path):
                    # restore trained model from cache
                    print("\n\tFound trained model in", model_path,
                          ", skipping...")
                    model.restore(model_path)
                else:

                    # Initialize model and print summary
                    model.init(seed=seed,
                               maxlen=maxlen,
                               embedding_dim=int(embedding_dimension),
                               dense_layer_size=dense_layer_size)
                    if print_summary:
                        model.model.summary()

                    # Train and cache a model
                    print('\n--- Training model... ')
                    model.train(df=df,
                                aux_in=aux_in[train_index],
                                sequences=embedding_input[train_index, :, :],
                                y=y[train_index],
                                y_1hot=y_1hot[train_index],
                                verbose=False,
                                epochs=num_epochs,
                                batch_size=batch_size,
                                log_dir=log_dir)
                    fs.mkdir(fs.dirname(model_path))
                    model.save(model_path)
                    print('\tsaved model to', model_path)

                # test model
                print('\n--- Testing model... ')
                p = model.predict(batch_size=batch_size,
                                  aux_in=aux_in[test_index],
                                  sequences=embedding_input[test_index, :, :],
                                  y=y[test_index],
                                  y_1hot=y_1hot[test_index],
                                  verbose=False)

                # cache results
                fs.mkdir(fs.dirname(predictions_path))
                with open(predictions_path, 'wb') as outfile:
                    pickle.dump(p, outfile)
                print('\tsaved predictions to', predictions_path)

            benchmarks = df['benchmark'].values[test_index]  # benchmarks names
            o = y[test_index]  # oracle device mappings (true values)
            correct = p == o  # predictions' correctness
            # runtimes of baseline mapping (CPU on AMD, GPU on NVIDIA)
            zero_r_dev = "runtime_cpu" if platform == "amd" else "runtime_gpu"
            zer_r_runtimes = df[zero_r_dev][test_index]
            # speedups of predictions
            runtimes = df[['runtime_cpu', 'runtime_gpu']].values[test_index]
            p_runtimes = [r[p_] for p_, r in zip(p, runtimes)]
            p_speedup = zer_r_runtimes / p_runtimes

            # sanity check
            assert (len(benchmarks) == len(o) == len(correct) == len(p) ==
                    len(p_speedup))

            # record results
            for benchmark_, o_, p_, correct_, p_speedup_ in zip(
                    benchmarks, o, p, correct, p_speedup):
                data.append({
                    "Model": model_basename,
                    "Platform": platform_name,
                    'Benchmark': escape_benchmark_name(benchmark_),
                    'Benchmark Suite': escape_suite_name(benchmark_),
                    "Oracle Mapping": o_,
                    "Predicted Mapping": p_,
                    "Correct?": correct_,
                    "Speedup": p_speedup_,
                })

    return pd.DataFrame(data,
                        index=range(1,
                                    len(data) + 1),
                        columns=[
                            "Model", "Platform", "Benchmark",
                            "Benchmark Suite", "Oracle Mapping",
                            "Predicted Mapping", "Correct?", "Speedup"
                        ])
Example #13
0
def write_file(path: str, contents: str) -> None:
    if fs.dirname(path):
        fs.mkdir(fs.dirname(path))
    with open(path, 'w') as outfile:
        outfile.write(contents)
Example #14
0
def evaluate(model):
    from progressbar import ProgressBar
    progressbar = [0, ProgressBar(maxval=68)]
    progressbar[1].start()
    data = []

    X_seq = None  # defer sequence encoding (it's expensive)

    for i, platform in enumerate(["Cypress", "Tahiti", "Fermi", "Kepler"]):
        platform_name = platform2str(platform)

        # 读取四个平台下标签的运行时
        oracle_runtimes = np.array(
            [float(x) for x in oracles["runtime_" + platform]])
        # 读取四个平台下的标签(粗化因子)
        y = np.array([int(x) for x in oracles["cf_" + platform]],
                     dtype=np.int32)
        # 对标签6种情况一热编码
        y_1hot = get_onehot(oracles, platform)
        X_cc, y_cc = get_features(df, oracles, platform)
        embed = np.load(f"{data_path}caseb_128.npy")

        kf = KFold(n_splits=len(y), shuffle=False)

        for j, (train_index, test_index) in enumerate(kf.split(y)):
            kernel = sorted(set(df["kernel"]))[test_index[0]]

            model_name = model.__name__
            model_basename = model.__basename__

            model_path = f"result_caseB/modelb_caseB/{model_basename}-{platform}-{j}.model"
            predictions_path = f"result_caseB/predictionb_caseB/{model_basename}-{platform}-{j}.result"

            if fs.exists(predictions_path):
                # load result from cache
                with open(predictions_path, 'rb') as infile:
                    p = pickle.load(infile)
            else:
                if fs.exists(model_path):
                    # load a trained model from cache
                    model.restore(model_path)
                else:

                    # create a new model and train it
                    model.init(seed=seed)
                    model.train(
                        sequences=embed[train_index],
                        verbose=True,  # TODO
                        y_1hot=y_1hot[train_index])

                    # cache the model
                    fs.mkdir(fs.dirname(model_path))
                    model.save(model_path)
                # make prediction
                p = model.predict(sequences=np.array(embed[test_index[0]]))[0]

                p = min(p, 2**(len(X_cc[test_index[0]]) - 1))

                # cache the prediction
                fs.mkdir(fs.dirname(predictions_path))
                with open(predictions_path, 'wb') as outfile:
                    pickle.dump(p, outfile)

            # oracle prediction
            o = y[test_index[0]]
            correct = p == o
            # get runtime without thread coarsening
            row = df[(df["kernel"] == kernel) & (df["cf"] == 1)]
            assert (len(row) == 1)  # sanity check
            nocf_runtime = float(row["runtime_" + platform])

            # get runtime of prediction
            row = df[(df["kernel"] == kernel) & (df["cf"] == p)]
            assert (len(row) == 1)  # sanity check
            p_runtime = float(row["runtime_" + platform])

            # get runtime of oracle coarsening factor
            o_runtime = oracle_runtimes[test_index[0]]
            # speedup and % oracle
            s_oracle = nocf_runtime / o_runtime
            p_speedup = nocf_runtime / p_runtime
            p_oracle = o_runtime / p_runtime

            # record result
            data.append({
                "Model": model_name,
                "Platform": platform_name,
                "Kernel": kernel,
                "Oracle-CF": o,
                "Predicted-CF": p,
                "Speedup": p_speedup,
                "Oracle": p_oracle
            })

            progressbar[0] += 1  # update progress bar
            progressbar[1].update(progressbar[0])

    return pd.DataFrame(data,
                        columns=[
                            "Model", "Platform", "Kernel", "Oracle-CF",
                            "Predicted-CF", "Speedup", "Oracle"
                        ])
Example #15
0
 def features_dir(csv_path):
     return fs.basename(fs.dirname(csv_path))
def test_accuracy(model, embeddings, folder_data, samples_per_class,
                  folder_results, dense_layer_size, print_summary, num_epochs,
                  batch_size):
    seed = 204

    num_classes = 104
    y_test = np.array([], dtype=np.int32)
    X_test = list()
    folder_data_test = os.path.join(folder_data, 'seq_test')
    print('Getting file names for', num_classes, 'classes from folders:')
    print(folder_data_test)
    for i in range(1, num_classes + 1):
        folder = os.path.join(folder_data_test, str(i))
        assert os.path.exists(folder), "Folder: " + folder + ' does not exist'
        print('\ttest      : Read file names from folder ', folder)
        listing = os.listdir(folder + '/')
        seq_files = [
            os.path.join(folder, f) for f in listing if f[-4:] == '.rec'
        ]
        assert len(seq_files) > 0, "No .rec files found in" + folder
        X_test += seq_files
        y_test = np.concatenate(
            [y_test,
             np.array([int(i)] * len(seq_files), dtype=np.int32)])

    folder_vocabulary = FLAGS.vocabulary_dir
    dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle')
    print('\tLoading dictionary from file', dictionary_pickle)
    with open(dictionary_pickle, 'rb') as f:
        dictionary = pickle.load(f)
    unk_index = dictionary[rgx.unknown_token]
    del dictionary

    X_seq_test, maxlen_test = encode_srcs(X_test, 'testing', unk_index)
    maxlen = maxlen_test
    print('Max. sequence length overall:', maxlen)
    if FLAGS.maxlen > 0:
        maxlen = FLAGS.maxlen
    print('Padding sequences to length', maxlen)
    X_seq_test = pad_src(X_seq_test, maxlen, unk_index)

    model.__name__ = FLAGS.model_name
    model_name = model.__name__
    model_path = os.path.join(folder_results,
                              "models/{}.model".format(model_name))
    predictions_path = os.path.join(
        folder_results,
        "predictions/{}_top{}.result".format(model_name, FLAGS.topk))

    if fs.exists(predictions_path):
        print("\tFound predictions in", predictions_path, ", skipping...")
        with open(predictions_path, 'rb') as infile:
            ind = pickle.load(infile)

    else:
        import tensorflow as tf
        embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1)
        vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape
        print('EMB:\n', embedding_matrix_normalized)

        print('\n--- Initializing model...')
        model.init(seed=seed,
                   maxlen=maxlen,
                   embedding_dim=int(embedding_dimension),
                   num_classes=num_classes,
                   dense_layer_size=dense_layer_size,
                   embedding_matrix=embedding_matrix_normalized)
        model.load_weights(
            os.path.join(FLAGS.out, model.__name__ + '_weights.h5'))
        if print_summary:
            model.model.summary()

        print('\n--- Testing model...')
        ind, prob = model.predict_topk(X_seq_test, batch_size, FLAGS.topk)
        del prob
        fs.mkdir(fs.dirname(predictions_path))
        with open(predictions_path, 'wb') as outfile:
            pickle.dump(ind, outfile)
        print('\tsaved predictions to', predictions_path)

    accuracy = np.zeros_like(y_test)
    ind = np.transpose(np.array(ind))
    for i in range(FLAGS.topk):
        accuracy += np.array(ind[i]) == y_test
    print('\nTest top{} accuracy:'.format(FLAGS.topk),
          sum(accuracy) * 100.0 / len(accuracy), '%')

    from sklearn.metrics import confusion_matrix
    conf_matr = confusion_matrix(y_test, ind[0])

    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    values = plt.imshow(conf_matr)
    ax.xaxis.tick_top()
    ax.xaxis.set_label_position('top')

    fig.colorbar(values)
    ax.set_xlabel('Настоящие классы')
    ax.set_ylabel('Предсказанные классы')
    conf_png = os.path.join(folder_results,
                            "models/conf_matr_{}.png".format(model_name))
    plt.savefig(conf_png)