Example #1
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '-d',
        dest='model_hdf5_file',
        default=None,
        help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option(
        '-g',
        dest='gain_height',
        default=False,
        action='store_true',
        help=
        'Nucleotide heights determined by the max of loss and gain [Default: %default]'
    )
    parser.add_option('-m',
                      dest='min_limit',
                      default=0.1,
                      type='float',
                      help='Minimum heatmap limit [Default: %default]')
    parser.add_option(
        '-n',
        dest='center_nt',
        default=200,
        type='int',
        help='Center nt to mutate and plot in the heat map [Default: %default]'
    )
    parser.add_option('-o',
                      dest='out_dir',
                      default='heat',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='targets',
        default='0',
        help=
        'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

            # read in target names
            target_labels = open(
                options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try:  # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(
                    random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5' % options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (
            options.center_nt, model_file, model_input_hdf5,
            options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start + delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]

    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack(
                [min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth': 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start),
                                       colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start),
                                      colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0),
                                       colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(
                    axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir,
                                              header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            subprocess.call('convert -density 300 %s %s' %
                            (logo_eps, logo_png),
                            shell=True)
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0],
                        c=rdbu[0],
                        label='loss',
                        linewidth=1)
            ax_sad.plot(minmax_matrix[1],
                        c=rdbu[-1],
                        label='gain',
                        linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top', 'bottom', 'left', 'right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix,
                        linewidths=0,
                        cmap='RdBu_r',
                        vmin=-vlim,
                        vmax=vlim,
                        xticklabels=False,
                        ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA',
                                         rotation='horizontal')  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' %
                        (options.out_dir, header.replace(':', '_'), ci),
                        dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [
                    header, delta_start + pos, ci, loss_matrix[pos],
                    gain_matrix[pos]
                ]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
Example #2
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file> <out_file>'
    parser = OptionParser(usage)
    parser.add_option('-a',
                      dest='add_features_file',
                      default=None,
                      help='Table of additional features')
    parser.add_option('-b',
                      dest='batch_size',
                      default=None,
                      type='int',
                      help='Align sizes with batch size')
    parser.add_option(
        '-c',
        dest='counts',
        default=False,
        action='store_true',
        help=
        'Validation and training proportions are given as raw counts [Default: %default]'
    )
    parser.add_option(
        '-e',
        dest='extend_length',
        type='int',
        default=None,
        help='Extend all sequences to this length [Default: %default]')
    parser.add_option('-r',
                      dest='permute',
                      default=False,
                      action='store_true',
                      help='Permute sequences [Default: %default]')
    parser.add_option('-s',
                      dest='random_seed',
                      default=1,
                      type='int',
                      help='numpy.random seed [Default: %default]')
    parser.add_option('-t',
                      dest='test_pct',
                      default=0,
                      type='float',
                      help='Test % [Default: %default]')
    parser.add_option('-v',
                      dest='valid_pct',
                      default=0,
                      type='float',
                      help='Validation % [Default: %default]')
    parser.add_option('--vt',
                      dest='valid_test',
                      default=False,
                      action='store_true',
                      help='Use validation as test, too [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide fasta file, targets file, and an output prefix')
    else:
        fasta_file = args[0]
        targets_file = args[1]
        out_file = args[2]

    # seed rng before shuffle
    npr.seed(options.random_seed)

    #################################################################
    # load data
    #################################################################
    seqs, targets = dna_io.load_data_1hot(fasta_file,
                                          targets_file,
                                          extend_len=options.extend_length,
                                          mean_norm=False,
                                          whiten=False,
                                          permute=False,
                                          sort=False)

    # reshape sequences for torch
    seqs = seqs.reshape((seqs.shape[0], 4, 1, seqs.shape[1] / 4))

    # read headers
    headers = []
    for line in open(fasta_file):
        if line[0] == '>':
            headers.append(line[1:].rstrip())
    headers = np.array(headers)

    # read labels
    target_labels = open(targets_file).readline().strip().split('\t')

    # read additional features
    if options.add_features_file:
        df_add = pd.read_table(options.add_features_file, index_col=0)
        df_add = df_add.astype(np.float32, copy=False)

    # permute
    if options.permute:
        order = npr.permutation(seqs.shape[0])
        seqs = seqs[order]
        targets = targets[order]
        headers = headers[order]
        if options.add_features_file:
            df_add = df_add.iloc[order]

    # check proper sum
    if options.counts:
        assert (options.test_pct + options.valid_pct <= seqs.shape[0])
    else:
        assert (options.test_pct + options.valid_pct <= 1.0)

    #################################################################
    # divide data
    #################################################################
    if options.counts:
        test_count = int(options.test_pct)
        valid_count = int(options.valid_pct)
    else:
        test_count = int(0.5 + options.test_pct * seqs.shape[0])
        valid_count = int(0.5 + options.valid_pct * seqs.shape[0])

    train_count = seqs.shape[0] - test_count - valid_count
    train_count = batch_round(train_count, options.batch_size)
    print >> sys.stderr, '%d training sequences ' % train_count

    test_count = batch_round(test_count, options.batch_size)
    print >> sys.stderr, '%d test sequences ' % test_count

    valid_count = batch_round(valid_count, options.batch_size)
    print >> sys.stderr, '%d validation sequences ' % valid_count

    i = 0
    train_seqs, train_targets = seqs[i:i +
                                     train_count, :], targets[i:i +
                                                              train_count, :]
    i += train_count
    valid_seqs, valid_targets, valid_headers = seqs[
        i:i + valid_count, :], targets[i:i +
                                       valid_count, :], headers[i:i +
                                                                valid_count]
    i += valid_count
    test_seqs, test_targets, test_headers = seqs[i:i + test_count, :], targets[
        i:i + test_count, :], headers[i:i + test_count]

    if options.add_features_file:
        i = 0
        train_add = df_add.iloc[i:i + train_count]
        i += train_count
        valid_add = df_add.iloc[i:i + valid_count]
        i += valid_count
        test_add = df_add.iloc[i:i + test_count]

    #################################################################
    # construct hdf5 representation
    #################################################################
    h5f = h5py.File(out_file, 'w')

    h5f.create_dataset('target_labels', data=target_labels)

    if train_count > 0:
        h5f.create_dataset('train_in', data=train_seqs)
        h5f.create_dataset('train_out', data=train_targets)

    if valid_count > 0:
        h5f.create_dataset('valid_in', data=valid_seqs)
        h5f.create_dataset('valid_out', data=valid_targets)

    if test_count > 0:
        h5f.create_dataset('test_in', data=test_seqs)
        h5f.create_dataset('test_out', data=test_targets)
        h5f.create_dataset('test_headers', data=test_headers)
    elif options.valid_test:
        h5f.create_dataset('test_in', data=valid_seqs)
        h5f.create_dataset('test_out', data=valid_targets)
        h5f.create_dataset('test_headers', data=valid_headers)

    if options.add_features_file:
        h5f.create_dataset('add_labels', data=list(df_add.columns))

        if train_count > 0:
            h5f.create_dataset('train_add', data=train_add.as_matrix())
        if valid_count > 0:
            h5f.create_dataset('valid_add', data=valid_add.as_matrix())
        if test_count > 0:
            h5f.create_dataset('test_add', data=test_add.as_matrix())
        elif options.valid_test:
            h5f.create_dataset('test_add', data=valid_add.as_matrix())

    h5f.close()
Example #3
0
    quit()

# filenames
fasta_file = args[0]
targets_file = args[1]
target_labels_file = args[2]
train_x_file = args[3]
train_y_file = args[4]
val_x_file = args[5]
val_y_file = args[6]
test_x_file = args[7]
test_y_file = args[8]

# get data
print "getting data"
seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file, extend_len=None, mean_norm=False, \
                                      whiten=False, permute=False, sort=False)

assert (seqs.shape[0] == targets.shape[0])

seqs = seqs.reshape(
    (seqs.shape[0], 4, seqs.shape[1] / 4))  # shape = (dataset_size, 4, 600)
seqs = np.transpose(seqs, (0, 2, 1))  # make shape = (dataset_size, 600, 4)

# get an array of the cell types
print "getting target labels"
target_labels = []
with open(targets_file, "r") as target_file:
    target_labels = target_file.readline().strip().split("\t")

# permute data if need be
if options.permute:
Example #4
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='gain_height', default=False, action='store_true', help='Nucleotide heights determined by the max of loss and gain [Default: %default]')
    parser.add_option('-m', dest='min_limit', default=0.1, type='float', help='Minimum heatmap limit [Default: %default]')
    parser.add_option('-n', dest='center_nt', default=0, type='int', help='Center nt to mutate and plot in the heat map [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-p', dest='print_table_all', default=False, action='store_true', help='Print all targets to the table [Default: %default]')
    parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_sat_predict.lua -center_nt %d %s %s %s' % (options.center_nt, model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)


    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    seq_mod_preds = np.array(hdf5_in['seq_mod_preds'])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len)/2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start:delta_start+delta_len]

    # decide which cells to plot
    if options.targets == '-1':
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(',')]


    #################################################################
    # plot
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = 'ACGT'
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = 'seq%d' % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style='white', font_scale=0.5)
            sns.axes_style({'axes.linewidth':1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20,3))
            ax_logo = plt.subplot2grid((3,heat_cols), (0,logo_start), colspan=(logo_end-logo_start))
            ax_sad = plt.subplot2grid((3,heat_cols), (1,sad_start), colspan=(sad_end-sad_start))
            ax_heat = plt.subplot2grid((3,heat_cols), (2,0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75/vlim*(abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75/vlim*(-minmax_matrix[0])
            logo_eps = '%s/%s_c%d_seq.eps' % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = '%s.png' % logo_eps[:-4]
            subprocess.call('convert -density 300 %s %s' % (logo_eps, logo_png), shell=True)
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label='loss', linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label='gain', linewidth=1)
            ax_sad.set_xlim(0,minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ['top','bottom','left','right']:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap='RdBu_r', vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels('TGCA', rotation='horizontal') # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig('%s/%s_c%d_heat.pdf' % (options.out_dir,header.replace(':','_'), ci), dpi=300)
            plt.close()


        #################################################################
        # print table of nt variability for each cell
        #################################################################
        print_targets = plot_targets
        if options.print_table_all:
            print_targets = range(seq_mod_preds.shape[3])

        for ci in print_targets:
            seq_mod_preds_cell = seq_mod_preds[si,:,:,ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start+pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, '\t'.join([str(c) for c in cols])

    table_out.close()
Example #5
0
def main():
    usage = 'usage: %prog [options] <model_file> <profile_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '--all',
        dest='all_data',
        default=False,
        action='store_true',
        help=
        'Search all training/valid/test sequences. By default we search only the test set. [Default: %default]'
    )
    parser.add_option('--cuda',
                      dest='cuda',
                      default=False,
                      action='store_true',
                      help='Run on GPGPU [Default: %default]')
    parser.add_option('--cudnn',
                      dest='cudnn',
                      default=False,
                      action='store_true',
                      help='Run on GPGPU w/cuDNN [Default: %default]')
    parser.add_option(
        '-d',
        dest='model_out_file',
        default=None,
        help='Pre-computed model predictions output table [Default: %default]')
    parser.add_option(
        '-e',
        dest='norm_even',
        default=False,
        action='store_true',
        help=
        'Normalize the weights for the positive and negative datasets to be even [Default: %default]'
    )
    parser.add_option('-f',
                      dest='font_heat',
                      default=6,
                      type='int',
                      help='Heat map axis font size [Default: %default]')
    parser.add_option('-n',
                      dest='num_dissect',
                      default=10,
                      type='int',
                      help='Dissect the top n hits [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='profile',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '-r',
        dest='norm_preds',
        default=False,
        action='store_true',
        help='Normalize predictions to have equal frequency [Default: %default]'
    )
    parser.add_option(
        '-z',
        dest='weight_zero',
        default=1.0,
        type='float',
        help=
        'Adjust the weights for the zero samples by this value [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)'
        )
    else:
        model_file = args[0]
        profile_file = args[1]
        input_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError, UnicodeDecodeError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')

            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            seq_headers = np.array(
                [h.decode('UTF-8') for h in hdf5_in['test_headers']])

            hdf5_in.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    # GPU options (needed below, too)
    gpgpu_str = ''
    if options.cudnn:
        gpgpu_str = '-cudnn'
    elif options.cuda:
        gpgpu_str = '-cuda'

    if options.model_out_file is None:
        options.model_out_file = '%s/preds.txt' % options.out_dir

        torch_cmd = 'basset_predict.lua -mc_n 10 -rc %s %s %s %s' % (
            gpgpu_str, model_file, model_input_hdf5, options.model_out_file)
        print(torch_cmd)
        subprocess.call(torch_cmd, shell=True)

    # read in predictions
    seqs_preds = np.loadtxt(options.model_out_file)

    num_targets = seqs_preds.shape[1]

    #################################################################
    # parse profile file
    #################################################################
    activity_profile, profile_weights, profile_mask, target_labels = load_profile(
        profile_file, num_targets, options.norm_even, options.weight_zero)

    # normalize predictions
    if options.norm_preds:
        pred_means = seqs_preds.mean(axis=0)

        # save to file for basset_refine.py
        np.save('%s/pred_means' % options.out_dir, pred_means)

        # aim for profile weighted average
        aim_mean = np.average(pred_means[profile_mask],
                              weights=profile_weights[profile_mask])

        # normalize
        for ti in range(seqs_preds.shape[1]):
            ratio_ti = pred_means[ti] / aim_mean
            if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4):
                print(
                    'WARNING: target %d with mean %.4f differs 4-fold from the median %.3f'
                    % (ti, pred_means[ti], aim_mean),
                    file=sys.stderr)
            seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti],
                                      aim_mean)

    #################################################################
    # plot clustered heat map limited to relevant targets
    #################################################################
    seqs_preds_prof = seqs_preds[:, profile_mask]
    seqs_preds_var = seqs_preds_prof.var(axis=1)
    seqs_sort_var = np.argsort(seqs_preds_var)[::-1]

    # heat map
    plt.figure()
    g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]),
                       metric='cosine',
                       linewidths=0,
                       yticklabels=target_labels[profile_mask],
                       xticklabels=False)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig('%s/heat_clust.pdf' % options.out_dir)
    plt.close()

    # dimension reduction
    # model_pca = PCA(n_components=50)
    # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof))
    # model = TSNE(n_components=2, perplexity=5, metric='euclidean')
    # spp_dr = model.fit_transform(spp_pca)
    model = PCA(n_components=2)
    spp_dr = model.fit_transform(np.transpose(seqs_preds_prof))
    plt.figure()
    plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c='black', s=5)
    target_labels_prof_concise = [
        tl.split(':')[-1] for tl in target_labels[profile_mask]
    ]
    for label, x, y, activity in zip(target_labels_prof_concise, spp_dr[:, 0],
                                     spp_dr[:, 1],
                                     activity_profile[profile_mask]):
        plt.annotate(label,
                     xy=(x, y),
                     size=10,
                     color=sns.color_palette('deep')[int(activity)])
    plt.savefig('%s/dim_red.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # compute profile distances
    #################################################################
    # compute prediction distances
    seqs_pdists = []
    for si in range(seqs_preds.shape[0]):
        # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum()
        sd = log_loss(activity_profile[profile_mask],
                      seqs_preds[si, profile_mask],
                      sample_weight=profile_weights[profile_mask])
        seqs_pdists.append(sd)
    seqs_pdists = np.array(seqs_pdists)

    # obtain sorted indexes
    seqs_sort_dist = np.argsort(seqs_pdists)

    # compute target distances
    seqs_tdists = []
    for si in range(seqs_preds.shape[0]):
        tdists = np.absolute(targets[si, profile_mask] -
                             activity_profile[profile_mask])
        tdists_weight = np.multiply(tdists, profile_weights[profile_mask])
        td = tdists_weight.sum()
        seqs_tdists.append(td)
    seqs_tdists = np.array(seqs_tdists)

    # print as table
    table_out = open('%s/table.txt' % options.out_dir, 'w')
    for si in seqs_sort_dist:
        cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(
            seqs_preds[si, profile_mask])
        print('\t'.join([str(c) for c in cols]), file=table_out)
    table_out.close()

    #################################################################
    # plot sorted heat map
    #################################################################
    plt.figure()
    g = sns.clustermap(np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]),
                       col_cluster=False,
                       metric='cosine',
                       linewidths=0,
                       yticklabels=target_labels[profile_mask],
                       xticklabels=False)
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig('%s/heat_rank.pdf' % options.out_dir)
    plt.close()

    #################################################################
    # dissect the top hits
    #################################################################
    satmut_targets = ','.join(
        [str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]])

    if gpgpu_str != '':
        gpgpu_str = '-%s' % gpgpu_str

    for ni in range(options.num_dissect):
        si = seqs_sort_dist[ni]

        # print FASTA
        fasta_file = '%s/seq%d.fa' % (options.out_dir, ni)
        fasta_out = open(fasta_file, 'w')
        print('>%s\n%s' % (seq_headers[si], seqs[si]), file=fasta_out)
        fasta_out.close()

        # saturated mutagenesis
        cmd = 'basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s' % (
            gpgpu_str, options.out_dir, ni, satmut_targets, model_file,
            fasta_file)
        subprocess.call(cmd, shell=True)

        # predictions and targets heat
        profile_sort = np.argsort(activity_profile[profile_mask])
        heat_mat = np.array([
            activity_profile[profile_mask], targets[si, profile_mask],
            seqs_preds_prof[si]
        ])
        heat_mat = heat_mat[:, profile_sort]

        plt.figure()
        ax = sns.heatmap(np.transpose(heat_mat),
                         yticklabels=target_labels[profile_mask][profile_sort],
                         xticklabels=['Desired', 'Experiment', 'Prediction'])
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0)
        for label in ax.yaxis.get_majorticklabels():
            label.set_fontsize(options.font_heat)
        plt.savefig('%s/heat%d.pdf' % (options.out_dir, ni))
        plt.close()
Example #6
0
def main():
    usage = "usage: %prog [options] <model_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g",
        dest="gain_height",
        default=False,
        action="store_true",
        help="Nucleotide heights determined by the max of loss and gain [Default: %default]",
    )
    parser.add_option(
        "-m", dest="min_limit", default=0.1, type="float", help="Minimum heatmap limit [Default: %default]"
    )
    parser.add_option(
        "-n",
        dest="center_nt",
        default=200,
        type="int",
        help="Center nt to mutate and plot in the heat map [Default: %default]",
    )
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-s", dest="sample", default=None, type="int", help="Sample sequences from the test set [Default:%default]"
    )
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file")
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split("\t")

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")
            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            try:  # TEMP
                seq_headers = np.array(hdf5_in["test_headers"])
                target_labels = np.array(hdf5_in["target_labels"])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = "%s/model_in.h5" % options.out_dir
                h5f = h5py.File(model_input_hdf5, "w")
                h5f.create_dataset("test_in", data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = "%s/model_out.h5" % options.out_dir
        torch_cmd = "basset_sat_predict.lua -center_nt %d %s %s %s" % (
            options.center_nt,
            model_file,
            model_input_hdf5,
            options.model_hdf5_file,
        )
        if subprocess.call(torch_cmd, shell=True):
            message("Error running basset_sat_predict.lua", "error")

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, "r")
    seq_mod_preds = np.array(hdf5_in["seq_mod_preds"])
    hdf5_in.close()

    # trim seqs to match seq_mod_preds length
    seq_len = len(seqs[0])
    delta_start = 0
    delta_len = seq_mod_preds.shape[2]
    if delta_len < seq_len:
        delta_start = (seq_len - delta_len) / 2
        for i in range(len(seqs)):
            seqs[i] = seqs[i][delta_start : delta_start + delta_len]

    # decide which cells to plot
    if options.targets == "-1":
        plot_targets = xrange(seq_mod_preds.shape[3])
    else:
        plot_targets = [int(ci) for ci in options.targets.split(",")]

    #################################################################
    # plot
    #################################################################
    table_out = open("%s/table.txt" % options.out_dir, "w")

    rdbu = sns.color_palette("RdBu_r", 10)

    nts = "ACGT"
    for si in range(seq_mod_preds.shape[0]):
        try:
            header = seq_headers[si]
        except TypeError:
            header = "seq%d" % si
        seq = seqs[si]

        # plot some descriptive heatmaps for each individual cell type
        for ci in plot_targets:
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            # compute matrices
            norm_matrix = seq_mod_preds_cell - real_pred_cell
            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)
            minmax_matrix = np.vstack([min_scores - real_pred_cell, max_scores - real_pred_cell])

            # prepare figure
            sns.set(style="white", font_scale=0.5)
            sns.axes_style({"axes.linewidth": 1})
            heat_cols = 400
            sad_start = 1
            sad_end = 323
            logo_start = 0
            logo_end = 324
            fig = plt.figure(figsize=(20, 3))
            ax_logo = plt.subplot2grid((3, heat_cols), (0, logo_start), colspan=(logo_end - logo_start))
            ax_sad = plt.subplot2grid((3, heat_cols), (1, sad_start), colspan=(sad_end - sad_start))
            ax_heat = plt.subplot2grid((3, heat_cols), (2, 0), colspan=heat_cols)

            # print a WebLogo of the sequence
            vlim = max(options.min_limit, abs(minmax_matrix).max())
            if options.gain_height:
                seq_heights = 0.25 + 1.75 / vlim * (abs(minmax_matrix).max(axis=0))
            else:
                seq_heights = 0.25 + 1.75 / vlim * (-minmax_matrix[0])
            logo_eps = "%s/%s_c%d_seq.eps" % (options.out_dir, header_filename(header), ci)
            seq_logo(seq, seq_heights, logo_eps)

            # add to figure
            logo_png = "%s.png" % logo_eps[:-4]
            logo_cmd = "convert -density 300 %s %s" % (logo_eps, logo_png)
            if subprocess.call(logo_cmd, shell=True):
                message("Error running convert", "error")
            logo = Image.open(logo_png)
            ax_logo.imshow(logo)
            ax_logo.set_axis_off()

            # plot loss and gain SAD scores
            ax_sad.plot(-minmax_matrix[0], c=rdbu[0], label="loss", linewidth=1)
            ax_sad.plot(minmax_matrix[1], c=rdbu[-1], label="gain", linewidth=1)
            ax_sad.set_xlim(0, minmax_matrix.shape[1])
            ax_sad.legend()
            # ax_sad.grid(True, linestyle=':')
            for axis in ["top", "bottom", "left", "right"]:
                ax_sad.spines[axis].set_linewidth(0.5)

            # plot real-normalized scores
            vlim = max(options.min_limit, abs(norm_matrix).max())
            sns.heatmap(norm_matrix, linewidths=0, cmap="RdBu_r", vmin=-vlim, vmax=vlim, xticklabels=False, ax=ax_heat)
            ax_heat.yaxis.set_ticklabels("TGCA", rotation="horizontal")  # , size=10)

            # save final figure
            plt.tight_layout()
            plt.savefig("%s/%s_c%d_heat.pdf" % (options.out_dir, header.replace(":", "_"), ci), dpi=300)
            plt.close()

        #################################################################
        # print table of nt variability for each cell
        #################################################################
        for ci in range(seq_mod_preds.shape[3]):
            seq_mod_preds_cell = seq_mod_preds[si, :, :, ci]
            real_pred_cell = get_real_pred(seq_mod_preds_cell, seq)

            min_scores = seq_mod_preds_cell.min(axis=0)
            max_scores = seq_mod_preds_cell.max(axis=0)

            loss_matrix = real_pred_cell - seq_mod_preds_cell.min(axis=0)
            gain_matrix = seq_mod_preds_cell.max(axis=0) - real_pred_cell

            for pos in range(seq_mod_preds_cell.shape[1]):
                cols = [header, delta_start + pos, ci, loss_matrix[pos], gain_matrix[pos]]
                print >> table_out, "\t".join([str(c) for c in cols])

    table_out.close()
Example #7
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file> <out_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='add_features_file', default=None, help='Table of additional features')
    parser.add_option('-b', dest='batch_size', default=None, type='int', help='Align sizes with batch size')
    parser.add_option('-c', dest='counts', default=False, action='store_true', help='Validation and training proportions are given as raw counts [Default: %default]')
    parser.add_option('-e', dest='extend_length', type='int', default=None, help='Extend all sequences to this length [Default: %default]')
    parser.add_option('-r', dest='permute', default=False, action='store_true', help='Permute sequences [Default: %default]')
    parser.add_option('-s', dest='random_seed', default=1, type='int', help='numpy.random seed [Default: %default]')
    parser.add_option('-t', dest='test_pct', default=0, type='float', help='Test % [Default: %default]')
    parser.add_option('-v', dest='valid_pct', default=0, type='float', help='Validation % [Default: %default]')
    parser.add_option('--vt', dest='valid_test', default=False, action='store_true', help='Use validation ')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide fasta file, targets file, and an output prefix')
    else:
        fasta_file = args[0]
        targets_file = args[1]
        out_file = args[2]

    # seed rng before shuffle
    npr.seed(options.random_seed)

    #################################################################
    # load data
    #################################################################
    seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file, extend_len=options.extend_length, mean_norm=False, whiten=False, permute=False, sort=False)

    # reshape sequences for torch
    seqs = seqs.reshape((seqs.shape[0],4,1,seqs.shape[1]/4))

    # read headers
    headers = []
    for line in open(fasta_file):
        if line[0] == '>':
            headers.append(line[1:].rstrip())
    headers = np.array(headers)

    # read labels
    target_labels = open(targets_file).readline().strip().split('\t')

    # read additional features
    if options.add_features_file:
        df_add = pd.read_table(options.add_features_file, index_col=0)
        df_add = df_add.astype(np.float32, copy=False)

    # permute
    if options.permute:
        order = npr.permutation(seqs.shape[0])
        seqs = seqs[order]
        targets = targets[order]
        headers = headers[order]
        if options.add_features_file:
            df_add = df_add.iloc[order]

    # check proper sum
    if options.counts:
        assert(options.test_pct + options.valid_pct <= seqs.shape[0])
    else:
        assert(options.test_pct + options.valid_pct <= 1.0)

    #################################################################
    # divide data
    #################################################################
    if options.counts:
        test_count = options.test_pct
        valid_count = options.valid_pct
    else:
        test_count = int(0.5 + options.test_pct * seqs.shape[0])
        valid_count = int(0.5 + options.valid_pct * seqs.shape[0])

    train_count = seqs.shape[0] - test_count - valid_count
    train_count = batch_round(train_count, options.batch_size)
    print >> sys.stderr, '%d training sequences ' % train_count

    test_count = batch_round(test_count, options.batch_size)
    print >> sys.stderr, '%d test sequences ' % test_count

    valid_count = batch_round(valid_count, options.batch_size)
    print >> sys.stderr, '%d validation sequences ' % valid_count

    i = 0
    train_seqs, train_targets = seqs[i:i+train_count,:], targets[i:i+train_count,:]
    i += train_count
    valid_seqs, valid_targets, valid_headers = seqs[i:i+valid_count,:], targets[i:i+valid_count,:], headers[i:i+valid_count]
    i += valid_count
    test_seqs, test_targets, test_headers = seqs[i:i+test_count,:], targets[i:i+test_count,:], headers[i:i+test_count]

    if options.add_features_file:
        i = 0
        train_add = df_add.iloc[i:i+train_count]
        i += train_count
        valid_add = df_add.iloc[i:i+valid_count]
        i += valid_count
        test_add = df_add.iloc[i:i+test_count]

    #################################################################
    # construct hdf5 representation
    #################################################################
    h5f = h5py.File(out_file, 'w')

    h5f.create_dataset('target_labels', data=target_labels)

    if train_count > 0:
        h5f.create_dataset('train_in', data=train_seqs)
        h5f.create_dataset('train_out', data=train_targets)

    if valid_count > 0:
        h5f.create_dataset('valid_in', data=valid_seqs)
        h5f.create_dataset('valid_out', data=valid_targets)

    if test_count > 0:
        h5f.create_dataset('test_in', data=test_seqs)
        h5f.create_dataset('test_out', data=test_targets)
        h5f.create_dataset('test_headers', data=test_headers)
    elif options.valid_test:
        h5f.create_dataset('test_in', data=valid_seqs)
        h5f.create_dataset('test_out', data=valid_targets)
        h5f.create_dataset('test_headers', data=valid_headers)

    if options.add_features_file:
        h5f.create_dataset('add_labels', data=list(df_add.columns))

        if train_count > 0:
            h5f.create_dataset('train_add', data=train_add.as_matrix())
        if valid_count > 0:
            h5f.create_dataset('valid_add', data=valid_add.as_matrix())
        if test_count > 0:
            h5f.create_dataset('test_add', data=test_add.as_matrix())
        elif options.valid_test:
            h5f.create_dataset('test_add', data=valid_add.as_matrix())

    h5f.close()
Example #8
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file> <out_file>'
    parser = OptionParser(usage)
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide fasta file, targets file, and an output prefix')
    else:
        fasta_file = args[0]
        targets_file = args[1]
        out_file = args[2]

    seqs, targets = dna_io.load_data_1hot(fasta_file, targets_file)

    seqs = seqs.reshape((seqs.shape[0], 4, 1, seqs.shape[1] / 4))

    target_labels = open(targets_file).readline().strip().split('\t')

    order = npr.permutation(seqs.shape[0])
    seqs = seqs[order]
    targets = targets[order]

    order = npr.permutation(seqs.shape[0])
    seqs = seqs[order]
    targets = targets[order]

    seqsnum1 = int(seqs.shape[0] / 20)
    x = 0

    kseqs = []
    ktargets = []
    for i in range(19):
        kseqs.append(seqs[x:x + seqsnum1, :].tolist())
        ktargets.append(targets[x:x + seqsnum1, :].tolist())
        x += seqsnum1

    kseqs.append(seqs[x:seqs.shape[0], :].tolist())
    ktargets.append(targets[x:seqs.shape[0], :].tolist())

    length = len(kseqs[0])

    for i in range(20):
        name = out_file + str(i + 1) + '.h5'
        valid_seqs = kseqs[i]
        valid_targets = ktargets[i]
        if i != 0:
            train_seqs = kseqs[0][:][:][:][:]
            train_targets = ktargets[0][:][:][:][:]
        else:
            train_seqs = kseqs[1][:][:][:][:]
            train_targets = ktargets[1][:][:][:][:]
        for j in range(1, 20):
            if (j != i) and (i != 0):
                train_seqs += kseqs[j][:][:][:][:]
                train_targets += ktargets[j][:][:][:][:]
            elif (j != 1) and (i == 0):
                train_seqs += kseqs[j][:][:][:][:]
                train_targets += ktargets[j][:][:][:][:]
        h5f = h5py.File(name, 'w')
        h5f.create_dataset('target_labels', data=target_labels)
        h5f.create_dataset('train_in', data=train_seqs)
        h5f.create_dataset('train_out', data=train_targets)
        h5f.create_dataset('valid_in', data=valid_seqs)
        h5f.create_dataset('valid_out', data=valid_targets)
        h5f.close()
Example #9
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='input_activity_file', help='Optional activity table corresponding to an input FASTA file')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='rng_seed', default=1, type='float', help='Random number generator seed [Default: %default]')
    parser.add_option('-s', dest='sample', default=None, type='int', help='Sample sequences from the test set [Default:%default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file')
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5'%options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False)

            # read in target names
            target_labels = open(options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,seqs_1hot.shape[1]/4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try: # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5'%options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')


    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_net_predict.lua %s %s %s' % (model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)


    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    reprs = []
    l = 1
    while 'reprs%d'%l in hdf5_in.keys():
        reprs.append(np.array(hdf5_in['reprs%d'%l]))
        l += 1
    hdf5_in.close()


    #################################################################
    # plot
    #################################################################
    print len(reprs)
    for l in range(len(reprs)):
        for si in range(len(seq_headers)):
            plt.figure()

            # just write the sequence out above it
            # or maybe I'll ultimately want to write an
            # influence version. yea probably.

            print reprs[l][si].shape
            sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False)
            plt.savefig('%s/%s_l%d.pdf' % (options.out_dir, header_filename(seq_headers[si]), l))
            plt.close()
Example #10
0
def main():
    usage = "usage: %prog [options] <model_file> <profile_file> <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a", dest="input_activity_file", help="Optional activity table corresponding to an input FASTA file"
    )
    parser.add_option(
        "--all",
        dest="all_data",
        default=False,
        action="store_true",
        help="Search all training/valid/test sequences. By default we search only the test set. [Default: %default]",
    )
    parser.add_option(
        "--cuda", dest="cuda", default=False, action="store_true", help="Run on GPGPU [Default: %default]"
    )
    parser.add_option(
        "--cudnn", dest="cudnn", default=False, action="store_true", help="Run on GPGPU w/cuDNN [Default: %default]"
    )
    parser.add_option(
        "-d",
        dest="model_out_file",
        default=None,
        help="Pre-computed model predictions output table [Default: %default]",
    )
    parser.add_option(
        "-e",
        dest="norm_even",
        default=False,
        action="store_true",
        help="Normalize the weights for the positive and negative datasets to be even [Default: %default]",
    )
    parser.add_option("-f", dest="font_heat", default=6, type="int", help="Heat map axis font size [Default: %default]")
    parser.add_option(
        "-n", dest="num_dissect", default=10, type="int", help="Dissect the top n hits [Default: %default]"
    )
    parser.add_option("-o", dest="out_dir", default="profile", help="Output directory [Default: %default]")
    parser.add_option(
        "-r",
        dest="norm_preds",
        default=False,
        action="store_true",
        help="Normalize predictions to have equal frequency [Default: %default]",
    )
    parser.add_option(
        "-z",
        dest="weight_zero",
        default=1.0,
        type="float",
        help="Adjust the weights for the zero samples by this value [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            "Must provide Basset model file, activity profile file, and input sequences (as a FASTA file or test data in an HDF file)"
        )
    else:
        model_file = args[0]
        profile_file = args[1]
        input_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        model_input_hdf5 = "%s/model_in.h5" % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file, options.input_activity_file, mean_norm=False, whiten=False, permute=False, sort=False
            )

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, "w")
        h5f.create_dataset("test_in", data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError, UnicodeDecodeError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")

            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            seq_headers = np.array([h.decode("UTF-8") for h in hdf5_in["test_headers"]])

            hdf5_in.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    #################################################################
    # Torch predict modifications
    #################################################################
    # GPU options (needed below, too)
    gpgpu_str = ""
    if options.cudnn:
        gpgpu_str = "-cudnn"
    elif options.cuda:
        gpgpu_str = "-cuda"

    if options.model_out_file is None:
        options.model_out_file = "%s/preds.txt" % options.out_dir

        torch_cmd = "basset_predict.lua -mc_n 10 -rc %s %s %s %s" % (
            gpgpu_str,
            model_file,
            model_input_hdf5,
            options.model_out_file,
        )
        print(torch_cmd)
        subprocess.call(torch_cmd, shell=True)

    # read in predictions
    seqs_preds = np.loadtxt(options.model_out_file)

    num_targets = seqs_preds.shape[1]

    #################################################################
    # parse profile file
    #################################################################
    activity_profile, profile_weights, profile_mask, target_labels = load_profile(
        profile_file, num_targets, options.norm_even, options.weight_zero
    )

    # normalize predictions
    if options.norm_preds:
        pred_means = seqs_preds.mean(axis=0)

        # save to file for basset_refine.py
        np.save("%s/pred_means" % options.out_dir, pred_means)

        # aim for profile weighted average
        aim_mean = np.average(pred_means[profile_mask], weights=profile_weights[profile_mask])

        # normalize
        for ti in range(seqs_preds.shape[1]):
            ratio_ti = pred_means[ti] / aim_mean
            if profile_mask[ti] and (ratio_ti < 1 / 4 or ratio_ti > 4):
                print(
                    "WARNING: target %d with mean %.4f differs 4-fold from the median %.3f"
                    % (ti, pred_means[ti], aim_mean),
                    file=sys.stderr,
                )
            seqs_preds[:, ti] = znorm(seqs_preds[:, ti], pred_means[ti], aim_mean)

    #################################################################
    # plot clustered heat map limited to relevant targets
    #################################################################
    seqs_preds_prof = seqs_preds[:, profile_mask]
    seqs_preds_var = seqs_preds_prof.var(axis=1)
    seqs_sort_var = np.argsort(seqs_preds_var)[::-1]

    # heat map
    plt.figure()
    g = sns.clustermap(
        np.transpose(seqs_preds_prof[seqs_sort_var[:1500]]),
        metric="cosine",
        linewidths=0,
        yticklabels=target_labels[profile_mask],
        xticklabels=False,
    )
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig("%s/heat_clust.pdf" % options.out_dir)
    plt.close()

    # dimension reduction
    # model_pca = PCA(n_components=50)
    # spp_pca = model.fit_transform(np.transpose(seqs_preds_prof))
    # model = TSNE(n_components=2, perplexity=5, metric='euclidean')
    # spp_dr = model.fit_transform(spp_pca)
    model = PCA(n_components=2)
    spp_dr = model.fit_transform(np.transpose(seqs_preds_prof))
    plt.figure()
    plt.scatter(spp_dr[:, 0], spp_dr[:, 1], c="black", s=5)
    target_labels_prof_concise = [tl.split(":")[-1] for tl in target_labels[profile_mask]]
    for label, x, y, activity in zip(
        target_labels_prof_concise, spp_dr[:, 0], spp_dr[:, 1], activity_profile[profile_mask]
    ):
        plt.annotate(label, xy=(x, y), size=10, color=sns.color_palette("deep")[int(activity)])
    plt.savefig("%s/dim_red.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # compute profile distances
    #################################################################
    # compute prediction distances
    seqs_pdists = []
    for si in range(seqs_preds.shape[0]):
        # sd = np.power(seqs_preds[si,profile_mask]-activity_profile[profile_mask], 2).sum()
        sd = log_loss(
            activity_profile[profile_mask], seqs_preds[si, profile_mask], sample_weight=profile_weights[profile_mask]
        )
        seqs_pdists.append(sd)
    seqs_pdists = np.array(seqs_pdists)

    # obtain sorted indexes
    seqs_sort_dist = np.argsort(seqs_pdists)

    # compute target distances
    seqs_tdists = []
    for si in range(seqs_preds.shape[0]):
        tdists = np.absolute(targets[si, profile_mask] - activity_profile[profile_mask])
        tdists_weight = np.multiply(tdists, profile_weights[profile_mask])
        td = tdists_weight.sum()
        seqs_tdists.append(td)
    seqs_tdists = np.array(seqs_tdists)

    # print as table
    table_out = open("%s/table.txt" % options.out_dir, "w")
    for si in seqs_sort_dist:
        cols = [si, seqs_pdists[si], seqs_tdists[si]] + list(seqs_preds[si, profile_mask])
        print("\t".join([str(c) for c in cols]), file=table_out)
    table_out.close()

    #################################################################
    # plot sorted heat map
    #################################################################
    plt.figure()
    g = sns.clustermap(
        np.transpose(seqs_preds_prof[seqs_sort_dist[:1000]]),
        col_cluster=False,
        metric="cosine",
        linewidths=0,
        yticklabels=target_labels[profile_mask],
        xticklabels=False,
    )
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)
    for label in g.ax_heatmap.yaxis.get_majorticklabels():
        label.set_fontsize(options.font_heat)
    plt.savefig("%s/heat_rank.pdf" % options.out_dir)
    plt.close()

    #################################################################
    # dissect the top hits
    #################################################################
    satmut_targets = ",".join([str(ti) for ti in range(len(activity_profile)) if profile_mask[ti]])

    if gpgpu_str != "":
        gpgpu_str = "-%s" % gpgpu_str

    for ni in range(options.num_dissect):
        si = seqs_sort_dist[ni]

        # print FASTA
        fasta_file = "%s/seq%d.fa" % (options.out_dir, ni)
        fasta_out = open(fasta_file, "w")
        print(">%s\n%s" % (seq_headers[si], seqs[si]), file=fasta_out)
        fasta_out.close()

        # saturated mutagenesis
        cmd = "basset_sat.py %s --mc_n 10 -n 500 -o %s/satmut%d -t %s %s %s" % (
            gpgpu_str,
            options.out_dir,
            ni,
            satmut_targets,
            model_file,
            fasta_file,
        )
        subprocess.call(cmd, shell=True)

        # predictions and targets heat
        profile_sort = np.argsort(activity_profile[profile_mask])
        heat_mat = np.array([activity_profile[profile_mask], targets[si, profile_mask], seqs_preds_prof[si]])
        heat_mat = heat_mat[:, profile_sort]

        plt.figure()
        ax = sns.heatmap(
            np.transpose(heat_mat),
            yticklabels=target_labels[profile_mask][profile_sort],
            xticklabels=["Desired", "Experiment", "Prediction"],
        )
        plt.setp(ax.xaxis.get_majorticklabels(), rotation=-45)
        plt.setp(ax.yaxis.get_majorticklabels(), rotation=-0)
        for label in ax.yaxis.get_majorticklabels():
            label.set_fontsize(options.font_heat)
        plt.savefig("%s/heat%d.pdf" % (options.out_dir, ni))
        plt.close()
Example #11
0
def main():
    usage = 'usage: %prog [options] <model_file> <input_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='input_activity_file',
        help='Optional activity table corresponding to an input FASTA file')
    parser.add_option(
        '-d',
        dest='model_hdf5_file',
        default=None,
        help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='heat',
                      help='Output directory [Default: %default]')
    parser.add_option('-r',
                      dest='rng_seed',
                      default=1,
                      type='float',
                      help='Random number generator seed [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='targets',
        default='0',
        help=
        'Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide Basset model file and input sequences (as a FASTA file or test data in an HDF file'
        )
    else:
        model_file = args[0]
        input_file = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(options.rng_seed)

    #################################################################
    # parse input file
    #################################################################
    try:
        # input_file is FASTA

        # load sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == '>':
                seq_headers.append(line[1:].rstrip())
                seqs.append('')
            else:
                seqs[-1] += line.rstrip()

        model_input_hdf5 = '%s/model_in.h5' % options.out_dir

        if options.input_activity_file:
            # one hot code
            seqs_1hot, targets = dna_io.load_data_1hot(
                input_file,
                options.input_activity_file,
                mean_norm=False,
                whiten=False,
                permute=False,
                sort=False)

            # read in target names
            target_labels = open(
                options.input_activity_file).readline().strip().split('\t')

        else:
            # load sequences
            seqs_1hot = dna_io.load_sequences(input_file, permute=False)
            targets = None
            target_labels = None

        # sample
        if options.sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), options.sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            if targets is not None:
                targets = targets[sample_i]

        # reshape sequences for torch
        seqs_1hot = seqs_1hot.reshape(
            (seqs_1hot.shape[0], 4, 1, seqs_1hot.shape[1] / 4))

        # write as test data to a HDF5 file
        h5f = h5py.File(model_input_hdf5, 'w')
        h5f.create_dataset('test_in', data=seqs_1hot)
        h5f.close()

    except (IOError, IndexError):
        # input_file is HDF5

        try:
            model_input_hdf5 = input_file

            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, 'r')
            seqs_1hot = np.array(hdf5_in['test_in'])
            targets = np.array(hdf5_in['test_out'])
            try:  # TEMP
                seq_headers = np.array(hdf5_in['test_headers'])
                target_labels = np.array(hdf5_in['target_labels'])
            except:
                seq_headers = None
                target_labels = None
            hdf5_in.close()

            # sample
            if options.sample:
                sample_i = np.array(
                    random.sample(xrange(seqs_1hot.shape[0]), options.sample))
                seqs_1hot = seqs_1hot[sample_i]
                seq_headers = seq_headers[sample_i]
                targets = targets[sample_i]

                # write sampled data to a new HDF5 file
                model_input_hdf5 = '%s/model_in.h5' % options.out_dir
                h5f = h5py.File(model_input_hdf5, 'w')
                h5f.create_dataset('test_in', data=seqs_1hot)
                h5f.close()

            # convert to ACGT sequences
            seqs = dna_io.vecs2dna(seqs_1hot)

        except IOError:
            parser.error('Could not parse input file as FASTA or HDF5.')

    #################################################################
    # Torch predict modifications
    #################################################################
    if options.model_hdf5_file is None:
        options.model_hdf5_file = '%s/model_out.h5' % options.out_dir
        torch_cmd = 'basset_net_predict.lua %s %s %s' % (
            model_file, model_input_hdf5, options.model_hdf5_file)
        print torch_cmd
        subprocess.call(torch_cmd, shell=True)

    #################################################################
    # load modification predictions
    #################################################################
    hdf5_in = h5py.File(options.model_hdf5_file, 'r')
    reprs = []
    l = 1
    while 'reprs%d' % l in hdf5_in.keys():
        reprs.append(np.array(hdf5_in['reprs%d' % l]))
        l += 1
    hdf5_in.close()

    #################################################################
    # plot
    #################################################################
    print len(reprs)
    for l in range(len(reprs)):
        for si in range(len(seq_headers)):
            plt.figure()

            # just write the sequence out above it
            # or maybe I'll ultimately want to write an
            # influence version. yea probably.

            print reprs[l][si].shape
            sns.heatmap(reprs[l][si], linewidths=0, xticklabels=False)
            plt.savefig('%s/%s_l%d.pdf' %
                        (options.out_dir, header_filename(seq_headers[si]), l))
            plt.close()