Example #1
0
def score_reads(k, readsf, par):
    cmds = []
    for c in range(k):
        cmds.append(
            '%s/simple-score -N cluster-%d.icm < %s > icm-%d.scores.tmp 2>/dev/null'
            % (bin_dir, c, readsf, c))

    util.exec_par(cmds, par)
Example #2
0
def train_imm(k, soft_assign, par):
    cmds = []
    for i in range(k):
        if soft_assign:
            cmds.append('%s/em_build-icm -p 1 cluster-%d.icm < cluster-%d.build.fa' % (bin_dir,i,i))
        else:
            cmds.append('%s/build-icm -p 1 cluster-%d.icm < cluster-%d.fa' % (bin_dir,i,i))

    util.exec_par(cmds, par)
Example #3
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=786432,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    parser.add_option('-c',
                      '--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-f',
                      dest='folds',
                      default=None,
                      type='int',
                      help='Generate cross fold split [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-i',
                      dest='interp_nan',
                      default=False,
                      action='store_true',
                      help='Interpolate NaNs [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option(
        '--peaks',
        dest='peaks_only',
        default=False,
        action='store_true',
        help='Create contigs only from peaks [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=256,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option(
        '--restart',
        dest='restart',
        default=False,
        action='store_true',
        help='Continue progress from midpoint. [Default: %default]')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--snap',
        dest='snap',
        default=1,
        type='int',
        help='Snap sequences to multiple of the given value [Default: %default]'
    )
    parser.add_option('--st',
                      '--split_test',
                      dest='split_test',
                      default=False,
                      action='store_true',
                      help='Exit after split. [Default: %default]')
    parser.add_option(
        '--stride',
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.5,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_clip',
        dest='umap_clip',
        default=1,
        type='float',
        help=
        'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]'
    )
    parser.add_option(
        '--umap_tfr',
        dest='umap_tfr',
        default=False,
        action='store_true',
        help='Save umap array into TFRecords [Default: %default]')
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option('--norm',
                      dest='norm',
                      default='',
                      type='str',
                      help='Normalize coverage values')
    parser.add_option('--step',
                      dest='step',
                      default=0,
                      type='int',
                      help='Stride using bp size [Default: %pool_window]')
    parser.add_option('--padding',
                      dest='padding',
                      default='valid',
                      type='str',
                      help='Padding method for sliding window approach')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    if options.break_t is not None and options.break_t < options.seq_length:
        print(
            'Maximum contig length --break cannot be less than sequence length.',
            file=sys.stderr)
        exit(1)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print('stride_train %.f' % options.stride_train, end='')
        options.stride_train = options.stride_train * options.seq_length
        print(' converted to %f' % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        if options.folds is None:
            print('stride_test %.f' % options.stride_test, end='')
            options.stride_test = options.stride_test * options.seq_length
            print(' converted to %f' % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    # check snap
    if options.snap is not None:
        if np.mod(options.seq_length, options.snap) != 0:
            raise ValueError('seq_length must be a multiple of snap')
        if np.mod(options.stride_train, options.snap) != 0:
            raise ValueError('stride_train must be a multiple of snap')
        if np.mod(options.stride_test, options.snap) != 0:
            raise ValueError('stride_test must be a multiple of snap')

    # setup output directory
    if os.path.isdir(options.out_dir) and not options.restart:
        print('Remove output directory %s or use --restart option.' %
              options.out_dir)
        exit(1)
    elif not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # read target datasets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')

    ################################################################
    # define genomic contigs
    ################################################################
    if not options.restart:
        chrom_contigs = genome.load_chromosomes(fasta_file)

        # remove gaps
        if options.gaps_file:
            chrom_contigs = genome.split_contigs(chrom_contigs,
                                                 options.gaps_file)

        # ditch the chromosomes for contigs
        contigs = []
        for chrom in chrom_contigs:
            if len(chrom.split('_')) == 1 and chrom != 'chrM':
                contigs += [
                    Contig(chrom, ctg_start, ctg_end)
                    for ctg_start, ctg_end in chrom_contigs[chrom]
                ]

        # limit to a BED file
        if options.limit_bed is not None:
            contigs = limit_contigs(contigs, options.limit_bed)

        # limit to peaks
        if options.peaks_only:
            peaks_bed = curate_peaks(targets_df, options.out_dir,
                                     options.pool_width, options.crop_bp)
            contigs = limit_contigs(contigs, peaks_bed)

        # filter for large enough
        contigs = [
            ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
        ]

        # break up large contigs
        if options.break_t is not None:
            contigs = break_large_contigs(contigs, options.break_t)

        # print contigs to BED file
        # ctg_bed_file = '%s/contigs.bed' % options.out_dir
        # write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    # label folds
    if options.folds is not None:
        fold_labels = ['fold%d' % fi for fi in range(options.folds)]
        num_folds = options.folds
    else:
        fold_labels = ['train', 'valid', 'test']
        num_folds = 3

    if not options.restart:
        if options.folds is not None:
            # divide by fold pct
            fold_contigs = divide_contigs_folds(contigs, options.folds)

        else:
            try:
                # convert to float pct
                valid_pct = float(options.valid_pct_or_chr)
                test_pct = float(options.test_pct_or_chr)
                assert (0 <= valid_pct <= 1)
                assert (0 <= test_pct <= 1)

                # divide by pct
                fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct)

            except (ValueError, AssertionError):
                # divide by chr
                valid_chrs = options.valid_pct_or_chr.split(',')
                test_chrs = options.test_pct_or_chr.split(',')
                fold_contigs = divide_contigs_chr(contigs, test_chrs,
                                                  valid_chrs)

        # rejoin broken contigs within set
        for fi in range(len(fold_contigs)):
            fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi])

        # write labeled contigs to BED file
        ctg_bed_file = '%s/contigs.bed' % options.out_dir
        ctg_bed_out = open(ctg_bed_file, 'w')
        for fi in range(len(fold_contigs)):
            for ctg in fold_contigs[fi]:
                line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end,
                                           fold_labels[fi])
                print(line, file=ctg_bed_out)
        ctg_bed_out.close()

    if options.split_test:
        exit()

    ################################################################
    # define model sequences
    ################################################################
    if not options.restart:
        fold_mseqs = []
        for fi in range(num_folds):
            if fold_labels[fi] in ['valid', 'test']:
                stride_fold = options.stride_test
            else:
                stride_fold = options.stride_train

            # stride sequences across contig
            fold_mseqs_fi = contig_sequences(fold_contigs[fi],
                                             options.seq_length, stride_fold,
                                             options.snap, fold_labels[fi])
            fold_mseqs.append(fold_mseqs_fi)

            # shuffle
            random.shuffle(fold_mseqs[fi])

            # down-sample
            if options.sample_pct < 1.0:
                fold_mseqs[fi] = random.sample(
                    fold_mseqs[fi],
                    int(options.sample_pct * len(fold_mseqs[fi])))

        # merge into one list
        mseqs = [ms for fm in fold_mseqs for ms in fm]

    ################################################################
    # mappability
    ################################################################
    if not options.restart:
        if options.umap_bed is not None:
            if shutil.which('bedtools') is None:
                print('Install Bedtools to annotate unmappable sites',
                      file=sys.stderr)
                exit(1)

            # annotate unmappable positions
            mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                         options.seq_length,
                                         options.pool_width, options.crop_bp)

            # filter unmappable
            mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                              options.umap_t)
            mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
            mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

            # write to file
            unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
            np.save(unmap_npy, mseqs_unmap)

        # write sequences to BED
        seqs_bed_file = '%s/sequences.bed' % options.out_dir
        write_seqs_bed(seqs_bed_file, mseqs, True)

    else:
        # read from directory
        seqs_bed_file = '%s/sequences.bed' % options.out_dir
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        mseqs = []
        fold_mseqs = []
        for fi in range(num_folds):
            fold_mseqs.append([])
        for line in open(seqs_bed_file):
            a = line.split()
            msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3])
            mseqs.append(msg)
            if a[3] == 'train':
                fi = 0
            elif a[3] == 'valid':
                fi = 1
            elif a[3] == 'test':
                fi = 2
            else:
                fi = int(a[3].replace('fold', ''))
            fold_mseqs[fi].append(msg)

    ################################################################
    # read sequence coverage values
    ################################################################
    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        clipsoft_ti = None
        if 'clip_soft' in targets_df.columns:
            clipsoft_ti = targets_df['clip_soft'].iloc[ti]

        scale_ti = 1
        if 'scale' in targets_df.columns:
            scale_ti = targets_df['scale'].iloc[ti]

        if options.restart and os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = '/home/shush/profile/tfprofile/bin/basenji_data_read.py'
            cmd += ' --crop %d' % options.crop_bp
            cmd += ' -w %d' % options.pool_width
            cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti]
            if clip_ti is not None:
                cmd += ' -c %f' % clip_ti
            if clipsoft_ti is not None:
                cmd += ' --clip_soft %f' % clipsoft_ti
            cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            if options.interp_nan:
                cmd += ' -i'
            if options.norm:
                cmd += ' --norm %s' % options.norm
            if options.step:
                cmd += ' --step %i' % options.step
            if options.padding:
                cmd += ' --padding %s' % options.padding
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for fold_set in fold_labels:
        fold_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == fold_set
        ]
        fold_set_start = fold_set_indexes[0]
        fold_set_end = fold_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = fold_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end)

        while tfr_start <= fold_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i)

            cmd = '/home/shush/profile/tfprofile/bin/basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end
            cmd += ' --umap_clip %f' % options.umap_clip
            if options.umap_tfr:
                cmd += ' --umap_tfr'
            if options.umap_bed is not None:
                cmd += ' -u %s' % unmap_npy

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (fold_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # stats
    ################################################################
    stats_dict = {}
    stats_dict['num_targets'] = targets_df.shape[0]
    stats_dict['seq_length'] = options.seq_length
    stats_dict['pool_width'] = options.pool_width
    stats_dict['crop_bp'] = options.crop_bp

    target_length = options.seq_length - 2 * options.crop_bp
    target_length = target_length // options.pool_width
    stats_dict['target_length'] = target_length

    for fi in range(num_folds):
        stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi])

    for i in range(10):
        print('~~~')
    print('%s/statistics.json' % options.out_dir)
    for i in range(10):
        print('~~~')
    with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
Example #4
0
def score_reads(k, readsf, par):
    cmds = []
    for c in range(k):
        cmds.append('%s/simple-score -N cluster-%d.icm < %s > icm-%d.scores.tmp 2>/dev/null' % (bin_dir,c,readsf,c))
    
    util.exec_par(cmds, par)
Example #5
0
def main():
    usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir> <bed_file>'
    parser = OptionParser(usage)

    # sat options
    sat_options = OptionGroup(parser, 'basenji_sat_bed.py options')
    sat_options.add_option(
        '-d',
        dest='mut_down',
        default=0,
        type='int',
        help=
        'Nucleotides downstream of center sequence to mutate [Default: %default]'
    )
    sat_options.add_option(
        '-f',
        dest='genome_fasta',
        default=None,
        help='Genome FASTA for sequences [Default: %default]')
    sat_options.add_option(
        '-l',
        dest='mut_len',
        default=0,
        type='int',
        help='Length of center sequence to mutate [Default: %default]')
    sat_options.add_option('-o',
                           dest='out_dir',
                           default='sat_mut',
                           help='Output directory [Default: %default]')
    sat_options.add_option('--plots',
                           dest='plots',
                           default=False,
                           action='store_true',
                           help='Make heatmap plots [Default: %default]')
    sat_options.add_option('-p',
                           dest='processes',
                           default=None,
                           type='int',
                           help='Number of processes, passed by multi script')
    sat_options.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    sat_options.add_option(
        '--shifts',
        dest='shifts',
        default='0',
        help='Ensemble prediction shifts [Default: %default]')
    sat_options.add_option(
        '--stats',
        dest='sad_stats',
        default='sum',
        help='Comma-separated list of stats to save. [Default: %default]')
    sat_options.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    sat_options.add_option(
        '-u',
        dest='mut_up',
        default=0,
        type='int',
        help=
        'Nucleotides upstream of center sequence to mutate [Default: %default]'
    )
    parser.add_option_group(sat_options)

    phylop_options = OptionGroup(parser, 'basenji_bench_phylop.py options')
    # phylop_options.add_option('-e', dest='num_estimators',
    #   default=100, type='int',
    #   help='Number of random forest estimators [Default: %default]')
    phylop_options.add_option(
        '-g',
        dest='genome',
        default='ce11',
        help='PhyloP and FASTA genome [Default: %default]')
    # phylop_options.add_option('--pca', dest='n_components',
    #   default=None, type='int',
    #   help='PCA n_components [Default: %default]')
    parser.add_option_group(phylop_options)

    fold_options = OptionGroup(parser, 'cross-fold options')
    fold_options.add_option(
        '-a',
        '--alt',
        dest='alternative',
        default='two-sided',
        help='Statistical test alternative [Default: %default]')
    fold_options.add_option(
        '-c',
        dest='crosses',
        default=1,
        type='int',
        help='Number of cross-fold rounds [Default:%default]')
    fold_options.add_option('-e',
                            dest='conda_env',
                            default='tf2.4',
                            help='Anaconda environment [Default: %default]')
    fold_options.add_option('--label_exp',
                            dest='label_exp',
                            default='Experiment',
                            help='Experiment label [Default: %default]')
    fold_options.add_option('--label_ref',
                            dest='label_ref',
                            default='Reference',
                            help='Reference label [Default: %default]')
    fold_options.add_option(
        '--max_proc',
        dest='max_proc',
        default=None,
        type='int',
        help='Maximum concurrent processes [Default: %default]')
    fold_options.add_option('--name',
                            dest='name',
                            default='sat',
                            help='SLURM name prefix [Default: %default]')
    fold_options.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    fold_options.add_option('-r',
                            dest='ref_dir',
                            default=None,
                            help='Reference directory for statistical tests')
    parser.add_option_group(fold_options)

    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide parameters file and data directory')
    else:
        exp_dir = args[0]
        params_file = args[1]
        data_dir = args[2]
        bed_file = args[3]

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # count folds
    num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')])

    # genome
    genome_path = os.environ[options.genome.upper()]
    options.genome_fasta = '%s/assembly/%s.fa' % (genome_path, options.genome)

    ################################################################
    # saturation mutagenesis
    ################################################################
    jobs = []
    scores_files = []

    for ci in range(options.crosses):
        for fi in range(num_folds):
            it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)
            name = '%s-f%dc%d' % (options.name, fi, ci)

            # update output directory
            sat_dir = '%s/%s' % (it_dir, options.out_dir)

            # check if done
            scores_file = '%s/scores.h5' % sat_dir
            scores_files.append(scores_file)
            if os.path.isfile(scores_file):
                print('%s already generated.' % scores_file)
            else:
                basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                basenji_cmd += ' conda activate %s;' % options.conda_env
                basenji_cmd += ' echo $HOSTNAME;'

                if options.processes > 1:
                    basenji_cmd += ' basenji_sat_bed_multi.py'
                    basenji_cmd += ' --max_proc %d' % (options.max_proc //
                                                       num_folds)
                    basenji_cmd += ' -q %s' % options.queue
                    basenji_cmd += ' -n %s' % name
                    basenji_cmd += ' -r'
                else:
                    basenji_cmd += ' basenji_sat_bed.py'

                basenji_cmd += ' %s' % options_string(options, sat_options,
                                                      sat_dir)
                basenji_cmd += ' %s' % params_file
                basenji_cmd += ' %s/train/model_best.h5' % it_dir
                basenji_cmd += ' %s' % bed_file

                if options.processes > 1:
                    jobs.append(basenji_cmd)
                else:
                    basenji_job = slurm.Job(basenji_cmd,
                                            name,
                                            out_file='%s.out' % sat_dir,
                                            err_file='%s.err' % sat_dir,
                                            cpu=2,
                                            gpu=1,
                                            queue=options.queue,
                                            mem=30000,
                                            time='28-0:00:00')
                    jobs.append(basenji_job)

    if options.processes > 1:
        util.exec_par(jobs, verbose=True)
    else:
        slurm.multi_run(jobs, verbose=True)

    ################################################################
    # ensemble
    ################################################################
    ensemble_dir = '%s/ensemble' % exp_dir
    if not os.path.isdir(ensemble_dir):
        os.mkdir(ensemble_dir)

    sat_dir = '%s/%s' % (ensemble_dir, options.out_dir)
    if not os.path.isdir(sat_dir):
        os.mkdir(sat_dir)

    if not os.path.isfile('%s/scores.h5' % sat_dir):
        print('Generating ensemble scores.')
        ensemble_scores_h5(sat_dir, scores_files)
    else:
        print('Ensemble scores already generated.')

    ################################################################
    # PhyloP regressors
    ################################################################
    # num_pcs = int(data_stats['num_targets']**0.75)

    jobs = []
    for ci in range(options.crosses):
        for fi in range(num_folds):
            it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)
            sat_dir = '%s/%s' % (it_dir, options.out_dir)

            if not os.path.isfile('%s/stats.txt' % sat_dir):
                phylop_cmd = 'basenji_bench_phylop.py'
                phylop_cmd += ' -e 200 -p 4'
                # phylop_cmd += ' -d %d' % num_pcs
                phylop_cmd += ' -o %s' % sat_dir
                phylop_cmd += ' %s/scores.h5' % sat_dir

                name = '%s-f%dc%d' % (options.name, fi, ci)
                std_pre = '%s/phylop' % sat_dir
                j = slurm.Job(phylop_cmd,
                              name,
                              '%s.out' % std_pre,
                              '%s.err' % std_pre,
                              queue='standard',
                              cpu=4,
                              mem=90000,
                              time='1-0:0:0')
                jobs.append(j)

    # ensemble
    sat_dir = '%s/%s' % (ensemble_dir, options.out_dir)
    if not os.path.isfile('%s/stats.txt' % sat_dir):
        phylop_cmd = 'basenji_bench_phylop.py'
        phylop_cmd += ' -e 200 -p 4'
        # phylop_cmd += ' -d %d' % num_pcs
        phylop_cmd += ' -o %s' % sat_dir
        phylop_cmd += ' %s/scores.h5' % sat_dir

        name = '%s-ens' % options.name
        std_pre = '%s/phylop' % sat_dir
        j = slurm.Job(phylop_cmd,
                      name,
                      '%s.out' % std_pre,
                      '%s.err' % std_pre,
                      queue='standard',
                      cpu=4,
                      mem=90000,
                      time='1-0:0:0')
        jobs.append(j)

    slurm.multi_run(jobs, verbose=True)

    ################################################################
    # compare
    ################################################################

    ref_sat_dirs = []
    exp_sat_dirs = []
    for ci in range(options.crosses):
        for fi in range(num_folds):
            exp_sat_dir = '%s/f%d_c%d/%s' % (exp_dir, fi, ci, options.out_dir)
            exp_sat_dirs.append(exp_sat_dir)
            if options.ref_dir is not None:
                ref_sat_dir = '%s/f%d_c%d/%s' % (options.ref_dir, fi, ci,
                                                 options.out_dir)
                ref_sat_dirs.append(ref_sat_dir)

    exp_pcor_folds, exp_r2_folds = read_metrics(exp_sat_dirs)
    exp_sat_dirs = ['%s/ensemble/%s' % (exp_dir, options.out_dir)]
    exp_pcor_ens, exp_r2_ens = read_metrics(exp_sat_dirs)
    if options.ref_dir is not None:
        ref_pcor_folds, ref_r2_folds = read_metrics(ref_sat_dirs)
        ref_sat_dirs = ['%s/ensemble/%s' % (options.ref_dir, options.out_dir)]
        ref_pcor_ens, ref_r2_ens = read_metrics(ref_sat_dirs)

    print('PearsonR')
    exp_mean = exp_pcor_folds.mean()
    exp_stdm = exp_pcor_folds.std() / np.sqrt(len(exp_pcor_folds))
    expe_mean = exp_pcor_ens.mean()
    expe_stdm = exp_pcor_ens.std() / np.sqrt(len(exp_pcor_ens))
    print('%12s:       %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm))
    print('%12s (ens): %.4f (%.4f)' %
          (options.label_exp, expe_mean, expe_stdm))
    if options.ref_dir is not None:
        ref_mean = ref_pcor_folds.mean()
        ref_stdm = ref_pcor_folds.std() / np.sqrt(len(ref_pcor_folds))
        refe_mean = ref_pcor_ens.mean()
        refe_stdm = ref_pcor_ens.std() / np.sqrt(len(ref_pcor_ens))
        print('%12s:       %.4f (%.4f)' %
              (options.label_ref, ref_mean, ref_stdm))
        print('%12s (ens): %.4f (%.4f)' %
              (options.label_ref, refe_mean, refe_stdm))

        mwp, tp = stat_tests(exp_pcor_folds, ref_pcor_folds,
                             options.alternative)
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)

    print('\nR2')
    exp_mean = exp_r2_folds.mean()
    exp_stdm = exp_r2_folds.std() / np.sqrt(len(exp_r2_folds))
    expe_mean = exp_r2_ens.mean()
    expe_stdm = exp_r2_ens.std() / np.sqrt(len(exp_r2_ens))
    print('%12s:       %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm))
    print('%12s (ens): %.4f (%.4f)' %
          (options.label_exp, expe_mean, expe_stdm))
    if options.ref_dir is not None:
        ref_mean = ref_r2_folds.mean()
        ref_stdm = ref_r2_folds.std() / np.sqrt(len(ref_r2_folds))
        refe_mean = ref_r2_ens.mean()
        refe_stdm = ref_r2_ens.std() / np.sqrt(len(ref_r2_ens))
        print('%12s:       %.4f (%.4f)' %
              (options.label_ref, ref_mean, ref_stdm))
        print('%12s (ens): %.4f (%.4f)' %
              (options.label_ref, refe_mean, refe_stdm))

        mwp, tp = stat_tests(exp_r2_folds, ref_r2_folds, options.alternative)
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)
Example #6
0
def main():
  usage = 'usage: %prog [options] <fasta_file> <targets_file>'
  parser = OptionParser(usage)
  parser.add_option('-b', dest='limit_bed',
      help='Limit to segments that overlap regions in a BED file')
  # parser.add_option('-c', dest='clip',
  #     default=None, type='float',
  #     help='Clip target values to have minimum [Default: %default]')
  parser.add_option('-d', dest='sample_pct',
      default=1.0, type='float',
      help='Down-sample the segments')
  parser.add_option('-g', dest='gaps_file',
      help='Genome assembly gaps BED [Default: %default]')
  parser.add_option('-l', dest='seq_length',
      default=131072, type='int',
      help='Sequence length [Default: %default]')
  parser.add_option('--local', dest='run_local',
      default=False, action='store_true',
      help='Run jobs locally as opposed to on SLURM [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='data_out',
      help='Output directory [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number parallel processes [Default: %default]')
  parser.add_option('--seed', dest='seed',
      default=44, type='int',
      help='Random seed [Default: %default]')
  parser.add_option('--stride_train', dest='stride_train',
      default=1., type='float',
      help='Stride to advance train sequences [Default: seq_length]')
  parser.add_option('--stride_test', dest='stride_test',
      default=1., type='float',
      help='Stride to advance valid and test sequences [Default: seq_length]')
  parser.add_option('-r', dest='seqs_per_tfr',
      default=256, type='int',
      help='Sequences per TFRecord file [Default: %default]')
  parser.add_option('-t', dest='test_pct',
      default=0.05, type='float',
      help='Proportion of the data for testing [Default: %default]')
  parser.add_option('-u', dest='unmap_bed',
      help='Unmappable segments to set to NA')
  parser.add_option('--unmap_t', dest='unmap_t',
      default=0.3, type='float',
      help='Remove sequences with more than this unmappable bin % [Default: %default]')
  parser.add_option('-w', dest='pool_width',
      default=128, type='int',
      help='Sum pool width [Default: %default]')
  parser.add_option('-v', dest='valid_pct',
      default=0.05, type='float',
      help='Proportion of the data for validation [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide FASTA and sample coverage labels and paths.')
  else:
    fasta_file = args[0]
    targets_file = args[1]

  random.seed(options.seed)
  np.random.seed(options.seed)

  if not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  ################################################################
  # define genomic contigs
  ################################################################
  chrom_contigs = basenji.genome.load_chromosomes(fasta_file)

  # remove gaps
  if options.gaps_file:
    chrom_contigs = basenji.genome.split_contigs(chrom_contigs,
                                                 options.gaps_file)

  # ditch the chromosomes for contigs
  contigs = []
  for chrom in chrom_contigs:
    contigs += [Contig(chrom, ctg_start, ctg_end)
                 for ctg_start, ctg_end in chrom_contigs[chrom]]

  # limit to a BED file
  if options.limit_bed is not None:
    contigs = limit_contigs(contigs, options.limit_bed)

  # filter for large enough
  contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

  # down-sample
  if options.sample_pct < 1.0:
    contigs = random.sample(contigs, int(options.sample_pct*len(contigs)))

  # print contigs to BED file
  ctg_bed_file = '%s/contigs.bed' % options.out_dir
  write_seqs_bed(ctg_bed_file, contigs)


  ################################################################
  # divide between train/valid/test
  ################################################################
  contig_sets = divide_contigs(contigs, options.test_pct, options.valid_pct)
  train_contigs, valid_contigs, test_contigs = contig_sets

  ################################################################
  # define model sequences
  ################################################################
  # stride sequences across contig
  train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train)
  valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test)
  test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test)

  # shuffle
  random.shuffle(train_mseqs)
  random.shuffle(valid_mseqs)
  random.shuffle(test_mseqs)

  # merge
  mseqs = train_mseqs + valid_mseqs + test_mseqs
  mseqs_labels = ['train']*len(train_mseqs) + ['valid']*len(valid_mseqs) + ['test']*len(test_mseqs)


  ################################################################
  # mappability
  ################################################################
  if options.unmap_bed is not None:
    # annotate unmappable positions
    mseqs_unmap = annotate_unmap(mseqs, options.unmap_bed,
                                 options.seq_length, options.pool_width)

    # filter unmappable
    mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.unmap_t)
    mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
    mseqs_labels = [mseqs_labels[i] for i in range(len(mseqs_labels)) if mseqs_map_mask[i]]
    mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

    # write to file
    unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
    np.save(unmap_npy, mseqs_unmap)

  # write sequences to BED
  seqs_bed_file = '%s/sequences.bed' % options.out_dir
  write_seqs_bed(seqs_bed_file, mseqs, mseqs_labels)


  ################################################################
  # read sequence coverage values
  ################################################################
  # read target datasets
  targets_df = pd.read_table(targets_file)

  seqs_cov_dir = '%s/seqs_cov' % options.out_dir
  if not os.path.isdir(seqs_cov_dir):
    os.mkdir(seqs_cov_dir)

  read_jobs = []

  for ti in range(targets_df.shape[0]):
    genome_cov_file = targets_df['file'].iloc[ti]
    seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
    seqs_cov_file = '%s.h5' % seqs_cov_stem

    if os.path.isfile(seqs_cov_file):
      print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
    else:
      cmd = 'basenji_data_read.py'
      cmd += ' -w %d' % options.pool_width
      cmd += ' %s' % genome_cov_file
      cmd += ' %s' % seqs_bed_file
      cmd += ' %s' % seqs_cov_file

      if options.run_local:
        cmd += ' &> %s.err' % seqs_cov_stem
        read_jobs.append(cmd)
      else:
        j = slurm.Job(cmd,
            name='read_t%d' % ti,
            out_file='%s.out' % seqs_cov_stem,
            err_file='%s.err' % seqs_cov_stem,
            queue='standard,tbdisk', mem=15000, time='12:0:0')
        read_jobs.append(j)

  if options.run_local:
    util.exec_par(read_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(read_jobs, options.processes, verbose=True, sleep_time=1)

  ################################################################
  # write TF Records
  ################################################################
  tfr_dir = '%s/tfrecords' % options.out_dir
  if not os.path.isdir(tfr_dir):
    os.mkdir(tfr_dir)

  write_jobs = []

  for tvt_set in ['train', 'valid', 'test']:
    tvt_set_indexes = [i for i in range(len(mseqs_labels)) if mseqs_labels[i] == tvt_set]
    tvt_set_start = tvt_set_indexes[0]
    tvt_set_end = tvt_set_indexes[-1]

    tfr_i = 0
    tfr_start = tvt_set_start
    tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end)

    while tfr_start <= tvt_set_end:
      tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i)

      cmd = 'basenji_data_write.py'
      cmd += ' -s %d' % tfr_start
      cmd += ' -e %d' % tfr_end
      if options.unmap_bed is not None:
        cmd += ' -u %s' % unmap_npy

      cmd += ' %s' % fasta_file
      cmd += ' %s' % seqs_bed_file
      cmd += ' %s' % seqs_cov_dir
      cmd += ' %s.tfr' % tfr_stem

      if options.run_local:
        cmd += ' &> %s.err' % tfr_stem
        write_jobs.append(cmd)
      else:
        j = slurm.Job(cmd,
              name='write_%s-%d' % (tvt_set, tfr_i),
              out_file='%s.out' % tfr_stem,
              err_file='%s.err' % tfr_stem,
              queue='standard,tbdisk', mem=15000, time='12:0:0')
        write_jobs.append(j)

      # update
      tfr_i += 1
      tfr_start += options.seqs_per_tfr
      tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end)

  if options.run_local:
    util.exec_par(write_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(write_jobs, options.processes, verbose=True, sleep_time=1)
Example #7
0
def main():
    usage = "usage: %prog [options] <input_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-k",
        dest="k_fold",
        type="int",
        default=10,
        help="Number of folds to use for cross-validation [Default: %default]",
    )
    parser.add_option(
        "--lambda_min",
        dest="lambda_min",
        type="float",
        default=0.01,
        help="Minimum -lambda value to attempt [Default: %default]",
    )
    parser.add_option(
        "--lambda_max",
        dest="lambda_max",
        type="float",
        default=10.0,
        help="Maximum -lambda value to attempt [Default: %default]",
    )
    parser.add_option(
        "--lambda_mult",
        dest="lambda_mult",
        type="float",
        default=2.0,
        help="Multiplier for next -lambda value to attempt [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="lesser_kmers",
        action="store_true",
        default=False,
        help="Use all kmers of length less than and equal to that given by -k [Default: %default]",
    )
    # parser.add_option('-m', dest='model_file', help='File to output model to')
    parser.add_option(
        "-p", dest="parallel", type="int", default=4, help="Number of parallel threads to run [Default: %default]"
    )
    parser.add_option(
        "-r",
        dest="replicates",
        type="int",
        default=1,
        help="Number of times to repeat the optimization for each fold [Default: %default]",
    )
    parser.add_option(
        "-w", dest="weights", action="store_true", default=False, help="Print a summary of the weight vectors"
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide input file")
    else:
        input_file = args[0]
    input_base = os.path.splitext(input_file)[0]

    if options.weights:
        summarize_weights(input_base, options)
        exit()

    # determine % of positive examples
    input_pos, input_total = positive_percent(input_file)
    f1_base = input_pos / float(input_total)  # trust me, it works

    for r in range(options.replicates):
        rep_dir = "%s_rep%d" % (input_base, r)
        if os.path.isdir(rep_dir):
            shutil.rmtree(rep_dir)
        os.mkdir(rep_dir)
        os.chdir(rep_dir)

        # divide data into folds
        divide_data("../" + input_file, options.k_fold)

        # collect pegasos commands
        cmds = []
        peg_lambda = options.lambda_min
        while peg_lambda <= options.lambda_max:
            # run on each fold
            for f in range(options.k_fold):
                cmds.append(
                    "pegasos -lambda %f -modelFile fold%d/train_%.1e.mod fold%d/train.dat &> /dev/null"
                    % (peg_lambda, f, peg_lambda, f)
                )

            # increase lambda
            peg_lambda *= options.lambda_mult

        # exceute pegasos commands
        util.exec_par(cmds, options.parallel)

        # start to clean up space
        for f in range(options.k_fold):
            os.remove("fold%d/train.dat" % f)

        os.chdir("..")

    # collect results
    peg_lambda = options.lambda_min
    while peg_lambda <= options.lambda_max:
        recalls = []
        precisions = []
        failed = False

        for r in range(options.replicates):
            if not failed:
                outcomes = {"tp": 0, "fp": 0, "fn": 0}

                # collect each fold
                for f in range(options.k_fold):
                    if not compute_accuracy(outcomes, "%s_rep%d/fold%d" % (input_base, r, f), peg_lambda):
                        failed = True
                        break

                # save
                if not failed:
                    recalls.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fn"]))
                    precisions.append(float(outcomes["tp"]) / (outcomes["tp"] + outcomes["fp"]))

        # summarize and print
        if failed:
            print "%.1e %8s %7s %8s %7s %8s %8s" % (peg_lambda, "NA", "NA", "NA", "NA", "NA", "NA")
        else:
            recall, rsd = stats.mean_sd(recalls)
            rsd /= math.sqrt(len(recalls))
            precision, psd = stats.mean_sd(precisions)
            psd /= math.sqrt(len(precisions))

            # null_p = 1.0-binom.cdf(int(recall*input_total+0.5)-1, int(recall*input_total/precision + 0.5), float(input_pos)/input_total)

            f1 = 2 * recall * precision / (recall + precision)

            # print '%.1e %8.3f %6.3f %8.3f %6.3f %8.3f %8.3f %8.1e' % (peg_lambda, recall, rsd, precision, psd, f1, (f1-f1_base), null_p)
            print "%.1e %8.4f %7.4f %8.4f %7.4f %8.4f %8.4f" % (
                peg_lambda,
                recall,
                rsd,
                precision,
                psd,
                f1,
                (f1 - f1_base),
            )

        peg_lambda *= options.lambda_mult