Ejemplo n.º 1
0
  def test_train(self):
    if os.path.isdir(self.exp_dir):
      shutil.rmtree(self.exp_dir)
    os.mkdir(self.exp_dir)

    ################################################################
    # basenji test
    ################################################################
    basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
    basenji_cmd += ' conda activate %s;' % self.conda_env
    basenji_cmd += ' basenji_test.py'
    basenji_cmd += ' -o %s' % self.exp_dir
    basenji_cmd += ' --rc'
    basenji_cmd += ' --shifts "1,0,-1"'
    basenji_cmd += ' %s' % self.params_file
    basenji_cmd += ' %s' % self.model_file
    basenji_cmd += ' %s' % self.data_dir

    basenji_job = slurm.Job(basenji_cmd,
                            name='test_test',
                            out_file='%s/test.out'%self.exp_dir,
                            err_file='%s/test.err'%self.exp_dir,
                            queue=self.queue,
                            cpu=1,
                            gpu=1,
                            mem=23000,
                            time='1:00:00')

    slurm.multi_run([basenji_job], verbose=True)

    ################################################################
    # compare
    ################################################################
    if os.path.isfile(self.ref_acc_file):
        ref_df = pd.read_csv(self.ref_acc_file, sep='\t', index_col=0)

        exp_acc_file = '%s/acc.txt' % self.exp_dir
        exp_df = pd.read_csv(exp_acc_file, sep='\t', index_col=0)

        np.testing.assert_allclose(ref_df.pearsonr, exp_df.pearsonr, atol=1e-3, rtol=1e-3)
        np.testing.assert_allclose(ref_df.r2, exp_df.r2, atol=1e-3, rtol=1e-3)

    else:
        print('Moving experiment to reference.')
        os.rename(self.exp_dir, os.path.split(self.ref_acc_file)[0])
Ejemplo n.º 2
0
def main():
    usage = "usage: %prog [options] <fasta_file> <targets_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-b",
        dest="blacklist_bed",
        help="Set blacklist nucleotides to a baseline value.",
    )
    parser.add_option(
        "--break",
        dest="break_t",
        default=786432,
        type="int",
        help="Break in half contigs above length [Default: %default]",
    )
    # parser.add_option('-c', dest='clip',
    #     default=None, type='float',
    #     help='Clip target values to have minimum [Default: %default]')
    parser.add_option(
        "-d",
        dest="sample_pct",
        default=1.0,
        type="float",
        help="Down-sample the segments",
    )
    parser.add_option(
        "-g", dest="gaps_file", help="Genome assembly gaps BED [Default: %default]"
    )
    parser.add_option(
        "-l",
        dest="seq_length",
        default=131072,
        type="int",
        help="Sequence length [Default: %default]",
    )
    parser.add_option(
        "--limit",
        dest="limit_bed",
        help="Limit to segments that overlap regions in a BED file",
    )
    parser.add_option(
        "--local",
        dest="run_local",
        default=False,
        action="store_true",
        help="Run jobs locally as opposed to on SLURM [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="data_out",
        help="Output directory [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number parallel processes [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="seqs_per_tfr",
        default=256,
        type="int",
        help="Sequences per TFRecord file [Default: %default]",
    )
    parser.add_option(
        "--seed",
        dest="seed",
        default=44,
        type="int",
        help="Random seed [Default: %default]",
    )
    parser.add_option(
        "--stride_train",
        dest="stride_train",
        default=1.0,
        type="float",
        help="Stride to advance train sequences [Default: seq_length]",
    )
    parser.add_option(
        "--stride_test",
        dest="stride_test",
        default=1.0,
        type="float",
        help="Stride to advance valid and test sequences [Default: seq_length]",
    )
    parser.add_option(
        "--soft",
        dest="soft_clip",
        default=False,
        action="store_true",
        help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="test_pct_or_chr",
        default=0.05,
        type="str",
        help="Proportion of the data for testing [Default: %default]",
    )
    parser.add_option("-u", dest="umap_bed", help="Unmappable regions in BED format")
    parser.add_option(
        "--umap_t",
        dest="umap_t",
        default=0.3,
        type="float",
        help="Remove sequences with more than this unmappable bin % [Default: %default]",
    )
    parser.add_option(
        "--umap_set",
        dest="umap_set",
        default=None,
        type="float",
        help="Set unmappable regions to this percentile in the sequences' distribution of values",
    )
    parser.add_option(
        "-w",
        dest="pool_width",
        default=128,
        type="int",
        help="Sum pool width [Default: %default]",
    )
    parser.add_option(
        "-v",
        dest="valid_pct_or_chr",
        default=0.05,
        type="str",
        help="Proportion of the data for validation [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide FASTA and sample coverage labels and paths.")
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.stride_train <= 0 or options.stride_train > 1:
        parser.error("Train stride =%f must be in [0,1]" % options.stride_train)

    if options.stride_test <= 0 or options.stride_test > 1:
        parser.error("Test stride =%f must be in [0,1]" % options.stride_test)

    ################################################################
    # define genomic contigs
    ################################################################
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
        contigs += [
            Contig(chrom, ctg_start, ctg_end)
            for ctg_start, ctg_end in chrom_contigs[chrom]
        ]

    # limit to a BED file
    if options.limit_bed is not None:
        contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = "%s/contigs.bed" % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert 0 <= valid_pct <= 1
        assert 0 <= test_pct <= 1

        # divide by pct
        contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chr = options.valid_pct_or_chr
        test_chr = options.test_pct_or_chr
        contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr)

    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################
    # stride sequences across contig
    train_mseqs = contig_sequences(
        train_contigs, options.seq_length, options.stride_train, label="train"
    )
    valid_mseqs = contig_sequences(
        valid_contigs, options.seq_length, options.stride_test, label="valid"
    )
    test_mseqs = contig_sequences(
        test_contigs, options.seq_length, options.stride_test, label="test"
    )

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # down-sample
    if options.sample_pct < 1.0:
        train_mseqs = random.sample(
            train_mseqs, int(options.sample_pct * len(train_mseqs))
        )
        valid_mseqs = random.sample(
            valid_mseqs, int(options.sample_pct * len(valid_mseqs))
        )
        test_mseqs = random.sample(
            test_mseqs, int(options.sample_pct * len(test_mseqs))
        )

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # mappability
    ################################################################
    if options.umap_bed is not None:
        if shutil.which("bedtools") is None:
            print("Install Bedtools to annotate unmappable sites", file=sys.stderr)
            exit(1)

        # annotate unmappable positions
        mseqs_unmap = annotate_unmap(
            mseqs, options.umap_bed, options.seq_length, options.pool_width
        )

        # filter unmappable
        mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t
        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = "%s/mseqs_unmap.npy" % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

    # write sequences to BED
    seqs_bed_file = "%s/sequences.bed" % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)

    ################################################################
    # read sequence coverage values
    ################################################################
    # read target datasets
    targets_df = pd.read_table(targets_file, index_col=0)

    seqs_cov_dir = "%s/seqs_cov" % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df["file"].iloc[ti]
        seqs_cov_stem = "%s/%d" % (seqs_cov_dir, ti)
        seqs_cov_file = "%s.h5" % seqs_cov_stem

        clip_ti = None
        if "clip" in targets_df.columns:
            clip_ti = targets_df["clip"].iloc[ti]

        scale_ti = 1
        if "scale" in targets_df.columns:
            scale_ti = targets_df["scale"].iloc[ti]

        if os.path.isfile(seqs_cov_file):
            print("Skipping existing %s" % seqs_cov_file, file=sys.stderr)
        else:
            cmd = "basenji_data_read.py"
            cmd += " -w %d" % options.pool_width
            cmd += " -u %s" % targets_df["sum_stat"].iloc[ti]
            if clip_ti is not None:
                cmd += " -c %f" % clip_ti
            if options.soft_clip:
                cmd += " --soft"
            cmd += " -s %f" % scale_ti
            if options.blacklist_bed:
                cmd += " -b %s" % options.blacklist_bed
            cmd += " %s" % genome_cov_file
            cmd += " %s" % seqs_bed_file
            cmd += " %s" % seqs_cov_file

            if options.run_local:
                cmd += " &> %s.err" % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(
                    cmd,
                    name="read_t%d" % ti,
                    out_file="%s.out" % seqs_cov_stem,
                    err_file="%s.err" % seqs_cov_stem,
                    queue="standard",
                    mem=15000,
                    time="12:0:0",
                )
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, "%s/targets.txt" % options.out_dir)

    # initialize TF Records dir
    tfr_dir = "%s/tfrecords" % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for tvt_set in ["train", "valid", "test"]:
        tvt_set_indexes = [i for i in range(len(mseqs)) if mseqs[i].label == tvt_set]
        tvt_set_start = tvt_set_indexes[0]
        tvt_set_end = tvt_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = tvt_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

        while tfr_start <= tvt_set_end:
            tfr_stem = "%s/%s-%d" % (tfr_dir, tvt_set, tfr_i)

            cmd = "basenji_data_write.py"
            cmd += " -s %d" % tfr_start
            cmd += " -e %d" % tfr_end
            if options.umap_bed is not None:
                cmd += " -u %s" % unmap_npy
            if options.umap_set is not None:
                cmd += " --umap_set %f" % options.umap_set

            cmd += " %s" % fasta_file
            cmd += " %s" % seqs_bed_file
            cmd += " %s" % seqs_cov_dir
            cmd += " %s.tfr" % tfr_stem

            if options.run_local:
                cmd += " &> %s.err" % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(
                    cmd,
                    name="write_%s-%d" % (tvt_set, tfr_i),
                    out_file="%s.out" % tfr_stem,
                    err_file="%s.err" % tfr_stem,
                    queue="standard",
                    mem=15000,
                    time="12:0:0",
                )
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )
Ejemplo n.º 3
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=786432,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    # parser.add_option('-c', dest='clip',
    #     default=None, type='float',
    #     help='Clip target values to have minimum [Default: %default]')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=256,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '--soft',
        dest='soft_clip',
        default=False,
        action='store_true',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.3,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Set unmappable regions to this percentile in the sequences\' distribution of values'
    )
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.stride_train <= 0 or options.stride_train > 1:
        parser.error('Train stride =%f must be in [0,1]' %
                     options.stride_train)

    if options.stride_test <= 0 or options.stride_test > 1:
        parser.error('Test stride =%f must be in [0,1]' % options.stride_test)

    ################################################################
    # define genomic contigs
    ################################################################
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
        contigs += [
            Contig(chrom, ctg_start, ctg_end)
            for ctg_start, ctg_end in chrom_contigs[chrom]
        ]

    # limit to a BED file
    if options.limit_bed is not None:
        contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [
        ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
    ]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = '%s/contigs.bed' % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert (0 <= valid_pct <= 1)
        assert (0 <= test_pct <= 1)

        # divide by pct
        contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chr = options.valid_pct_or_chr
        test_chr = options.test_pct_or_chr
        contig_sets = divide_contigs_chr(contigs, test_chr, valid_chr)

    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################
    # stride sequences across contig
    train_mseqs = contig_sequences(train_contigs,
                                   options.seq_length,
                                   options.stride_train,
                                   label='train')
    valid_mseqs = contig_sequences(valid_contigs,
                                   options.seq_length,
                                   options.stride_test,
                                   label='valid')
    test_mseqs = contig_sequences(test_contigs,
                                  options.seq_length,
                                  options.stride_test,
                                  label='test')

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # mappability
    ################################################################
    if options.umap_bed is not None:
        # annotate unmappable positions
        mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                     options.seq_length, options.pool_width)

        # filter unmappable
        mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                          options.umap_t)
        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

        # down-sample
    if options.sample_pct < 1.0:
        mseqs = random.sample(mseqs, int(options.sample_pct * len(contigs)))

    # write sequences to BED
    seqs_bed_file = '%s/sequences.bed' % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)

    ################################################################
    # read sequence coverage values
    ################################################################
    # read target datasets
    targets_df = pd.read_table(targets_file, index_col=0)

    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        scale_ti = 1
        if 'scale' in targets_df.columns:
            scale_ti = targets_df['scale'].iloc[ti]

        if os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = 'basenji_data_read.py'
            cmd += ' -w %d' % options.pool_width
            cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti]
            if clip_ti is not None:
                cmd += ' -c %f' % clip_ti
            if options.soft_clip:
                cmd += ' --soft'
            cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard,tbdisk',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for tvt_set in ['train', 'valid', 'test']:
        tvt_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == tvt_set
        ]
        tvt_set_start = tvt_set_indexes[0]
        tvt_set_end = tvt_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = tvt_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

        while tfr_start <= tvt_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i)

            cmd = 'basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end
            if options.umap_bed is not None:
                cmd += ' -u %s' % unmap_npy
            if options.umap_set is not None:
                cmd += ' --umap_set %f' % options.umap_set

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (tvt_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard,tbdisk',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)
Ejemplo n.º 4
0
def main():
    usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option('-a',
                      '--alt',
                      dest='alternative',
                      default='two-sided',
                      help='Statistical test alternative [Default: %default]')
    parser.add_option('-c',
                      dest='crosses',
                      default=1,
                      type='int',
                      help='Number of cross-fold rounds [Default:%default]')
    parser.add_option('-e',
                      dest='conda_env',
                      default='tf2-gpu',
                      help='Anaconda environment [Default: %default]')
    parser.add_option('--l1',
                      dest='label1',
                      default='Reference',
                      help='Reference label [Default: %default]')
    parser.add_option('--l2',
                      dest='label2',
                      default='Experiment',
                      help='Experiment label [Default: %default]')
    parser.add_option('--name',
                      dest='name',
                      default='test',
                      help='SLURM name prefix [Default: %default]')
    parser.add_option('-o',
                      dest='out_stem',
                      default=None,
                      help='Outplut plot stem [Default: %default]')
    parser.add_option('-q', dest='queue', default='gtx1080ti')
    parser.add_option('-r',
                      dest='ref_dir',
                      default=None,
                      help='Reference directory for statistical tests')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option('--spec',
                      dest='specificity',
                      default=False,
                      action='store_true',
                      help='Test specificity [Default: %default]')
    parser.add_option('--train',
                      dest='train',
                      default=False,
                      action='store_true',
                      help='Test on the training set, too [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters file and data directory')
    else:
        exp_dir = args[0]
        params_file = args[1]
        data_dir = args[2]

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # count folds
    num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')])

    ################################################################
    # test check
    ################################################################
    jobs = []

    if options.train:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

                # check if done
                acc_file = '%s/test_train/acc.txt' % it_dir
                if os.path.isfile(acc_file):
                    print('%s already generated.' % acc_file)
                else:
                    # basenji test
                    basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                    basenji_cmd += ' conda activate %s;' % options.conda_env
                    basenji_cmd += ' basenji_test.py'
                    basenji_cmd += ' -o %s/test_train' % it_dir
                    if options.rc:
                        basenji_cmd += ' --rc'
                    if options.shifts:
                        basenji_cmd += ' --shifts %s' % options.shifts
                    basenji_cmd += ' --split train'
                    basenji_cmd += ' %s' % params_file
                    basenji_cmd += ' %s/train/model_check.h5' % it_dir
                    basenji_cmd += ' %s/data' % it_dir

                    name = '%s-testtr-f%dc%d' % (options.name, fi, ci)
                    basenji_job = slurm.Job(
                        basenji_cmd,
                        name=name,
                        out_file='%s/test_train.out' % it_dir,
                        err_file='%s/test_train.err' % it_dir,
                        queue=options.queue,
                        cpu=1,
                        gpu=1,
                        mem=23000,
                        time='4:00:00')
                    jobs.append(basenji_job)

    ################################################################
    # test best
    ################################################################
    for ci in range(options.crosses):
        for fi in range(num_folds):
            it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

            # check if done
            acc_file = '%s/test/acc.txt' % it_dir
            if os.path.isfile(acc_file):
                print('%s already generated.' % acc_file)
            else:
                # basenji test
                basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                basenji_cmd += ' conda activate %s;' % options.conda_env
                basenji_cmd += ' basenji_test.py'
                basenji_cmd += ' -o %s/test' % it_dir
                if options.rc:
                    basenji_cmd += ' --rc'
                if options.shifts:
                    basenji_cmd += ' --shifts %s' % options.shifts
                basenji_cmd += ' %s' % params_file
                basenji_cmd += ' %s/train/model_best.h5' % it_dir
                basenji_cmd += ' %s/data' % it_dir

                name = '%s-test-f%dc%d' % (options.name, fi, ci)
                basenji_job = slurm.Job(basenji_cmd,
                                        name=name,
                                        out_file='%s/test.out' % it_dir,
                                        err_file='%s/test.err' % it_dir,
                                        queue=options.queue,
                                        cpu=1,
                                        gpu=1,
                                        mem=23000,
                                        time='4:00:00')
                jobs.append(basenji_job)

    ################################################################
    # test best specificity
    ################################################################
    if options.specificity:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

                # check if done
                acc_file = '%s/test_spec/acc.txt' % it_dir
                if os.path.isfile(acc_file):
                    print('%s already generated.' % acc_file)
                else:
                    # basenji test
                    basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                    basenji_cmd += ' conda activate %s;' % options.conda_env
                    basenji_cmd += ' basenji_test_specificity.py'
                    basenji_cmd += ' -o %s/test_spec' % it_dir
                    if options.rc:
                        basenji_cmd += ' --rc'
                    if options.shifts:
                        basenji_cmd += ' --shifts %s' % options.shifts
                    basenji_cmd += ' %s' % params_file
                    basenji_cmd += ' %s/train/model_best.h5' % it_dir
                    basenji_cmd += ' %s/data' % it_dir

                    name = '%s-spec-f%dc%d' % (options.name, fi, ci)
                    basenji_job = slurm.Job(
                        basenji_cmd,
                        name=name,
                        out_file='%s/test_spec.out' % it_dir,
                        err_file='%s/test_spec.err' % it_dir,
                        queue=options.queue,
                        cpu=1,
                        gpu=1,
                        mem=60000,
                        time='6:00:00')
                    jobs.append(basenji_job)

    slurm.multi_run(jobs, verbose=True)

    if options.ref_dir is not None:
        # classification or regression
        with open('%s/f0_c0/test/acc.txt' % exp_dir) as test0_open:
            header = test0_open.readline().split()
            if 'pearsonr' in header:
                metric = 'pearsonr'
            else:
                metric = 'auprc'

        ################################################################
        # compare checkpoint on training set
        ################################################################
        if options.train:
            ref_glob_str = '%s/*/test_train/acc.txt' % options.ref_dir
            ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, metric)

            exp_glob_str = '%s/*/test_train/acc.txt' % exp_dir
            exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, metric)

            mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

            print('\nTrain:')
            print('%12s %s: %.4f (%.4f)' %
                  (options.label1, metric, ref_mean, ref_stdm))
            print('%12s %s: %.4f (%.4f)' %
                  (options.label2, metric, exp_mean, exp_stdm))
            print('Mann-Whitney U p-value: %.3g' % mwp)
            print('T-test p-value: %.3g' % tp)

            if options.out_stem is not None:
                jointplot(ref_cors, exp_cors,
                          '%s_train.pdf' % options.out_stem, options.label1,
                          options.label2)

        ################################################################
        # compare best on test set
        ################################################################
        ref_glob_str = '%s/*/test/acc.txt' % options.ref_dir
        ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, metric)

        exp_glob_str = '%s/*/test/acc.txt' % exp_dir
        exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, metric)

        mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

        print('\nTest:')
        print('%12s %s: %.4f (%.4f)' %
              (options.label1, metric, ref_mean, ref_stdm))
        print('%12s %s: %.4f (%.4f)' %
              (options.label2, metric, exp_mean, exp_stdm))
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)

        if options.out_stem is not None:
            jointplot(ref_cors, exp_cors, '%s_test.pdf' % options.out_stem,
                      options.label1, options.label2)

        ################################################################
        # compare best on test set specificity
        ################################################################
        if options.specificity:
            ref_glob_str = '%s/*/test_spec/acc.txt' % options.ref_dir
            ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str, metric)

            exp_glob_str = '%s/*/test_spec/acc.txt' % exp_dir
            exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, metric)

            mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

            print('\nSpecificity:')
            print('%12s %s: %.4f (%.4f)' %
                  (options.label1, metric, ref_mean, ref_stdm))
            print('%12s %s: %.4f (%.4f)' %
                  (options.label2, metric, exp_mean, exp_stdm))
            print('Mann-Whitney U p-value: %.3g' % mwp)
            print('T-test p-value: %.3g' % tp)

            if options.out_stem is not None:
                jointplot(ref_cors, exp_cors, '%s_spec.pdf' % options.out_stem,
                          options.label1, options.label2)
Ejemplo n.º 5
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <sample_wigs_file> <hdf5_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-b',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '-c',
        dest='clip',
        default=None,
        type='float',
        help='Clip target values to have minimum [Default: %default]')
    parser.add_option('--cluster_dir',
                      dest='cluster_dir',
                      default='basenji_hdf5')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-f',
                      dest='fourier_dim',
                      default=None,
                      type='int',
                      help='Fourier transform dimension [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=1024,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--log2',
        dest='log10to2',
        default=False,
        action='store_true',
        help='Transform values from log10 to log2 [Default: %default]')
    parser.add_option(
        '--mult_cov',
        dest='cov_multiplier',
        default=1,
        type='float',
        help=
        'Coverage multiplier, useful when the read extension and pool width do not match [Default: %default]'
    )
    parser.add_option(
        '-n',
        dest='na_t',
        default=0.25,
        type='float',
        help=
        'Remove sequences with an NA% greater than this threshold [Default: %default]'
    )
    parser.add_option(
        '-o',
        dest='out_bed_file',
        help='Output the train/valid/test sequences as a BED file')
    parser.add_option(
        '-p',
        dest='processes',
        default=1,
        type='int',
        help='Number parallel processes to load data [Default: %default]')
    parser.add_option('-s',
                      dest='stride',
                      type='int',
                      help='Stride to advance segments [Default: seq_length]')
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        type='str',
        default=0.05,
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='unmap_bed',
                      help='Unmappable segments to set to NA')
    parser.add_option('-w',
                      dest='pool_width',
                      type='int',
                      default=1,
                      help='Average pooling width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        type='str',
        default=0.05,
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option('-z',
                      dest='compression',
                      help='h5py compression [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide genome FASTA file, sample Wig/BigWig labels and paths, '
            'and model output file')
    else:
        fasta_file = args[0]
        sample_wigs_file = args[1]
        hdf5_file = args[2]

    random.seed(1)

    if options.stride is None:
        options.stride = options.seq_length

    ################################################################
    # assess bigwigs
    ################################################################
    # get wig files and labels
    target_wigs = OrderedDict()
    target_strands = []
    target_labels = []
    for line in open(sample_wigs_file, encoding='UTF-8'):
        a = line.rstrip().split('\t')

        if a[0] in target_wigs:
            print('WARNING: duplicate target id %s' % a[0], file=sys.stderr)

        target_wigs[a[0]] = a[1]
        target_strands.append(a[2])
        if len(a) > 3:
            target_labels.append(a[3])
        else:
            target_labels.append('')

    if options.fourier_dim is not None and 2 * options.fourier_dim >= options.seq_length / options.pool_width:
        print(
            "Fourier transform to %d dims won't compress %d length sequences with %d pooling"
            % (options.fourier_dim, options.seq_length, options.pool_width),
            file=sys.stderr)
        exit(1)

    ################################################################
    # prepare genomic segments
    ################################################################
    chrom_segments = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_segments = genome.split_contigs(chrom_segments,
                                              options.gaps_file)

    # ditch the chromosomes
    segments = []
    for chrom in chrom_segments:
        segments += [(chrom, seg_start, seg_end)
                     for seg_start, seg_end in chrom_segments[chrom]]

    # standardize order
    segments.sort()

    # filter for large enough
    segments = [
        cse for cse in segments if cse[2] - cse[1] >= options.seq_length
    ]

    # down-sample
    if options.sample_pct < 1.0:
        segments = random.sample(segments,
                                 int(options.sample_pct * len(segments)))

    # limit to a BED file
    if options.limit_bed is not None:
        segments = limit_segments(segments, options.limit_bed)

    if not os.path.isdir(options.cluster_dir):
        os.mkdir(options.cluster_dir)

    # print segments to BED file
    seg_bed_file = '%s/segments.bed' % options.cluster_dir
    seg_bed_out = open(seg_bed_file, 'w')
    for chrom, seg_start, seg_end in segments:
        print('%s\t%d\t%d' % (chrom, seg_start, seg_end), file=seg_bed_out)
    seg_bed_out.close()

    ################################################################
    # bigwig read and process
    ################################################################
    print('Reading and pre-processing bigwigs for %d segments' % len(segments),
          flush=True)

    targets_real = []
    targets_imag = []

    # generate numpy arrays on cluster
    jobs = []
    for target_label in target_wigs.keys():
        wig_file = target_wigs[target_label]
        npy_file = '%s/%s' % (options.cluster_dir, target_label)
        if not os.path.isfile(npy_file) and not os.path.isfile(
                '%s.npy' % npy_file):
            print(npy_file)

            if os.path.splitext(wig_file)[1] == '.h5':
                script = 'seqs_hdf5.py'
            else:
                script = 'bigwig_hdf5.py'

            cmd = 'echo $HOSTNAME; %s -l %d -s %d -w %d %s %s %s' % (
                script, options.seq_length, options.stride, options.pool_width,
                wig_file, seg_bed_file, npy_file)
            name = 'hdf5_%s' % target_label
            outf = '%s/%s.out' % (options.cluster_dir, target_label)
            errf = '%s/%s.err' % (options.cluster_dir, target_label)
            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue='standard,tbdisk',
                          mem=15000,
                          time='12:0:0')
            jobs.append(j)

    slurm.multi_run(jobs)

    # load into targets_real, targets_imag
    for target_label in target_wigs.keys():
        npy_file = '%s/%s.npy' % (options.cluster_dir, target_label)
        wig_targets = np.load(npy_file)
        targets_real.append(wig_targets)

    # transpose from TxSxL to SxLxT
    targets_real = np.transpose(np.array(targets_real), axes=(1, 2, 0))

    print('%d target sequences' % targets_real.shape[0])

    ################################################################
    # one hot code sequences
    ################################################################
    seqs_1hot, seqs_segments = segments_1hot(fasta_file, segments,
                                             options.seq_length,
                                             options.stride)
    print('%d sequences one hot coded' % seqs_1hot.shape[0])

    ################################################################
    # correct for unmappable regions
    ################################################################
    if options.unmap_bed is not None:
        seqs_na = annotate_na(seqs_segments, options.unmap_bed,
                              options.seq_length, options.pool_width)

        # determine mappable sequences and update test indexes
        map_indexes = []

        for i in range(seqs_na.shape[0]):
            # mappable
            if seqs_na[i, :].mean(dtype='float64') < options.na_t:
                map_indexes.append(i)

            # unmappable
            else:
                # forget it
                pass

        # update data structures
        targets_real = targets_real[map_indexes]
        if options.fourier_dim is not None:
            targets_imag = targets_imag[map_indexes]

        seqs_1hot = seqs_1hot[map_indexes]
        seqs_segments = [seqs_segments[mi] for mi in map_indexes]
        seqs_na = seqs_na[map_indexes]

    ################################################################
    # write to train, valid, test HDF5
    ################################################################

    # choose test indexes
    if options.test_pct_or_chr.startswith('chr'):
        test_indexes = [
            si for si in range(len(seqs_segments))
            if seqs_segments[si][0] == options.test_pct_or_chr
        ]
    else:
        test_pct = float(options.test_pct_or_chr)
        test_indexes = [
            twi for twi in range(len(seqs_segments))
            if random.random() < test_pct
        ]

    # choose valid indexes
    if options.valid_pct_or_chr.startswith('chr'):
        # valid_indexes = np.array([seq_seg[0] == options.valid_pct_or_chr for seq_seg in seqs_segments])
        valid_indexes = [
            si for si in range(len(seqs_segments))
            if seqs_segments[si][0] == options.valid_pct_or_chr
        ]
    else:
        valid_pct = float(options.valid_pct_or_chr)
        valid_n = int(valid_pct * len(seqs_segments))
        nontest_indexes = set(range(len(seqs_segments))) - set(test_indexes)
        valid_indexes = random.sample(nontest_indexes, valid_n)

    # remainder is training
    train_indexes = list(
        set(range(len(seqs_segments))) - set(valid_indexes) -
        set(test_indexes))

    # training may require shuffling
    random.shuffle(train_indexes)
    random.shuffle(valid_indexes)
    random.shuffle(test_indexes)

    # write to HDF5
    hdf5_out = h5py.File(hdf5_file, 'w')

    # store pooling
    hdf5_out.create_dataset('pool_width', data=options.pool_width, dtype='int')

    # store targets
    target_ids = np.array(list(target_wigs.keys()), dtype='S')
    hdf5_out.create_dataset('target_ids', data=target_ids)

    target_labels = np.array(target_labels, dtype='S')
    hdf5_out.create_dataset('target_labels', data=target_labels)

    target_strands = np.array(target_strands, dtype='S')
    hdf5_out.create_dataset('target_strands', data=target_strands)

    # HDF5 train
    hdf5_out.create_dataset('train_in',
                            data=seqs_1hot[train_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('train_out',
                            data=targets_real[train_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('train_out_imag',
                                data=targets_imag[train_indexes],
                                dtype='float16',
                                compression=options.compression)
    hdf5_out.create_dataset('train_na',
                            data=seqs_na[train_indexes],
                            dtype='bool',
                            compression=options.compression)

    # HDF5 valid
    hdf5_out.create_dataset('valid_in',
                            data=seqs_1hot[valid_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('valid_out',
                            data=targets_real[valid_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('valid_out_imag',
                                data=targets_imag[valid_indexes],
                                dtype='float16',
                                compression=options.compression)
    hdf5_out.create_dataset('valid_na',
                            data=seqs_na[valid_indexes],
                            dtype='bool',
                            compression=options.compression)

    # HDF5 test
    hdf5_out.create_dataset('test_in',
                            data=seqs_1hot[test_indexes],
                            dtype='bool',
                            compression=options.compression)
    hdf5_out.create_dataset('test_out',
                            data=targets_real[test_indexes],
                            dtype='float16',
                            compression=options.compression)
    if options.fourier_dim is not None:
        hdf5_out.create_dataset('test_out_imag',
                                data=targets_imag[test_indexes],
                                dtype='float16',
                                compression=options.compression)
    hdf5_out.create_dataset('test_na',
                            data=seqs_na[test_indexes],
                            dtype='bool',
                            compression=options.compression)

    hdf5_out.close()

    # output BED file
    if options.out_bed_file:
        out_bed_out = open(options.out_bed_file, 'w')
        for si in train_indexes:
            print('%s\t%d\t%d\ttrain' % seqs_segments[si], file=out_bed_out)
        for si in valid_indexes:
            print('%s\t%d\t%d\tvalid' % seqs_segments[si], file=out_bed_out)
        for si in test_indexes:
            print('%s\t%d\t%d\ttest' % seqs_segments[si], file=out_bed_out)
        out_bed_out.close()
Ejemplo n.º 6
0
def main():
  usage = 'usage: %prog [options] <fasta0_file,fasta1_file> <targets_file>'
  parser = OptionParser(usage)
  parser.add_option('-a', dest='align_net',
      help='Alignment .net file')
  parser.add_option('-b', dest='blacklist_beds',
      help='Set blacklist nucleotides to a baseline value.')
  parser.add_option('--break', dest='break_t',
      default=None, type='int',
      help='Break in half contigs above length [Default: %default]')
  parser.add_option('-c','--crop', dest='crop_bp',
      default=0, type='int',
      help='Crop bp off each end [Default: %default]')
  parser.add_option('-d', dest='sample_pct',
      default=1.0, type='float',
      help='Down-sample the segments')
  parser.add_option('-g', dest='gap_files',
      help='Comma-separated list of assembly gaps BED files [Default: %default]')
  parser.add_option('-i', dest='interp_nan',
      default=False, action='store_true',
      help='Interpolate NaNs [Default: %default]') 
  parser.add_option('-l', dest='seq_length',
      default=131072, type='int',
      help='Sequence length [Default: %default]')
  parser.add_option('--local', dest='run_local',
      default=False, action='store_true',
      help='Run jobs locally as opposed to on SLURM [Default: %default]')
  parser.add_option('-n', dest='net_fill_min',
    default=100000, type='int',
    help='Alignment net fill size minimum [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='data_out',
      help='Output directory [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number parallel processes [Default: %default]')
  parser.add_option('-r', dest='seqs_per_tfr',
      default=256, type='int',
      help='Sequences per TFRecord file [Default: %default]')
  parser.add_option('--restart', dest='restart',
      default=False, action='store_true',
      help='Skip already read HDF5 coverage values. [Default: %default]')
  parser.add_option('--seed', dest='seed',
      default=44, type='int',
      help='Random seed [Default: %default]')
  parser.add_option('--snap', dest='snap',
      default=None, type='int',
      help='Snap sequences to multiple of the given value [Default: %default]')
  parser.add_option('--stride', '--stride_train', dest='stride_train',
      default=1., type='float',
      help='Stride to advance train sequences [Default: seq_length]')
  parser.add_option('--stride_test', dest='stride_test',
      default=1., type='float',
      help='Stride to advance valid and test sequences [Default: %default]')
  parser.add_option('--soft', dest='soft_clip',
      default=False, action='store_true',
      help='Soft clip values, applying sqrt to the execess above the threshold [Default: %default]')
  parser.add_option('-t', dest='test_pct',
      default=0.1, type='float',
      help='Proportion of the data for testing [Default: %default]')
  parser.add_option('-u', dest='umap_beds',
      help='Comma-separated genome unmappable segments to set to NA')
  parser.add_option('--umap_t', dest='umap_t',
      default=0.5, type='float',
      help='Remove sequences with more than this unmappable bin % [Default: %default]')
  parser.add_option('--umap_clip', dest='umap_clip',
      default=None, type='float',
      help='Clip unmappable regions to this percentile in the sequences\' distribution of values')
  parser.add_option('-w', dest='pool_width',
      default=128, type='int',
      help='Sum pool width [Default: %default]')
  parser.add_option('-v', dest='valid_pct',
      default=0.1, type='float',
      help='Proportion of the data for validation [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide FASTA and sample coverage label and path files for two genomes.')
  else:
    fasta_files = args[0].split(',')
    targets_file = args[1]

  # there is still some source of stochasticity
  random.seed(options.seed)
  np.random.seed(options.seed)

  # transform proportion strides to base pairs
  if options.stride_train <= 1:
    print('stride_train %.f'%options.stride_train, end='')
    options.stride_train = options.stride_train*options.seq_length
    print(' converted to %f' % options.stride_train)
  options.stride_train = int(np.round(options.stride_train))
  if options.stride_test <= 1:
    print('stride_test %.f'%options.stride_test, end='')
    options.stride_test = options.stride_test*options.seq_length
    print(' converted to %f' % options.stride_test)
  options.stride_test = int(np.round(options.stride_test))

  # check snap
  if options.snap is not None:
    if np.mod(options.seq_length, options.snap) != 0: 
      raise ValueError('seq_length must be a multiple of snap')
    if np.mod(options.stride_train, options.snap) != 0: 
      raise ValueError('stride_train must be a multiple of snap')
    if np.mod(options.stride_test, options.snap) != 0:
      raise ValueError('stride_test must be a multiple of snap')

  if os.path.isdir(options.out_dir) and not options.restart:
    print('Remove output directory %s or use --restart option.' % options.out_dir)
    exit(1)
  elif not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  if options.gap_files is not None:
    options.gap_files = options.gap_files.split(',')

  if options.blacklist_beds is not None:
    options.blacklist_beds = options.blacklist_beds.split(',')

  # read targets
  targets_df = pd.read_table(targets_file, index_col=0)

  # verify genomes
  num_genomes = len(fasta_files)
  assert(len(set(targets_df.genome)) == num_genomes)

  ################################################################
  # define genomic contigs
  ################################################################
  genome_chr_contigs = []
  for gi in range(num_genomes):
    genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi]))

    # remove gaps
    if options.gap_files[gi]:
      genome_chr_contigs[gi] = genome.split_contigs(genome_chr_contigs[gi],
                                                    options.gap_files[gi])

  # ditch the chromosomes
  contigs = []
  for gi in range(num_genomes):
    for chrom in genome_chr_contigs[gi]:
      contigs += [Contig(gi, chrom, ctg_start, ctg_end)
                  for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]]

  # filter for large enough
  contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

  # break up large contigs
  if options.break_t is not None:
    contigs = break_large_contigs(contigs, options.break_t)

  # print contigs to BED file
  for gi in range(num_genomes):
    contigs_i = [ctg for ctg in contigs if ctg.genome == gi]
    ctg_bed_file = '%s/contigs%d.bed' % (options.out_dir, gi)
    write_seqs_bed(ctg_bed_file, contigs_i)

  ################################################################
  # divide between train/valid/test
  ################################################################

  # connect contigs across genomes by alignment
  contig_components = connect_contigs(contigs, options.align_net, options.net_fill_min, options.out_dir)

  # divide contig connected components between train/valid/test
  contig_sets = divide_contig_components(contig_components, options.test_pct, options.valid_pct)
  train_contigs, valid_contigs, test_contigs = contig_sets

  # rejoin broken contigs within set
  train_contigs = rejoin_large_contigs(train_contigs)
  valid_contigs = rejoin_large_contigs(valid_contigs)
  test_contigs = rejoin_large_contigs(test_contigs)

  # quantify leakage across sets
  quantify_leakage(options.align_net, train_contigs, valid_contigs, test_contigs, options.out_dir)

  ################################################################
  # define model sequences
  ################################################################

  # stride sequences across contig
  train_mseqs = contig_sequences(train_contigs, options.seq_length,
                                 options.stride_train, options.snap, 'train')
  valid_mseqs = contig_sequences(valid_contigs, options.seq_length,
                                 options.stride_test, options.snap, 'valid')
  test_mseqs = contig_sequences(test_contigs, options.seq_length,
                                options.stride_test, options.snap, 'test')

  # shuffle
  random.shuffle(train_mseqs)
  random.shuffle(valid_mseqs)
  random.shuffle(test_mseqs)

  # down-sample
  if options.sample_pct < 1.0:
    train_mseqs = random.sample(train_mseqs, int(options.sample_pct*len(train_mseqs)))
    valid_mseqs = random.sample(valid_mseqs, int(options.sample_pct*len(valid_mseqs)))
    test_mseqs = random.sample(test_mseqs, int(options.sample_pct*len(test_mseqs)))

  # merge
  mseqs = train_mseqs + valid_mseqs + test_mseqs

  ################################################################
  # separate sequences by genome
  ################################################################
  mseqs_genome = []
  for gi in range(num_genomes):
    mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi]
    mseqs_genome.append(mseqs_gi)

  ################################################################
  # mappability
  ################################################################

  options.umap_beds = options.umap_beds.split(',')
  unmap_npys = [None, None]

  for gi in range(num_genomes):
    if options.umap_beds[gi] is not None:
      # annotate unmappable positions
      mseqs_unmap = annotate_unmap(mseqs_genome[gi], options.umap_beds[gi],
                                   options.seq_length, options.pool_width)

      # filter unmappable
      mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.umap_t)
      mseqs_genome[gi] = [mseqs_genome[gi][si] for si in range(len(mseqs_genome[gi])) if mseqs_map_mask[si]]
      mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

      # write to file
      unmap_npys[gi] = '%s/mseqs%d_unmap.npy' % (options.out_dir, gi)
      np.save(unmap_npys[gi], mseqs_unmap)

  seqs_bed_files = []
  for gi in range(num_genomes):
    # write sequences to BED
    seqs_bed_files.append('%s/sequences%d.bed' % (options.out_dir, gi))
    write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True)

  ################################################################
  # read sequence coverage values
  ################################################################
  seqs_cov_dir = '%s/seqs_cov' % options.out_dir
  if not os.path.isdir(seqs_cov_dir):
    os.mkdir(seqs_cov_dir)

  read_jobs = []
  for gi in range(num_genomes):
    read_jobs += make_read_jobs(seqs_bed_files[gi], targets_df,
                                gi, seqs_cov_dir, options)

  if options.run_local:
    util.exec_par(read_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(read_jobs, options.processes, verbose=True,
                    launch_sleep=1, update_sleep=5)

  ################################################################
  # write TF Records
  ################################################################

  tfr_dir = '%s/tfrecords' % options.out_dir
  if not os.path.isdir(tfr_dir):
    os.mkdir(tfr_dir)

  # set genome target index starts
  sum_targets = 0
  genome_targets_start = []
  for gi in range(num_genomes):
    genome_targets_start.append(sum_targets)
    targets_df_gi = targets_df[targets_df.genome == gi]
    sum_targets += targets_df_gi.shape[0]

  write_jobs = []
  for gi in range(num_genomes):
    write_jobs += make_write_jobs(mseqs_genome[gi], fasta_files[gi], seqs_bed_files[gi],
                                  seqs_cov_dir, tfr_dir, gi, unmap_npys[gi],
                                  genome_targets_start[gi], sum_targets, options)

  if options.run_local:
    util.exec_par(write_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(write_jobs, options.processes, verbose=True,
                    launch_sleep=1, update_sleep=5)

  ################################################################
  # stats
  ################################################################
  stats_dict = {}
  # stats_dict['num_targets'] = targets_df.shape[0]
  # stats_dict['train_seqs'] = len(train_mseqs)
  # stats_dict['valid_seqs'] = len(valid_mseqs)
  # stats_dict['test_seqs'] = len(test_mseqs)
  stats_dict['seq_length'] = options.seq_length
  stats_dict['pool_width'] = options.pool_width
  stats_dict['crop_bp'] = options.crop_bp

  target_length = options.seq_length - 2*options.crop_bp
  target_length = target_length // options.pool_width
  stats_dict['target_length'] = target_length

  with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
    json.dump(stats_dict, stats_json_out, indent=4)
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>'
    parser = OptionParser(usage)

    # basenji_sat_bed.py options
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/assembly/hg19.fa' % os.environ['HG19'],
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option(
        '-l',
        dest='mut_len',
        default=200,
        type='int',
        help='Length of center sequence to mutate [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='sat_mut',
                      help='Output directory [Default: %default]')
    parser.add_option('--plots',
                      dest='plots',
                      default=False,
                      action='store_true',
                      help='Make heatmap plots [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')

    # _multi.py options
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '-q',
        dest='queue',
        default='k80',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '-r',
        dest='restart',
        default=False,
        action='store_true',
        help='Restart a partially completed job [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters and model files and VCF file')
    else:
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print('Please remove %s' % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = 'source activate py3_gpu; basenji_sat_bed.py %s %s %d' % (
                options_pkl_file, ' '.join(args), pi)
            name = 'sat_p%d' % pi
            outf = '%s/job%d.out' % (options.out_dir, pi)
            errf = '%s/job%d.err' % (options.out_dir, pi)
            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue=options.queue,
                          gpu=1,
                          mem=30000,
                          time='14-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    collect_h5(options.out_dir, options.processes)
Ejemplo n.º 8
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>'
    parser = OptionParser(usage)

    # sad
    parser.add_option(
        '-c',
        dest='center_pct',
        default=0.25,
        type='float',
        help='Require clustered SNPs lie in center region [Default: %default]')
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/data/hg19.fa' % os.environ['BASENJIDIR'],
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/data/human.hg19.genome' %
                      os.environ['BASENJIDIR'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option('--h5',
                      dest='out_h5',
                      default=False,
                      action='store_true',
                      help='Output stats to sad.h5 [Default: %default]')
    parser.add_option('--local',
                      dest='local',
                      default=1024,
                      type='int',
                      help='Local SAD score [Default: %default]')
    parser.add_option('-n',
                      dest='norm_file',
                      default=None,
                      help='Normalize SAD scores')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sad',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option('--pseudo',
                      dest='log_pseudo',
                      default=1,
                      type='float',
                      help='Log2 pseudocount [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '--stats',
        dest='sad_stats',
        default='SAD',
        help='Comma-separated list of stats to save. [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--ti',
        dest='track_indexes',
        default=None,
        type='str',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '-u',
        dest='penultimate',
        default=False,
        action='store_true',
        help='Compute SED in the penultimate layer [Default: %default]')
    parser.add_option('-z',
                      dest='out_zarr',
                      default=False,
                      action='store_true',
                      help='Output stats to sad.zarr [Default: %default]')

    # multi
    parser.add_option('--name',
                      dest='name',
                      default='sad',
                      help='SLURM name prefix [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '-q',
        dest='queue',
        default='k80',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '-r',
        dest='restart',
        default=False,
        action='store_true',
        help='Restart a partially completed job [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters and model files and VCF file')
    else:
        params_file = args[0]
        model_file = args[1]
        vcf_file = args[2]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print('Please remove %s' % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate tf1.13-gpu;'
            cmd += ' echo $HOSTNAME;'

            cmd += ' basenji_sad_ref.py %s %s %d' % (options_pkl_file,
                                                     ' '.join(args), pi)

            name = '%s_p%d' % (options.name, pi)
            outf = '%s/job%d.out' % (options.out_dir, pi)
            errf = '%s/job%d.err' % (options.out_dir, pi)

            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue=options.queue,
                          gpu=1,
                          mem=37000,
                          time='7-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    if options.out_h5:
        collect_h5('sad.h5', options.out_dir, options.processes)

    elif options.out_zarr:
        collect_zarr('sad.zarr', options.out_dir, options.processes)

    else:
        collect_table('sad_table.txt', options.out_dir, options.processes)
Ejemplo n.º 9
0
def main():
  usage = ('usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>'
           ' <vcf_file>')
  parser = OptionParser(usage)
  parser.add_option(
      '-a',
      dest='all_sed',
      default=False,
      action='store_true',
      help=
      'Print all variant-gene pairs, as opposed to only nonzero [Default: %default]'
  )
  parser.add_option(
      '-b',
      dest='batch_size',
      default=None,
      type='int',
      help='Batch size [Default: %default]')
  parser.add_option(
      '-c',
      dest='csv',
      default=False,
      action='store_true',
      help='Print table as CSV [Default: %default]')
  parser.add_option(
      '-g',
      dest='genome_file',
      default='%s/data/human.hg19.genome' % os.environ['BASENJIDIR'],
      help='Chromosome lengths file [Default: %default]')
  parser.add_option(
      '-o',
      dest='out_dir',
      default='sed',
      help='Output directory for tables and plots [Default: %default]')
  parser.add_option(
      '-p',
      dest='processes',
      default=2,
      type='int',
      help='Number of parallel processes to run [Default: %default]')
  parser.add_option(
      '--pseudo',
      dest='log_pseudo',
      default=0.125,
      type='float',
      help='Log2 pseudocount [Default: %default]')
  parser.add_option(
      '-q',
      dest='queue',
      default='k80',
      help='SLURM queue on which to run the jobs [Default: %default]')
  parser.add_option(
      '-r',
      dest='tss_radius',
      default=0,
      type='int',
      help='Radius of bins considered to quantify TSS transcription [Default: %default]')
  parser.add_option(
      '--rc',
      dest='rc',
      default=False,
      action='store_true',
      help=
      'Average the forward and reverse complement predictions when testing [Default: %default]'
  )
  parser.add_option(
      '--shifts',
      dest='shifts',
      default='0',
      help='Ensemble prediction shifts [Default: %default]')
  parser.add_option(
      '-t',
      dest='targets_file',
      default=None,
      help='File specifying target indexes and labels in table format.')
  parser.add_option(
      '--ti',
      dest='track_indexes',
      help='Comma-separated list of target indexes to output BigWig tracks')
  parser.add_option(
      '-u',
      dest='penultimate',
      default=False,
      action='store_true',
      help='Compute SED in the penultimate layer [Default: %default]')
  parser.add_option(
      '-x',
      dest='tss_table',
      default=False,
      action='store_true',
      help='Print TSS table in addition to gene [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 4:
    parser.error(
        'Must provide parameters and model files, genes HDF5 file, and QTL VCF'
        ' file')
  else:
    params_file = args[0]
    model_file = args[1]
    genes_hdf5_file = args[2]
    vcf_file = args[3]

  #######################################################
  # prep work

  # output directory
  if os.path.isdir(options.out_dir):
    shutil.rmtree(options.out_dir)
  os.mkdir(options.out_dir)

  # pickle options
  options_pkl_file = '%s/options.pkl' % options.out_dir
  options_pkl = open(options_pkl_file, 'wb')
  pickle.dump(options, options_pkl)
  options_pkl.close()

  #######################################################
  # launch worker threads
  jobs = []
  for pi in range(options.processes):
    cmd = 'source activate py3_gpu; basenji_sed.py %s %s %d' % (
        options_pkl_file, ' '.join(args), pi)
    name = 'sed_p%d' % pi
    outf = '%s/job%d.out' % (options.out_dir, pi)
    errf = '%s/job%d.err' % (options.out_dir, pi)
    j = slurm.Job(
        cmd,
        name,
        outf,
        errf,
        queue=options.queue,
        mem=30000,
        time='4:0:0',
        gpu=1)
    jobs.append(j)

  slurm.multi_run(jobs, max_proc=options.processes, verbose=True, sleep_time=60)

  #######################################################
  # collect output

  collect_table_multi('sed_gene.txt', options.out_dir, options.processes, options.log_pseudo)
  if options.tss_table:
    collect_table('sed_tss.txt', options.out_dir, options.processes)

  if options.track_indexes is not None:
    if not os.path.isdir('%s/tracks' % options.out_dir):
      os.mkdir('%s/tracks' % options.out_dir)

    for track_file in glob.glob('%s/job*/tracks/*'):
      track_base = os.path.split(track_file)[1]
      os.rename(track_file, '%s/tracks/%s' % (options.out_dir, track_base))

  for pi in range(options.processes):
    shutil.rmtree('%s/job%d' % (options.out_dir, pi))
Ejemplo n.º 10
0
def main():
    usage = "usage: %prog [options] <params_file> <model_file> <bed_file>"
    parser = OptionParser(usage)

    # basenji_sat_bed.py options
    parser.add_option(
        "-f",
        dest="genome_fasta",
        default=None,
        help="Genome FASTA for sequences [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="mut_len",
        default=200,
        type="int",
        help="Length of center sequence to mutate [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="sat_mut",
        help="Output directory [Default: %default]",
    )
    parser.add_option(
        "--plots",
        dest="plots",
        default=False,
        action="store_true",
        help="Make heatmap plots [Default: %default]",
    )
    parser.add_option(
        "--rc",
        dest="rc",
        default=False,
        action="store_true",
        help=
        "Ensemble forward and reverse complement predictions [Default: %default]",
    )
    parser.add_option(
        "--shifts",
        dest="shifts",
        default="0",
        help="Ensemble prediction shifts [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="targets_file",
        default=None,
        type="str",
        help="File specifying target indexes and labels in table format",
    )

    # _multi.py options
    parser.add_option(
        "-n",
        dest="name",
        default="sat",
        help="SLURM job name prefix [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number of processes, passed by multi script",
    )
    parser.add_option(
        "-q",
        dest="queue",
        default="k80",
        help="SLURM queue on which to run the jobs [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="restart",
        default=False,
        action="store_true",
        help="Restart a partially completed job [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        print(args)
        parser.error("Must provide parameters and model files and BED file")
    else:
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print("Please remove %s" % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = "%s/options.pkl" % options.out_dir
    options_pkl = open(options_pkl_file, "wb")
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = "source activate tf1.12-gpu; basenji_sat_bed.py %s %s %d" % (
                options_pkl_file,
                " ".join(args),
                pi,
            )
            name = "%s_p%d" % (options.name, pi)
            outf = "%s/job%d.out" % (options.out_dir, pi)
            errf = "%s/job%d.err" % (options.out_dir, pi)
            j = slurm.Job(
                cmd,
                name,
                outf,
                errf,
                queue=options.queue,
                gpu=1,
                mem=30000,
                time="14-0:0:0",
            )
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    collect_h5(options.out_dir, options.processes)
Ejemplo n.º 11
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>'
    parser = OptionParser(usage)

    # basenji_predict_bed.py options
    parser.add_option(
        '-b',
        dest='bigwig_indexes',
        default=None,
        help='Comma-separated list of target indexes to write BigWigs')
    parser.add_option('-e',
                      dest='embed_layer',
                      default=None,
                      type='int',
                      help='Embed sequences using the specified layer index.')
    parser.add_option('-f',
                      dest='genome_fasta',
                      default=None,
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default=None,
                      help='Chromosome length information [Default: %default]')
    parser.add_option(
        '-l',
        dest='site_length',
        default=None,
        type='int',
        help='Prediction site length. [Default: params.seq_length]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='pred_out',
                      help='Output directory [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('-s',
                      dest='sum',
                      default=False,
                      action='store_true',
                      help='Sum site predictions [Default: %default]')
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')

    # _multi.py options
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '-r',
        dest='restart',
        default=False,
        action='store_true',
        help='Restart a partially completed job [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        print(args)
        parser.error('Must provide parameters and model files and BED file')
    else:
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print('Please remove %s' % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate tf1.15-gpu;'
            cmd += ' basenji_predict_bed.py %s %s %d' % (options_pkl_file,
                                                         ' '.join(args), pi)
            name = 'pred_p%d' % pi
            outf = '%s/job%d.out' % (options.out_dir, pi)
            errf = '%s/job%d.err' % (options.out_dir, pi)
            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue=options.queue,
                          gpu=1,
                          mem=60000,
                          time='14-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    collect_h5(options.out_dir, options.processes)
Ejemplo n.º 12
0
def main():
  usage = 'usage: %prog [options] <fasta_file> <targets_file>'
  parser = OptionParser(usage)
  parser.add_option('-b', dest='limit_bed',
      help='Limit to segments that overlap regions in a BED file')
  # parser.add_option('-c', dest='clip',
  #     default=None, type='float',
  #     help='Clip target values to have minimum [Default: %default]')
  parser.add_option('-d', dest='sample_pct',
      default=1.0, type='float',
      help='Down-sample the segments')
  parser.add_option('-g', dest='gaps_file',
      help='Genome assembly gaps BED [Default: %default]')
  parser.add_option('-l', dest='seq_length',
      default=131072, type='int',
      help='Sequence length [Default: %default]')
  parser.add_option('--local', dest='run_local',
      default=False, action='store_true',
      help='Run jobs locally as opposed to on SLURM [Default: %default]')
  parser.add_option('-o', dest='out_dir',
      default='data_out',
      help='Output directory [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number parallel processes [Default: %default]')
  parser.add_option('--seed', dest='seed',
      default=44, type='int',
      help='Random seed [Default: %default]')
  parser.add_option('--stride_train', dest='stride_train',
      default=1., type='float',
      help='Stride to advance train sequences [Default: seq_length]')
  parser.add_option('--stride_test', dest='stride_test',
      default=1., type='float',
      help='Stride to advance valid and test sequences [Default: seq_length]')
  parser.add_option('-r', dest='seqs_per_tfr',
      default=256, type='int',
      help='Sequences per TFRecord file [Default: %default]')
  parser.add_option('-t', dest='test_pct',
      default=0.05, type='float',
      help='Proportion of the data for testing [Default: %default]')
  parser.add_option('-u', dest='unmap_bed',
      help='Unmappable segments to set to NA')
  parser.add_option('--unmap_t', dest='unmap_t',
      default=0.3, type='float',
      help='Remove sequences with more than this unmappable bin % [Default: %default]')
  parser.add_option('-w', dest='pool_width',
      default=128, type='int',
      help='Sum pool width [Default: %default]')
  parser.add_option('-v', dest='valid_pct',
      default=0.05, type='float',
      help='Proportion of the data for validation [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 2:
    parser.error('Must provide FASTA and sample coverage labels and paths.')
  else:
    fasta_file = args[0]
    targets_file = args[1]

  random.seed(options.seed)
  np.random.seed(options.seed)

  if not os.path.isdir(options.out_dir):
    os.mkdir(options.out_dir)

  ################################################################
  # define genomic contigs
  ################################################################
  chrom_contigs = basenji.genome.load_chromosomes(fasta_file)

  # remove gaps
  if options.gaps_file:
    chrom_contigs = basenji.genome.split_contigs(chrom_contigs,
                                                 options.gaps_file)

  # ditch the chromosomes for contigs
  contigs = []
  for chrom in chrom_contigs:
    contigs += [Contig(chrom, ctg_start, ctg_end)
                 for ctg_start, ctg_end in chrom_contigs[chrom]]

  # limit to a BED file
  if options.limit_bed is not None:
    contigs = limit_contigs(contigs, options.limit_bed)

  # filter for large enough
  contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

  # down-sample
  if options.sample_pct < 1.0:
    contigs = random.sample(contigs, int(options.sample_pct*len(contigs)))

  # print contigs to BED file
  ctg_bed_file = '%s/contigs.bed' % options.out_dir
  write_seqs_bed(ctg_bed_file, contigs)


  ################################################################
  # divide between train/valid/test
  ################################################################
  contig_sets = divide_contigs(contigs, options.test_pct, options.valid_pct)
  train_contigs, valid_contigs, test_contigs = contig_sets

  ################################################################
  # define model sequences
  ################################################################
  # stride sequences across contig
  train_mseqs = contig_sequences(train_contigs, options.seq_length, options.stride_train)
  valid_mseqs = contig_sequences(valid_contigs, options.seq_length, options.stride_test)
  test_mseqs = contig_sequences(test_contigs, options.seq_length, options.stride_test)

  # shuffle
  random.shuffle(train_mseqs)
  random.shuffle(valid_mseqs)
  random.shuffle(test_mseqs)

  # merge
  mseqs = train_mseqs + valid_mseqs + test_mseqs
  mseqs_labels = ['train']*len(train_mseqs) + ['valid']*len(valid_mseqs) + ['test']*len(test_mseqs)


  ################################################################
  # mappability
  ################################################################
  if options.unmap_bed is not None:
    # annotate unmappable positions
    mseqs_unmap = annotate_unmap(mseqs, options.unmap_bed,
                                 options.seq_length, options.pool_width)

    # filter unmappable
    mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') < options.unmap_t)
    mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
    mseqs_labels = [mseqs_labels[i] for i in range(len(mseqs_labels)) if mseqs_map_mask[i]]
    mseqs_unmap = mseqs_unmap[mseqs_map_mask,:]

    # write to file
    unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
    np.save(unmap_npy, mseqs_unmap)

  # write sequences to BED
  seqs_bed_file = '%s/sequences.bed' % options.out_dir
  write_seqs_bed(seqs_bed_file, mseqs, mseqs_labels)


  ################################################################
  # read sequence coverage values
  ################################################################
  # read target datasets
  targets_df = pd.read_table(targets_file)

  seqs_cov_dir = '%s/seqs_cov' % options.out_dir
  if not os.path.isdir(seqs_cov_dir):
    os.mkdir(seqs_cov_dir)

  read_jobs = []

  for ti in range(targets_df.shape[0]):
    genome_cov_file = targets_df['file'].iloc[ti]
    seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
    seqs_cov_file = '%s.h5' % seqs_cov_stem

    if os.path.isfile(seqs_cov_file):
      print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
    else:
      cmd = 'basenji_data_read.py'
      cmd += ' -w %d' % options.pool_width
      cmd += ' %s' % genome_cov_file
      cmd += ' %s' % seqs_bed_file
      cmd += ' %s' % seqs_cov_file

      if options.run_local:
        cmd += ' &> %s.err' % seqs_cov_stem
        read_jobs.append(cmd)
      else:
        j = slurm.Job(cmd,
            name='read_t%d' % ti,
            out_file='%s.out' % seqs_cov_stem,
            err_file='%s.err' % seqs_cov_stem,
            queue='standard,tbdisk', mem=15000, time='12:0:0')
        read_jobs.append(j)

  if options.run_local:
    util.exec_par(read_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(read_jobs, options.processes, verbose=True, sleep_time=1)

  ################################################################
  # write TF Records
  ################################################################
  tfr_dir = '%s/tfrecords' % options.out_dir
  if not os.path.isdir(tfr_dir):
    os.mkdir(tfr_dir)

  write_jobs = []

  for tvt_set in ['train', 'valid', 'test']:
    tvt_set_indexes = [i for i in range(len(mseqs_labels)) if mseqs_labels[i] == tvt_set]
    tvt_set_start = tvt_set_indexes[0]
    tvt_set_end = tvt_set_indexes[-1]

    tfr_i = 0
    tfr_start = tvt_set_start
    tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end)

    while tfr_start <= tvt_set_end:
      tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i)

      cmd = 'basenji_data_write.py'
      cmd += ' -s %d' % tfr_start
      cmd += ' -e %d' % tfr_end
      if options.unmap_bed is not None:
        cmd += ' -u %s' % unmap_npy

      cmd += ' %s' % fasta_file
      cmd += ' %s' % seqs_bed_file
      cmd += ' %s' % seqs_cov_dir
      cmd += ' %s.tfr' % tfr_stem

      if options.run_local:
        cmd += ' &> %s.err' % tfr_stem
        write_jobs.append(cmd)
      else:
        j = slurm.Job(cmd,
              name='write_%s-%d' % (tvt_set, tfr_i),
              out_file='%s.out' % tfr_stem,
              err_file='%s.err' % tfr_stem,
              queue='standard,tbdisk', mem=15000, time='12:0:0')
        write_jobs.append(j)

      # update
      tfr_i += 1
      tfr_start += options.seqs_per_tfr
      tfr_end = min(tfr_start+options.seqs_per_tfr, tvt_set_end)

  if options.run_local:
    util.exec_par(write_jobs, options.processes, verbose=True)
  else:
    slurm.multi_run(write_jobs, options.processes, verbose=True, sleep_time=1)
Ejemplo n.º 13
0
def main():
    usage = 'usage: %prog [options] <params_file> <data1_dir> ...'
    parser = OptionParser(usage)

    # train
    train_options = OptionGroup(parser, 'basenji_train.py options')
    train_options.add_option(
        '-k',
        dest='keras_fit',
        default=False,
        action='store_true',
        help='Train with Keras fit method [Default: %default]')
    train_options.add_option(
        '-o',
        dest='out_dir',
        default='train_out',
        help='Output directory for test statistics [Default: %default]')
    train_options.add_option(
        '--restore',
        dest='restore',
        help=
        'Restore model and continue training, from existing fold train dir [Default: %default]'
    )
    train_options.add_option(
        '--trunk',
        dest='trunk',
        default=False,
        action='store_true',
        help='Restore only model trunk [Default: %default]')
    train_options.add_option(
        '--tfr_train',
        dest='tfr_train_pattern',
        default=None,
        help=
        'Training TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    train_options.add_option(
        '--tfr_eval',
        dest='tfr_eval_pattern',
        default=None,
        help=
        'Evaluation TFR pattern string appended to data_dir/tfrecords for subsetting [Default: %default]'
    )
    parser.add_option_group(train_options)

    # test
    test_options = OptionGroup(parser, 'basenji_test.py options')
    test_options.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    test_options.add_option(
        '--shifts',
        dest='shifts',
        default='0',
        type='str',
        help='Ensemble prediction shifts [Default: %default]')
    parser.add_option_group(test_options)

    # multi
    rep_options = OptionGroup(parser, 'replication options')
    rep_options.add_option(
        '-c',
        dest='crosses',
        default=1,
        type='int',
        help='Number of cross-fold rounds [Default:%default]')
    rep_options.add_option('-e',
                           dest='conda_env',
                           default='tf2.4',
                           help='Anaconda environment [Default: %default]')
    rep_options.add_option('-f',
                           dest='fold_subset',
                           default=None,
                           type='int',
                           help='Run a subset of folds [Default:%default]')
    rep_options.add_option('--name',
                           dest='name',
                           default='fold',
                           help='SLURM name prefix [Default: %default]')
    rep_options.add_option('-p',
                           dest='processes',
                           default=None,
                           type='int',
                           help='Number of processes, passed by multi script')
    rep_options.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    rep_options.add_option('-r',
                           dest='restart',
                           default=False,
                           action='store_true')
    rep_options.add_option('--spec_off',
                           dest='spec_off',
                           default=False,
                           action='store_true')
    rep_options.add_option('--test_off',
                           dest='test_off',
                           default=False,
                           action='store_true')
    rep_options.add_option('--test_train_off',
                           dest='test_train_off',
                           default=False,
                           action='store_true')
    parser.add_option_group(rep_options)

    (options, args) = parser.parse_args()

    if len(args) < 2:
        parser.error('Must provide parameters and data directory.')
    else:
        params_file = os.path.abspath(args[0])
        data_dirs = [os.path.abspath(arg) for arg in args[1:]]

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_train = params['train']

    #######################################################
    # prep work

    if not options.restart and os.path.isdir(options.out_dir):
        print('Output directory %s exists. Please remove.' % options.out_dir)
        exit(1)
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # read data parameters
    num_data = len(data_dirs)
    data_stats_file = '%s/statistics.json' % data_dirs[0]
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # count folds
    num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')])

    # subset folds
    if options.fold_subset is not None:
        num_folds = min(options.fold_subset, num_folds)

    #######################################################
    # train

    jobs = []

    for ci in range(options.crosses):
        for fi in range(num_folds):
            rep_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci)
            if options.restart and os.path.isdir(rep_dir):
                print('%s found and skipped.' % rep_dir)
            else:
                # make rep dir
                os.mkdir(rep_dir)

                # make rep data
                rep_data_dirs = []
                for di in range(num_data):
                    rep_data_dirs.append('%s/data%d' % (rep_dir, di))
                    make_rep_data(data_dirs[di], rep_data_dirs[-1], fi, ci)

                # train command
                cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                cmd += ' conda activate %s;' % options.conda_env
                cmd += ' echo $HOSTNAME;'

                cmd += ' basenji_train.py'
                cmd += ' %s' % options_string(options, train_options, rep_dir)
                cmd += ' %s %s' % (params_file, ' '.join(rep_data_dirs))

                name = '%s-train-f%dc%d' % (options.name, fi, ci)
                sbf = os.path.abspath('%s/train.sb' % rep_dir)
                outf = os.path.abspath('%s/train.out' % rep_dir)
                errf = os.path.abspath('%s/train.err' % rep_dir)

                j = slurm.Job(cmd,
                              name,
                              outf,
                              errf,
                              sbf,
                              queue=options.queue,
                              cpu=4,
                              gpu=params_train.get('num_gpu', 1),
                              mem=37000,
                              time='28-0:0:0')
                jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # test train

    jobs = []

    if not options.test_train_off:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci)

                for di in range(num_data):
                    if num_data == 1:
                        out_dir = '%s/test_train' % it_dir
                        model_file = '%s/train/model_check.h5' % it_dir
                    else:
                        out_dir = '%s/test%d_train' % (it_dir, di)
                        model_file = '%s/train/model%d_check.h5' % (it_dir, di)

                    # check if done
                    acc_file = '%s/acc.txt' % out_dir
                    if os.path.isfile(acc_file):
                        print('%s already generated.' % acc_file)
                    else:
                        # basenji test
                        basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                        basenji_cmd += ' conda activate %s;' % options.conda_env
                        basenji_cmd += ' basenji_test.py'
                        basenji_cmd += ' --head %d' % di
                        basenji_cmd += ' -o %s' % out_dir
                        if options.rc:
                            basenji_cmd += ' --rc'
                        if options.shifts:
                            basenji_cmd += ' --shifts %s' % options.shifts
                        basenji_cmd += ' --split train'
                        basenji_cmd += ' %s' % params_file
                        basenji_cmd += ' %s' % model_file
                        basenji_cmd += ' %s/data%d' % (it_dir, di)

                        name = '%s-testtr-f%dc%d' % (options.name, fi, ci)
                        basenji_job = slurm.Job(basenji_cmd,
                                                name=name,
                                                out_file='%s.out' % out_dir,
                                                err_file='%s.err' % out_dir,
                                                queue=options.queue,
                                                cpu=1,
                                                gpu=1,
                                                mem=23000,
                                                time='8:00:00')
                        jobs.append(basenji_job)

    #######################################################
    # test best

    if not options.test_off:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci)

                for di in range(num_data):
                    if num_data == 1:
                        out_dir = '%s/test' % it_dir
                        model_file = '%s/train/model_best.h5' % it_dir
                    else:
                        out_dir = '%s/test%d' % (it_dir, di)
                        model_file = '%s/train/model%d_best.h5' % (it_dir, di)

                    # check if done
                    acc_file = '%s/acc.txt' % out_dir
                    if os.path.isfile(acc_file):
                        print('%s already generated.' % acc_file)
                    else:
                        # basenji test
                        basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                        basenji_cmd += ' conda activate %s;' % options.conda_env
                        basenji_cmd += ' basenji_test.py'
                        basenji_cmd += ' --head %d' % di
                        basenji_cmd += ' -o %s' % out_dir
                        if options.rc:
                            basenji_cmd += ' --rc'
                        if options.shifts:
                            basenji_cmd += ' --shifts %s' % options.shifts
                        basenji_cmd += ' %s' % params_file
                        basenji_cmd += ' %s' % model_file
                        basenji_cmd += ' %s/data%d' % (it_dir, di)

                        name = '%s-test-f%dc%d' % (options.name, fi, ci)
                        basenji_job = slurm.Job(basenji_cmd,
                                                name=name,
                                                out_file='%s.out' % out_dir,
                                                err_file='%s.err' % out_dir,
                                                queue=options.queue,
                                                cpu=1,
                                                gpu=1,
                                                mem=23000,
                                                time='4:00:00')
                        jobs.append(basenji_job)

    #######################################################
    # test best specificity

    if not options.spec_off:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (options.out_dir, fi, ci)

                for di in range(num_data):
                    if num_data == 1:
                        out_dir = '%s/test_spec' % it_dir
                        model_file = '%s/train/model_best.h5' % it_dir
                    else:
                        out_dir = '%s/test%d_spec' % (it_dir, di)
                        model_file = '%s/train/model%d_best.h5' % (it_dir, di)

                    # check if done
                    acc_file = '%s/acc.txt' % out_dir
                    if os.path.isfile(acc_file):
                        print('%s already generated.' % acc_file)
                    else:
                        # basenji test
                        basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                        basenji_cmd += ' conda activate %s;' % options.conda_env
                        basenji_cmd += ' basenji_test_specificity.py'
                        basenji_cmd += ' --head %d' % di
                        basenji_cmd += ' -o %s' % out_dir
                        if options.rc:
                            basenji_cmd += ' --rc'
                        if options.shifts:
                            basenji_cmd += ' --shifts %s' % options.shifts
                        basenji_cmd += ' %s' % params_file
                        basenji_cmd += ' %s' % model_file
                        basenji_cmd += ' %s/data%d' % (it_dir, di)

                        name = '%s-spec-f%dc%d' % (options.name, fi, ci)
                        basenji_job = slurm.Job(basenji_cmd,
                                                name=name,
                                                out_file='%s.out' % out_dir,
                                                err_file='%s.err' % out_dir,
                                                queue=options.queue,
                                                cpu=1,
                                                gpu=1,
                                                mem=90000,
                                                time='6:00:00')
                        jobs.append(basenji_job)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)
Ejemplo n.º 14
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=8388608,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    parser.add_option('--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option(
        '-d',
        dest='diagonal_offset',
        default=2,
        type='int',
        help='Positions on the diagonal to ignore [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option(
        '-k',
        dest='kernel_stddev',
        default=0,
        type='int',
        help='Gaussian kernel stddev to smooth values [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=128,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option(
        '--restart',
        dest='restart',
        default=False,
        action='store_true',
        help='Skip already read HDF5 coverage values. [Default: %default]')
    parser.add_option('--sample',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '--soft',
        dest='soft_clip',
        default=False,
        action='store_true',
        help=
        'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_midpoints',
        dest='umap_midpoints',
        help='Regions with midpoints to exclude in BED format. Used for 4C/HiC.'
    )
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.3,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_set',
        dest='umap_set',
        default=None,
        type='float',
        help=
        'Set unmappable regions to this percentile in the sequences\' distribution of values'
    )
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option(
        '--snap',
        dest='snap',
        default=None,
        type='int',
        help=
        'snap stride to multiple for binned targets in bp, if not None seq_length must be a multiple of snap'
    )
    parser.add_option('--as_obsexp',
                      dest='as_obsexp',
                      action="store_true",
                      default=False,
                      help='save targets as obsexp profiles')
    parser.add_option('--global_obsexp',
                      dest='global_obsexp',
                      action="store_true",
                      default=False,
                      help='use pre-calculated by-chromosome obs/exp')
    parser.add_option('--no_log',
                      dest='no_log',
                      action="store_true",
                      default=False,
                      help='do not take log for obs/exp')

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print('stride_train %.f' % options.stride_train, end='')
        options.stride_train = options.stride_train * options.seq_length
        print(' converted to %f' % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        print('stride_test %.f' % options.stride_test, end='')
        options.stride_test = options.stride_test * options.seq_length
        print(' converted to %f' % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    if options.snap != None:
        if np.mod(options.seq_length, options.snap) != 0:
            raise ValueError('seq_length must be a multiple of snap')
        if np.mod(options.stride_train, options.snap) != 0:
            raise ValueError('stride_train must be a multiple of snap')
        if np.mod(options.stride_test, options.snap) != 0:
            raise ValueError('stride_test must be a multiple of snap')

    if os.path.isdir(options.out_dir) and not options.restart:
        print('Remove output directory %s or use --restart option.' %
              options.out_dir)
        exit(1)
    elif not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # dump options
    with open('%s/options.json' % options.out_dir, 'w') as options_json_out:
        json.dump(options.__dict__, options_json_out, sort_keys=True, indent=4)

    ################################################################
    # define genomic contigs
    ################################################################
    chrom_contigs = genome.load_chromosomes(fasta_file)

    # remove gaps
    if options.gaps_file:
        chrom_contigs = genome.split_contigs(chrom_contigs, options.gaps_file)

    # ditch the chromosomes for contigs
    contigs = []
    for chrom in chrom_contigs:
        contigs += [
            Contig(chrom, ctg_start, ctg_end)
            for ctg_start, ctg_end in chrom_contigs[chrom]
        ]

    # limit to a BED file
    if options.limit_bed is not None:
        contigs = limit_contigs(contigs, options.limit_bed)

    # filter for large enough
    contigs = [
        ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
    ]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    ctg_bed_file = '%s/contigs.bed' % options.out_dir
    write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    try:
        # convert to float pct
        valid_pct = float(options.valid_pct_or_chr)
        test_pct = float(options.test_pct_or_chr)
        assert (0 <= valid_pct <= 1)
        assert (0 <= test_pct <= 1)

        # divide by pct
        contig_sets = divide_contigs_pct(contigs, test_pct, valid_pct)

    except (ValueError, AssertionError):
        # divide by chr
        valid_chrs = options.valid_pct_or_chr.split(',')
        test_chrs = options.test_pct_or_chr.split(',')
        contig_sets = divide_contigs_chr(contigs, test_chrs, valid_chrs)

    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################
    # stride sequences across contig
    train_mseqs = contig_sequences(train_contigs,
                                   options.seq_length,
                                   options.stride_train,
                                   options.snap,
                                   label='train')
    valid_mseqs = contig_sequences(valid_contigs,
                                   options.seq_length,
                                   options.stride_test,
                                   options.snap,
                                   label='valid')
    test_mseqs = contig_sequences(test_contigs,
                                  options.seq_length,
                                  options.stride_test,
                                  options.snap,
                                  label='test')

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # down-sample
    if options.sample_pct < 1.0:
        train_mseqs = random.sample(train_mseqs,
                                    int(options.sample_pct * len(train_mseqs)))
        valid_mseqs = random.sample(valid_mseqs,
                                    int(options.sample_pct * len(valid_mseqs)))
        test_mseqs = random.sample(test_mseqs,
                                   int(options.sample_pct * len(test_mseqs)))

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # mappability
    ################################################################
    if (options.umap_bed is not None) or (options.umap_midpoints is not None):
        if shutil.which('bedtools') is None:
            print('Install Bedtools to annotate unmappable sites',
                  file=sys.stderr)
            exit(1)

    if options.umap_bed is not None:
        # annotate unmappable positions
        mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                     options.seq_length, options.pool_width)

        # filter unmappable
        mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                          options.umap_t)
        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

    if options.umap_midpoints is not None:
        # annotate unmappable midpoints for 4C/HiC
        mseqs_unmap = annotate_unmap(mseqs, options.umap_midpoints,
                                     options.seq_length, options.pool_width)

        # filter unmappable
        seqmid = mseqs_unmap.shape[
            1] // 2  #int( options.seq_length / options.pool_width /2)
        mseqs_map_mask = (np.sum(mseqs_unmap[:, seqmid - 1:seqmid + 1],
                                 axis=1) == 0)

        mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
        mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

        # write to file
        unmap_npy = '%s/mseqs_unmap_midpoints.npy' % options.out_dir
        np.save(unmap_npy, mseqs_unmap)

    # write sequences to BED
    print('writing sequences to BED')
    seqs_bed_file = '%s/sequences.bed' % options.out_dir
    write_seqs_bed(seqs_bed_file, mseqs, True)

    ################################################################
    # read sequence coverage values
    ################################################################
    # read target datasets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')

    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        # scale_ti = 1
        # if 'scale' in targets_df.columns:
        #   scale_ti = targets_df['scale'].iloc[ti]

        if options.restart and os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = 'akita_data_read.py'
            cmd += ' --crop %d' % options.crop_bp
            cmd += ' -k %d' % options.kernel_stddev
            cmd += ' -w %d' % options.pool_width
            if clip_ti is not None:
                cmd += ' --clip %f' % clip_ti
            if options.soft_clip:
                cmd += ' --soft'
            # cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            if options.as_obsexp:
                cmd += ' --as_obsexp'
                if options.global_obsexp:
                    cmd += ' --global_obsexp'
                if options.no_log:
                    cmd += ' --no_log'
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for tvt_set in ['train', 'valid', 'test']:
        tvt_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == tvt_set
        ]
        tvt_set_start = tvt_set_indexes[0]
        tvt_set_end = tvt_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = tvt_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

        while tfr_start <= tvt_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, tvt_set, tfr_i)

            cmd = 'basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end

            # do not use
            # if options.umap_bed is not None:
            #   cmd += ' -u %s' % unmap_npy
            # if options.umap_set is not None:
            #   cmd += ' --umap_set %f' % options.umap_set

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (tvt_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, tvt_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # stats
    ################################################################
    stats_dict = {}
    stats_dict['num_targets'] = targets_df.shape[0]
    stats_dict['train_seqs'] = len(train_mseqs)
    stats_dict['valid_seqs'] = len(valid_mseqs)
    stats_dict['test_seqs'] = len(test_mseqs)
    stats_dict['seq_length'] = options.seq_length
    stats_dict['pool_width'] = options.pool_width
    stats_dict['crop_bp'] = options.crop_bp
    stats_dict['diagonal_offset'] = options.diagonal_offset

    target1_length = options.seq_length - 2 * options.crop_bp
    target1_length = target1_length // options.pool_width
    target1_length = target1_length - options.diagonal_offset
    target_length = target1_length * (target1_length + 1) // 2
    stats_dict['target_length'] = target_length

    with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
Ejemplo n.º 15
0
def main():
    usage = 'usage: %prog [options] <exp_dir> <params_file> <data1_dir> ...'
    parser = OptionParser(usage)
    parser.add_option('-a',
                      '--alt',
                      dest='alternative',
                      default='two-sided',
                      help='Statistical test alternative [Default: %default]')
    parser.add_option('-c',
                      dest='crosses',
                      default=1,
                      type='int',
                      help='Number of cross-fold rounds [Default:%default]')
    parser.add_option('-d',
                      dest='dataset_i',
                      default=None,
                      type='int',
                      help='Dataset index [Default:%default]')
    parser.add_option('--d_ref',
                      dest='dataset_ref_i',
                      default=None,
                      type='int',
                      help='Reference Dataset index [Default:%default]')
    parser.add_option('-e',
                      dest='conda_env',
                      default='tf2-gpu',
                      help='Anaconda environment [Default: %default]')
    parser.add_option('-f',
                      dest='fold_subset',
                      default=None,
                      type='int',
                      help='Run a subset of folds [Default:%default]')
    parser.add_option('--label_exp',
                      dest='label_exp',
                      default='Experiment',
                      help='Experiment label [Default: %default]')
    parser.add_option('--label_ref',
                      dest='label_ref',
                      default='Reference',
                      help='Reference label [Default: %default]')
    parser.add_option('-m',
                      dest='metric',
                      default=None,
                      help='Train/test metric [Default: Pearsonr or AUPRC]')
    parser.add_option('--name',
                      dest='name',
                      default='test',
                      help='SLURM name prefix [Default: %default]')
    parser.add_option('-o',
                      dest='out_stem',
                      default=None,
                      help='Output plot stem [Default: %default]')
    parser.add_option('-q', dest='queue', default='gtx1080ti')
    parser.add_option('-r',
                      dest='ref_dir',
                      default=None,
                      help='Reference directory for statistical tests')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option('--spec',
                      dest='specificity',
                      default=False,
                      action='store_true',
                      help='Test specificity [Default: %default]')
    parser.add_option('--train',
                      dest='train',
                      default=False,
                      action='store_true',
                      help='Test on the training set, too [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) < 3:
        parser.error('Must provide parameters file and data directory')
    else:
        exp_dir = args[0]
        params_file = args[1]
        data_dirs = [os.path.abspath(arg) for arg in args[2:]]

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dirs[0]
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    if options.dataset_i is None:
        head_i = 0
    else:
        head_i = options.dataset_i

    # count folds
    num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')])

    # subset folds
    if options.fold_subset is not None:
        num_folds = min(options.fold_subset, num_folds)

    ################################################################
    # test check
    ################################################################
    jobs = []

    if options.train:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

                if options.dataset_i is None:
                    out_dir = '%s/test_train' % it_dir
                    model_file = '%s/train/model_check.h5' % it_dir
                else:
                    out_dir = '%s/test%d_train' % (it_dir, options.dataset_i)
                    model_file = '%s/train/model%d_check.h5' % (
                        it_dir, options.dataset_i)

                # check if done
                acc_file = '%s/acc.txt' % out_dir
                if os.path.isfile(acc_file):
                    # print('%s already generated.' % acc_file)
                    pass
                else:
                    cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                    cmd += ' conda activate %s;' % options.conda_env
                    cmd += ' basenji_test.py'
                    cmd += ' --head %d' % head_i
                    cmd += ' -o %s' % out_dir
                    if options.rc:
                        cmd += ' --rc'
                    if options.shifts:
                        cmd += ' --shifts %s' % options.shifts
                    cmd += ' --split train'
                    cmd += ' %s' % params_file
                    cmd += ' %s' % model_file
                    cmd += ' %s/data%d' % (it_dir, head_i)

                    name = '%s-testtr-f%dc%d' % (options.name, fi, ci)
                    j = slurm.Job(cmd,
                                  name=name,
                                  out_file='%s.out' % out_dir,
                                  err_file='%s.err' % out_dir,
                                  queue=options.queue,
                                  cpu=1,
                                  gpu=1,
                                  mem=23000,
                                  time='4:00:00')
                    jobs.append(j)

    ################################################################
    # test best
    ################################################################
    for ci in range(options.crosses):
        for fi in range(num_folds):
            it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

            if options.dataset_i is None:
                out_dir = '%s/test' % it_dir
                model_file = '%s/train/model_best.h5' % it_dir
            else:
                out_dir = '%s/test%d' % (it_dir, options.dataset_i)
                model_file = '%s/train/model%d_best.h5' % (it_dir,
                                                           options.dataset_i)

            # check if done
            acc_file = '%s/acc.txt' % out_dir
            if os.path.isfile(acc_file):
                # print('%s already generated.' % acc_file)
                pass
            else:
                # basenji test
                cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                cmd += ' conda activate %s;' % options.conda_env
                cmd += ' basenji_test.py'
                cmd += ' --head %d' % head_i
                cmd += ' -o %s' % out_dir
                if options.rc:
                    cmd += ' --rc'
                if options.shifts:
                    cmd += ' --shifts %s' % options.shifts
                cmd += ' %s' % params_file
                cmd += ' %s' % model_file
                cmd += ' %s/data%d' % (it_dir, head_i)

                name = '%s-test-f%dc%d' % (options.name, fi, ci)
                j = slurm.Job(cmd,
                              name=name,
                              out_file='%s.out' % out_dir,
                              err_file='%s.err' % out_dir,
                              queue=options.queue,
                              cpu=1,
                              gpu=1,
                              mem=23000,
                              time='4:00:00')
                jobs.append(j)

    ################################################################
    # test best specificity
    ################################################################
    if options.specificity:
        for ci in range(options.crosses):
            for fi in range(num_folds):
                it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

                if options.dataset_i is None:
                    out_dir = '%s/test_spec' % it_dir
                    model_file = '%s/train/model_best.h5' % it_dir
                else:
                    out_dir = '%s/test%d_spec' % (it_dir, options.dataset_i)
                    model_file = '%s/train/model%d_best.h5' % (
                        it_dir, options.dataset_i)

                # check if done
                acc_file = '%s/acc.txt' % out_dir
                if os.path.isfile(acc_file):
                    # print('%s already generated.' % acc_file)
                    pass
                else:
                    # basenji test
                    cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                    cmd += ' conda activate %s;' % options.conda_env
                    cmd += ' basenji_test_specificity.py'
                    cmd += ' --head %d' % head_i
                    cmd += ' -o %s' % out_dir
                    if options.rc:
                        cmd += ' --rc'
                    if options.shifts:
                        cmd += ' --shifts %s' % options.shifts
                    cmd += ' %s' % params_file
                    cmd += ' %s' % model_file
                    cmd += ' %s/data%d' % (it_dir, head_i)

                    name = '%s-spec-f%dc%d' % (options.name, fi, ci)
                    j = slurm.Job(cmd,
                                  name=name,
                                  out_file='%s.out' % out_dir,
                                  err_file='%s.err' % out_dir,
                                  queue=options.queue,
                                  cpu=1,
                                  gpu=1,
                                  mem=75000,
                                  time='6:00:00')
                    jobs.append(j)

    slurm.multi_run(jobs, verbose=True)

    if options.dataset_i is None:
        test_prefix = 'test'
    else:
        test_prefix = 'test%d' % options.dataset_i

    if options.dataset_ref_i is None:
        test_ref_prefix = 'test'
    else:
        test_ref_prefix = 'test%d' % options.dataset_ref_i

    # classification or regression
    if options.metric is None:
        with open('%s/f0_c0/%s/acc.txt' %
                  (exp_dir, test_prefix)) as test0_open:
            header = test0_open.readline().split()
            if 'pearsonr' in header:
                options.metric = 'pearsonr'
            else:
                options.metric = 'auprc'

    ################################################################
    # compare checkpoint on training set
    ################################################################
    if options.train:
        exp_glob_str = '%s/*/%s_train/acc.txt' % (exp_dir, test_prefix)
        exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str,
                                                    options.metric)

        if options.ref_dir is not None:
            ref_glob_str = '%s/*/%s_train/acc.txt' % (options.ref_dir,
                                                      test_ref_prefix)
            ref_cors, ref_mean, ref_stdm = read_metrics(
                ref_glob_str, options.metric)
            mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

        print('\nTrain:')
        print('%12s %s: %.4f (%.4f)' %
              (options.label_exp, options.metric, exp_mean, exp_stdm))
        if options.ref_dir is not None:
            print('%12s %s: %.4f (%.4f)' %
                  (options.label_ref, options.metric, ref_mean, ref_stdm))
            print('Mann-Whitney U p-value: %.3g' % mwp)
            print('T-test p-value: %.3g' % tp)

            if options.out_stem is not None:
                jointplot(ref_cors, exp_cors,
                          '%s_train.pdf' % options.out_stem, options.label_ref,
                          options.label_exp)

    ################################################################
    # compare best on test set
    ################################################################
    exp_glob_str = '%s/*/%s/acc.txt' % (exp_dir, test_prefix)
    exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str, options.metric)

    if options.ref_dir is not None:
        ref_glob_str = '%s/*/%s/acc.txt' % (options.ref_dir, test_ref_prefix)
        ref_cors, ref_mean, ref_stdm = read_metrics(ref_glob_str,
                                                    options.metric)

        mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

    print('\nTest:')
    print('%12s %s: %.4f (%.4f)' %
          (options.label_exp, options.metric, exp_mean, exp_stdm))
    if options.ref_dir is not None:
        print('%12s %s: %.4f (%.4f)' %
              (options.label_ref, options.metric, ref_mean, ref_stdm))
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)

        if options.out_stem is not None:
            jointplot(ref_cors, exp_cors, '%s_test.pdf' % options.out_stem,
                      options.label_ref, options.label_exp)

    ################################################################
    # compare best on test set specificity
    ################################################################
    if options.specificity:
        exp_glob_str = '%s/*/%s_spec/acc.txt' % (exp_dir, test_prefix)
        exp_cors, exp_mean, exp_stdm = read_metrics(exp_glob_str,
                                                    options.metric)

        if options.ref_dir is not None:
            ref_glob_str = '%s/*/%s_spec/acc.txt' % (options.ref_dir,
                                                     test_ref_prefix)
            ref_cors, ref_mean, ref_stdm = read_metrics(
                ref_glob_str, options.metric)

            mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

        print('\nSpecificity:')
        print('%12s %s: %.4f (%.4f)' %
              (options.label_exp, options.metric, exp_mean, exp_stdm))
        if options.ref_dir is not None:
            print('%12s %s: %.4f (%.4f)' %
                  (options.label_ref, options.metric, ref_mean, ref_stdm))
            print('Mann-Whitney U p-value: %.3g' % mwp)
            print('T-test p-value: %.3g' % tp)

            if options.out_stem is not None:
                jointplot(ref_cors, exp_cors, '%s_spec.pdf' % options.out_stem,
                          options.label_ref, options.label_exp)
Ejemplo n.º 16
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='batch_size',
                      default=256,
                      type='int',
                      help='Batch size [Default: %default]')
    parser.add_option('-c',
                      dest='csv',
                      default=False,
                      action='store_true',
                      help='Print table as CSV [Default: %default]')
    parser.add_option(
        '-e',
        dest='heatmaps',
        default=False,
        action='store_true',
        help='Draw score heatmaps, grouped by index SNP [Default: %default]')
    parser.add_option(
        '-f',
        dest='genome_fasta',
        default='%s/assembly/hg19.fa' % os.environ['HG19'],
        help=
        'Genome FASTA from which sequences will be drawn [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.genome' %
                      os.environ['HG19'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option(
        '-l',
        dest='seq_len',
        type='int',
        default=131072,
        help='Sequence length provided to the model [Default: %default]')
    parser.add_option('--local',
                      dest='local',
                      default=1024,
                      type='int',
                      help='Local SAD score [Default: %default]')
    parser.add_option('-n',
                      dest='norm_file',
                      default=None,
                      help='Normalize SAD scores')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sad',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=2,
                      type='int',
                      help='Number of parallel processes to run.')
    parser.add_option('--pseudo',
                      dest='log_pseudo',
                      default=1,
                      type='float',
                      help='Log2 pseudocount [Default: %default]')
    parser.add_option(
        '-q',
        dest='queue',
        default='p100',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average the forward and reverse complement predictions when testing [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--ti',
        dest='track_indexes',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '-u',
        dest='penultimate',
        default=False,
        action='store_true',
        help='Compute SED in the penultimate layer [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters and model files and VCF file')
    else:
        params_file = args[0]
        model_file = args[1]
        vcf_file = args[2]

    #######################################################
    # prep work

    # output directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        cmd = 'source activate py3_gpu; basenji_sad.py %s %s %d' % (
            options_pkl_file, ' '.join(args), pi)
        name = 'sad_p%d' % pi
        outf = '%s/job%d.out' % (options.out_dir, pi)
        errf = '%s/job%d.err' % (options.out_dir, pi)
        j = slurm.Job(cmd,
                      name,
                      outf,
                      errf,
                      queue=options.queue,
                      mem=15000,
                      time='7-0:0:0',
                      gpu=1)
        jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    sleep_time=60)

    #######################################################
    # collect output

    collect_table('sad_table.txt', options.out_dir, options.processes)
Ejemplo n.º 17
0
  def test_train(self):
    exp_dir = 'train_full/exp'
    if os.path.isdir(exp_dir):
      shutil.rmtree(exp_dir)
    os.mkdir(exp_dir)

    ################################################################
    # train
    ################################################################
    jobs = []
    for i in range(self.iterations):
      it_dir = '%s/%d' % (exp_dir, i)
      os.mkdir(it_dir)

      # basenji train
      basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
      basenji_cmd += ' conda activate %s;' % self.conda_env
      basenji_cmd += ' %s/basenji_train.py' % self.basenji_path
      basenji_cmd += ' -o %s/train' % it_dir
      basenji_cmd += ' %s' % self.params_file
      basenji_cmd += ' %s' % self.data_dir

      basenji_job = slurm.Job(basenji_cmd,
                      name='train%d' % i,
                      out_file='%s/train.out'%it_dir,
                      err_file='%s/train.err'%it_dir,
                      queue=self.queue,
                      cpu=1,
                      gpu=1,
                      mem=23000,
                      time='12-00:00:00')
      jobs.append(basenji_job)

    slurm.multi_run(jobs, verbose=True)

    ################################################################
    # test check
    ################################################################
    jobs = []
    for i in range(self.iterations):
      it_dir = '%s/%d' % (exp_dir, i)

      # basenji test
      basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
      basenji_cmd += ' conda activate %s;' % self.conda_env
      basenji_cmd += ' %s/basenji_test.py' % self.basenji_path
      basenji_cmd += ' -o %s/test_train' % it_dir
      basenji_cmd += ' --tfr "train-*.tfr"'
      basenji_cmd += ' %s' % self.params_file
      basenji_cmd += ' %s/train/model_check.h5' % it_dir
      basenji_cmd += ' %s' % self.data_dir

      basenji_job = slurm.Job(basenji_cmd,
                      name='test%d' % i,
                      out_file='%s/test_train.out'%it_dir,
                      err_file='%s/test_train.err'%it_dir,
                      queue=self.queue,
                      cpu=1,
                      gpu=1,
                      mem=23000,
                      time='4:00:00')
      jobs.append(basenji_job)

    slurm.multi_run(jobs, verbose=True)

    ################################################################
    # test best
    ################################################################
    jobs = []
    for i in range(self.iterations):
      it_dir = '%s/%d' % (exp_dir, i)

      # basenji test
      basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
      basenji_cmd += ' conda activate %s;' % self.conda_env
      basenji_cmd += ' %s/basenji_test.py' % self.basenji_path
      basenji_cmd += ' -o %s/test' % it_dir
      basenji_cmd += ' %s' % self.params_file
      basenji_cmd += ' %s/train/model_best.h5' % it_dir
      basenji_cmd += ' %s' % self.data_dir

      basenji_job = slurm.Job(basenji_cmd,
                      name='test%d' % i,
                      out_file='%s/test.out'%it_dir,
                      err_file='%s/test.err'%it_dir,
                      queue=self.queue,
                      cpu=1,
                      gpu=1,
                      mem=23000,
                      time='4:00:00')
      jobs.append(basenji_job)

    slurm.multi_run(jobs, verbose=True)

    ################################################################
    # compare checkpoint on training set
    ################################################################
    ref_cors = []
    for acc_file in glob.glob('%s/*/test_train/acc.txt' % self.ref_dir):
      acc_df = pd.read_csv(acc_file, sep='\t', index_col=0)
      ref_cors.append(acc_df.pearsonr.mean())

    exp_cors = []
    for acc_file in glob.glob('%s/*/test_train/acc.txt' % exp_dir):
      acc_df = pd.read_csv(acc_file, sep='\t', index_col=0)
      exp_cors.append(acc_df.pearsonr.mean())

    _, mwp = mannwhitneyu(ref_cors, exp_cors, alternative='two-sided')
    _, tp = ttest_ind(ref_cors, exp_cors)
    print('\nTrain:')
    print('Reference  PearsonR: %.4f (%.4f)' % (np.mean(ref_cors), np.std(ref_cors)))
    print('Experiment PearsonR: %.4f (%.4f)' % (np.mean(exp_cors), np.std(exp_cors)))
    print('Mann-Whitney U p-value: %.3g' % mwp)
    print('T-test p-value: %.3g' % tp)

    # self.assertGreater(mwp, 0.05)
    # self.assertGreater(tp, 0.05)
    
    ################################################################
    # compare best on test set
    ################################################################
    ref_cors = []
    for acc_file in glob.glob('%s/*/test/acc.txt' % self.ref_dir):
      acc_df = pd.read_csv(acc_file, sep='\t', index_col=0)
      ref_cors.append(acc_df.pearsonr.mean())

    exp_cors = []
    for acc_file in glob.glob('%s/*/test/acc.txt' % exp_dir):
      acc_df = pd.read_csv(acc_file, sep='\t', index_col=0)
      exp_cors.append(acc_df.pearsonr.mean())

    _, mwp = mannwhitneyu(ref_cors, exp_cors, alternative='two-sided')
    _, tp = ttest_ind(ref_cors, exp_cors)
    print('\nTest:')
    print('Reference  PearsonR: %.4f (%.4f)' % (np.mean(ref_cors), np.std(ref_cors)))
    print('Experiment PearsonR: %.4f (%.4f)' % (np.mean(exp_cors), np.std(exp_cors)))
    print('Mann-Whitney U p-value: %.3g' % mwp)
    print('T-test p-value: %.3g' % tp)
Ejemplo n.º 18
0
def main():
    usage = 'usage: %prog [options] <model> <vcf_file>'
    parser = OptionParser(usage)

    # sad
    parser.add_option('-b',
                      dest='batch_size',
                      default=4,
                      type='int',
                      help='Batch size [Default: %default]')
    parser.add_option('-c',
                      dest='slice_center',
                      default=None,
                      type='int',
                      help='Slice center positions [Default: %default]')
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/data/hg19.fa' % os.environ['BASENJIDIR'],
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sad',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option('--pseudo',
                      dest='log_pseudo',
                      default=1,
                      type='float',
                      help='Log2 pseudocount [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option('--species', dest='species', default='human')
    parser.add_option(
        '--stats',
        dest='sad_stats',
        default='SAD',
        help='Comma-separated list of stats to save. [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')

    # multi
    parser.add_option('-e',
                      dest='conda_env',
                      default='tf2.2-gpu',
                      help='Anaconda environment [Default: %default]')
    parser.add_option('--name',
                      dest='name',
                      default='sad',
                      help='SLURM name prefix [Default: %default]')
    parser.add_option('--max_proc',
                      dest='max_proc',
                      default=None,
                      type='int',
                      help='Maximum concurrent processes [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '-r',
        dest='restart',
        default=False,
        action='store_true',
        help='Restart a partially completed job [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide model and VCF file')
    else:
        model_file = args[0]
        vcf_file = args[1]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print('Please remove %s' % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env

            cmd += ' sonnet_sad.py %s %s %d' % (options_pkl_file,
                                                ' '.join(args), pi)

            name = '%s_p%d' % (options.name, pi)
            outf = '%s/job%d.out' % (options.out_dir, pi)
            errf = '%s/job%d.err' % (options.out_dir, pi)

            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue=options.queue,
                          gpu=1,
                          mem=22000,
                          time='14-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.max_proc,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    collect_h5('sad.h5', options.out_dir, options.processes)
Ejemplo n.º 19
0
def main():
    usage = "usage: %prog [options] <fasta0_file,fasta1_file> <targets_file>"
    parser = OptionParser(usage)
    parser.add_option("-a", dest="align_net", help="Alignment .net file")
    parser.add_option(
        "-b",
        dest="blacklist_beds",
        help="Set blacklist nucleotides to a baseline value.",
    )
    parser.add_option(
        "--break",
        dest="break_t",
        default=None,
        type="int",
        help="Break in half contigs above length [Default: %default]",
    )
    # parser.add_option('-c', dest='clip',
    #     default=None, type='float',
    #     help='Clip target values to have minimum [Default: %default]')
    parser.add_option(
        "-d",
        dest="sample_pct",
        default=1.0,
        type="float",
        help="Down-sample the segments",
    )
    parser.add_option(
        "-f",
        dest="fill_min",
        default=100000,
        type="int",
        help="Alignment net fill size minimum [Default: %default]",
    )
    parser.add_option(
        "-g",
        dest="gap_files",
        help="Comma-separated list of assembly gaps BED files [Default: %default]",
    )
    parser.add_option(
        "-l",
        dest="seq_length",
        default=131072,
        type="int",
        help="Sequence length [Default: %default]",
    )
    parser.add_option(
        "--local",
        dest="run_local",
        default=False,
        action="store_true",
        help="Run jobs locally as opposed to on SLURM [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="data_out",
        help="Output directory [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number parallel processes [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="seqs_per_tfr",
        default=256,
        type="int",
        help="Sequences per TFRecord file [Default: %default]",
    )
    parser.add_option(
        "--seed",
        dest="seed",
        default=44,
        type="int",
        help="Random seed [Default: %default]",
    )
    parser.add_option(
        "--stride_train",
        dest="stride_train",
        default=1.0,
        type="float",
        help="Stride to advance train sequences [Default: %default]",
    )
    parser.add_option(
        "--stride_test",
        dest="stride_test",
        default=1.0,
        type="float",
        help="Stride to advance valid and test sequences [Default: %default]",
    )
    parser.add_option(
        "--soft",
        dest="soft_clip",
        default=False,
        action="store_true",
        help="Soft clip values, applying sqrt to the execess above the threshold [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="test_pct",
        default=0.1,
        type="float",
        help="Proportion of the data for testing [Default: %default]",
    )
    parser.add_option(
        "-u",
        dest="umap_beds",
        help="Comma-separated genome unmappable segments to set to NA",
    )
    parser.add_option(
        "--umap_t",
        dest="umap_t",
        default=0.5,
        type="float",
        help="Remove sequences with more than this unmappable bin % [Default: %default]",
    )
    parser.add_option(
        "--umap_set",
        dest="umap_set",
        default=None,
        type="float",
        help="Set unmappable regions to this percentile in the sequences' distribution of values",
    )
    parser.add_option(
        "-w",
        dest="pool_width",
        default=128,
        type="int",
        help="Sum pool width [Default: %default]",
    )
    parser.add_option(
        "-v",
        dest="valid_pct",
        default=0.1,
        type="float",
        help="Proportion of the data for validation [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            "Must provide FASTA and sample coverage label and path files for two genomes."
        )
    else:
        fasta_files = args[0].split(",")
        targets_file = args[1]

    # there is still some source of stochasticity
    random.seed(options.seed)
    np.random.seed(options.seed)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print("stride_train %.f" % options.stride_train, end="")
        options.stride_train = options.stride_train * options.seq_length
        print(" converted to %f" % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        print("stride_test %.f" % options.stride_test, end="")
        options.stride_test = options.stride_test * options.seq_length
        print(" converted to %f" % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.gap_files is not None:
        options.gap_files = options.gap_files.split(",")

    if options.blacklist_beds is not None:
        options.blacklist_beds = options.blacklist_beds.split(",")

    # read targets
    targets_df = pd.read_table(targets_file, index_col=0)

    # verify genomes
    num_genomes = len(fasta_files)
    assert len(set(targets_df.genome)) == num_genomes

    ################################################################
    # define genomic contigs
    ################################################################
    genome_chr_contigs = []
    for gi in range(num_genomes):
        genome_chr_contigs.append(genome.load_chromosomes(fasta_files[gi]))

        # remove gaps
        if options.gap_files[gi]:
            genome_chr_contigs[gi] = genome.split_contigs(
                genome_chr_contigs[gi], options.gap_files[gi]
            )

    # ditch the chromosomes
    contigs = []
    for gi in range(num_genomes):
        for chrom in genome_chr_contigs[gi]:
            contigs += [
                Contig(gi, chrom, ctg_start, ctg_end)
                for ctg_start, ctg_end in genome_chr_contigs[gi][chrom]
            ]

    # filter for large enough
    contigs = [ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length]

    # break up large contigs
    if options.break_t is not None:
        contigs = break_large_contigs(contigs, options.break_t)

    # print contigs to BED file
    for gi in range(num_genomes):
        contigs_i = [ctg for ctg in contigs if ctg.genome == gi]
        ctg_bed_file = "%s/contigs%d.bed" % (options.out_dir, gi)
        write_seqs_bed(ctg_bed_file, contigs_i)

    ################################################################
    # divide between train/valid/test
    ################################################################

    # connect contigs across genomes by alignment
    contig_components = connect_contigs(
        contigs, options.align_net, options.fill_min, options.out_dir
    )

    # divide contig connected components between train/valid/test
    contig_sets = divide_contig_components(
        contig_components, options.test_pct, options.valid_pct
    )
    train_contigs, valid_contigs, test_contigs = contig_sets

    # rejoin broken contigs within set
    train_contigs = rejoin_large_contigs(train_contigs)
    valid_contigs = rejoin_large_contigs(valid_contigs)
    test_contigs = rejoin_large_contigs(test_contigs)

    ################################################################
    # define model sequences
    ################################################################

    # stride sequences across contig
    train_mseqs = contig_sequences(
        train_contigs, options.seq_length, options.stride_train, label="train"
    )
    valid_mseqs = contig_sequences(
        valid_contigs, options.seq_length, options.stride_test, label="valid"
    )
    test_mseqs = contig_sequences(
        test_contigs, options.seq_length, options.stride_test, label="test"
    )

    # shuffle
    random.shuffle(train_mseqs)
    random.shuffle(valid_mseqs)
    random.shuffle(test_mseqs)

    # down-sample
    if options.sample_pct < 1.0:
        train_mseqs = random.sample(
            train_mseqs, int(options.sample_pct * len(train_mseqs))
        )
        valid_mseqs = random.sample(
            valid_mseqs, int(options.sample_pct * len(valid_mseqs))
        )
        test_mseqs = random.sample(
            test_mseqs, int(options.sample_pct * len(test_mseqs))
        )

    # merge
    mseqs = train_mseqs + valid_mseqs + test_mseqs

    ################################################################
    # separate sequences by genome
    ################################################################
    mseqs_genome = []
    for gi in range(num_genomes):
        mseqs_gi = [mseqs[si] for si in range(len(mseqs)) if mseqs[si].genome == gi]
        mseqs_genome.append(mseqs_gi)

    ################################################################
    # mappability
    ################################################################

    options.umap_beds = options.umap_beds.split(",")
    unmap_npys = [None, None]

    for gi in range(num_genomes):
        if options.umap_beds[gi] is not None:
            # annotate unmappable positions
            mseqs_unmap = annotate_unmap(
                mseqs_genome[gi],
                options.umap_beds[gi],
                options.seq_length,
                options.pool_width,
            )

            # filter unmappable
            mseqs_map_mask = mseqs_unmap.mean(axis=1, dtype="float64") < options.umap_t
            mseqs_genome[gi] = [
                mseqs_genome[gi][si]
                for si in range(len(mseqs_genome[gi]))
                if mseqs_map_mask[si]
            ]
            mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

            # write to file
            unmap_npys[gi] = "%s/mseqs%d_unmap.npy" % (options.out_dir, gi)
            np.save(unmap_npys[gi], mseqs_unmap)

    seqs_bed_files = []
    for gi in range(num_genomes):
        # write sequences to BED
        seqs_bed_files.append("%s/sequences%d.bed" % (options.out_dir, gi))
        write_seqs_bed(seqs_bed_files[gi], mseqs_genome[gi], True)

    ################################################################
    # read sequence coverage values
    ################################################################
    seqs_cov_dir = "%s/seqs_cov" % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []
    for gi in range(num_genomes):
        read_jobs += make_read_jobs(
            seqs_bed_files[gi], targets_df, gi, seqs_cov_dir, options
        )

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            read_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )

    ################################################################
    # write TF Records
    ################################################################

    tfr_dir = "%s/tfrecords" % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    # set genome target index starts
    sum_targets = 0
    genome_targets_start = []
    for gi in range(num_genomes):
        genome_targets_start.append(sum_targets)
        targets_df_gi = targets_df[targets_df.genome == gi]
        sum_targets += targets_df_gi.shape[0]

    write_jobs = []
    for gi in range(num_genomes):
        write_jobs += make_write_jobs(
            mseqs_genome[gi],
            fasta_files[gi],
            seqs_bed_files[gi],
            seqs_cov_dir,
            tfr_dir,
            gi,
            unmap_npys[gi],
            genome_targets_start[gi],
            sum_targets,
            options,
        )

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(
            write_jobs, options.processes, verbose=True, launch_sleep=1, update_sleep=5
        )
Ejemplo n.º 20
0
def main():
  usage = 'usage: %prog [options] <params_file> <model_file> <vcf_file>'
  parser = OptionParser(usage)

  # scd
  parser.add_option('-f', dest='genome_fasta',
      default='%s/data/hg19.fa' % os.environ['BASENJIDIR'],
      help='Genome FASTA for sequences [Default: %default]')
  parser.add_option('-m', dest='plot_map',
      default=False, action='store_true',
      help='Plot contact map for each allele [Default: %default]')
  parser.add_option('-o',dest='out_dir',
      default='scd',
      help='Output directory for tables and plots [Default: %default]')
  parser.add_option('--rc', dest='rc',
      default=False, action='store_true',
      help='Average forward and reverse complement predictions [Default: %default]')
  parser.add_option('--shifts', dest='shifts',
      default='0', type='str',
      help='Ensemble prediction shifts [Default: %default]')
  parser.add_option('--stats', dest='scd_stats',
      default='SCD',
      help='Comma-separated list of stats to save. [Default: %default]')
  parser.add_option('-t', dest='targets_file',
      default=None, type='str',
      help='File specifying target indexes and labels in table format')

  # multi
  parser.add_option('--cpu', dest='cpu',
      default=False, action='store_true',
      help='Run without a GPU [Default: %default]')
  parser.add_option('--name', dest='name',
      default='scd', help='SLURM name prefix [Default: %default]')
  parser.add_option('--max_proc', dest='max_proc',
      default=None, type='int',
      help='Maximum concurrent processes [Default: %default]')
  parser.add_option('-p', dest='processes',
      default=None, type='int',
      help='Number of processes, passed by multi script')
  parser.add_option('-q', dest='queue',
      default='gtx1080ti',
      help='SLURM queue on which to run the jobs [Default: %default]')
  parser.add_option('-r', dest='restart',
      default=False, action='store_true',
      help='Restart a partially completed job [Default: %default]')
  (options, args) = parser.parse_args()

  if len(args) != 3:
    parser.error('Must provide parameters and model files and VCF file')
  else:
    params_file = args[0]
    model_file = args[1]
    vcf_file = args[2]

  #######################################################
  # prep work

  # output directory
  if not options.restart:
    if os.path.isdir(options.out_dir):
      print('Please remove %s' % options.out_dir, file=sys.stderr)
      exit(1)
    os.mkdir(options.out_dir)

  # pickle options
  options_pkl_file = '%s/options.pkl' % options.out_dir
  options_pkl = open(options_pkl_file, 'wb')
  pickle.dump(options, options_pkl)
  options_pkl.close()

  #######################################################
  # launch worker threads
  jobs = []
  for pi in range(options.processes):
    if not options.restart or not job_completed(options, pi):
      if options.cpu:
        cmd = ''
      else:
        cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
        cmd += 'conda activate tf1.15-gpu;'

      cmd += ' akita_scd.py %s %s %d' % (
          options_pkl_file, ' '.join(args), pi)

      name = '%s_p%d' % (options.name, pi)
      outf = '%s/job%d.out' % (options.out_dir, pi)
      errf = '%s/job%d.err' % (options.out_dir, pi)

      num_gpu = 1*(not options.cpu)

      j = slurm.Job(cmd, name,
          outf, errf,
          queue=options.queue, gpu=num_gpu,
          mem=15000, time='14-0:0:0')
      jobs.append(j)

  slurm.multi_run(jobs, max_proc=options.max_proc, verbose=True,
                  launch_sleep=10, update_sleep=60)

  #######################################################
  # collect output

  collect_h5('scd.h5', options.out_dir, options.processes)
Ejemplo n.º 21
0
def main():
    usage = (
        "usage: %prog [options] <params_file> <model_file> <genes_hdf5_file>"
        " <vcf_file>")
    parser = OptionParser(usage)
    parser.add_option(
        "-a",
        dest="all_sed",
        default=False,
        action="store_true",
        help=
        "Print all variant-gene pairs, as opposed to only nonzero [Default: %default]",
    )
    parser.add_option(
        "-b",
        dest="batch_size",
        default=None,
        type="int",
        help="Batch size [Default: %default]",
    )
    parser.add_option(
        "-c",
        dest="csv",
        default=False,
        action="store_true",
        help="Print table as CSV [Default: %default]",
    )
    parser.add_option(
        "-g",
        dest="genome_file",
        default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"],
        help="Chromosome lengths file [Default: %default]",
    )
    parser.add_option(
        "-o",
        dest="out_dir",
        default="sed",
        help="Output directory for tables and plots [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=2,
        type="int",
        help="Number of parallel processes to run [Default: %default]",
    )
    parser.add_option(
        "--pseudo",
        dest="log_pseudo",
        default=0.125,
        type="float",
        help="Log2 pseudocount [Default: %default]",
    )
    parser.add_option(
        "-q",
        dest="queue",
        default="k80",
        help="SLURM queue on which to run the jobs [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="tss_radius",
        default=0,
        type="int",
        help=
        "Radius of bins considered to quantify TSS transcription [Default: %default]",
    )
    parser.add_option(
        "--rc",
        dest="rc",
        default=False,
        action="store_true",
        help=
        "Average the forward and reverse complement predictions when testing [Default: %default]",
    )
    parser.add_option(
        "--shifts",
        dest="shifts",
        default="0",
        help="Ensemble prediction shifts [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="targets_file",
        default=None,
        help="File specifying target indexes and labels in table format.",
    )
    parser.add_option(
        "--ti",
        dest="track_indexes",
        help="Comma-separated list of target indexes to output BigWig tracks",
    )
    parser.add_option(
        "-u",
        dest="penultimate",
        default=False,
        action="store_true",
        help="Compute SED in the penultimate layer [Default: %default]",
    )
    parser.add_option(
        "-x",
        dest="tss_table",
        default=False,
        action="store_true",
        help="Print TSS table in addition to gene [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error(
            "Must provide parameters and model files, genes HDF5 file, and QTL VCF"
            " file")
    else:
        params_file = args[0]
        model_file = args[1]
        genes_hdf5_file = args[2]
        vcf_file = args[3]

    #######################################################
    # prep work

    # output directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = "%s/options.pkl" % options.out_dir
    options_pkl = open(options_pkl_file, "wb")
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        cmd = "source activate py3_gpu; basenji_sed.py %s %s %d" % (
            options_pkl_file,
            " ".join(args),
            pi,
        )
        name = "sed_p%d" % pi
        outf = "%s/job%d.out" % (options.out_dir, pi)
        errf = "%s/job%d.err" % (options.out_dir, pi)
        j = slurm.Job(cmd,
                      name,
                      outf,
                      errf,
                      queue=options.queue,
                      mem=30000,
                      time="4:0:0",
                      gpu=1)
        jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    sleep_time=60)

    #######################################################
    # collect output

    collect_table_multi("sed_gene.txt", options.out_dir, options.processes,
                        options.log_pseudo)
    if options.tss_table:
        collect_table("sed_tss.txt", options.out_dir, options.processes)

    if options.track_indexes is not None:
        if not os.path.isdir("%s/tracks" % options.out_dir):
            os.mkdir("%s/tracks" % options.out_dir)

        for track_file in glob.glob("%s/job*/tracks/*"):
            track_base = os.path.split(track_file)[1]
            os.rename(track_file,
                      "%s/tracks/%s" % (options.out_dir, track_base))

    for pi in range(options.processes):
        shutil.rmtree("%s/job%d" % (options.out_dir, pi))
Ejemplo n.º 22
0
def main():
    usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option('-a',
                      '--alt',
                      dest='alternative',
                      default='two-sided',
                      help='Statistical test alternative [Default: %default]')
    parser.add_option('-d',
                      dest='dataset_i',
                      default=None,
                      type='int',
                      help='Dataset index [Default:%default]')
    parser.add_option('-e',
                      dest='conda_env',
                      default='tf2-gpu',
                      help='Anaconda environment [Default: %default]')
    parser.add_option('--name',
                      dest='name',
                      default='test',
                      help='SLURM name prefix [Default: %default]')
    parser.add_option('-q', dest='queue', default='gtx1080ti')
    parser.add_option('-r',
                      dest='ref_dir',
                      default=None,
                      help='Reference directory for statistical tests')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option('--spec',
                      dest='specificity',
                      default=False,
                      action='store_true',
                      help='Test specificity [Default: %default]')
    parser.add_option('--train',
                      dest='train',
                      default=False,
                      action='store_true',
                      help='Test on the training set, too [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters file and data directory')
    else:
        exp_dir = args[0]
        params_file = args[1]
        data_dirs = [os.path.abspath(arg) for arg in args[2:]]

    if options.dataset_i is None:
        head_i = 0
    else:
        head_i = options.dataset_i

    iterations = len(glob.glob('%s/*' % exp_dir))

    ################################################################
    # test check
    ################################################################
    jobs = []

    if options.train:
        for i in range(iterations):
            it_dir = '%s/%d' % (exp_dir, i)

            if options.dataset_i is None:
                out_dir = '%s/test_train' % it_dir
                model_file = '%s/train/model_check.h5' % it_dir
                data_dir = data_dirs[0]
            else:
                out_dir = '%s/test%d_train' % (it_dir, options.dataset_i)
                model_file = '%s/train/model%d_check.h5' % (it_dir,
                                                            options.dataset_i)
                data_dir = data_dirs[options.dataset_i]

            # check if done
            acc_file = '%s/acc.txt' % out_dir
            if os.path.isfile(acc_file):
                print('%s already generated.' % acc_file)
            else:
                cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                cmd += ' conda activate %s;' % options.conda_env
                cmd += ' basenji_test.py'
                cmd += ' --head %d' % head_i
                cmd += ' -o %s' % out_dir
                if options.rc:
                    cmd += ' --rc'
                if options.shifts:
                    cmd += ' --shifts %s' % options.shifts
                cmd += ' --split train'
                cmd += ' %s' % params_file
                cmd += ' %s' % model_file
                cmd += ' %s' % data_dir

                name = '%s-testtr%d' % (options.name, i)
                j = slurm.Job(cmd,
                              name=name,
                              out_file='%s.out' % out_dir,
                              err_file='%s.err' % out_dir,
                              queue=options.queue,
                              cpu=1,
                              gpu=1,
                              mem=23000,
                              time='4:00:00')
                jobs.append(j)

    ################################################################
    # test best
    ################################################################
    for i in range(iterations):
        it_dir = '%s/%d' % (exp_dir, i)

        if options.dataset_i is None:
            out_dir = '%s/test' % it_dir
            model_file = '%s/train/model_best.h5' % it_dir
            data_dir = data_dirs[0]
        else:
            out_dir = '%s/test%d' % (it_dir, options.dataset_i)
            model_file = '%s/train/model%d_best.h5' % (it_dir,
                                                       options.dataset_i)
            data_dir = data_dirs[options.dataset_i]

        # check if done
        acc_file = '%s/acc.txt' % out_dir
        if os.path.isfile(acc_file):
            print('%s already generated.' % acc_file)
        else:
            # basenji test
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env
            cmd += ' basenji_test.py'
            cmd += ' --head %d' % head_i
            cmd += ' -o %s' % out_dir
            if options.rc:
                cmd += ' --rc'
            if options.shifts:
                cmd += ' --shifts %s' % options.shifts
            cmd += ' %s' % params_file
            cmd += ' %s' % model_file
            cmd += ' %s' % data_dir

            name = '%s-test%d' % (options.name, i)
            j = slurm.Job(cmd,
                          name=name,
                          out_file='%s.out' % out_dir,
                          err_file='%s.err' % out_dir,
                          queue=options.queue,
                          cpu=1,
                          gpu=1,
                          mem=23000,
                          time='4:00:00')
            jobs.append(j)

    ################################################################
    # test best specificity
    ################################################################
    if options.specificity:
        for i in range(iterations):
            it_dir = '%s/%d' % (exp_dir, i)

            if options.dataset_i is None:
                out_dir = '%s/test_spec' % it_dir
                model_file = '%s/train/model_best.h5' % it_dir
                data_dir = data_dirs[0]
            else:
                out_dir = '%s/test%d_spec' % (it_dir, di)
                model_file = '%s/train/model%d_best.h5' % (it_dir, di)
                data_dir = data_dirs[options.dataset_i]

            # check if done
            acc_file = '%s/acc.txt' % out_dir
            if os.path.isfile(acc_file):
                print('%s already generated.' % acc_file)
            else:
                # basenji test
                cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                cmd += ' conda activate %s;' % options.conda_env
                cmd += ' basenji_test_specificity.py'
                cmd += ' --head %d' % head_i
                cmd += ' -o %s' % out_dir
                if options.rc:
                    cmd += ' --rc'
                if options.shifts:
                    cmd += ' --shifts %s' % options.shifts
                cmd += ' %s' % params_file
                cmd += ' %s' % model_file
                cmd += ' %s' % data_dir

                name = '%s-spec%d' % (options.name, i)
                j = slurm.Job(cmd,
                              name=name,
                              out_file='%s.out' % out_dir,
                              err_file='%s.err' % out_dir,
                              queue=options.queue,
                              cpu=1,
                              gpu=1,
                              mem=75000,
                              time='6:00:00')
                jobs.append(j)

    slurm.multi_run(jobs, verbose=True)

    if options.ref_dir is not None:
        ################################################################
        # compare checkpoint on training set
        ################################################################
        if options.train:
            ref_glob_str = '%s/*/test_train/acc.txt' % options.ref_dir
            ref_cors, ref_mean, ref_stdm = read_cors(ref_glob_str)

            exp_glob_str = '%s/*/test_train/acc.txt' % exp_dir
            exp_cors, exp_mean, exp_stdm = read_cors(exp_glob_str)

            mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

            print('\nTrain:')
            print('Reference  PearsonR: %.4f (%.4f)' % (ref_mean, ref_stdm))
            print('Experiment PearsonR: %.4f (%.4f)' % (exp_mean, exp_stdm))
            print('Mann-Whitney U p-value: %.3g' % mwp)
            print('T-test p-value: %.3g' % tp)

        ################################################################
        # compare best on test set
        ################################################################
        ref_glob_str = '%s/*/test/acc.txt' % options.ref_dir
        ref_cors, ref_mean, ref_stdm = read_cors(ref_glob_str)

        exp_glob_str = '%s/*/test/acc.txt' % exp_dir
        exp_cors, exp_mean, exp_stdm = read_cors(exp_glob_str)

        mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

        print('\nTest:')
        print('Reference  PearsonR: %.4f (%.4f)' % (ref_mean, ref_stdm))
        print('Experiment PearsonR: %.4f (%.4f)' % (exp_mean, exp_stdm))
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)

        ################################################################
        # compare best on test set specificity
        ################################################################
        if options.specificity:
            ref_glob_str = '%s/*/test_spec/acc.txt' % options.ref_dir
            ref_cors, ref_mean, ref_stdm = read_cors(ref_glob_str)

            exp_glob_str = '%s/*/test_spec/acc.txt' % exp_dir
            exp_cors, exp_mean, exp_stdm = read_cors(exp_glob_str)

            mwp, tp = stat_tests(ref_cors, exp_cors, options.alternative)

            print('\nSpecificity:')
            print('Reference  PearsonR: %.4f (%.4f)' % (ref_mean, ref_stdm))
            print('Experiment PearsonR: %.4f (%.4f)' % (exp_mean, exp_stdm))
            print('Mann-Whitney U p-value: %.3g' % mwp)
            print('T-test p-value: %.3g' % tp)
Ejemplo n.º 23
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file>'
    parser = OptionParser(usage)

    # sad
    parser.add_option('-f',
                      dest='genome_fasta',
                      default='%s/data/hg38.fa' % os.environ['BASENJIDIR'],
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option('--local',
                      dest='local',
                      default=1024,
                      type='int',
                      help='Local SAD score [Default: %default]')
    parser.add_option('-n',
                      dest='norm_file',
                      default=None,
                      help='Normalize SAD scores')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sad_gtex',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option('--pseudo',
                      dest='log_pseudo',
                      default=1,
                      type='float',
                      help='Log2 pseudocount [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      type='str',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '--stats',
        dest='sad_stats',
        default='SAD',
        help='Comma-separated list of stats to save. [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '--ti',
        dest='track_indexes',
        default=None,
        type='str',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '--threads',
        dest='threads',
        default=False,
        action='store_true',
        help='Run CPU math and output in a separate thread [Default: %default]'
    )
    parser.add_option(
        '-u',
        dest='penultimate',
        default=False,
        action='store_true',
        help='Compute SED in the penultimate layer [Default: %default]')

    # multi
    parser.add_option('-e',
                      dest='conda_env',
                      default='tf2.2-gpu',
                      help='Anaconda environment [Default: %default]')
    parser.add_option('-g',
                      dest='gtex_vcf_dir',
                      default='/home/drk/seqnn/data/gtex_fine/susie_pip90')
    parser.add_option('--name',
                      dest='name',
                      default='gtex',
                      help='SLURM name prefix [Default: %default]')
    parser.add_option('--max_proc',
                      dest='max_proc',
                      default=None,
                      type='int',
                      help='Maximum concurrent processes [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script. \
            (Unused, but needs to appear as dummy.)')
    parser.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '-r',
        dest='restart',
        default=False,
        action='store_true',
        help='Restart a partially completed job [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide parameters and model files')
    else:
        params_file = args[0]
        model_file = args[1]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print('Please remove %s' % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # predict

    cmd_base = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
    cmd_base += ' conda activate %s;' % options.conda_env
    cmd_base += ' basenji_sad.py %s %s %s' % (options_pkl_file, params_file,
                                              model_file)

    jobs = []
    for gtex_pos_vcf in glob.glob('%s/*_pos.vcf' % options.gtex_vcf_dir):
        # positive job
        job_base = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0]
        out_dir = '%s/%s' % (options.out_dir, job_base)
        if not options.restart or not os.path.isfile('%s/sad.h5' % out_dir):
            cmd = '%s -o %s %s' % (cmd_base, out_dir, gtex_pos_vcf)
            name = '%s_%s' % (options.name, job_base)
            j = slurm.Job(cmd,
                          name,
                          '%s.out' % out_dir,
                          '%s.err' % out_dir,
                          queue=options.queue,
                          gpu=1,
                          mem=22000,
                          time='1-0:0:0')
            jobs.append(j)

        # negative job
        gtex_neg_vcf = gtex_pos_vcf.replace('_pos.', '_neg.')
        job_base = os.path.splitext(os.path.split(gtex_neg_vcf)[1])[0]
        out_dir = '%s/%s' % (options.out_dir, job_base)
        if not options.restart or not os.path.isfile('%s/sad.h5' % out_dir):
            cmd = '%s -o %s %s' % (cmd_base, out_dir, gtex_neg_vcf)
            name = '%s_%s' % (options.name, job_base)
            j = slurm.Job(cmd,
                          name,
                          '%s.out' % out_dir,
                          '%s.err' % out_dir,
                          queue=options.queue,
                          gpu=1,
                          mem=22000,
                          time='1-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.max_proc,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # classify

    cmd_base = 'basenji_bench_classify.py -i 100 -p 2 -r 44 -s'

    jobs = []
    for gtex_pos_vcf in glob.glob('%s/*_pos.vcf' % options.gtex_vcf_dir):
        tissue = os.path.splitext(os.path.split(gtex_pos_vcf)[1])[0][:-4]
        sad_pos = '%s/%s_pos/sad.h5' % (options.out_dir, tissue)
        sad_neg = '%s/%s_neg/sad.h5' % (options.out_dir, tissue)
        out_dir = '%s/%s_class' % (options.out_dir, tissue)
        if not options.restart or not os.path.isfile('%s/stats.txt' % out_dir):
            cmd = '%s -o %s %s %s' % (cmd_base, out_dir, sad_pos, sad_neg)
            j = slurm.Job(cmd,
                          tissue,
                          '%s.out' % out_dir,
                          '%s.err' % out_dir,
                          queue='standard',
                          cpu=2,
                          mem=22000,
                          time='1-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs, verbose=True)
Ejemplo n.º 24
0
def main():
    usage = "usage: %prog [options] <params_file> <model_file> <vcf_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-b",
        dest="batch_size",
        default=256,
        type="int",
        help="Batch size [Default: %default]",
    )
    parser.add_option(
        "-c",
        dest="csv",
        default=False,
        action="store_true",
        help="Print table as CSV [Default: %default]",
    )
    parser.add_option(
        "-f",
        dest="genome_fasta",
        default="%s/data/hg19.fa" % os.environ["BASENJIDIR"],
        help="Genome FASTA for sequences [Default: %default]",
    )
    parser.add_option(
        "-g",
        dest="genome_file",
        default="%s/data/human.hg19.genome" % os.environ["BASENJIDIR"],
        help="Chromosome lengths file [Default: %default]",
    )
    parser.add_option(
        "--h5",
        dest="out_h5",
        default=False,
        action="store_true",
        help="Output stats to sad.h5 [Default: %default]",
    )
    parser.add_option(
        "--local",
        dest="local",
        default=1024,
        type="int",
        help="Local SAD score [Default: %default]",
    )
    parser.add_option("-n",
                      dest="norm_file",
                      default=None,
                      help="Normalize SAD scores")
    parser.add_option(
        "-o",
        dest="out_dir",
        default="sad",
        help="Output directory for tables and plots [Default: %default]",
    )
    parser.add_option(
        "-p",
        dest="processes",
        default=None,
        type="int",
        help="Number of processes, passed by multi script",
    )
    parser.add_option(
        "--pseudo",
        dest="log_pseudo",
        default=1,
        type="float",
        help="Log2 pseudocount [Default: %default]",
    )
    parser.add_option(
        "-q",
        dest="queue",
        default="k80",
        help="SLURM queue on which to run the jobs [Default: %default]",
    )
    parser.add_option(
        "-r",
        dest="restart",
        default=False,
        action="store_true",
        help="Restart a partially completed job [Default: %default]",
    )
    parser.add_option(
        "--rc",
        dest="rc",
        default=False,
        action="store_true",
        help=
        "Average forward and reverse complement predictions [Default: %default]",
    )
    parser.add_option(
        "--shifts",
        dest="shifts",
        default="0",
        type="str",
        help="Ensemble prediction shifts [Default: %default]",
    )
    parser.add_option(
        "--stats",
        dest="sad_stats",
        default="SAD,xSAR",
        help="Comma-separated list of stats to save. [Default: %default]",
    )
    parser.add_option(
        "-t",
        dest="targets_file",
        default=None,
        type="str",
        help="File specifying target indexes and labels in table format",
    )
    parser.add_option(
        "--ti",
        dest="track_indexes",
        default=None,
        type="str",
        help="Comma-separated list of target indexes to output BigWig tracks",
    )
    parser.add_option(
        "-u",
        dest="penultimate",
        default=False,
        action="store_true",
        help="Compute SED in the penultimate layer [Default: %default]",
    )
    parser.add_option(
        "-z",
        dest="out_zarr",
        default=False,
        action="store_true",
        help="Output stats to sad.zarr [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error("Must provide parameters and model files and VCF file")
    else:
        params_file = args[0]
        model_file = args[1]
        vcf_file = args[2]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print("Please remove %s" % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = "%s/options.pkl" % options.out_dir
    options_pkl = open(options_pkl_file, "wb")
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = "source activate tf1.12-gpu; basenji_sadf.py %s %s %d" % (
                options_pkl_file,
                " ".join(args),
                pi,
            )
            name = "sad_p%d" % pi
            outf = "%s/job%d.out" % (options.out_dir, pi)
            errf = "%s/job%d.err" % (options.out_dir, pi)
            j = slurm.Job(
                cmd,
                name,
                outf,
                errf,
                queue=options.queue,
                gpu=1,
                mem=15000,
                time="7-0:0:0",
            )
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    if options.out_h5:
        collect_h5("sad.h5", options.out_dir, options.processes)

    elif options.out_zarr:
        collect_zarr("sad.zarr", options.out_dir, options.processes)

    else:
        collect_table("sad_table.txt", options.out_dir, options.processes)
Ejemplo n.º 25
0
def main():
    usage = 'usage: %prog [options] <params_file> <seed_model> <data_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-e',
        dest='num_epochs',
        default=4,
        type='int',
        help='Number of epochs to train models [Default: %default]')
    parser.add_option('-n',
                      dest='num_models',
                      default=3,
                      type='int',
                      help='Number of models to train [Default: %default]')
    parser.add_option(
        '-o',
        dest='out_dir',
        default='seqnn_avg',
        help='Output directory in which to train [Default: %default]')
    parser.add_option(
        '-s',
        dest='num_steps',
        default=None,
        type='int',
        help='Number of steps to train models [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide parameters, seed model, and data')
    else:
        params_file = args[0]
        seed_model = args[1]
        data_file = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    jobs = []

    for mi in range(options.num_models):
        model_dir = '%s/m%d' % (options.out_dir, mi)

        cmd = 'source activate py3_gpu;'
        cmd += ' basenji_train.py'
        cmd += ' --rc --shifts "3,2,1,0,-1,-2,-3"'
        cmd += ' --logdir %s' % model_dir
        cmd += ' --check_all'
        cmd += ' --num_train_epochs %d' % options.num_epochs
        cmd += ' --restart %s' % seed_model
        cmd += ' --params %s' % params_file
        cmd += ' --data %s' % data_file

        j = slurm.Job(cmd,
                      name=model_dir,
                      out_file='%s.out' % model_dir,
                      err_file='%s.err' % model_dir,
                      queue='gtx1080ti',
                      gpu=1,
                      cpu=1,
                      time='4-0:0:0',
                      mem=30000)

        jobs.append(j)

    slurm.multi_run(jobs, verbose=True)
def main():
    usage = 'usage: %prog [options] <exp_dir> <params_file> <data_dir> <bed_file>'
    parser = OptionParser(usage)

    # sat options
    sat_options = OptionGroup(parser, 'basenji_sat_bed.py options')
    sat_options.add_option(
        '-d',
        dest='mut_down',
        default=0,
        type='int',
        help=
        'Nucleotides downstream of center sequence to mutate [Default: %default]'
    )
    sat_options.add_option(
        '-f',
        dest='genome_fasta',
        default=None,
        help='Genome FASTA for sequences [Default: %default]')
    sat_options.add_option(
        '-l',
        dest='mut_len',
        default=0,
        type='int',
        help='Length of center sequence to mutate [Default: %default]')
    sat_options.add_option('-o',
                           dest='out_dir',
                           default='sat_mut',
                           help='Output directory [Default: %default]')
    sat_options.add_option('--plots',
                           dest='plots',
                           default=False,
                           action='store_true',
                           help='Make heatmap plots [Default: %default]')
    sat_options.add_option('-p',
                           dest='processes',
                           default=None,
                           type='int',
                           help='Number of processes, passed by multi script')
    sat_options.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    sat_options.add_option(
        '--shifts',
        dest='shifts',
        default='0',
        help='Ensemble prediction shifts [Default: %default]')
    sat_options.add_option(
        '--stats',
        dest='sad_stats',
        default='sum',
        help='Comma-separated list of stats to save. [Default: %default]')
    sat_options.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    sat_options.add_option(
        '-u',
        dest='mut_up',
        default=0,
        type='int',
        help=
        'Nucleotides upstream of center sequence to mutate [Default: %default]'
    )
    parser.add_option_group(sat_options)

    phylop_options = OptionGroup(parser, 'basenji_bench_phylop.py options')
    # phylop_options.add_option('-e', dest='num_estimators',
    #   default=100, type='int',
    #   help='Number of random forest estimators [Default: %default]')
    phylop_options.add_option(
        '-g',
        dest='genome',
        default='ce11',
        help='PhyloP and FASTA genome [Default: %default]')
    # phylop_options.add_option('--pca', dest='n_components',
    #   default=None, type='int',
    #   help='PCA n_components [Default: %default]')
    parser.add_option_group(phylop_options)

    fold_options = OptionGroup(parser, 'cross-fold options')
    fold_options.add_option(
        '-a',
        '--alt',
        dest='alternative',
        default='two-sided',
        help='Statistical test alternative [Default: %default]')
    fold_options.add_option(
        '-c',
        dest='crosses',
        default=1,
        type='int',
        help='Number of cross-fold rounds [Default:%default]')
    fold_options.add_option('-e',
                            dest='conda_env',
                            default='tf2-gpu',
                            help='Anaconda environment [Default: %default]')
    fold_options.add_option('--label_exp',
                            dest='label_exp',
                            default='Experiment',
                            help='Experiment label [Default: %default]')
    fold_options.add_option('--label_ref',
                            dest='label_ref',
                            default='Reference',
                            help='Reference label [Default: %default]')
    fold_options.add_option('--name',
                            dest='name',
                            default='sat',
                            help='SLURM name prefix [Default: %default]')
    fold_options.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    fold_options.add_option('-r',
                            dest='ref_dir',
                            default=None,
                            help='Reference directory for statistical tests')
    parser.add_option_group(fold_options)

    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error('Must provide parameters file and data directory')
    else:
        exp_dir = args[0]
        params_file = args[1]
        data_dir = args[2]
        bed_file = args[3]

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # count folds
    num_folds = len([dkey for dkey in data_stats if dkey.startswith('fold')])

    # genome
    genome_path = os.environ[options.genome.upper()]
    options.genome_fasta = '%s/assembly/%s.fa' % (genome_path, options.genome)

    ################################################################
    # saturation mutagenesis
    ################################################################
    jobs = []
    scores_files = []

    for ci in range(options.crosses):
        for fi in range(num_folds):
            it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)

            # update output directory
            sat_dir = '%s/%s' % (it_dir, options.out_dir)

            # check if done
            scores_file = '%s/scores.h5' % sat_dir
            scores_files.append(scores_file)
            if os.path.isfile(scores_file):
                print('%s already generated.' % scores_file)
            else:
                basenji_cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
                basenji_cmd += ' conda activate %s;' % options.conda_env
                basenji_cmd += ' echo $HOSTNAME;'

                basenji_cmd += ' basenji_sat_bed.py'
                basenji_cmd += ' %s' % options_string(options, sat_options,
                                                      sat_dir)
                basenji_cmd += ' %s' % params_file
                basenji_cmd += ' %s/train/model_best.h5' % it_dir
                basenji_cmd += ' %s' % bed_file

                name = '%s-f%dc%d' % (options.name, fi, ci)
                basenji_job = slurm.Job(basenji_cmd,
                                        name,
                                        out_file='%s.out' % sat_dir,
                                        err_file='%s.err' % sat_dir,
                                        cpu=2,
                                        gpu=1,
                                        queue=options.queue,
                                        mem=30000,
                                        time='7-0:00:00')
                jobs.append(basenji_job)

    slurm.multi_run(jobs, verbose=True)

    ################################################################
    # ensemble
    ################################################################
    ensemble_dir = '%s/ensemble' % exp_dir
    if not os.path.isdir(ensemble_dir):
        os.mkdir(ensemble_dir)

    sat_dir = '%s/%s' % (ensemble_dir, options.out_dir)
    if not os.path.isdir(sat_dir):
        os.mkdir(sat_dir)

    if not os.path.isfile('%s/scores.h5' % sat_dir):
        print('Generating ensemble scores.')
        ensemble_scores_h5(sat_dir, scores_files)
    else:
        print('Ensemble scores already generated.')

    ################################################################
    # PhyloP regressors
    ################################################################
    # num_pcs = int(data_stats['num_targets']**0.75)

    jobs = []
    for ci in range(options.crosses):
        for fi in range(num_folds):
            it_dir = '%s/f%d_c%d' % (exp_dir, fi, ci)
            sat_dir = '%s/%s' % (it_dir, options.out_dir)

            if not os.path.isfile('%s/stats.txt' % sat_dir):
                phylop_cmd = 'basenji_bench_phylop.py'
                phylop_cmd += ' -e 200 -p 4'
                # phylop_cmd += ' -d %d' % num_pcs
                phylop_cmd += ' -o %s' % sat_dir
                phylop_cmd += ' %s/scores.h5' % sat_dir

                name = '%s-f%dc%d' % (options.name, fi, ci)
                std_pre = '%s/phylop' % sat_dir
                j = slurm.Job(phylop_cmd,
                              name,
                              '%s.out' % std_pre,
                              '%s.err' % std_pre,
                              queue='standard',
                              cpu=4,
                              mem=45000,
                              time='1-0:0:0')
                jobs.append(j)

    # ensemble
    sat_dir = '%s/%s' % (ensemble_dir, options.out_dir)
    if not os.path.isfile('%s/stats.txt' % sat_dir):
        phylop_cmd = 'basenji_bench_phylop.py'
        phylop_cmd += ' -e 200 -p 4'
        # phylop_cmd += ' -d %d' % num_pcs
        phylop_cmd += ' -o %s' % sat_dir
        phylop_cmd += ' %s/scores.h5' % sat_dir

        name = '%s-ens' % options.name
        std_pre = '%s/phylop' % sat_dir
        j = slurm.Job(phylop_cmd,
                      name,
                      '%s.out' % std_pre,
                      '%s.err' % std_pre,
                      queue='standard',
                      cpu=4,
                      mem=45000,
                      time='1-0:0:0')
        jobs.append(j)

    slurm.multi_run(jobs, verbose=True)

    ################################################################
    # compare
    ################################################################

    ref_sat_dirs = []
    exp_sat_dirs = []
    for ci in range(options.crosses):
        for fi in range(num_folds):
            exp_sat_dir = '%s/f%d_c%d/%s' % (exp_dir, fi, ci, options.out_dir)
            exp_sat_dirs.append(exp_sat_dir)
            if options.ref_dir is not None:
                ref_sat_dir = '%s/f%d_c%d/%s' % (options.ref_dir, fi, ci,
                                                 options.out_dir)
                ref_sat_dirs.append(ref_sat_dir)

    exp_pcor_folds, exp_r2_folds = read_metrics(exp_sat_dirs)
    exp_sat_dirs = ['%s/ensemble/%s' % (exp_dir, options.out_dir)]
    exp_pcor_ens, exp_r2_ens = read_metrics(exp_sat_dirs)
    if options.ref_dir is not None:
        ref_pcor_folds, ref_r2_folds = read_metrics(ref_sat_dirs)
        ref_sat_dirs = ['%s/ensemble/%s' % (options.ref_dir, options.out_dir)]
        ref_pcor_ens, ref_r2_ens = read_metrics(ref_sat_dirs)

    print('PearsonR')
    exp_mean = exp_pcor_folds.mean()
    exp_stdm = exp_pcor_folds.std() / np.sqrt(len(exp_pcor_folds))
    expe_mean = exp_pcor_ens.mean()
    expe_stdm = exp_pcor_ens.std() / np.sqrt(len(exp_pcor_ens))
    print('%12s:       %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm))
    print('%12s (ens): %.4f (%.4f)' %
          (options.label_exp, expe_mean, expe_stdm))
    if options.ref_dir is not None:
        ref_mean = ref_pcor_folds.mean()
        ref_stdm = ref_pcor_folds.std() / np.sqrt(len(ref_pcor_folds))
        refe_mean = ref_pcor_ens.mean()
        refe_stdm = ref_pcor_ens.std() / np.sqrt(len(ref_pcor_ens))
        print('%12s:       %.4f (%.4f)' %
              (options.label_ref, ref_mean, ref_stdm))
        print('%12s (ens): %.4f (%.4f)' %
              (options.label_ref, refe_mean, refe_stdm))

        mwp, tp = stat_tests(exp_pcor_folds, ref_pcor_folds,
                             options.alternative)
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)

    print('\nR2')
    exp_mean = exp_r2_folds.mean()
    exp_stdm = exp_r2_folds.std() / np.sqrt(len(exp_r2_folds))
    expe_mean = exp_r2_ens.mean()
    expe_stdm = exp_r2_ens.std() / np.sqrt(len(exp_r2_ens))
    print('%12s:       %.4f (%.4f)' % (options.label_exp, exp_mean, exp_stdm))
    print('%12s (ens): %.4f (%.4f)' %
          (options.label_exp, expe_mean, expe_stdm))
    if options.ref_dir is not None:
        ref_mean = ref_r2_folds.mean()
        ref_stdm = ref_r2_folds.std() / np.sqrt(len(ref_r2_folds))
        refe_mean = ref_r2_ens.mean()
        refe_stdm = ref_r2_ens.std() / np.sqrt(len(ref_r2_ens))
        print('%12s:       %.4f (%.4f)' %
              (options.label_ref, ref_mean, ref_stdm))
        print('%12s (ens): %.4f (%.4f)' %
              (options.label_ref, refe_mean, refe_stdm))

        mwp, tp = stat_tests(exp_r2_folds, ref_r2_folds, options.alternative)
        print('Mann-Whitney U p-value: %.3g' % mwp)
        print('T-test p-value: %.3g' % tp)
Ejemplo n.º 27
0
def main():
    usage = 'usage: %prog [options] <params_file> <data_dir>'
    parser = OptionParser(usage)

    # train
    train_options = OptionGroup(parser, 'basenji_train.py options')
    train_options.add_option(
        '-k',
        dest='keras_fit',
        default=False,
        action='store_true',
        help='Train with Keras fit method [Default: %default]')
    train_options.add_option(
        '-o',
        dest='out_dir',
        default='train_out',
        help='Output directory for test statistics [Default: %default]')
    train_options.add_option(
        '--restore',
        dest='restore',
        help='Restore model and continue training [Default: %default]')
    train_options.add_option(
        '--trunk',
        dest='trunk',
        default=False,
        action='store_true',
        help='Restore only model trunk [Default: %default]')
    train_options.add_option(
        '--tfr_train',
        dest='tfr_train_pattern',
        default='train-*.tfr',
        help=
        'Training TFRecord pattern string appended to data_dir [Default: %default]'
    )
    train_options.add_option(
        '--tfr_eval',
        dest='tfr_eval_pattern',
        default='valid-*.tfr',
        help=
        'Evaluation TFRecord pattern string appended to data_dir [Default: %default]'
    )
    parser.add_option_group(train_options)

    # test
    test_options = OptionGroup(parser, 'basenji_test.py options')
    test_options.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average forward and reverse complement predictions [Default: %default]'
    )
    test_options.add_option(
        '--shifts',
        dest='shifts',
        default='0',
        type='str',
        help='Ensemble prediction shifts [Default: %default]')
    parser.add_option_group(test_options)

    # multi
    rep_options = OptionGroup(parser, 'replication options')
    rep_options.add_option('-e',
                           dest='conda_env',
                           default='tf2-gpu',
                           help='Anaconda environment [Default: %default]')
    rep_options.add_option('--name',
                           dest='name',
                           default='reps',
                           help='SLURM name prefix [Default: %default]')
    rep_options.add_option('-p',
                           dest='processes',
                           default=None,
                           type='int',
                           help='Number of processes, passed by multi script')
    rep_options.add_option(
        '-q',
        dest='queue',
        default='gtx1080ti',
        help='SLURM queue on which to run the jobs [Default: %default]')
    rep_options.add_option('-r',
                           dest='restart',
                           default=False,
                           action='store_true')
    parser.add_option_group(rep_options)

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide parameters and data directory.')
    else:
        params_file = os.path.abspath(args[0])
        data_dir = os.path.abspath(args[1])

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_train = params['train']

    #######################################################
    # prep work

    if not options.restart and os.path.isdir(options.out_dir):
        print('Output directory %s exists. Please remove.' % options.out_dir)
        exit(1)
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #######################################################
    # train

    jobs = []
    for pi in range(options.processes):
        rep_dir = '%s/%d' % (options.out_dir, pi)
        if options.restart and os.path.isdir(rep_dir):
            print('%s found and skipped.' % rep_dir)
        else:
            os.mkdir(rep_dir)

            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env
            cmd += ' echo $HOSTNAME;'

            cmd += ' basenji_train.py'
            cmd += ' %s' % options_string(options, train_options,
                                          '%s/train' % rep_dir)
            cmd += ' %s %s' % (params_file, data_dir)

            name = '%s-train%d' % (options.name, pi)
            sbf = os.path.abspath('%s/train.sb' % rep_dir)
            outf = os.path.abspath('%s/train.out' % rep_dir)
            errf = os.path.abspath('%s/train.err' % rep_dir)

            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          sbf,
                          queue=options.queue,
                          gpu=params_train.get('num_gpu', 1),
                          mem=23000,
                          time='28-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # test train

    jobs = []
    for pi in range(options.processes):
        rep_dir = '%s/%d' % (options.out_dir, pi)
        test_dir = '%s/test_train' % rep_dir

        # check if done
        acc_file = '%s/acc.txt' % test_dir
        if options.restart and os.path.isfile(acc_file):
            print('%s already generated.' % acc_file)
        else:
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env
            cmd += ' echo $HOSTNAME;'

            cmd += ' basenji_test.py'
            if options.rc:
                cmd += ' --rc'
            if options.shifts:
                cmd += ' --shifts %s' % options.shifts
            cmd += ' -o %s' % test_dir
            cmd += ' --tfr "train-*.tfr"'
            cmd += ' %s %s/train/model_check.h5 %s' % (params_file, rep_dir,
                                                       data_dir)

            name = '%s-testtr%d' % (options.name, pi)
            sbf = os.path.abspath('%s/test_train.sb' % rep_dir)
            outf = os.path.abspath('%s/test_train.out' % rep_dir)
            errf = os.path.abspath('%s/test_train.err' % rep_dir)

            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          sbf,
                          queue=options.queue,
                          gpu=params_train.get('num_gpu', 1),
                          mem=23000,
                          time='4:0:0')
            jobs.append(j)

    #######################################################
    # test best

    for pi in range(options.processes):
        rep_dir = '%s/%d' % (options.out_dir, pi)
        test_dir = '%s/test' % rep_dir

        # check if done
        acc_file = '%s/acc.txt' % test_dir
        if options.restart and os.path.isfile(acc_file):
            print('%s already generated.' % acc_file)
        else:
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env
            cmd += ' echo $HOSTNAME;'

            cmd += ' basenji_test.py'
            if options.rc:
                cmd += ' --rc'
            if options.shifts:
                cmd += ' --shifts %s' % options.shifts

            cmd += ' -o %s' % test_dir
            cmd += ' %s %s/train/model_best.h5 %s' % (params_file, rep_dir,
                                                      data_dir)

            name = '%s-test%d' % (options.name, pi)
            sbf = os.path.abspath('%s/test.sb' % rep_dir)
            outf = os.path.abspath('%s/test.out' % rep_dir)
            errf = os.path.abspath('%s/test.err' % rep_dir)

            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          sbf,
                          queue=options.queue,
                          gpu=params_train.get('num_gpu', 1),
                          mem=23000,
                          time='4:0:0')
            jobs.append(j)

    #######################################################
    # test best specificity

    for pi in range(options.processes):
        rep_dir = '%s/%d' % (options.out_dir, pi)
        test_dir = '%s/test_spec' % rep_dir

        # check if done
        acc_file = '%s/acc.txt' % test_dir
        if options.restart and os.path.isfile(acc_file):
            print('%s already generated.' % acc_file)
        else:
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env
            cmd += ' echo $HOSTNAME;'

            cmd += ' basenji_test_specificity.py'
            if options.rc:
                cmd += ' --rc'
            if options.shifts:
                cmd += ' --shifts %s' % options.shifts
            cmd += ' -o %s' % test_dir
            cmd += ' %s %s/train/model_best.h5 %s' % (params_file, rep_dir,
                                                      data_dir)

            name = '%s-spec%d' % (options.name, pi)
            sbf = os.path.abspath('%s/test_spec.sb' % rep_dir)
            outf = os.path.abspath('%s/test_spec.out' % rep_dir)
            errf = os.path.abspath('%s/test_spec.err' % rep_dir)

            # sticking to one gpu because the normalization time dominates
            # better would be to save predictions above.
            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          sbf,
                          queue=options.queue,
                          gpu=1,
                          mem=45000,
                          time='8:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)
Ejemplo n.º 28
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <genes_hdf5_file> <vcf_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='all_sed',
        default=False,
        action='store_true',
        help=
        'Print all variant-gene pairs, as opposed to only nonzero [Default: %default]'
    )
    parser.add_option('-b',
                      dest='batch_size',
                      default=None,
                      type='int',
                      help='Batch size [Default: %default]')
    parser.add_option('-c',
                      dest='csv',
                      default=False,
                      action='store_true',
                      help='Print table as CSV [Default: %default]')
    parser.add_option('-g',
                      dest='genome_file',
                      default='%s/assembly/human.hg19.genome' %
                      os.environ['HG19'],
                      help='Chromosome lengths file [Default: %default]')
    parser.add_option(
        '-i',
        dest='index_snp',
        default=False,
        action='store_true',
        help=
        'SNPs are labeled with their index SNP as column 6 [Default: %default]'
    )
    parser.add_option(
        '-o',
        dest='out_dir',
        default='sed',
        help='Output directory for tables and plots [Default: %default]')
    parser.add_option(
        '-p',
        dest='processes',
        default=2,
        type='int',
        help='Number of parallel processes to run [Default: %default]')
    parser.add_option(
        '-q',
        dest='queue',
        default='p100',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Average the forward and reverse complement predictions when testing [Default: %default]'
    )
    parser.add_option(
        '-s',
        dest='score',
        default=False,
        action='store_true',
        help='SNPs are labeled with scores as column 7 [Default: %default]')
    parser.add_option(
        '-t',
        dest='target_wigs_file',
        default=None,
        help='Store target values, extracted from this list of WIG files')
    parser.add_option(
        '--ti',
        dest='track_indexes',
        help='Comma-separated list of target indexes to output BigWig tracks')
    parser.add_option(
        '-x',
        dest='transcript_table',
        default=False,
        action='store_true',
        help='Print transcript table in addition to gene [Default: %default]')
    parser.add_option(
        '-w',
        dest='tss_width',
        default=1,
        type='int',
        help=
        'Width of bins considered to quantify TSS transcription [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 4:
        parser.error(
            'Must provide parameters and model files, genes HDF5 file, and QTL VCF file'
        )
    else:
        params_file = args[0]
        model_file = args[1]
        genes_hdf5_file = args[2]
        vcf_file = args[3]

    #######################################################
    # prep work

    # output directory
    if os.path.isdir(options.out_dir):
        shutil.rmtree(options.out_dir)
    os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        cmd = 'source activate py3_gpu; basenji_sed.py %s %s %d' % (
            options_pkl_file, ' '.join(args), pi)
        name = 'sed_p%d' % pi
        outf = '%s/job%d.out' % (options.out_dir, pi)
        errf = '%s/job%d.err' % (options.out_dir, pi)
        j = slurm.Job(cmd,
                      name,
                      outf,
                      errf,
                      queue=options.queue,
                      mem=16000,
                      time='4:0:0',
                      gpu=1)
        jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.processes,
                    verbose=True,
                    sleep_time=60)

    #######################################################
    # collect output

    collect_table_multi('sed_gene.txt', options.out_dir, options.processes)
    if options.transcript_table:
        collect_table('sed_tx.txt', options.out_dir, options.processes)

    if options.track_indexes is not None:
        if not os.path.isdir('%s/tracks' % options.out_dir):
            os.mkdir('%s/tracks' % options.out_dir)

        for track_file in glob.glob('%s/job*/tracks/*'):
            track_base = os.path.split(track_file)[1]
            os.rename(track_file,
                      '%s/tracks/%s' % (options.out_dir, track_base))

    for pi in range(options.processes):
        shutil.rmtree('%s/job%d' % (options.out_dir, pi))
Ejemplo n.º 29
0
def main():
    usage = 'usage: %prog [options] <fasta_file> <targets_file>'
    parser = OptionParser(usage)
    parser.add_option('-b',
                      dest='blacklist_bed',
                      help='Set blacklist nucleotides to a baseline value.')
    parser.add_option(
        '--break',
        dest='break_t',
        default=786432,
        type='int',
        help='Break in half contigs above length [Default: %default]')
    parser.add_option('-c',
                      '--crop',
                      dest='crop_bp',
                      default=0,
                      type='int',
                      help='Crop bp off each end [Default: %default]')
    parser.add_option('-d',
                      dest='sample_pct',
                      default=1.0,
                      type='float',
                      help='Down-sample the segments')
    parser.add_option('-f',
                      dest='folds',
                      default=None,
                      type='int',
                      help='Generate cross fold split [Default: %default]')
    parser.add_option('-g',
                      dest='gaps_file',
                      help='Genome assembly gaps BED [Default: %default]')
    parser.add_option('-i',
                      dest='interp_nan',
                      default=False,
                      action='store_true',
                      help='Interpolate NaNs [Default: %default]')
    parser.add_option('-l',
                      dest='seq_length',
                      default=131072,
                      type='int',
                      help='Sequence length [Default: %default]')
    parser.add_option(
        '--limit',
        dest='limit_bed',
        help='Limit to segments that overlap regions in a BED file')
    parser.add_option(
        '--local',
        dest='run_local',
        default=False,
        action='store_true',
        help='Run jobs locally as opposed to on SLURM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='data_out',
                      help='Output directory [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number parallel processes [Default: %default]')
    parser.add_option(
        '--peaks',
        dest='peaks_only',
        default=False,
        action='store_true',
        help='Create contigs only from peaks [Default: %default]')
    parser.add_option('-r',
                      dest='seqs_per_tfr',
                      default=256,
                      type='int',
                      help='Sequences per TFRecord file [Default: %default]')
    parser.add_option(
        '--restart',
        dest='restart',
        default=False,
        action='store_true',
        help='Continue progress from midpoint. [Default: %default]')
    parser.add_option('--seed',
                      dest='seed',
                      default=44,
                      type='int',
                      help='Random seed [Default: %default]')
    parser.add_option(
        '--snap',
        dest='snap',
        default=1,
        type='int',
        help='Snap sequences to multiple of the given value [Default: %default]'
    )
    parser.add_option('--st',
                      '--split_test',
                      dest='split_test',
                      default=False,
                      action='store_true',
                      help='Exit after split. [Default: %default]')
    parser.add_option(
        '--stride',
        '--stride_train',
        dest='stride_train',
        default=1.,
        type='float',
        help='Stride to advance train sequences [Default: seq_length]')
    parser.add_option(
        '--stride_test',
        dest='stride_test',
        default=1.,
        type='float',
        help='Stride to advance valid and test sequences [Default: seq_length]'
    )
    parser.add_option(
        '-t',
        dest='test_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for testing [Default: %default]')
    parser.add_option('-u',
                      dest='umap_bed',
                      help='Unmappable regions in BED format')
    parser.add_option(
        '--umap_t',
        dest='umap_t',
        default=0.5,
        type='float',
        help=
        'Remove sequences with more than this unmappable bin % [Default: %default]'
    )
    parser.add_option(
        '--umap_clip',
        dest='umap_clip',
        default=1,
        type='float',
        help=
        'Clip values at unmappable positions to distribution quantiles, eg 0.25. [Default: %default]'
    )
    parser.add_option(
        '--umap_tfr',
        dest='umap_tfr',
        default=False,
        action='store_true',
        help='Save umap array into TFRecords [Default: %default]')
    parser.add_option('-w',
                      dest='pool_width',
                      default=128,
                      type='int',
                      help='Sum pool width [Default: %default]')
    parser.add_option(
        '-v',
        dest='valid_pct_or_chr',
        default=0.05,
        type='str',
        help='Proportion of the data for validation [Default: %default]')
    parser.add_option('--norm',
                      dest='norm',
                      default='',
                      type='str',
                      help='Normalize coverage values')
    parser.add_option('--step',
                      dest='step',
                      default=0,
                      type='int',
                      help='Stride using bp size [Default: %pool_window]')
    parser.add_option('--padding',
                      dest='padding',
                      default='valid',
                      type='str',
                      help='Padding method for sliding window approach')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error(
            'Must provide FASTA and sample coverage labels and paths.')
    else:
        fasta_file = args[0]
        targets_file = args[1]

    random.seed(options.seed)
    np.random.seed(options.seed)

    if options.break_t is not None and options.break_t < options.seq_length:
        print(
            'Maximum contig length --break cannot be less than sequence length.',
            file=sys.stderr)
        exit(1)

    # transform proportion strides to base pairs
    if options.stride_train <= 1:
        print('stride_train %.f' % options.stride_train, end='')
        options.stride_train = options.stride_train * options.seq_length
        print(' converted to %f' % options.stride_train)
    options.stride_train = int(np.round(options.stride_train))
    if options.stride_test <= 1:
        if options.folds is None:
            print('stride_test %.f' % options.stride_test, end='')
            options.stride_test = options.stride_test * options.seq_length
            print(' converted to %f' % options.stride_test)
    options.stride_test = int(np.round(options.stride_test))

    # check snap
    if options.snap is not None:
        if np.mod(options.seq_length, options.snap) != 0:
            raise ValueError('seq_length must be a multiple of snap')
        if np.mod(options.stride_train, options.snap) != 0:
            raise ValueError('stride_train must be a multiple of snap')
        if np.mod(options.stride_test, options.snap) != 0:
            raise ValueError('stride_test must be a multiple of snap')

    # setup output directory
    if os.path.isdir(options.out_dir) and not options.restart:
        print('Remove output directory %s or use --restart option.' %
              options.out_dir)
        exit(1)
    elif not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    # read target datasets
    targets_df = pd.read_csv(targets_file, index_col=0, sep='\t')

    ################################################################
    # define genomic contigs
    ################################################################
    if not options.restart:
        chrom_contigs = genome.load_chromosomes(fasta_file)

        # remove gaps
        if options.gaps_file:
            chrom_contigs = genome.split_contigs(chrom_contigs,
                                                 options.gaps_file)

        # ditch the chromosomes for contigs
        contigs = []
        for chrom in chrom_contigs:
            if len(chrom.split('_')) == 1 and chrom != 'chrM':
                contigs += [
                    Contig(chrom, ctg_start, ctg_end)
                    for ctg_start, ctg_end in chrom_contigs[chrom]
                ]

        # limit to a BED file
        if options.limit_bed is not None:
            contigs = limit_contigs(contigs, options.limit_bed)

        # limit to peaks
        if options.peaks_only:
            peaks_bed = curate_peaks(targets_df, options.out_dir,
                                     options.pool_width, options.crop_bp)
            contigs = limit_contigs(contigs, peaks_bed)

        # filter for large enough
        contigs = [
            ctg for ctg in contigs if ctg.end - ctg.start >= options.seq_length
        ]

        # break up large contigs
        if options.break_t is not None:
            contigs = break_large_contigs(contigs, options.break_t)

        # print contigs to BED file
        # ctg_bed_file = '%s/contigs.bed' % options.out_dir
        # write_seqs_bed(ctg_bed_file, contigs)

    ################################################################
    # divide between train/valid/test
    ################################################################
    # label folds
    if options.folds is not None:
        fold_labels = ['fold%d' % fi for fi in range(options.folds)]
        num_folds = options.folds
    else:
        fold_labels = ['train', 'valid', 'test']
        num_folds = 3

    if not options.restart:
        if options.folds is not None:
            # divide by fold pct
            fold_contigs = divide_contigs_folds(contigs, options.folds)

        else:
            try:
                # convert to float pct
                valid_pct = float(options.valid_pct_or_chr)
                test_pct = float(options.test_pct_or_chr)
                assert (0 <= valid_pct <= 1)
                assert (0 <= test_pct <= 1)

                # divide by pct
                fold_contigs = divide_contigs_pct(contigs, test_pct, valid_pct)

            except (ValueError, AssertionError):
                # divide by chr
                valid_chrs = options.valid_pct_or_chr.split(',')
                test_chrs = options.test_pct_or_chr.split(',')
                fold_contigs = divide_contigs_chr(contigs, test_chrs,
                                                  valid_chrs)

        # rejoin broken contigs within set
        for fi in range(len(fold_contigs)):
            fold_contigs[fi] = rejoin_large_contigs(fold_contigs[fi])

        # write labeled contigs to BED file
        ctg_bed_file = '%s/contigs.bed' % options.out_dir
        ctg_bed_out = open(ctg_bed_file, 'w')
        for fi in range(len(fold_contigs)):
            for ctg in fold_contigs[fi]:
                line = '%s\t%d\t%d\t%s' % (ctg.chr, ctg.start, ctg.end,
                                           fold_labels[fi])
                print(line, file=ctg_bed_out)
        ctg_bed_out.close()

    if options.split_test:
        exit()

    ################################################################
    # define model sequences
    ################################################################
    if not options.restart:
        fold_mseqs = []
        for fi in range(num_folds):
            if fold_labels[fi] in ['valid', 'test']:
                stride_fold = options.stride_test
            else:
                stride_fold = options.stride_train

            # stride sequences across contig
            fold_mseqs_fi = contig_sequences(fold_contigs[fi],
                                             options.seq_length, stride_fold,
                                             options.snap, fold_labels[fi])
            fold_mseqs.append(fold_mseqs_fi)

            # shuffle
            random.shuffle(fold_mseqs[fi])

            # down-sample
            if options.sample_pct < 1.0:
                fold_mseqs[fi] = random.sample(
                    fold_mseqs[fi],
                    int(options.sample_pct * len(fold_mseqs[fi])))

        # merge into one list
        mseqs = [ms for fm in fold_mseqs for ms in fm]

    ################################################################
    # mappability
    ################################################################
    if not options.restart:
        if options.umap_bed is not None:
            if shutil.which('bedtools') is None:
                print('Install Bedtools to annotate unmappable sites',
                      file=sys.stderr)
                exit(1)

            # annotate unmappable positions
            mseqs_unmap = annotate_unmap(mseqs, options.umap_bed,
                                         options.seq_length,
                                         options.pool_width, options.crop_bp)

            # filter unmappable
            mseqs_map_mask = (mseqs_unmap.mean(axis=1, dtype='float64') <
                              options.umap_t)
            mseqs = [mseqs[i] for i in range(len(mseqs)) if mseqs_map_mask[i]]
            mseqs_unmap = mseqs_unmap[mseqs_map_mask, :]

            # write to file
            unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
            np.save(unmap_npy, mseqs_unmap)

        # write sequences to BED
        seqs_bed_file = '%s/sequences.bed' % options.out_dir
        write_seqs_bed(seqs_bed_file, mseqs, True)

    else:
        # read from directory
        seqs_bed_file = '%s/sequences.bed' % options.out_dir
        unmap_npy = '%s/mseqs_unmap.npy' % options.out_dir
        mseqs = []
        fold_mseqs = []
        for fi in range(num_folds):
            fold_mseqs.append([])
        for line in open(seqs_bed_file):
            a = line.split()
            msg = ModelSeq(a[0], int(a[1]), int(a[2]), a[3])
            mseqs.append(msg)
            if a[3] == 'train':
                fi = 0
            elif a[3] == 'valid':
                fi = 1
            elif a[3] == 'test':
                fi = 2
            else:
                fi = int(a[3].replace('fold', ''))
            fold_mseqs[fi].append(msg)

    ################################################################
    # read sequence coverage values
    ################################################################
    seqs_cov_dir = '%s/seqs_cov' % options.out_dir
    if not os.path.isdir(seqs_cov_dir):
        os.mkdir(seqs_cov_dir)

    read_jobs = []

    for ti in range(targets_df.shape[0]):
        genome_cov_file = targets_df['file'].iloc[ti]
        seqs_cov_stem = '%s/%d' % (seqs_cov_dir, ti)
        seqs_cov_file = '%s.h5' % seqs_cov_stem

        clip_ti = None
        if 'clip' in targets_df.columns:
            clip_ti = targets_df['clip'].iloc[ti]

        clipsoft_ti = None
        if 'clip_soft' in targets_df.columns:
            clipsoft_ti = targets_df['clip_soft'].iloc[ti]

        scale_ti = 1
        if 'scale' in targets_df.columns:
            scale_ti = targets_df['scale'].iloc[ti]

        if options.restart and os.path.isfile(seqs_cov_file):
            print('Skipping existing %s' % seqs_cov_file, file=sys.stderr)
        else:
            cmd = '/home/shush/profile/tfprofile/bin/basenji_data_read.py'
            cmd += ' --crop %d' % options.crop_bp
            cmd += ' -w %d' % options.pool_width
            cmd += ' -u %s' % targets_df['sum_stat'].iloc[ti]
            if clip_ti is not None:
                cmd += ' -c %f' % clip_ti
            if clipsoft_ti is not None:
                cmd += ' --clip_soft %f' % clipsoft_ti
            cmd += ' -s %f' % scale_ti
            if options.blacklist_bed:
                cmd += ' -b %s' % options.blacklist_bed
            if options.interp_nan:
                cmd += ' -i'
            if options.norm:
                cmd += ' --norm %s' % options.norm
            if options.step:
                cmd += ' --step %i' % options.step
            if options.padding:
                cmd += ' --padding %s' % options.padding
            cmd += ' %s' % genome_cov_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_file

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % seqs_cov_stem
                read_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='read_t%d' % ti,
                              out_file='%s.out' % seqs_cov_stem,
                              err_file='%s.err' % seqs_cov_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                read_jobs.append(j)

    if options.run_local:
        util.exec_par(read_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(read_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # write TF Records
    ################################################################
    # copy targets file
    shutil.copy(targets_file, '%s/targets.txt' % options.out_dir)

    # initialize TF Records dir
    tfr_dir = '%s/tfrecords' % options.out_dir
    if not os.path.isdir(tfr_dir):
        os.mkdir(tfr_dir)

    write_jobs = []

    for fold_set in fold_labels:
        fold_set_indexes = [
            i for i in range(len(mseqs)) if mseqs[i].label == fold_set
        ]
        fold_set_start = fold_set_indexes[0]
        fold_set_end = fold_set_indexes[-1] + 1

        tfr_i = 0
        tfr_start = fold_set_start
        tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end)

        while tfr_start <= fold_set_end:
            tfr_stem = '%s/%s-%d' % (tfr_dir, fold_set, tfr_i)

            cmd = '/home/shush/profile/tfprofile/bin/basenji_data_write.py'
            cmd += ' -s %d' % tfr_start
            cmd += ' -e %d' % tfr_end
            cmd += ' --umap_clip %f' % options.umap_clip
            if options.umap_tfr:
                cmd += ' --umap_tfr'
            if options.umap_bed is not None:
                cmd += ' -u %s' % unmap_npy

            cmd += ' %s' % fasta_file
            cmd += ' %s' % seqs_bed_file
            cmd += ' %s' % seqs_cov_dir
            cmd += ' %s.tfr' % tfr_stem

            if options.run_local:
                # breaks on some OS
                # cmd += ' &> %s.err' % tfr_stem
                write_jobs.append(cmd)
            else:
                j = slurm.Job(cmd,
                              name='write_%s-%d' % (fold_set, tfr_i),
                              out_file='%s.out' % tfr_stem,
                              err_file='%s.err' % tfr_stem,
                              queue='standard',
                              mem=15000,
                              time='12:0:0')
                write_jobs.append(j)

            # update
            tfr_i += 1
            tfr_start += options.seqs_per_tfr
            tfr_end = min(tfr_start + options.seqs_per_tfr, fold_set_end)

    if options.run_local:
        util.exec_par(write_jobs, options.processes, verbose=True)
    else:
        slurm.multi_run(write_jobs,
                        options.processes,
                        verbose=True,
                        launch_sleep=1,
                        update_sleep=5)

    ################################################################
    # stats
    ################################################################
    stats_dict = {}
    stats_dict['num_targets'] = targets_df.shape[0]
    stats_dict['seq_length'] = options.seq_length
    stats_dict['pool_width'] = options.pool_width
    stats_dict['crop_bp'] = options.crop_bp

    target_length = options.seq_length - 2 * options.crop_bp
    target_length = target_length // options.pool_width
    stats_dict['target_length'] = target_length

    for fi in range(num_folds):
        stats_dict['%s_seqs' % fold_labels[fi]] = len(fold_mseqs[fi])

    for i in range(10):
        print('~~~')
    print('%s/statistics.json' % options.out_dir)
    for i in range(10):
        print('~~~')
    with open('%s/statistics.json' % options.out_dir, 'w') as stats_json_out:
        json.dump(stats_dict, stats_json_out, indent=4)
Ejemplo n.º 30
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <bed_file>'
    parser = OptionParser(usage)

    # basenji_sat_bed.py options
    parser.add_option(
        '-d',
        dest='mut_down',
        default=0,
        type='int',
        help=
        'Nucleotides downstream of center sequence to mutate [Default: %default]'
    )
    parser.add_option('-f',
                      dest='genome_fasta',
                      default=None,
                      help='Genome FASTA for sequences [Default: %default]')
    parser.add_option(
        '-l',
        dest='mut_len',
        default=200,
        type='int',
        help='Length of center sequence to mutate [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='sat_mut',
                      help='Output directory [Default: %default]')
    parser.add_option('--plots',
                      dest='plots',
                      default=False,
                      action='store_true',
                      help='Make heatmap plots [Default: %default]')
    parser.add_option(
        '--rc',
        dest='rc',
        default=False,
        action='store_true',
        help=
        'Ensemble forward and reverse complement predictions [Default: %default]'
    )
    parser.add_option('--shifts',
                      dest='shifts',
                      default='0',
                      help='Ensemble prediction shifts [Default: %default]')
    parser.add_option(
        '--stats',
        dest='sad_stats',
        default='sum',
        help='Comma-separated list of stats to save. [Default: %default]')
    parser.add_option(
        '-t',
        dest='targets_file',
        default=None,
        type='str',
        help='File specifying target indexes and labels in table format')
    parser.add_option(
        '-u',
        dest='mut_up',
        default=0,
        type='int',
        help=
        'Nucleotides upstream of center sequence to mutate [Default: %default]'
    )

    # _multi.py options
    parser.add_option('-e',
                      dest='conda_env',
                      default='tf2.4',
                      help='Anaconda environment [Default: %default]')
    parser.add_option('--max_proc',
                      dest='max_proc',
                      default=None,
                      type='int',
                      help='Maximum concurrent processes [Default: %default]')
    parser.add_option('-n',
                      dest='name',
                      default='sat',
                      help='SLURM job name prefix [Default: %default]')
    parser.add_option('-p',
                      dest='processes',
                      default=None,
                      type='int',
                      help='Number of processes, passed by multi script')
    parser.add_option(
        '-q',
        dest='queue',
        default='k80',
        help='SLURM queue on which to run the jobs [Default: %default]')
    parser.add_option(
        '-r',
        dest='restart',
        default=False,
        action='store_true',
        help='Restart a partially completed job [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        print(args)
        parser.error('Must provide parameters and model files and BED file')
    else:
        params_file = args[0]
        model_file = args[1]
        bed_file = args[2]

    #######################################################
    # prep work

    # output directory
    if not options.restart:
        if os.path.isdir(options.out_dir):
            print('Please remove %s' % options.out_dir, file=sys.stderr)
            exit(1)
        os.mkdir(options.out_dir)

    # pickle options
    options_pkl_file = '%s/options.pkl' % options.out_dir
    options_pkl = open(options_pkl_file, 'wb')
    pickle.dump(options, options_pkl)
    options_pkl.close()

    #######################################################
    # launch worker threads
    jobs = []
    for pi in range(options.processes):
        if not options.restart or not job_completed(options, pi):
            cmd = '. /home/drk/anaconda3/etc/profile.d/conda.sh;'
            cmd += ' conda activate %s;' % options.conda_env

            cmd += ' basenji_sat_bed.py %s %s %d' % (options_pkl_file,
                                                     ' '.join(args), pi)
            name = '%s_p%d' % (options.name, pi)
            outf = '%s/job%d.out' % (options.out_dir, pi)
            errf = '%s/job%d.err' % (options.out_dir, pi)
            j = slurm.Job(cmd,
                          name,
                          outf,
                          errf,
                          queue=options.queue,
                          cpu=2,
                          gpu=1,
                          mem=30000,
                          time='14-0:0:0')
            jobs.append(j)

    slurm.multi_run(jobs,
                    max_proc=options.max_proc,
                    verbose=True,
                    launch_sleep=10,
                    update_sleep=60)

    #######################################################
    # collect output

    sad_stat = options.sad_stats.split(',')[0]
    collect_h5(options.out_dir, options.processes, sad_stat)