Esempio n. 1
0
    def test_seqs(self):
        """Test that the one hot coded sequences match."""
        for gi in range(2):
            # read sequence coordinates
            seqs_bed_file = '%s/sequences%d.bed' % (self.out_dir, gi)
            seq_coords = read_seq_coords(seqs_bed_file)

            # read one hot coding from TF Records
            train_tfrs_str = '%s/tfrecords/train-%d-0.tfr' % (self.out_dir, gi)
            seqs_1hot, _, genomes = self.read_tfrecords(train_tfrs_str)

            # check genome
            self.assertEqual(len(np.unique(genomes)), 1)
            self.assertEqual(genomes[0], gi)

            # open FASTA
            fasta_open = pysam.Fastafile(self.fasta_files[gi])

            # check random sequences
            seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
            for si in seq_indexes:
                sc = seq_coords[si]

                seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end).upper()
                seq_1hot_dna = hot1_dna(seqs_1hot[si])
                self.assertEqual(seq_fasta, seq_1hot_dna)
Esempio n. 2
0
def global_align(seq1_1hot, seq2_1hot):
    """Align two 1-hot encoded sequences."""

    align_opts = {
        'gap_open_penalty': 10,
        'gap_extend_penalty': 1,
        'match_score': 5,
        'mismatch_score': -4
    }

    seq1_dna = DNA(dna_io.hot1_dna(seq1_1hot))
    seq2_dna = DNA(dna_io.hot1_dna(seq2_1hot))
    # seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, *align_opts)[0]
    seq_align = global_pairwise_align_nucleotide(seq1_dna,
                                                 seq2_dna,
                                                 gap_open_penalty=10,
                                                 gap_extend_penalty=1,
                                                 match_score=5,
                                                 mismatch_score=-4)[0]
    seq1_align = str(seq_align[0])
    seq2_align = str(seq_align[1])
    return seq1_align, seq2_align
Esempio n. 3
0
    def test_seqs(self):
        """Test that the one hot coded sequences match."""
        # read sequence coordinates
        seqs_bed_file = '%s/sequences.bed' % self.out_dir
        seq_coords = read_seq_coords(seqs_bed_file)

        # read one hot coding from TF Records
        train_tfrs_str = '%s/tfrecords/train-0.tfr' % self.out_dir
        seqs_1hot, _ = self.read_tfrecords(train_tfrs_str)

        # open FASTA
        fasta_open = pysam.Fastafile(self.fasta_file)

        # check random sequences
        seq_indexes = random.sample(range(seqs_1hot.shape[0]), 32)
        for si in seq_indexes:
            sc = seq_coords[si]

            seq_fasta = fasta_open.fetch(sc.chr, sc.start, sc.end)
            seq_1hot_dna = hot1_dna(seqs_1hot[si])
            self.assertEqual(seq_fasta, seq_1hot_dna)
Esempio n. 4
0
def main():
    usage = "usage: %prog [options] <tfr_dir> <out_bw>"
    parser = OptionParser(usage)
    parser.add_option("-f",
                      dest="fasta_file",
                      default="%s/assembly/ucsc/hg38.fa" % os.environ["HG38"])
    parser.add_option(
        "-g",
        dest="genome_file",
        default="%s/assembly/ucsc/hg38.human.genome" % os.environ["HG38"],
    )
    parser.add_option(
        "-l",
        dest="target_length",
        default=1024,
        type="int",
        help="TFRecord target length [Default: %default]",
    )
    parser.add_option("-s", dest="data_split", default="train")
    parser.add_option(
        "-t",
        dest="target_i",
        default=0,
        type="int",
        help="Target index [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide TF Records directory and output BigWig")
    else:
        tfr_dir = args[0]
        out_bw_file = args[1]

    # initialize output BigWig
    out_bw_open = pyBigWig.open(out_bw_file, "w")

    # construct header
    header = []
    for line in open(options.genome_file):
        a = line.split()
        header.append((a[0], int(a[1])))

    # write header
    out_bw_open.addHeader(header)

    # initialize chr dictionary
    chr_values = {}
    for chrm, clen in header:
        chr_values[chrm] = np.zeros(clen, dtype="float16")

    # open sequences BED
    seq_bed_open = open("%s/../sequences0.bed" % tfr_dir)

    # open FASTA
    fasta_open = pysam.Fastafile(options.fasta_file)

    # initialize one shot iterator
    # next_op = make_next_op('%s/%s-0-0.tfr' % (tfr_dir, options.data_split))
    next_op = make_next_op("%s/%s-0-*.tfr" % (tfr_dir, options.data_split))

    # read sequence values
    with tf.Session() as sess:
        next_datum = sess.run(next_op)
        while next_datum:
            # read sequence
            seq_bed_line = seq_bed_open.readline()
            a = seq_bed_line.rstrip().split("\t")
            while a[-1] != options.data_split:
                seq_bed_line = seq_bed_open.readline()
                a = seq_bed_line.rstrip().split("\t")
            chrm = a[0]
            start = int(a[1])
            end = int(a[2])
            target_pool = (end - start) // options.target_length

            # check sequence
            seq_1hot = next_datum["sequence"].reshape((-1, 4))
            seq_1hot_dna = hot1_dna(seq_1hot)
            seq_fasta = fasta_open.fetch(chrm, start, end).upper()
            if seq_1hot_dna != seq_fasta:
                seq_diff = [
                    seq_1hot_dna[i] != seq_fasta[i]
                    for i in range(len(seq_fasta))
                ]
                seq_diff = np.array(seq_match, dtype="bool")
                print("WARNING: %s:%d-%d differs by %d nts (%.4f)" %
                      (chrm, start, end, seq_match.sum(), seq_match.mean()))

            # read targets
            targets = next_datum["target"].reshape(options.target_length, -1)
            targets_ti = targets[:, options.target_i]

            # set values
            chr_values[chrm][start:end] = np.repeat(targets_ti, target_pool)

            try:
                next_datum = sess.run(next_op)
            except tf.errors.OutOfRangeError:
                next_datum = False

    fasta_open.close()

    # write chr values
    for chrm, _ in header:
        print(chrm)
        out_bw_open.addEntries(chrm,
                               0,
                               values=chr_values[chrm],
                               span=1,
                               step=1)

    # close files
    out_bw_open.close()
Esempio n. 5
0
def alleles_1hot(gene_seq, seq_1hot, seq_snps):
    ''' One hot code for gene sequence alleles. '''

    # initialize one hot coding
    aseqs_1hot = []

    # add reference allele sequence
    aseqs_1hot.append(np.copy(seq_1hot))

    # set all reference alleles
    for snp in seq_snps:

        # determine SNP position wrt sequence
        snp_seq_pos = snp.pos - 1 - gene_seq.start

        # verify that the reference allele matches the reference
        seq_ref = dna_io.hot1_dna(aseqs_1hot[0][snp_seq_pos:snp_seq_pos +
                                                len(snp.ref_allele), :])
        if seq_ref != snp.ref_allele:
            print(
                'WARNING: %s - ref allele %s does not match reference genome %s; changing reference genome to match.'
                % (snp.rsid, snp.ref_allele, seq_ref),
                file=sys.stderr)

            if len(seq_ref) == len(snp.ref_allele):
                # SNP
                dna_io.hot1_set(aseqs_1hot[0], snp_seq_pos, snp.ref_allele)

            # not confident in these operations

            # elif len(seq_ref) > len(snp.ref_allele):
            #   # deletion
            #   delete_len = len(seq_ref) - len(snp.ref_allele)
            #   dna_io.hot1_delete(aseqs_1hot[0], snp_seq_pos + 1, delete_len)

            # else:
            #   # insertion
            #   dna_io.hot1_insert(aseqs_1hot[0], snp_seq_pos + 1, snp.ref_allele[1:])

            else:
                raise Exception(
                    'ERROR: reference mismatch indels cannot yet be handled.')

    # for each SNP
    for snp in seq_snps:

        # determine SNP position wrt sequence
        snp_seq_pos = snp.pos - 1 - gene_seq.start

        # add minor allele sequence
        aseqs_1hot.append(np.copy(aseqs_1hot[0]))
        if len(snp.ref_allele) == len(snp.alt_alleles[0]):
            # SNP
            dna_io.hot1_set(aseqs_1hot[-1], snp_seq_pos, snp.alt_alleles[0])

        elif len(snp.ref_allele) > len(snp.alt_alleles[0]):
            # deletion
            delete_len = len(snp.ref_allele) - len(snp.alt_alleles[0])
            assert (snp.ref_allele[0] == snp.alt_alleles[0][0])
            dna_io.hot1_delete(aseqs_1hot[-1], snp_seq_pos + 1, delete_len)

        else:
            # insertion
            assert (snp.ref_allele[0] == snp.alt_alleles[0][0])
            dna_io.hot1_insert(aseqs_1hot[-1], snp_seq_pos + 1,
                               snp.alt_alleles[0][1:])

    # finalize
    aseqs_1hot = np.array(aseqs_1hot)

    return aseqs_1hot
Esempio n. 6
0
def parse_input(input_file, sample):
    """ Parse an input file that might be FASTA or HDF5. """

    try:
        # input_file is FASTA

        # read sequences and headers
        seqs = []
        seq_headers = []
        for line in open(input_file):
            if line[0] == ">":
                seq_headers.append(line[1:].rstrip())
                seqs.append("")
            else:
                seqs[-1] += line.rstrip()

        # convert to arrays
        seqs = np.array(seqs)
        seq_headers = np.array(seq_headers)

        # one hot code sequences
        seqs_1hot = []
        for seq in seqs:
            seqs_1hot.append(dna_io.dna_1hot(seq))
        seqs_1hot = np.array(seqs_1hot)

        # sample
        if sample:
            sample_i = np.array(
                random.sample(xrange(seqs_1hot.shape[0]), sample))
            seqs_1hot = seqs_1hot[sample_i]
            seq_headers = seq_headers[sample_i]
            seqs = seqs[sample_i]

        # initialize targets variable
        targets = None

    except (UnicodeDecodeError):
        # input_file is HDF5

        try:
            # load (sampled) test data from HDF5
            hdf5_in = h5py.File(input_file, "r")
            seqs_1hot = np.array(hdf5_in["test_in"])
            targets = np.array(hdf5_in["test_out"])
            hdf5_in.close()

            # sample
            if sample:
                sample_i = np.array(
                    random.sample(range(seqs_1hot.shape[0]), sample))
                seqs_1hot = seqs_1hot[sample_i]
                targets = targets[sample_i]

            # convert to ACGT sequences
            seqs = dna_io.hot1_dna(seqs_1hot)

        except IOError:
            parser.error("Could not parse input file as FASTA or HDF5.")

    return seqs, seqs_1hot, targets
Esempio n. 7
0
def main():
    usage = 'usage: %prog [options] <params_file> <model_file> <data_dir>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='act_t',
        default=0.5,
        type='float',
        help=
        'Activation threshold (as proportion of max) to consider for PWM [Default: %default]'
    )
    parser.add_option(
        '-d',
        dest='plot_density',
        default=False,
        action='store_true',
        help='Plot filter activation density [Default: %default]')
    parser.add_option(
        '--heat',
        dest='plot_heats',
        default=False,
        action='store_true',
        help=
        'Plot heat maps describing filter activations in the test sequences [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='seq_length_crop',
        default=None,
        type='int',
        help='Crop sequences to shorter length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='basenji_motifs')
    parser.add_option('-m',
                      dest='meme_db',
                      default='%s/cisbp/Homo_sapiens.meme' %
                      os.environ['HG38'],
                      help='MEME database used to annotate motifs')
    parser.add_option(
        '-p',
        dest='parallel_threads',
        default=1,
        type='int',
        help='Generate weblogos in parallal threads [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=None,
        type='int',
        help='Sample sequences from the test set [Default:%default]')
    parser.add_option(
        '-t',
        dest='trim_filters',
        default=False,
        action='store_true',
        help=
        'Trim uninformative positions off the filter ends [Default: %default]')
    parser.add_option(
        '--tfr',
        dest='tfr_pattern',
        default='test-*.tfr',
        help='TFR pattern string appended to data_dir [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error(
            'Must provide Basenji params and model files and data directory')
    else:
        params_file = args[0]
        model_file = args[1]
        data_dir = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    #######################################################
    # inputs

    # read model parameters
    with open(params_file) as params_open:
        params = json.load(params_open)
    params_model = params['model']
    params_train = params['train']
    if options.seq_length_crop is not None:
        params_model['seq_length'] = options.seq_length_crop

    # read data parameters
    data_stats_file = '%s/statistics.json' % data_dir
    with open(data_stats_file) as data_stats_open:
        data_stats = json.load(data_stats_open)

    # construct data ops
    tfr_pattern_path = '%s/tfrecords/%s' % (data_dir, options.tfr_pattern)
    eval_data = dataset.SeqDataset(tfr_pattern_path,
                                   seq_length=data_stats['seq_length'],
                                   seq_length_crop=options.seq_length_crop,
                                   target_length=data_stats['target_length'],
                                   batch_size=params_train['batch_size'],
                                   mode=tf.estimator.ModeKeys.EVAL)

    # obtain sequences
    eval_seqs_1hot = eval_data.numpy(return_inputs=True, return_outputs=False)
    eval_seqs_dna = dna_io.hot1_dna(eval_seqs_1hot)
    del eval_seqs_1hot

    #################################################################
    # model

    # initialize model
    seqnn_model = seqnn.SeqNN(params_model)
    seqnn_model.restore(model_file)

    # first layer embedding
    seqnn_model.build_embed(0)
    _, preds_length, preds_depth = seqnn_model.embed.output.shape

    # get weights
    filter_weights = seqnn_model.get_conv_weights()
    print(filter_weights.shape)
    num_filters, _, filter_size = filter_weights.shape

    # compute filter activations
    filter_outs = seqnn_model.predict(eval_data)
    print(filter_outs.shape)

    #################################################################
    # individual filter plots

    # save information contents
    filters_ic = []
    meme_out = meme_intro('%s/filters_meme.txt' % options.out_dir,
                          eval_seqs_dna)

    # plot weblogo of high scoring outputs (in parallel)
    if options.parallel_threads > 1:
        pfl_args = []
        for f in range(num_filters):
            pfl_args.append(
                (filter_outs[:, :, f], filter_size, eval_seqs_dna,
                 '%s/filter%d_logo' % (options.out_dir, f), options.act_t))
        with multiprocessing.get_context('spawn').Pool(
                options.parallel_threads) as pool:
            pool.starmap(plot_filter_logo, pfl_args)

    for f in range(num_filters):
        print('Filter %d' % f)

        # plot filter parameters as a heatmap
        plot_filter_heat(filter_weights[f, :, :],
                         '%s/filter%d_heat.pdf' % (options.out_dir, f))

        if options.parallel_threads == 1:
            plot_filter_logo(filter_outs[:, :, f], filter_size, eval_seqs_dna,
                             '%s/filter%d_logo' % (options.out_dir, f),
                             options.act_t)

        # write possum motif file
        # filter_possum(filter_weights[f, :, :], 'filter%d' % f,
        #               '%s/filter%d_possum.txt' % (options.out_dir,
        #                                           f), options.trim_filters)

        # make a PWM for the filter
        filter_pwm, nsites = make_filter_pwm('%s/filter%d_logo.fa' %
                                             (options.out_dir, f))

        if nsites < 10:
            # no information
            filters_ic.append(0)
        else:
            # compute and save information content
            filters_ic.append(info_content(filter_pwm))

            # add to the meme motif file
            meme_add(meme_out, f, filter_pwm, nsites, options.trim_filters)

    meme_out.close()

    #################################################################
    # annotate filters
    #################################################################
    # run tomtom
    subprocess.call(
        'tomtom -dist pearson -thresh 0.1 -oc %s/tomtom %s/filters_meme.txt %s'
        % (options.out_dir, options.out_dir, options.meme_db),
        shell=True)

    # read in annotations
    filter_names = name_filters(num_filters,
                                '%s/tomtom/tomtom.tsv' % options.out_dir,
                                options.meme_db)

    #################################################################
    # print a table of information
    #################################################################
    table_out = open('%s/table.txt' % options.out_dir, 'w')

    # print header for later panda reading
    header_cols = ('', 'consensus', 'annotation', 'ic', 'mean', 'std')
    print('%3s  %19s  %10s  %5s  %6s  %6s' % header_cols, file=table_out)

    for f in range(num_filters):
        # collapse to a consensus motif
        consensus = filter_motif(filter_weights[f, :, :])

        # grab annotation
        annotation = '.'
        name_pieces = filter_names[f].split('_')
        if len(name_pieces) > 1:
            annotation = name_pieces[1]

        f_scores = np.ravel(filter_outs[:, :, f])
        fmean, fstd = f_scores.mean(), f_scores.std()
        if options.plot_density:
            # plot density of filter output scores
            plot_score_density(f_scores,
                               '%s/filter%d_dens.pdf' % (options.out_dir, f))

        row_cols = (f, consensus, annotation, filters_ic[f], fmean, fstd)
        print('%-3d  %19s  %10s  %5.2f  %6.4f  %6.4f' % row_cols,
              file=table_out)

    table_out.close()

    #################################################################
    # global filter plots
    #################################################################

    # these methods make less sense for longer sequences;
    # I should fragment the sequences first.

    if options.plot_heats:
        # plot filter-sequence heatmap
        plot_filter_seq_heat(filter_outs,
                             '%s/filter_seqs.pdf' % options.out_dir)

        # plot filter-segment heatmap
        plot_filter_seg_heat(filter_outs,
                             '%s/filter_segs.pdf' % options.out_dir)
        plot_filter_seg_heat(filter_outs,
                             '%s/filter_segs_raw.pdf' % options.out_dir,
                             whiten=False)

        # plot filter-target correlation heatmap
        plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                         '%s/filter_target_cors_mean.pdf' % options.out_dir,
                         'mean')
        plot_target_corr(filter_outs, seq_targets, filter_names, target_names,
                         '%s/filter_target_cors_max.pdf' % options.out_dir,
                         'max')