Example #1
0
def summarize_metrics(args):
    import numpy as np
    import h5py
    import pandas as pd
    from ioutils import open_file_or_stdout

    def parse_filename(filename):
        c = filename.split('/')
        d = {'dataset': c[2],
            'cv_index': c[4].split('_')[-1],
            'model_name': c[-1].split('.')[1],
            'featureset': c[-1].split('.')[2],
            'icshape_dataset': c[-1].split('.')[3]
        }
        return d

    summary = []
    for input_file in args.input_files:
        d = parse_filename(input_file)
        with h5py.File(input_file, 'r') as f:
            d['accuracy'] = f['metrics/accuracy'][()]
            d['roc_auc'] = f['metrics/roc_auc'][()]
            summary.append(d)
    summary = pd.DataFrame.from_records(summary)
    summary = summary[['dataset', 'icshape_dataset', 'model_name', 'featureset', 'cv_index', 'accuracy', 'roc_auc']]
    with open_file_or_stdout(args.output_file) as fout:
        summary.to_csv(fout, sep='\t', index=False)
Example #2
0
def read_duplicate_hist(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout

    bin_size = args.bin_size
    max_length = args.max_length
    n = max_length // bin_size
    bounds = np.arange(0, (n + 1) * bin_size, bin_size)

    logger.info('read chrom sizes: ' + args.chrom_sizes_file)
    chrom_sizes = {}
    with open(args.chrom_sizes_file, 'r') as f:
        for line in f:
            c = line.strip().split('\t')
            chrom_sizes[c[0]] = int(c[1])
    logger.info('read input BAM/SAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, "rb")
    dup_counts = np.zeros(n + 1, dtype=np.int64)
    tot_counts = np.zeros(n + 1, dtype=np.int64)
    max_length = args.max_length
    for read in sam:
        index = min(chrom_sizes[read.reference_name] // bin_size, n)
        if read.is_duplicate:
            dup_counts[index] += 1
        tot_counts[index] += 1

    logger.info('create output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as f:
        f.write('bin\tduplicates\ttotal\n')
        for i in range(n + 1):
            f.write('{}\t{}\t{}\n'.format(bounds[i], dup_counts[i],
                                          tot_counts[i]))
Example #3
0
def extract_feature_sequence(args):
    from pyfaidx import Fasta
    from Bio.Seq import Seq
    from ioutils import open_file_or_stdout

    fout = open_file_or_stdout(args.output_file)
    fastas = {}
    with open(args.input_file, 'r') as fin:
        for lineno, line in enumerate(fin):
            feature = line.split('\t')[0]
            gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split(
                '|')
            start = int(start)
            end = int(end)
            if gene_type == 'genomic':
                gene_type = 'genome'
            if gene_type not in fastas:
                fastas[gene_type] = Fasta(
                    os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
            if gene_type == 'genome':
                chrom, gstart, gend, strand = gene_id.split('_')
                gstart = int(gstart)
                gend = int(gend)
                seq = fastas[gene_type][chrom][gstart:gend].seq
                if strand == '-':
                    seq = str(Seq(seq).reverse_complement())
            else:
                seq = fastas[gene_type][transcript_id][start:end].seq
            seq = seq.upper()
            fout.write('>{}\n'.format(feature))
            fout.write(seq)
            fout.write('\n')
    fout.close()
Example #4
0
def extract_mature_mirna_location(args):
    from utils import read_gff, GFFRecord
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import OrderedDict, defaultdict

    logger.info('read input GFF file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    logger.info('open output BED file: ' + args.input_file)
    fout = open_file_or_stdout(args.output_file)
    # key: precursor_id, value: precursor record
    precursors = OrderedDict()
    # key: precursor_id, value: list of mature records
    matures = defaultdict(list)
    # read features from GFF file
    for record in read_gff(fin):
        if record.feature == 'miRNA_primary_transcript':
            precursors[record.attr['ID']] = record
        elif record.feature == 'miRNA':
            matures[record.attr['Derives_from']].append(record)
    # get locations of mature miRNAs
    for precursor_id, precursor in precursors.items():
        for mature in matures[precursor_id]:
            if mature.strand == '+':
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
                    precursor.attr['Name'], mature.start - precursor.start,
                    mature.end - precursor.start + 1, mature.attr['Name']))
            else:
                fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format(
                    precursor.attr['Name'], precursor.end - mature.end,
                    precursor.end - mature.start + 1, mature.attr['Name']))
    fin.close()
    fout.close()
Example #5
0
def merge_data_frames(args):
    import pandas as pd
    import numpy as np
    from ioutils import open_file_or_stdout

    if (not args.on_index) and (args.on is None):
        raise ValueError(
            'argument --on is required if --on-index is not specified')
    merged = None
    for input_file in args.input_file:
        logger.info('read input file: ' + input_file)
        df = pd.read_table(input_file, sep=args.sep)
        if merged is None:
            merged = df
        else:
            if args.on_index:
                merged = pd.merge(merged,
                                  df,
                                  how=args.how,
                                  left_index=True,
                                  right_index=True)
            else:
                merged = pd.merge(merged, df, how=args.how, on=args.on)
    if args.fillna is not None:
        merged.fillna(args.fillna, inplace=True)
    logger.info('open output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as f:
        merged.to_csv(f, sep=args.sep, header=True, index=args.on_index)
Example #6
0
def fragment_length_hist(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout

    logger.info('read input BAM/SAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, "rb")
    max_length = args.max_length
    counts = np.zeros(max_length + 1, dtype=np.int64)
    read1 = None
    for read in sam:
        if (not read.is_paired) or (not read.is_proper_pair):
            continue
        if read.is_read1:
            read1 = read
        elif read.is_read2:
            if read.query_name != read1.query_name:
                continue
            length = read.reference_end - read1.reference_start
            counts[min(length, max_length)] += 1

    with open_file_or_stdout(args.output_file) as f:
        f.write('fragment_length\tcounts\n')
        for i in range(max_length + 1):
            f.write('{}\t{}\n'.format(i, counts[i]))
Example #7
0
def count_transcript(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout
    from collections import OrderedDict

    logger.info('read input BAM/SAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, "rb")
    counts = OrderedDict()
    min_mapping_quality = args.min_mapping_quality
    strandness = {'no': 0, 'forward': 1, 'reverse': 2}.get(args.strandness, 0)
    for read in sam:
        if read.is_unmapped:
            continue
        if read.mapping_quality < min_mapping_quality:
            continue
        if (strandness == 1) and read.is_reverse:
            continue
        if (strandness == 2) and (not read.is_reverse):
            continue
        if read.reference_name not in counts:
            counts[read.reference_name] = 0
        counts[read.reference_name] += 1

    with open_file_or_stdout(args.output_file) as f:
        if sam.header is not None:
            for sq in sam.header['SQ']:
                name = sq['SN']
                f.write('{}\t{}\n'.format(name, counts.get(name, 0)))
        else:
            for name, count in counts.items():
                f.write('{}\t{}\n'.format(name, count))
Example #8
0
def chrom_sizes(args):
    from Bio import SeqIO
    from ioutils import open_file_or_stdin, open_file_or_stdout

    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            fout.write('{}\t{}\n'.format(record.id, len(record.seq)))
Example #9
0
def gtf_to_transcript_table(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import OrderedDict

    feature = args.feature
    default_transcript_type = args.transcript_type
    default_gene_type = args.gene_type

    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        transcripts = OrderedDict()
        for line in fin:
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            if c[2] != feature:
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            if 'transcript_name' not in attrs:
                attrs['transcript_name'] = attrs['transcript_id']
            if 'gene_name' not in attrs:
                attrs['gene_name'] = attrs['gene_id']
            if default_transcript_type is not None:
                attrs['transcript_type'] = default_transcript_type
            else:
                if 'transcript_type' not in attrs:
                    attrs['transcript_type'] = 'unknown'
            if default_gene_type is not None:
                attrs['gene_type'] = default_gene_type
            else:
                if 'gene_type' not in attrs:
                    attrs['gene_type'] = 'unknown'
            exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6],
                attrs['gene_id'], attrs['transcript_id'], 
                attrs['gene_name'], attrs['transcript_name'],
                attrs['gene_type'], attrs['transcript_type'], c[1]]
            transcript = transcripts.get(attrs['transcript_id'])
            if transcript is None:
                transcripts[attrs['transcript_id']] = exon
            else:
                if c[2] == 'exon':
                    transcript[1] = min(transcript[1], exon[1])
                    transcript[2] = max(transcript[2], exon[2])
        header = ['chrom', 'start', 'end', 'name', 'score', 'strand',
            'gene_id', 'transcript_id', 
            'gene_name', 'transcript_name',
            'gene_type', 'transcript_type', 'source'
        ]
        print('\t'.join(header), file=fout)
        for transcript in transcripts.values():
            print('\t'.join(str(a) for a in transcript), file=fout)
    fout.close()
Example #10
0
def rfam(args):
    import numpy as np
    import subprocess
    from Bio import SeqIO
    from io import StringIO
    import re
    from utils import random_sequences
    from ioutils import open_file_or_stdout

    alphabet = 'AUCG'

    # read CM file
    motif_name = 'RFAM'
    with open(args.input_file, 'r') as f:
        for line in f:
            c = line.strip().split()
            if c[0] == 'NAME':
                motif_name = c[1]
                break

    n_motif_seqs = args.n - round(args.bg_percent*0.01*args.n)
    n_bg_seqs = round(args.bg_percent*0.01*args.n)
    # generate motif sequences
    p = subprocess.Popen(['cmemit', '--nohmmonly', '-e', str(args.length), '-N', str(n_motif_seqs),
        args.input_file], stdout=subprocess.PIPE)
    out, _ = p.communicate()
    sequences = []
    starts = np.zeros(n_motif_seqs, dtype=np.int32)
    ends = np.zeros(n_motif_seqs, dtype=np.int32)
    labels = np.zeros(args.n, dtype=np.int32)
    labels[:n_motif_seqs] = 1
    pat_cmemit = re.compile(r'^[^/]+/([0-9]+)\-([0-9]+)$')
    for i, record in enumerate(SeqIO.parse(StringIO(str(out, encoding='ascii')), 'fasta')):
        start, end = pat_cmemit.match(record.id).groups()
        sequences.append(str(record.seq))
        starts[i] = int(start) + 1
        ends[i] = int(end)
    # generate background sequences
    if n_bg_seqs > 0:
        sequences += random_sequences(args.length, alphabet=alphabet, size=n_bg_seqs)
        starts = np.append(starts, np.zeros(n_bg_seqs, dtype=np.int32))
        ends = np.append(ends, np.zeros(n_bg_seqs, dtype=np.int32))
    
    # shuffle orders
    logger.info('generate {} motif sequences and {} background sequences'.format(n_motif_seqs, n_bg_seqs))
    seq_indices = np.random.permutation(args.n)
    sequences = [sequences[i] for i in seq_indices]
    labels = labels[seq_indices]
    starts = starts[seq_indices]
    ends = ends[seq_indices]

    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    for i, seq in enumerate(sequences):
        fout.write('>{}_{:06d},{},{},{}\n'.format(motif_name, i + 1, labels[i], starts[i], ends[i]))
        fout.write(seq)
        fout.write('\n')
    fout.close()
Example #11
0
def extract_longest_transcript(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    from collections import defaultdict
    from functools import partial

    feature = args.feature
    genes = defaultdict(partial(defaultdict, int))
    lines = []
    logger.info('read gtf file: ' + args.input_file)
    with open_file_or_stdin(args.input_file) as fin:
        lineno = 0
        for line in fin:
            lineno += 1
            c = line.strip().split('\t')
            if c[0].startswith('#'):
                continue
            if c[2] != feature:
                lines.append(('#other#', line))
                continue
            attrs = {}
            for a in c[8].split(';')[:-1]:
                a = a.strip()
                i = a.find(' ')
                key = a[:i]
                val = a[(i + 1):].strip('"')
                attrs[key] = val
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError(
                    'transcript_id not found in GTF file at line {}'.format(
                        lineno))
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError(
                    'gene_id not found in GTF file at line {}'.format(lineno))
            lines.append((transcript_id, line))
            genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1
    kept_transcripts = set()
    kept_transcripts.add('#other#')
    for gene_id, gene in genes.items():
        max_length = 0
        max_transcript = None
        for transcript_id, length in gene.items():
            if length > max_length:
                max_length = length
                max_transcript = transcript_id
        kept_transcripts.add(transcript_id)

    logger.info('number of genes: {}'.format(len(genes)))
    logger.info('number of transcripts: {}'.format(
        sum(map(len, genes.values()))))
    logger.info(
        'number of longest transcripts: {}'.format(len(kept_transcripts) - 1))
    logger.info('write output gtf file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for transcript_id, line in lines:
            if transcript_id in kept_transcripts:
                fout.write(line)
Example #12
0
def normalize(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout
    import pandas as pd

    with open_file_or_stdin(args.input_file) as f:
        matrix = pd.read_table(f, sep='\t', index_col=0)
    if args.method == 'cpm':
        matrix = 1e6 * matrix.astype('float') / matrix.sum(axis=0)
    with open_file_or_stdout(args.output_file) as f:
        matrix.to_csv(f, sep='\t', header=True, index=True, na_rep='NA')
Example #13
0
def background(args):
    from utils import random_sequences
    from ioutils import open_file_or_stdout

    sequences = random_sequences(args.length, alphabet=args.alphabet, size=args.n)
    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    for i, seq in enumerate(sequences):
        fout.write('>RANDOM_{:06d}:0\n'.format(i + 1))
        fout.write(seq)
        fout.write('\n')
    fout.close()
Example #14
0
def sequential_mapping(args):
    from jinja2 import Template, Environment
    from ioutils import open_file_or_stdout

    rna_types = []
    if len(args.rna_types) > 0:
        rna_types = args.rna_types.split(',')

    logger.info('load template: ' + args.template)
    env = Environment(lstrip_blocks=True, trim_blocks=True)
    with open(args.template, 'r') as f:
        template = env.from_string(f.read())
    with open_file_or_stdout(args.output_file) as f:
        f.write(template.render(rna_types=rna_types, aligner=args.aligner))
Example #15
0
def gtf_to_bed(args):
    from ioutils import open_file_or_stdin, open_file_or_stdout

    exon_feature = 'exon'
    # use transcript_id attribute as key
    transcripts = {}
    logger.info('read input GTF file: ' + args.input_file)
    for lineno, record in enumerate(read_gtf(args.input_file)):
        c, attrs, line = record
        if c[2] == exon_feature:
            gene_id = attrs.get('gene_id')
            if gene_id is None:
                raise ValueError(
                    'gene_id attribute not found in GTF file {}:{}'.format(
                        args.input_file, lineno))
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError(
                    'transcript_id attribute not found in GTF file {}:{}'.
                    format(args.input_file, lineno))
            transcript = transcripts.get(transcript_id)
            if transcript is None:
                # new transcript
                transcript = {
                    'chrom': c[0],
                    'strand': c[6],
                    'gene_id': gene_id,
                    'gene_name': attrs.get('gene_name', gene_id),
                    'transcript_name': attrs.get('transcript_name',
                                                 transcript_id),
                    'exons': []
                }
                transcripts[transcript_id] = transcript
            # add a new exon
            transcript['exons'].append((int(c[3]) - 1, int(c[4])))

    fout = open_file_or_stdout(args.output_file)
    bed_template = '{chrom}\t{start}\t{end}\t{name}\t0\t{strand}\t0\t0\t0\t{n_exons}\t{exon_sizes}\t{exon_starts}\n'
    for transcript_id, transcript in transcripts.items():
        # sort exons by start position
        transcript['exons'] = sorted(transcript['exons'], key=lambda x: x[0])
        transcript['n_exons'] = len(transcript['exons'])
        transcript['start'] = transcript['exons'][0][0]
        transcript['end'] = transcript['exons'][-1][1]
        transcript['exon_starts'] = ','.join(
            str(e[0] - transcript['start']) for e in transcript['exons'])
        transcript['exon_sizes'] = ','.join(
            str(e[1] - e[0]) for e in transcript['exons'])
        transcript['name'] = '{gene_id}'.format(**transcript)
        fout.write(bed_template.format(**transcript))
Example #16
0
def transcript_counts(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout
    from collections import defaultdict

    logger.info('read input transcript BAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, "rb")
    counts = defaultdict(int)
    for read in sam:
        counts[read.reference_name] += 1

    logger.info('create output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for key, val in counts.items():
            fout.write('{}\t{}\n'.format(key, val))
Example #17
0
def filter_circrna_reads(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout, open_file_or_stdin
    from collections import defaultdict
    from copy import deepcopy

    logger.info('read input SAM file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    sam_in = pysam.AlignmentFile(fin, "r")
    if sam_in.header is None:
        raise ValueError('requires SAM header to get junction positions')
    # get junction positions (middle of the sequences)
    junction_positions = {}
    for sq in sam_in.header['SQ']:
        junction_positions[sq['SN']] = sq['LN'] // 2

    logger.info('create output SAM file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in)

    sam_filtered = None
    if args.filtered_file is not None:
        logger.info('create filtered SAM file: ' + args.filtered_file)
        sam_filtered = pysam.AlignmentFile(args.filtered_file,
                                           'w',
                                           template=sam_in)

    for read in sam_in:
        filtered = False
        if read.is_unmapped:
            filtered = True
        elif read.is_reverse:
            filtered = True
        else:
            pos = junction_positions[read.reference_name]
            if not (read.reference_start < pos <= read.reference_end):
                filterd = True
        if not filtered:
            sam_out.write(read)
        elif sam_filtered is not None:
            sam_filtered.write(read)

    fin.close()
    fout.close()
    if sam_filtered is not None:
        sam_filtered.close()
Example #18
0
def flagstat(args):
    import pysam
    from ioutils import open_file_or_stdin, open_file_or_stdout

    logger.info('read input file: ' + args.input_file)
    fin = open_file_or_stdin(args.input_file)
    sam = pysam.AlignmentFile(fin, 'rb')
    counts = [0] * 4096
    for read in sam:
        counts[read.flag] += 1
    sam.close()

    logger.info('create output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        fout.write('flag\tcounts\n')
        for flag, count in enumerate(counts):
            if count > 0:
                fout.write('{}\t{}\n'.format(flag, count))
Example #19
0
def extract_feature_sequence(args):
    from pyfaidx import Fasta
    from Bio.Seq import Seq
    from ioutils import open_file_or_stdout

    def pad_range(start, end, chrom_size, max_length):
        if (end - start) >= max_length:
            continue
        padding_left = (max_length - (end - start)) // 2
        new_start = max(0, start - padding_left)
        new_end = min(new_start + max_length, chrom_size)
        return new_start, new_end

    fout = open_file_or_stdout(args.output_file)
    fastas = {}
    with open(args.input_file, 'r') as fin:
        for lineno, line in enumerate(fin):
            if lineno == 0:
                continue
            feature = line.split('\t')[0]
            gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split(
                '|')
            start = int(start)
            end = int(end)
            if gene_type == 'genomic':
                gene_type = 'genome'
            # load FASTA file
            if gene_type not in fastas:
                fastas[gene_type] = Fasta(
                    os.path.join(args.genome_dir, 'fasta', gene_type + '.fa'))
            if gene_type == 'genome':
                chrom, gstart, gend, strand = gene_id.split('_')
                gstart = int(gstart)
                gend = int(gend)
                seq = fastas[gene_type][chrom][gstart:gend].seq
                if strand == '-':
                    seq = str(Seq(seq).reverse_complement())
            else:
                seq = fastas[gene_type][transcript_id][start:end].seq
            seq = seq.upper()
            fout.write('>{}\n'.format(feature))
            fout.write(seq)
            fout.write('\n')
    fout.close()
Example #20
0
def extract_circrna_junction(args):
    from Bio import SeqIO
    from ioutils import open_file_or_stdin, open_file_or_stdout

    anchor_size = args.anchor_size
    logger.info('read sequence file: ' + args.input_file)
    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    with open_file_or_stdin(args.input_file) as fin:
        for record in SeqIO.parse(fin, 'fasta'):
            seq = str(record.seq)
            if len(seq) < args.min_length:
                continue
            s = min(len(seq), anchor_size)
            seq_id = record.id.split('|')[0]
            fout.write('>{}\n'.format(seq_id))
            fout.write(seq[-s:] + seq[:s])
            fout.write('\n')
    fout.close()
Example #21
0
def pwm(args):
    from utils import read_transfac
    from utils import sample_pwm, random_sequences, embed_pwm
    from ioutils import open_file_or_stdout

    logger.info('read motif file: ' + args.input_file)
    motif = read_transfac(args.input_file)
    pwm = motif['PWM']/np.sum(motif['PWM'], axis=1, keepdims=True)
    alphabet = motif['PO']
    pwm_name = motif['ID']
    motif_length = motif['PWM'].shape[0]

    if args.length is None:
        args.length = motif_length
    if args.length < motif_length:
        raise ValueError('cannot embed motif of length {} into sequence of length{}'.format(motif_length, args.length))

    n_motif_seqs = args.n - round(args.bg_percent*0.01*args.n)
    n_bg_seqs = round(args.bg_percent*0.01*args.n)
    sequences, starts = embed_pwm(pwm, alphabet=alphabet, size=n_motif_seqs, length=args.length)
    ends = starts + motif_length
    if n_bg_seqs > 0:
        sequences += random_sequences(args.length, alphabet=alphabet, size=n_bg_seqs)
        starts = np.append(starts, np.zeros(n_bg_seqs, dtype=np.int32))
        ends = np.append(ends, np.zeros(n_bg_seqs, dtype=np.int32))
    labels = np.zeros(args.n, dtype=np.int32)
    labels[:n_motif_seqs] = 1

    # shuffle orders
    logger.info('generate {} motif sequences and {} background sequences'.format(n_motif_seqs, n_bg_seqs))
    seq_indices = np.random.permutation(args.n)
    sequences = [sequences[i] for i in seq_indices]
    labels = labels[seq_indices]
    starts = starts[seq_indices]
    ends = ends[seq_indices]

    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    for i, seq in enumerate(sequences):
        fout.write('>{}_{:06d},{},{},{}\n'.format(pwm_name, i + 1, labels[i], starts[i], ends[i]))
        fout.write(seq)
        fout.write('\n')
    fout.close()
Example #22
0
def sample_pwm(args):
    from utils import read_transfac
    from utils import sample_pwm as _sample_pwm
    from ioutils import open_file_or_stdout

    logger.info('read motif file: ' + args.input_file)
    motif = read_transfac(args.input_file)
    pwm = motif['PWM'] / np.sum(motif['PWM'], axis=1, keepdims=True)
    alphabet = motif['PO'].replace('T', 'U')
    sequences = _sample_pwm(pwm, alphabet=motif['PO'], size=args.n)
    pwm_name = motif['ID']

    logger.info('create output file: ' + args.output_file)
    fout = open_file_or_stdout(args.output_file)
    for i, seq in enumerate(sequences):
        fout.write('>{}_{:06d}\n'.format(pwm_name, i + 1))
        fout.write(seq)
        fout.write('\n')
    fout.close()
Example #23
0
def read_length_hist(args):
    import pysam
    import numpy as np
    from ioutils import open_file_or_stdout

    logger.info('read input BAM/SAM file: ' + args.input_file)
    sam = pysam.AlignmentFile(args.input_file, "rb")
    counts_ref = np.zeros(args.max_length, dtype=np.int64)
    counts_query = np.zeros(args.max_length, dtype=np.int64)
    max_length = args.max_length
    for read in sam:
        counts_query[read.query_length] += 1
        counts_ref[min(read.reference_length, max_length - 1)] += 1

    logger.info('create output file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as f:
        f.write('length\tquery\treference\n')
        for i in range(args.max_length):
            f.write('{}\t{}\t{}\n'.format(i, counts_query[i], counts_ref[i]))
Example #24
0
def fix_gtf(args):
    from ioutils import open_file_or_stdout
    from collections import defaultdict
    # strand of exons grouped by transcript_id
    strands = defaultdict(list)

    feature = args.feature
    lines = []
    logger.info('read GTF file: ' + args.input_file)
    for c, attrs, line in read_gtf(args.input_file):
        if c[2] in ('transcript', 'exon'):
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError(
                    'transcript_id not found in GTF file at line {}'.format(
                        lineno))
            lines.append((transcript_id, line))
        else:
            transcript_id = attrs.get('transcript_id')
            if transcript_id is None:
                raise ValueError(
                    'transcript_id not found in GTF file at line {}'.format(
                        lineno))
            strands[transcript_id].append(c[6])
    invalid_transcripts = set()
    for transcript_id, strands_tx in strands.items():
        strands_tx = set(strands_tx)
        # remove transcripts without strand information
        if '.' in strands_tx:
            invalid_transcripts.add(transcript_id)
        # remove transcripts with exon on different strands
        elif len(strands_tx) != 1:
            invalid_transcripts.add(transcript_id)

    logger.info('number of transcripts: {}'.format(len(strands)))
    logger.info('number of invalid transcripts: {}'.format(
        len(invalid_transcripts)))
    logger.info('write GTF file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for transcript_id, line in lines:
            if transcript_id not in invalid_transcripts:
                fout.write(line)
Example #25
0
def calculate_clustering_score(args):
    import numpy as np
    import pandas as pd
    from evaluation import uca_score, knn_score
    from ioutils import open_file_or_stdout

    logger.info('read feature matrix: ' + args.matrix)
    X = pd.read_table(args.matrix, index_col=0, sep='\t')

    if args.transpose:
        logger.info('transpose feature matrix')
        X = X.T
    if args.use_log:
        logger.info('apply log2 to feature matrix')
        X = np.log2(X + 0.25)

    logger.info('calculate clustering score')
    if args.method == 'uca_score':
        if args.sample_classes is None:
            raise ValueError(
                'argument --sample-classes is required for knn_score')
        logger.info('read sample classes: ' + args.sample_classes)
        sample_classes = pd.read_table(args.sample_classes,
                                       index_col=0,
                                       sep='\t').iloc[:, 0]
        y = sample_classes[X.index.values].values
        score = uca_score(X, y)
    elif args.method == 'knn_score':
        if args.batch is None:
            raise ValueError('argument --batch is required for knn_score')
        if args.batch_index is None:
            raise ValueError(
                'argument --batch-index is required for knn_score')
        logger.info('read batch information: ' + args.batch)
        batch = pd.read_table(args.batch, index_col=0,
                              sep='\t').iloc[:, args.batch_index - 1]
        batch = batch[X.index.values].values
        score = knn_score(X, batch)
    else:
        raise ValueError('unknown clustering score method: ' + args.method)
    with open_file_or_stdout(args.output_file) as fout:
        fout.write('{}'.format(score))
Example #26
0
def render(args):
    from jinja2 import Template, Environment, StrictUndefined
    from ioutils import open_file_or_stdout
    from collections import defaultdict
    import yaml
    import json

    env = Environment(lstrip_blocks=True,
                      trim_blocks=True,
                      undefined=StrictUndefined)
    with open(args.input_file, 'r') as f:
        template = env.from_string(f.read())

    config = yaml.load(open(args.config, 'r'))
    config['tracks'] = dict(
        sorted(config['tracks'].items(), key=lambda x: x[1]['order']))
    config['tracks_json'] = json.dumps(config['tracks'], indent=4)
    config['options_json'] = json.dumps(config['options'], indent=4)
    with open_file_or_stdout(args.output_file) as f:
        f.write(template.render(**config))
Example #27
0
def calc_rpkm(args):
    import pandas as pd
    import numpy as np
    from ioutils import open_file_or_stdin, open_file_or_stdout

    matrix = pd.read_table(open_file_or_stdin(args.input_file),
                           index_col=0,
                           sep='\t')
    feature_info = matrix.index.to_series().str.split('|', expand=True)
    feature_info.columns = [
        'gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id',
        'start', 'end'
    ]
    feature_info['start'] = feature_info['start'].astype('int')
    feature_info['end'] = feature_info['end'].astype('int')
    feature_info['length'] = feature_info['end'] - feature_info['start']
    matrix = 1000.0 * matrix.div(feature_info['length'], axis=0)
    matrix.to_csv(open_file_or_stdout(args.output_file),
                  index=True,
                  header=True,
                  sep='\t',
                  na_rep='NA')
def summarize_metrics(args):
    import numpy as np
    import h5py
    import pandas as pd
    from tqdm import tqdm
    from ioutils import open_file_or_stdout

    def parse_filename(filename):
        d = {}
        keymap = {
            'd': 'dataset',
            'w': 'window_size',
            'b': 'binarization_method',
            'm': 'model',
            'i': 'cv_index'
        }
        for v in filename.split(','):
            c = v.split('=')
            if len(c) == 1:
                d[keymap[c[0]]] = None
            elif len(c) == 2:
                d[keymap[c[0]]] = c[1]
            else:
                raise ValueError('cannot parse filename: ' + filename)
        return d

    logger.info('read input directory: ' + args.input_dir)
    summary = []
    for input_file in os.listdir(args.input_dir):
        d = parse_filename(input_file)
        with h5py.File(os.path.join(args.input_dir, input_file), 'r') as f:
            d['accuracy'] = f['metrics/accuracy'][()]
            d['roc_auc'] = f['metrics/roc_auc'][()]
            summary.append(d)
    summary = pd.DataFrame.from_records(summary)
    with open_file_or_stdout(args.output_file) as fout:
        summary.to_csv(fout, sep='\t', index=False)
def summarize_metrics_by_rna(args):
    import numpy as np
    import h5py
    import pandas as pd
    from tqdm import tqdm
    from ioutils import open_file_or_stdout

    def parse_filename(filename):
        d = {}
        keymap = {
            'd': 'dataset',
            'w': 'window_size',
            'b': 'binarization_method',
            'm': 'model',
            'i': 'cv_index'
        }
        for v in filename.split(','):
            c = v.split('=')
            if len(c) == 1:
                d[keymap[c[0]]] = None
            elif len(c) == 2:
                d[keymap[c[0]]] = c[1]
            else:
                raise ValueError('cannot parse filename: ' + filename)
        return d

    logger.info('read input directory: ' + args.input_dir)
    summary = []
    for input_file in os.listdir(args.input_dir):
        d = parse_filename(input_file)
        metrics = pd.read_table(os.path.join(args.input_dir, input_file))
        for key, value in d.items():
            metrics[key] = value
        summary.append(metrics)
    summary = pd.concat(summary, axis=0)
    with open_file_or_stdout(args.output_file) as fout:
        summary.to_csv(fout, sep='\t', index=False)
Example #30
0
def extract_gene(args):
    from ioutils import open_file_or_stdout

    feature = args.feature
    genes = {}
    logger.info('read GTF file: ' + args.input_file)
    for c, attrs, line in read_gtf(args.input_file):
        if (feature is not None) and (c[2] != feature):
            continue
        gene_id = attrs.get('gene_id')
        gene = genes.get(gene_id)
        if gene is None:
            gene = [c[0], int(c[3]) - 1, int(c[4]), gene_id, 0, c[6]]
            genes[gene_id] = gene
        else:
            gene[1] = min(gene[1], int(c[3]) - 1)
            gene[2] = max(gene[2], int(c[4]))

    logger.info('number of genes: {}'.format(len(genes)))
    logger.info('write BED file: ' + args.output_file)
    with open_file_or_stdout(args.output_file) as fout:
        for gene_id, gene in genes.items():
            fout.write('\t'.join(map(str, gene)))
            fout.write('\n')