def summarize_metrics(args): import numpy as np import h5py import pandas as pd from ioutils import open_file_or_stdout def parse_filename(filename): c = filename.split('/') d = {'dataset': c[2], 'cv_index': c[4].split('_')[-1], 'model_name': c[-1].split('.')[1], 'featureset': c[-1].split('.')[2], 'icshape_dataset': c[-1].split('.')[3] } return d summary = [] for input_file in args.input_files: d = parse_filename(input_file) with h5py.File(input_file, 'r') as f: d['accuracy'] = f['metrics/accuracy'][()] d['roc_auc'] = f['metrics/roc_auc'][()] summary.append(d) summary = pd.DataFrame.from_records(summary) summary = summary[['dataset', 'icshape_dataset', 'model_name', 'featureset', 'cv_index', 'accuracy', 'roc_auc']] with open_file_or_stdout(args.output_file) as fout: summary.to_csv(fout, sep='\t', index=False)
def read_duplicate_hist(args): import pysam import numpy as np from ioutils import open_file_or_stdout bin_size = args.bin_size max_length = args.max_length n = max_length // bin_size bounds = np.arange(0, (n + 1) * bin_size, bin_size) logger.info('read chrom sizes: ' + args.chrom_sizes_file) chrom_sizes = {} with open(args.chrom_sizes_file, 'r') as f: for line in f: c = line.strip().split('\t') chrom_sizes[c[0]] = int(c[1]) logger.info('read input BAM/SAM file: ' + args.input_file) sam = pysam.AlignmentFile(args.input_file, "rb") dup_counts = np.zeros(n + 1, dtype=np.int64) tot_counts = np.zeros(n + 1, dtype=np.int64) max_length = args.max_length for read in sam: index = min(chrom_sizes[read.reference_name] // bin_size, n) if read.is_duplicate: dup_counts[index] += 1 tot_counts[index] += 1 logger.info('create output file: ' + args.output_file) with open_file_or_stdout(args.output_file) as f: f.write('bin\tduplicates\ttotal\n') for i in range(n + 1): f.write('{}\t{}\t{}\n'.format(bounds[i], dup_counts[i], tot_counts[i]))
def extract_feature_sequence(args): from pyfaidx import Fasta from Bio.Seq import Seq from ioutils import open_file_or_stdout fout = open_file_or_stdout(args.output_file) fastas = {} with open(args.input_file, 'r') as fin: for lineno, line in enumerate(fin): feature = line.split('\t')[0] gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split( '|') start = int(start) end = int(end) if gene_type == 'genomic': gene_type = 'genome' if gene_type not in fastas: fastas[gene_type] = Fasta( os.path.join(args.genome_dir, 'fasta', gene_type + '.fa')) if gene_type == 'genome': chrom, gstart, gend, strand = gene_id.split('_') gstart = int(gstart) gend = int(gend) seq = fastas[gene_type][chrom][gstart:gend].seq if strand == '-': seq = str(Seq(seq).reverse_complement()) else: seq = fastas[gene_type][transcript_id][start:end].seq seq = seq.upper() fout.write('>{}\n'.format(feature)) fout.write(seq) fout.write('\n') fout.close()
def extract_mature_mirna_location(args): from utils import read_gff, GFFRecord from ioutils import open_file_or_stdin, open_file_or_stdout from collections import OrderedDict, defaultdict logger.info('read input GFF file: ' + args.input_file) fin = open_file_or_stdin(args.input_file) logger.info('open output BED file: ' + args.input_file) fout = open_file_or_stdout(args.output_file) # key: precursor_id, value: precursor record precursors = OrderedDict() # key: precursor_id, value: list of mature records matures = defaultdict(list) # read features from GFF file for record in read_gff(fin): if record.feature == 'miRNA_primary_transcript': precursors[record.attr['ID']] = record elif record.feature == 'miRNA': matures[record.attr['Derives_from']].append(record) # get locations of mature miRNAs for precursor_id, precursor in precursors.items(): for mature in matures[precursor_id]: if mature.strand == '+': fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format( precursor.attr['Name'], mature.start - precursor.start, mature.end - precursor.start + 1, mature.attr['Name'])) else: fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format( precursor.attr['Name'], precursor.end - mature.end, precursor.end - mature.start + 1, mature.attr['Name'])) fin.close() fout.close()
def merge_data_frames(args): import pandas as pd import numpy as np from ioutils import open_file_or_stdout if (not args.on_index) and (args.on is None): raise ValueError( 'argument --on is required if --on-index is not specified') merged = None for input_file in args.input_file: logger.info('read input file: ' + input_file) df = pd.read_table(input_file, sep=args.sep) if merged is None: merged = df else: if args.on_index: merged = pd.merge(merged, df, how=args.how, left_index=True, right_index=True) else: merged = pd.merge(merged, df, how=args.how, on=args.on) if args.fillna is not None: merged.fillna(args.fillna, inplace=True) logger.info('open output file: ' + args.output_file) with open_file_or_stdout(args.output_file) as f: merged.to_csv(f, sep=args.sep, header=True, index=args.on_index)
def fragment_length_hist(args): import pysam import numpy as np from ioutils import open_file_or_stdout logger.info('read input BAM/SAM file: ' + args.input_file) sam = pysam.AlignmentFile(args.input_file, "rb") max_length = args.max_length counts = np.zeros(max_length + 1, dtype=np.int64) read1 = None for read in sam: if (not read.is_paired) or (not read.is_proper_pair): continue if read.is_read1: read1 = read elif read.is_read2: if read.query_name != read1.query_name: continue length = read.reference_end - read1.reference_start counts[min(length, max_length)] += 1 with open_file_or_stdout(args.output_file) as f: f.write('fragment_length\tcounts\n') for i in range(max_length + 1): f.write('{}\t{}\n'.format(i, counts[i]))
def count_transcript(args): import pysam import numpy as np from ioutils import open_file_or_stdout from collections import OrderedDict logger.info('read input BAM/SAM file: ' + args.input_file) sam = pysam.AlignmentFile(args.input_file, "rb") counts = OrderedDict() min_mapping_quality = args.min_mapping_quality strandness = {'no': 0, 'forward': 1, 'reverse': 2}.get(args.strandness, 0) for read in sam: if read.is_unmapped: continue if read.mapping_quality < min_mapping_quality: continue if (strandness == 1) and read.is_reverse: continue if (strandness == 2) and (not read.is_reverse): continue if read.reference_name not in counts: counts[read.reference_name] = 0 counts[read.reference_name] += 1 with open_file_or_stdout(args.output_file) as f: if sam.header is not None: for sq in sam.header['SQ']: name = sq['SN'] f.write('{}\t{}\n'.format(name, counts.get(name, 0))) else: for name, count in counts.items(): f.write('{}\t{}\n'.format(name, count))
def chrom_sizes(args): from Bio import SeqIO from ioutils import open_file_or_stdin, open_file_or_stdout fout = open_file_or_stdout(args.output_file) with open_file_or_stdin(args.input_file) as fin: for record in SeqIO.parse(fin, 'fasta'): fout.write('{}\t{}\n'.format(record.id, len(record.seq)))
def gtf_to_transcript_table(args): from ioutils import open_file_or_stdin, open_file_or_stdout from collections import OrderedDict feature = args.feature default_transcript_type = args.transcript_type default_gene_type = args.gene_type fout = open_file_or_stdout(args.output_file) with open_file_or_stdin(args.input_file) as fin: transcripts = OrderedDict() for line in fin: c = line.strip().split('\t') if c[0].startswith('#'): continue if c[2] != feature: continue attrs = {} for a in c[8].split(';')[:-1]: a = a.strip() i = a.find(' ') key = a[:i] val = a[(i + 1):].strip('"') attrs[key] = val if 'transcript_name' not in attrs: attrs['transcript_name'] = attrs['transcript_id'] if 'gene_name' not in attrs: attrs['gene_name'] = attrs['gene_id'] if default_transcript_type is not None: attrs['transcript_type'] = default_transcript_type else: if 'transcript_type' not in attrs: attrs['transcript_type'] = 'unknown' if default_gene_type is not None: attrs['gene_type'] = default_gene_type else: if 'gene_type' not in attrs: attrs['gene_type'] = 'unknown' exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6], attrs['gene_id'], attrs['transcript_id'], attrs['gene_name'], attrs['transcript_name'], attrs['gene_type'], attrs['transcript_type'], c[1]] transcript = transcripts.get(attrs['transcript_id']) if transcript is None: transcripts[attrs['transcript_id']] = exon else: if c[2] == 'exon': transcript[1] = min(transcript[1], exon[1]) transcript[2] = max(transcript[2], exon[2]) header = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'gene_id', 'transcript_id', 'gene_name', 'transcript_name', 'gene_type', 'transcript_type', 'source' ] print('\t'.join(header), file=fout) for transcript in transcripts.values(): print('\t'.join(str(a) for a in transcript), file=fout) fout.close()
def rfam(args): import numpy as np import subprocess from Bio import SeqIO from io import StringIO import re from utils import random_sequences from ioutils import open_file_or_stdout alphabet = 'AUCG' # read CM file motif_name = 'RFAM' with open(args.input_file, 'r') as f: for line in f: c = line.strip().split() if c[0] == 'NAME': motif_name = c[1] break n_motif_seqs = args.n - round(args.bg_percent*0.01*args.n) n_bg_seqs = round(args.bg_percent*0.01*args.n) # generate motif sequences p = subprocess.Popen(['cmemit', '--nohmmonly', '-e', str(args.length), '-N', str(n_motif_seqs), args.input_file], stdout=subprocess.PIPE) out, _ = p.communicate() sequences = [] starts = np.zeros(n_motif_seqs, dtype=np.int32) ends = np.zeros(n_motif_seqs, dtype=np.int32) labels = np.zeros(args.n, dtype=np.int32) labels[:n_motif_seqs] = 1 pat_cmemit = re.compile(r'^[^/]+/([0-9]+)\-([0-9]+)$') for i, record in enumerate(SeqIO.parse(StringIO(str(out, encoding='ascii')), 'fasta')): start, end = pat_cmemit.match(record.id).groups() sequences.append(str(record.seq)) starts[i] = int(start) + 1 ends[i] = int(end) # generate background sequences if n_bg_seqs > 0: sequences += random_sequences(args.length, alphabet=alphabet, size=n_bg_seqs) starts = np.append(starts, np.zeros(n_bg_seqs, dtype=np.int32)) ends = np.append(ends, np.zeros(n_bg_seqs, dtype=np.int32)) # shuffle orders logger.info('generate {} motif sequences and {} background sequences'.format(n_motif_seqs, n_bg_seqs)) seq_indices = np.random.permutation(args.n) sequences = [sequences[i] for i in seq_indices] labels = labels[seq_indices] starts = starts[seq_indices] ends = ends[seq_indices] logger.info('create output file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) for i, seq in enumerate(sequences): fout.write('>{}_{:06d},{},{},{}\n'.format(motif_name, i + 1, labels[i], starts[i], ends[i])) fout.write(seq) fout.write('\n') fout.close()
def extract_longest_transcript(args): from ioutils import open_file_or_stdin, open_file_or_stdout from collections import defaultdict from functools import partial feature = args.feature genes = defaultdict(partial(defaultdict, int)) lines = [] logger.info('read gtf file: ' + args.input_file) with open_file_or_stdin(args.input_file) as fin: lineno = 0 for line in fin: lineno += 1 c = line.strip().split('\t') if c[0].startswith('#'): continue if c[2] != feature: lines.append(('#other#', line)) continue attrs = {} for a in c[8].split(';')[:-1]: a = a.strip() i = a.find(' ') key = a[:i] val = a[(i + 1):].strip('"') attrs[key] = val transcript_id = attrs.get('transcript_id') if transcript_id is None: raise ValueError( 'transcript_id not found in GTF file at line {}'.format( lineno)) gene_id = attrs.get('gene_id') if gene_id is None: raise ValueError( 'gene_id not found in GTF file at line {}'.format(lineno)) lines.append((transcript_id, line)) genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1 kept_transcripts = set() kept_transcripts.add('#other#') for gene_id, gene in genes.items(): max_length = 0 max_transcript = None for transcript_id, length in gene.items(): if length > max_length: max_length = length max_transcript = transcript_id kept_transcripts.add(transcript_id) logger.info('number of genes: {}'.format(len(genes))) logger.info('number of transcripts: {}'.format( sum(map(len, genes.values())))) logger.info( 'number of longest transcripts: {}'.format(len(kept_transcripts) - 1)) logger.info('write output gtf file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: for transcript_id, line in lines: if transcript_id in kept_transcripts: fout.write(line)
def normalize(args): from ioutils import open_file_or_stdin, open_file_or_stdout import pandas as pd with open_file_or_stdin(args.input_file) as f: matrix = pd.read_table(f, sep='\t', index_col=0) if args.method == 'cpm': matrix = 1e6 * matrix.astype('float') / matrix.sum(axis=0) with open_file_or_stdout(args.output_file) as f: matrix.to_csv(f, sep='\t', header=True, index=True, na_rep='NA')
def background(args): from utils import random_sequences from ioutils import open_file_or_stdout sequences = random_sequences(args.length, alphabet=args.alphabet, size=args.n) logger.info('create output file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) for i, seq in enumerate(sequences): fout.write('>RANDOM_{:06d}:0\n'.format(i + 1)) fout.write(seq) fout.write('\n') fout.close()
def sequential_mapping(args): from jinja2 import Template, Environment from ioutils import open_file_or_stdout rna_types = [] if len(args.rna_types) > 0: rna_types = args.rna_types.split(',') logger.info('load template: ' + args.template) env = Environment(lstrip_blocks=True, trim_blocks=True) with open(args.template, 'r') as f: template = env.from_string(f.read()) with open_file_or_stdout(args.output_file) as f: f.write(template.render(rna_types=rna_types, aligner=args.aligner))
def gtf_to_bed(args): from ioutils import open_file_or_stdin, open_file_or_stdout exon_feature = 'exon' # use transcript_id attribute as key transcripts = {} logger.info('read input GTF file: ' + args.input_file) for lineno, record in enumerate(read_gtf(args.input_file)): c, attrs, line = record if c[2] == exon_feature: gene_id = attrs.get('gene_id') if gene_id is None: raise ValueError( 'gene_id attribute not found in GTF file {}:{}'.format( args.input_file, lineno)) transcript_id = attrs.get('transcript_id') if transcript_id is None: raise ValueError( 'transcript_id attribute not found in GTF file {}:{}'. format(args.input_file, lineno)) transcript = transcripts.get(transcript_id) if transcript is None: # new transcript transcript = { 'chrom': c[0], 'strand': c[6], 'gene_id': gene_id, 'gene_name': attrs.get('gene_name', gene_id), 'transcript_name': attrs.get('transcript_name', transcript_id), 'exons': [] } transcripts[transcript_id] = transcript # add a new exon transcript['exons'].append((int(c[3]) - 1, int(c[4]))) fout = open_file_or_stdout(args.output_file) bed_template = '{chrom}\t{start}\t{end}\t{name}\t0\t{strand}\t0\t0\t0\t{n_exons}\t{exon_sizes}\t{exon_starts}\n' for transcript_id, transcript in transcripts.items(): # sort exons by start position transcript['exons'] = sorted(transcript['exons'], key=lambda x: x[0]) transcript['n_exons'] = len(transcript['exons']) transcript['start'] = transcript['exons'][0][0] transcript['end'] = transcript['exons'][-1][1] transcript['exon_starts'] = ','.join( str(e[0] - transcript['start']) for e in transcript['exons']) transcript['exon_sizes'] = ','.join( str(e[1] - e[0]) for e in transcript['exons']) transcript['name'] = '{gene_id}'.format(**transcript) fout.write(bed_template.format(**transcript))
def transcript_counts(args): import pysam import numpy as np from ioutils import open_file_or_stdout from collections import defaultdict logger.info('read input transcript BAM file: ' + args.input_file) sam = pysam.AlignmentFile(args.input_file, "rb") counts = defaultdict(int) for read in sam: counts[read.reference_name] += 1 logger.info('create output file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: for key, val in counts.items(): fout.write('{}\t{}\n'.format(key, val))
def filter_circrna_reads(args): import pysam import numpy as np from ioutils import open_file_or_stdout, open_file_or_stdin from collections import defaultdict from copy import deepcopy logger.info('read input SAM file: ' + args.input_file) fin = open_file_or_stdin(args.input_file) sam_in = pysam.AlignmentFile(fin, "r") if sam_in.header is None: raise ValueError('requires SAM header to get junction positions') # get junction positions (middle of the sequences) junction_positions = {} for sq in sam_in.header['SQ']: junction_positions[sq['SN']] = sq['LN'] // 2 logger.info('create output SAM file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in) sam_filtered = None if args.filtered_file is not None: logger.info('create filtered SAM file: ' + args.filtered_file) sam_filtered = pysam.AlignmentFile(args.filtered_file, 'w', template=sam_in) for read in sam_in: filtered = False if read.is_unmapped: filtered = True elif read.is_reverse: filtered = True else: pos = junction_positions[read.reference_name] if not (read.reference_start < pos <= read.reference_end): filterd = True if not filtered: sam_out.write(read) elif sam_filtered is not None: sam_filtered.write(read) fin.close() fout.close() if sam_filtered is not None: sam_filtered.close()
def flagstat(args): import pysam from ioutils import open_file_or_stdin, open_file_or_stdout logger.info('read input file: ' + args.input_file) fin = open_file_or_stdin(args.input_file) sam = pysam.AlignmentFile(fin, 'rb') counts = [0] * 4096 for read in sam: counts[read.flag] += 1 sam.close() logger.info('create output file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: fout.write('flag\tcounts\n') for flag, count in enumerate(counts): if count > 0: fout.write('{}\t{}\n'.format(flag, count))
def extract_feature_sequence(args): from pyfaidx import Fasta from Bio.Seq import Seq from ioutils import open_file_or_stdout def pad_range(start, end, chrom_size, max_length): if (end - start) >= max_length: continue padding_left = (max_length - (end - start)) // 2 new_start = max(0, start - padding_left) new_end = min(new_start + max_length, chrom_size) return new_start, new_end fout = open_file_or_stdout(args.output_file) fastas = {} with open(args.input_file, 'r') as fin: for lineno, line in enumerate(fin): if lineno == 0: continue feature = line.split('\t')[0] gene_id, gene_type, gene_name, domain_id, transcript_id, start, end = feature.split( '|') start = int(start) end = int(end) if gene_type == 'genomic': gene_type = 'genome' # load FASTA file if gene_type not in fastas: fastas[gene_type] = Fasta( os.path.join(args.genome_dir, 'fasta', gene_type + '.fa')) if gene_type == 'genome': chrom, gstart, gend, strand = gene_id.split('_') gstart = int(gstart) gend = int(gend) seq = fastas[gene_type][chrom][gstart:gend].seq if strand == '-': seq = str(Seq(seq).reverse_complement()) else: seq = fastas[gene_type][transcript_id][start:end].seq seq = seq.upper() fout.write('>{}\n'.format(feature)) fout.write(seq) fout.write('\n') fout.close()
def extract_circrna_junction(args): from Bio import SeqIO from ioutils import open_file_or_stdin, open_file_or_stdout anchor_size = args.anchor_size logger.info('read sequence file: ' + args.input_file) logger.info('create output file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) with open_file_or_stdin(args.input_file) as fin: for record in SeqIO.parse(fin, 'fasta'): seq = str(record.seq) if len(seq) < args.min_length: continue s = min(len(seq), anchor_size) seq_id = record.id.split('|')[0] fout.write('>{}\n'.format(seq_id)) fout.write(seq[-s:] + seq[:s]) fout.write('\n') fout.close()
def pwm(args): from utils import read_transfac from utils import sample_pwm, random_sequences, embed_pwm from ioutils import open_file_or_stdout logger.info('read motif file: ' + args.input_file) motif = read_transfac(args.input_file) pwm = motif['PWM']/np.sum(motif['PWM'], axis=1, keepdims=True) alphabet = motif['PO'] pwm_name = motif['ID'] motif_length = motif['PWM'].shape[0] if args.length is None: args.length = motif_length if args.length < motif_length: raise ValueError('cannot embed motif of length {} into sequence of length{}'.format(motif_length, args.length)) n_motif_seqs = args.n - round(args.bg_percent*0.01*args.n) n_bg_seqs = round(args.bg_percent*0.01*args.n) sequences, starts = embed_pwm(pwm, alphabet=alphabet, size=n_motif_seqs, length=args.length) ends = starts + motif_length if n_bg_seqs > 0: sequences += random_sequences(args.length, alphabet=alphabet, size=n_bg_seqs) starts = np.append(starts, np.zeros(n_bg_seqs, dtype=np.int32)) ends = np.append(ends, np.zeros(n_bg_seqs, dtype=np.int32)) labels = np.zeros(args.n, dtype=np.int32) labels[:n_motif_seqs] = 1 # shuffle orders logger.info('generate {} motif sequences and {} background sequences'.format(n_motif_seqs, n_bg_seqs)) seq_indices = np.random.permutation(args.n) sequences = [sequences[i] for i in seq_indices] labels = labels[seq_indices] starts = starts[seq_indices] ends = ends[seq_indices] logger.info('create output file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) for i, seq in enumerate(sequences): fout.write('>{}_{:06d},{},{},{}\n'.format(pwm_name, i + 1, labels[i], starts[i], ends[i])) fout.write(seq) fout.write('\n') fout.close()
def sample_pwm(args): from utils import read_transfac from utils import sample_pwm as _sample_pwm from ioutils import open_file_or_stdout logger.info('read motif file: ' + args.input_file) motif = read_transfac(args.input_file) pwm = motif['PWM'] / np.sum(motif['PWM'], axis=1, keepdims=True) alphabet = motif['PO'].replace('T', 'U') sequences = _sample_pwm(pwm, alphabet=motif['PO'], size=args.n) pwm_name = motif['ID'] logger.info('create output file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) for i, seq in enumerate(sequences): fout.write('>{}_{:06d}\n'.format(pwm_name, i + 1)) fout.write(seq) fout.write('\n') fout.close()
def read_length_hist(args): import pysam import numpy as np from ioutils import open_file_or_stdout logger.info('read input BAM/SAM file: ' + args.input_file) sam = pysam.AlignmentFile(args.input_file, "rb") counts_ref = np.zeros(args.max_length, dtype=np.int64) counts_query = np.zeros(args.max_length, dtype=np.int64) max_length = args.max_length for read in sam: counts_query[read.query_length] += 1 counts_ref[min(read.reference_length, max_length - 1)] += 1 logger.info('create output file: ' + args.output_file) with open_file_or_stdout(args.output_file) as f: f.write('length\tquery\treference\n') for i in range(args.max_length): f.write('{}\t{}\t{}\n'.format(i, counts_query[i], counts_ref[i]))
def fix_gtf(args): from ioutils import open_file_or_stdout from collections import defaultdict # strand of exons grouped by transcript_id strands = defaultdict(list) feature = args.feature lines = [] logger.info('read GTF file: ' + args.input_file) for c, attrs, line in read_gtf(args.input_file): if c[2] in ('transcript', 'exon'): transcript_id = attrs.get('transcript_id') if transcript_id is None: raise ValueError( 'transcript_id not found in GTF file at line {}'.format( lineno)) lines.append((transcript_id, line)) else: transcript_id = attrs.get('transcript_id') if transcript_id is None: raise ValueError( 'transcript_id not found in GTF file at line {}'.format( lineno)) strands[transcript_id].append(c[6]) invalid_transcripts = set() for transcript_id, strands_tx in strands.items(): strands_tx = set(strands_tx) # remove transcripts without strand information if '.' in strands_tx: invalid_transcripts.add(transcript_id) # remove transcripts with exon on different strands elif len(strands_tx) != 1: invalid_transcripts.add(transcript_id) logger.info('number of transcripts: {}'.format(len(strands))) logger.info('number of invalid transcripts: {}'.format( len(invalid_transcripts))) logger.info('write GTF file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: for transcript_id, line in lines: if transcript_id not in invalid_transcripts: fout.write(line)
def calculate_clustering_score(args): import numpy as np import pandas as pd from evaluation import uca_score, knn_score from ioutils import open_file_or_stdout logger.info('read feature matrix: ' + args.matrix) X = pd.read_table(args.matrix, index_col=0, sep='\t') if args.transpose: logger.info('transpose feature matrix') X = X.T if args.use_log: logger.info('apply log2 to feature matrix') X = np.log2(X + 0.25) logger.info('calculate clustering score') if args.method == 'uca_score': if args.sample_classes is None: raise ValueError( 'argument --sample-classes is required for knn_score') logger.info('read sample classes: ' + args.sample_classes) sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t').iloc[:, 0] y = sample_classes[X.index.values].values score = uca_score(X, y) elif args.method == 'knn_score': if args.batch is None: raise ValueError('argument --batch is required for knn_score') if args.batch_index is None: raise ValueError( 'argument --batch-index is required for knn_score') logger.info('read batch information: ' + args.batch) batch = pd.read_table(args.batch, index_col=0, sep='\t').iloc[:, args.batch_index - 1] batch = batch[X.index.values].values score = knn_score(X, batch) else: raise ValueError('unknown clustering score method: ' + args.method) with open_file_or_stdout(args.output_file) as fout: fout.write('{}'.format(score))
def render(args): from jinja2 import Template, Environment, StrictUndefined from ioutils import open_file_or_stdout from collections import defaultdict import yaml import json env = Environment(lstrip_blocks=True, trim_blocks=True, undefined=StrictUndefined) with open(args.input_file, 'r') as f: template = env.from_string(f.read()) config = yaml.load(open(args.config, 'r')) config['tracks'] = dict( sorted(config['tracks'].items(), key=lambda x: x[1]['order'])) config['tracks_json'] = json.dumps(config['tracks'], indent=4) config['options_json'] = json.dumps(config['options'], indent=4) with open_file_or_stdout(args.output_file) as f: f.write(template.render(**config))
def calc_rpkm(args): import pandas as pd import numpy as np from ioutils import open_file_or_stdin, open_file_or_stdout matrix = pd.read_table(open_file_or_stdin(args.input_file), index_col=0, sep='\t') feature_info = matrix.index.to_series().str.split('|', expand=True) feature_info.columns = [ 'gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end' ] feature_info['start'] = feature_info['start'].astype('int') feature_info['end'] = feature_info['end'].astype('int') feature_info['length'] = feature_info['end'] - feature_info['start'] matrix = 1000.0 * matrix.div(feature_info['length'], axis=0) matrix.to_csv(open_file_or_stdout(args.output_file), index=True, header=True, sep='\t', na_rep='NA')
def summarize_metrics(args): import numpy as np import h5py import pandas as pd from tqdm import tqdm from ioutils import open_file_or_stdout def parse_filename(filename): d = {} keymap = { 'd': 'dataset', 'w': 'window_size', 'b': 'binarization_method', 'm': 'model', 'i': 'cv_index' } for v in filename.split(','): c = v.split('=') if len(c) == 1: d[keymap[c[0]]] = None elif len(c) == 2: d[keymap[c[0]]] = c[1] else: raise ValueError('cannot parse filename: ' + filename) return d logger.info('read input directory: ' + args.input_dir) summary = [] for input_file in os.listdir(args.input_dir): d = parse_filename(input_file) with h5py.File(os.path.join(args.input_dir, input_file), 'r') as f: d['accuracy'] = f['metrics/accuracy'][()] d['roc_auc'] = f['metrics/roc_auc'][()] summary.append(d) summary = pd.DataFrame.from_records(summary) with open_file_or_stdout(args.output_file) as fout: summary.to_csv(fout, sep='\t', index=False)
def summarize_metrics_by_rna(args): import numpy as np import h5py import pandas as pd from tqdm import tqdm from ioutils import open_file_or_stdout def parse_filename(filename): d = {} keymap = { 'd': 'dataset', 'w': 'window_size', 'b': 'binarization_method', 'm': 'model', 'i': 'cv_index' } for v in filename.split(','): c = v.split('=') if len(c) == 1: d[keymap[c[0]]] = None elif len(c) == 2: d[keymap[c[0]]] = c[1] else: raise ValueError('cannot parse filename: ' + filename) return d logger.info('read input directory: ' + args.input_dir) summary = [] for input_file in os.listdir(args.input_dir): d = parse_filename(input_file) metrics = pd.read_table(os.path.join(args.input_dir, input_file)) for key, value in d.items(): metrics[key] = value summary.append(metrics) summary = pd.concat(summary, axis=0) with open_file_or_stdout(args.output_file) as fout: summary.to_csv(fout, sep='\t', index=False)
def extract_gene(args): from ioutils import open_file_or_stdout feature = args.feature genes = {} logger.info('read GTF file: ' + args.input_file) for c, attrs, line in read_gtf(args.input_file): if (feature is not None) and (c[2] != feature): continue gene_id = attrs.get('gene_id') gene = genes.get(gene_id) if gene is None: gene = [c[0], int(c[3]) - 1, int(c[4]), gene_id, 0, c[6]] genes[gene_id] = gene else: gene[1] = min(gene[1], int(c[3]) - 1) gene[2] = max(gene[2], int(c[4])) logger.info('number of genes: {}'.format(len(genes))) logger.info('write BED file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: for gene_id, gene in genes.items(): fout.write('\t'.join(map(str, gene))) fout.write('\n')