def extract_mature_mirna_location(args): from utils import read_gff, GFFRecord from ioutils import open_file_or_stdin, open_file_or_stdout from collections import OrderedDict, defaultdict logger.info('read input GFF file: ' + args.input_file) fin = open_file_or_stdin(args.input_file) logger.info('open output BED file: ' + args.input_file) fout = open_file_or_stdout(args.output_file) # key: precursor_id, value: precursor record precursors = OrderedDict() # key: precursor_id, value: list of mature records matures = defaultdict(list) # read features from GFF file for record in read_gff(fin): if record.feature == 'miRNA_primary_transcript': precursors[record.attr['ID']] = record elif record.feature == 'miRNA': matures[record.attr['Derives_from']].append(record) # get locations of mature miRNAs for precursor_id, precursor in precursors.items(): for mature in matures[precursor_id]: if mature.strand == '+': fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format( precursor.attr['Name'], mature.start - precursor.start, mature.end - precursor.start + 1, mature.attr['Name'])) else: fout.write('{}\t{}\t{}\t{}\t0\t+\n'.format( precursor.attr['Name'], precursor.end - mature.end, precursor.end - mature.start + 1, mature.attr['Name'])) fin.close() fout.close()
def chrom_sizes(args): from Bio import SeqIO from ioutils import open_file_or_stdin, open_file_or_stdout fout = open_file_or_stdout(args.output_file) with open_file_or_stdin(args.input_file) as fin: for record in SeqIO.parse(fin, 'fasta'): fout.write('{}\t{}\n'.format(record.id, len(record.seq)))
def extract_longest_transcript(args): from ioutils import open_file_or_stdin, open_file_or_stdout from collections import defaultdict from functools import partial feature = args.feature genes = defaultdict(partial(defaultdict, int)) lines = [] logger.info('read gtf file: ' + args.input_file) with open_file_or_stdin(args.input_file) as fin: lineno = 0 for line in fin: lineno += 1 c = line.strip().split('\t') if c[0].startswith('#'): continue if c[2] != feature: lines.append(('#other#', line)) continue attrs = {} for a in c[8].split(';')[:-1]: a = a.strip() i = a.find(' ') key = a[:i] val = a[(i + 1):].strip('"') attrs[key] = val transcript_id = attrs.get('transcript_id') if transcript_id is None: raise ValueError( 'transcript_id not found in GTF file at line {}'.format( lineno)) gene_id = attrs.get('gene_id') if gene_id is None: raise ValueError( 'gene_id not found in GTF file at line {}'.format(lineno)) lines.append((transcript_id, line)) genes[gene_id][transcript_id] += int(c[4]) - int(c[3]) + 1 kept_transcripts = set() kept_transcripts.add('#other#') for gene_id, gene in genes.items(): max_length = 0 max_transcript = None for transcript_id, length in gene.items(): if length > max_length: max_length = length max_transcript = transcript_id kept_transcripts.add(transcript_id) logger.info('number of genes: {}'.format(len(genes))) logger.info('number of transcripts: {}'.format( sum(map(len, genes.values())))) logger.info( 'number of longest transcripts: {}'.format(len(kept_transcripts) - 1)) logger.info('write output gtf file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: for transcript_id, line in lines: if transcript_id in kept_transcripts: fout.write(line)
def gtf_to_transcript_table(args): from ioutils import open_file_or_stdin, open_file_or_stdout from collections import OrderedDict feature = args.feature default_transcript_type = args.transcript_type default_gene_type = args.gene_type fout = open_file_or_stdout(args.output_file) with open_file_or_stdin(args.input_file) as fin: transcripts = OrderedDict() for line in fin: c = line.strip().split('\t') if c[0].startswith('#'): continue if c[2] != feature: continue attrs = {} for a in c[8].split(';')[:-1]: a = a.strip() i = a.find(' ') key = a[:i] val = a[(i + 1):].strip('"') attrs[key] = val if 'transcript_name' not in attrs: attrs['transcript_name'] = attrs['transcript_id'] if 'gene_name' not in attrs: attrs['gene_name'] = attrs['gene_id'] if default_transcript_type is not None: attrs['transcript_type'] = default_transcript_type else: if 'transcript_type' not in attrs: attrs['transcript_type'] = 'unknown' if default_gene_type is not None: attrs['gene_type'] = default_gene_type else: if 'gene_type' not in attrs: attrs['gene_type'] = 'unknown' exon = [c[0], int(c[3]) - 1, int(c[4]), attrs['gene_id'], 0, c[6], attrs['gene_id'], attrs['transcript_id'], attrs['gene_name'], attrs['transcript_name'], attrs['gene_type'], attrs['transcript_type'], c[1]] transcript = transcripts.get(attrs['transcript_id']) if transcript is None: transcripts[attrs['transcript_id']] = exon else: if c[2] == 'exon': transcript[1] = min(transcript[1], exon[1]) transcript[2] = max(transcript[2], exon[2]) header = ['chrom', 'start', 'end', 'name', 'score', 'strand', 'gene_id', 'transcript_id', 'gene_name', 'transcript_name', 'gene_type', 'transcript_type', 'source' ] print('\t'.join(header), file=fout) for transcript in transcripts.values(): print('\t'.join(str(a) for a in transcript), file=fout) fout.close()
def normalize(args): from ioutils import open_file_or_stdin, open_file_or_stdout import pandas as pd with open_file_or_stdin(args.input_file) as f: matrix = pd.read_table(f, sep='\t', index_col=0) if args.method == 'cpm': matrix = 1e6 * matrix.astype('float') / matrix.sum(axis=0) with open_file_or_stdout(args.output_file) as f: matrix.to_csv(f, sep='\t', header=True, index=True, na_rep='NA')
def calculate_gene_length(args): import HTSeq from collections import defaultdict from functools import partial import numpy as np from ioutils import open_file_or_stdin from tqdm import tqdm fin = open_file_or_stdin(args.input_file) gff = HTSeq.GFF_Reader(fin) exons = defaultdict(partial(defaultdict, int)) for feature in tqdm(gff, unit='feature'): if feature.type == 'exon': exons[feature.attr['gene_id']][ feature.attr['transcript_id']] += feature.iv.length
def print_fasta(args): from ioutils import open_file_or_stdin from Bio import SeqIO with open_file_or_stdin(args.input_file) as f: for record in SeqIO.parse(f, 'fasta'): seq_id, label, start, end = record.id.split(',') seq = str(record.seq) start = int(start) end = int(end) print('>{}'.format(record.id)) if label == '1': print('{}\x1B[1;31m{}\x1B[0m{}'.format(seq[:start], seq[start:end], seq[end:])) else: print(seq)
def filter_circrna_reads(args): import pysam import numpy as np from ioutils import open_file_or_stdout, open_file_or_stdin from collections import defaultdict from copy import deepcopy logger.info('read input SAM file: ' + args.input_file) fin = open_file_or_stdin(args.input_file) sam_in = pysam.AlignmentFile(fin, "r") if sam_in.header is None: raise ValueError('requires SAM header to get junction positions') # get junction positions (middle of the sequences) junction_positions = {} for sq in sam_in.header['SQ']: junction_positions[sq['SN']] = sq['LN'] // 2 logger.info('create output SAM file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) sam_out = pysam.AlignmentFile(fout, 'w', template=sam_in) sam_filtered = None if args.filtered_file is not None: logger.info('create filtered SAM file: ' + args.filtered_file) sam_filtered = pysam.AlignmentFile(args.filtered_file, 'w', template=sam_in) for read in sam_in: filtered = False if read.is_unmapped: filtered = True elif read.is_reverse: filtered = True else: pos = junction_positions[read.reference_name] if not (read.reference_start < pos <= read.reference_end): filterd = True if not filtered: sam_out.write(read) elif sam_filtered is not None: sam_filtered.write(read) fin.close() fout.close() if sam_filtered is not None: sam_filtered.close()
def flagstat(args): import pysam from ioutils import open_file_or_stdin, open_file_or_stdout logger.info('read input file: ' + args.input_file) fin = open_file_or_stdin(args.input_file) sam = pysam.AlignmentFile(fin, 'rb') counts = [0] * 4096 for read in sam: counts[read.flag] += 1 sam.close() logger.info('create output file: ' + args.output_file) with open_file_or_stdout(args.output_file) as fout: fout.write('flag\tcounts\n') for flag, count in enumerate(counts): if count > 0: fout.write('{}\t{}\n'.format(flag, count))
def extract_circrna_junction(args): from Bio import SeqIO from ioutils import open_file_or_stdin, open_file_or_stdout anchor_size = args.anchor_size logger.info('read sequence file: ' + args.input_file) logger.info('create output file: ' + args.output_file) fout = open_file_or_stdout(args.output_file) with open_file_or_stdin(args.input_file) as fin: for record in SeqIO.parse(fin, 'fasta'): seq = str(record.seq) if len(seq) < args.min_length: continue s = min(len(seq), anchor_size) seq_id = record.id.split('|')[0] fout.write('>{}\n'.format(seq_id)) fout.write(seq[-s:] + seq[:s]) fout.write('\n') fout.close()
def read_gtf(filename): from ioutils import open_file_or_stdin with open_file_or_stdin(filename) as fin: lineno = 0 for line in fin: lineno += 1 c = line.strip().split('\t') if c[0].startswith('#'): continue attrs = {} for a in c[8].split(';')[:-1]: a = a.strip() i = a.find(' ') key = a[:i] val = a[(i + 1):].strip('"') attrs[key] = val gene_id = attrs.get('gene_id') if gene_id is None: raise ValueError('gene_id not found in GTF file at line {}'.format(lineno)) yield (c, attrs, line)
def calc_rpkm(args): import pandas as pd import numpy as np from ioutils import open_file_or_stdin, open_file_or_stdout matrix = pd.read_table(open_file_or_stdin(args.input_file), index_col=0, sep='\t') feature_info = matrix.index.to_series().str.split('|', expand=True) feature_info.columns = [ 'gene_id', 'gene_type', 'gene_name', 'feature_id', 'transcript_id', 'start', 'end' ] feature_info['start'] = feature_info['start'].astype('int') feature_info['end'] = feature_info['end'].astype('int') feature_info['length'] = feature_info['end'] - feature_info['start'] matrix = 1000.0 * matrix.div(feature_info['length'], axis=0) matrix.to_csv(open_file_or_stdout(args.output_file), index=True, header=True, sep='\t', na_rep='NA')