def bed12_to_fasta(gene_feature_file, reference_seq_file): ref_fa = pysam.Fastafile(reference_seq_file) for g in GeneFeature.parse(open(gene_feature_file)): exon_seqs = [] error_occurred = False for start, end in g.exons: seq = ref_fa.fetch(g.chrom, start, end) if not seq: logging.warning( "gene %s exon %s:%d-%d not found in reference" % (g.tx_name, g.chrom, start, end)) error_occurred = True break exon_seqs.append(seq) if error_occurred: continue # make fasta record seq = ''.join(exon_seqs) if g.strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) yield (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name, g.strand, seqlines)) ref_fa.close()
def genepred_to_fasta(gene_feature_file, reference_seq_file): ref_fa = pysam.Fastafile(reference_seq_file) total = 0 used = 0 for g in GeneFeature.parse(open(gene_feature_file)): total += 1 exon_seqs = [] error_occurred = False for start, end in g.exons: seq = ref_fa.fetch(g.chrom, start, end) if (not seq) or (len(seq) < (end - start)): logging.warning("gene %s exon %s:%d-%d not found in reference" % (g.tx_name, g.chrom, start, end)) error_occurred = True break exon_seqs.append(seq) if error_occurred: continue used += 1 # make fasta record seq = ''.join(exon_seqs) if g.strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) fa_record = (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name, g.strand, seqlines)) yield g, fa_record logging.info("Used %d/%d gene features" % (used,total)) ref_fa.close()
def build_gene_maps(genefile): gene_genome_map = {} gene_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): gene_genome_map[g.tx_name] = g # add gene to interval tree gene_interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=(g.tx_name)) gene_trees[g.chrom].insert_interval(gene_interval) return gene_genome_map, gene_trees
def create_fragment_size_index(output_dir, gene_feature_file, reference_seq_file, bowtie_build_bin, max_fragment_size): """ make an alignment index containing sequences that can be used to assess the fragment size distribution. these sequences must be larger than the 'max_insert_size' in order to be viable for use in characterizing the fragment size distribution. """ # parse genes file genes = [g for g in GeneFeature.parse(open(gene_feature_file))] # find all exons that are larger than the maximum estimated fragment size exons = set([ coord for coord in find_unambiguous_exon_intervals(genes) if (coord[2] - coord[1]) >= max_fragment_size ]) logging.info("Found %d exons larger than %d" % (len(exons), max_fragment_size)) # extract the nucleotide sequence of the exons logging.info("Extracting sequences to use for estimating the fragment " " size distribution") ref_fa = pysam.Fastafile(reference_seq_file) frag_size_fa_file = os.path.join(output_dir, "frag_size_seq.fa") fh = open(frag_size_fa_file, 'w') for chrom, start, end, strand in exons: seq = ref_fa.fetch(chrom, start, end) if not seq: logging.warning("exon %s:%d-%d not found in reference" % (chrom, start, end)) continue # make fasta record if strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) record = (">%s:%d-%d strand=%s\n%s" % (chrom, start, end, strand, seqlines)) print >> fh, record fh.close() ref_fa.close() # build bowtie alignment index from the fragment size exons logging.info("Building bowtie index") frag_size_index = os.path.join(output_dir, FRAG_SIZE_INDEX) args = [bowtie_build_bin, frag_size_fa_file, frag_size_index] return subprocess.call(args)
def build_exon_trees(samfh, genefile): rname_tid_map = dict((rname,i) for i,rname in enumerate(samfh.references)) exon_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): name = config.GENE_REF_PREFIX + g.tx_name if name not in rname_tid_map: continue if g.chrom not in rname_tid_map: continue gene_tid = rname_tid_map[name] # get reference index in sam file chrom_tid = rname_tid_map[g.chrom] # add gene to interval tree for start,end in g.exons[1::-1]: exon_interval = Interval(start, end, chrom=chrom_tid, strand=g.strand, value=gene_tid) exon_trees[chrom_tid].insert_interval(exon_interval) return dict(exon_trees)
def create_fragment_size_index(output_dir, gene_feature_file, reference_seq_file, bowtie_build_bin, max_fragment_size): """ make an alignment index containing sequences that can be used to assess the fragment size distribution. these sequences must be larger than the 'max_insert_size' in order to be viable for use in characterizing the fragment size distribution. """ # parse genes file genes = [g for g in GeneFeature.parse(open(gene_feature_file))] # find all exons that are larger than the maximum estimated fragment size exons = set([coord for coord in find_unambiguous_exon_intervals(genes) if (coord[2] - coord[1]) >= max_fragment_size]) logging.info("Found %d exons larger than %d" % (len(exons), max_fragment_size)) # extract the nucleotide sequence of the exons logging.info("Extracting sequences to use for estimating the fragment " " size distribution") ref_fa = pysam.Fastafile(reference_seq_file) frag_size_fa_file = os.path.join(output_dir, "frag_size_seq.fa") fh = open(frag_size_fa_file, 'w') for chrom, start, end, strand in exons: seq = ref_fa.fetch(chrom, start, end) if not seq: logging.warning("exon %s:%d-%d not found in reference" % (chrom, start, end)) continue # make fasta record if strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) record = (">%s:%d-%d strand=%s\n%s" % (chrom, start, end, strand, seqlines)) print >>fh, record fh.close() ref_fa.close() # build bowtie alignment index from the fragment size exons logging.info("Building bowtie index") frag_size_index = os.path.join(output_dir, FRAG_SIZE_INDEX) args = [bowtie_build_bin, frag_size_fa_file, frag_size_index] return subprocess.call(args)
def bed12_to_fasta(gene_feature_file, reference_seq_file): ref_fa = pysam.Fastafile(reference_seq_file) for g in GeneFeature.parse(open(gene_feature_file)): exon_seqs = [] error_occurred = False for start, end in g.exons: seq = ref_fa.fetch(g.chrom, start, end) if not seq: error_occurred = True break exon_seqs.append(seq) if error_occurred: continue # make fasta record seq = ''.join(exon_seqs) if g.strand == '-': seq = DNA_reverse_complement(seq) # break seq onto multiple lines seqlines = split_seq(seq, BASES_PER_LINE) yield (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % (config.GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name, g.strand, seqlines)) ref_fa.close()
def build_exon_trees(samfh, genefile): rname_tid_map = dict( (rname, i) for i, rname in enumerate(samfh.references)) exon_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): name = config.GENE_REF_PREFIX + g.tx_name if name not in rname_tid_map: continue if g.chrom not in rname_tid_map: continue gene_tid = rname_tid_map[name] # get reference index in sam file chrom_tid = rname_tid_map[g.chrom] # add gene to interval tree for start, end in g.exons[1::-1]: exon_interval = Interval(start, end, chrom=chrom_tid, strand=g.strand, value=gene_tid) exon_trees[chrom_tid].insert_interval(exon_interval) return dict(exon_trees)