def bed12_to_fasta(gene_feature_file, reference_seq_file):
    ref_fa = pysam.Fastafile(reference_seq_file)
    for g in GeneFeature.parse(open(gene_feature_file)):
        exon_seqs = []
        error_occurred = False
        for start, end in g.exons:
            seq = ref_fa.fetch(g.chrom, start, end)
            if not seq:
                logging.warning(
                    "gene %s exon %s:%d-%d not found in reference" %
                    (g.tx_name, g.chrom, start, end))
                error_occurred = True
                break
            exon_seqs.append(seq)
        if error_occurred:
            continue
        # make fasta record
        seq = ''.join(exon_seqs)
        if g.strand == '-':
            seq = DNA_reverse_complement(seq)
        # break seq onto multiple lines
        seqlines = split_seq(seq, BASES_PER_LINE)
        yield (">%s range=%s:%d-%d gene=%s strand=%s\n%s" %
               (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name,
                g.strand, seqlines))
    ref_fa.close()
def genepred_to_fasta(gene_feature_file, reference_seq_file):
    ref_fa = pysam.Fastafile(reference_seq_file)
    total = 0
    used = 0
    for g in GeneFeature.parse(open(gene_feature_file)):
        total += 1
        exon_seqs = []
        error_occurred = False
        for start, end in g.exons:
            seq = ref_fa.fetch(g.chrom, start, end)
            if (not seq) or (len(seq) < (end - start)):
                logging.warning("gene %s exon %s:%d-%d not found in reference" % 
                                (g.tx_name, g.chrom, start, end))
                error_occurred = True
                break
            exon_seqs.append(seq)
        if error_occurred:
            continue
        used += 1
        # make fasta record
        seq = ''.join(exon_seqs)
        if g.strand == '-':
            seq = DNA_reverse_complement(seq)
        # break seq onto multiple lines
        seqlines = split_seq(seq, BASES_PER_LINE)    
        fa_record = (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % 
                     (GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, 
                      g.gene_name, g.strand, seqlines))
        yield g, fa_record
    logging.info("Used %d/%d gene features" % (used,total))
    ref_fa.close()
Example #3
0
def build_gene_maps(genefile):
    gene_genome_map = {}
    gene_trees = collections.defaultdict(lambda: IntervalTree())    
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        gene_genome_map[g.tx_name] = g
        # add gene to interval tree
        gene_interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=(g.tx_name))
        gene_trees[g.chrom].insert_interval(gene_interval)
    return gene_genome_map, gene_trees
def create_fragment_size_index(output_dir, gene_feature_file,
                               reference_seq_file, bowtie_build_bin,
                               max_fragment_size):
    """
    make an alignment index containing sequences that can be used to
    assess the fragment size distribution.  these sequences must be 
    larger than the 'max_insert_size' in order to be viable for use 
    in characterizing the fragment size distribution.
    """
    # parse genes file
    genes = [g for g in GeneFeature.parse(open(gene_feature_file))]
    # find all exons that are larger than the maximum estimated fragment size
    exons = set([
        coord for coord in find_unambiguous_exon_intervals(genes)
        if (coord[2] - coord[1]) >= max_fragment_size
    ])
    logging.info("Found %d exons larger than %d" %
                 (len(exons), max_fragment_size))
    # extract the nucleotide sequence of the exons
    logging.info("Extracting sequences to use for estimating the fragment "
                 " size distribution")
    ref_fa = pysam.Fastafile(reference_seq_file)
    frag_size_fa_file = os.path.join(output_dir, "frag_size_seq.fa")
    fh = open(frag_size_fa_file, 'w')
    for chrom, start, end, strand in exons:
        seq = ref_fa.fetch(chrom, start, end)
        if not seq:
            logging.warning("exon %s:%d-%d not found in reference" %
                            (chrom, start, end))
            continue
        # make fasta record
        if strand == '-':
            seq = DNA_reverse_complement(seq)
            # break seq onto multiple lines
            seqlines = split_seq(seq, BASES_PER_LINE)
            record = (">%s:%d-%d strand=%s\n%s" %
                      (chrom, start, end, strand, seqlines))
            print >> fh, record
    fh.close()
    ref_fa.close()
    # build bowtie alignment index from the fragment size exons
    logging.info("Building bowtie index")
    frag_size_index = os.path.join(output_dir, FRAG_SIZE_INDEX)
    args = [bowtie_build_bin, frag_size_fa_file, frag_size_index]
    return subprocess.call(args)
def build_exon_trees(samfh, genefile):
    rname_tid_map = dict((rname,i) for i,rname in enumerate(samfh.references))
    exon_trees = collections.defaultdict(lambda: IntervalTree())    
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        name = config.GENE_REF_PREFIX + g.tx_name
        if name not in rname_tid_map:
            continue
        if g.chrom not in rname_tid_map:
            continue
        gene_tid = rname_tid_map[name]
        # get reference index in sam file
        chrom_tid = rname_tid_map[g.chrom]        
        # add gene to interval tree
        for start,end in g.exons[1::-1]:        
            exon_interval = Interval(start, end, chrom=chrom_tid, strand=g.strand, value=gene_tid)
            exon_trees[chrom_tid].insert_interval(exon_interval)
    return dict(exon_trees)
def create_fragment_size_index(output_dir, gene_feature_file, 
                               reference_seq_file, bowtie_build_bin, 
                               max_fragment_size):
    """
    make an alignment index containing sequences that can be used to
    assess the fragment size distribution.  these sequences must be 
    larger than the 'max_insert_size' in order to be viable for use 
    in characterizing the fragment size distribution.
    """
    # parse genes file
    genes = [g for g in GeneFeature.parse(open(gene_feature_file))]
    # find all exons that are larger than the maximum estimated fragment size
    exons = set([coord for coord in find_unambiguous_exon_intervals(genes)
                 if (coord[2] - coord[1]) >= max_fragment_size])
    logging.info("Found %d exons larger than %d" % (len(exons), max_fragment_size))    
    # extract the nucleotide sequence of the exons
    logging.info("Extracting sequences to use for estimating the fragment "
                 " size distribution")
    ref_fa = pysam.Fastafile(reference_seq_file)    
    frag_size_fa_file = os.path.join(output_dir, "frag_size_seq.fa")
    fh = open(frag_size_fa_file, 'w')
    for chrom, start, end, strand in exons:
        seq = ref_fa.fetch(chrom, start, end)
        if not seq:
            logging.warning("exon %s:%d-%d not found in reference" % (chrom, start, end))
            continue
        # make fasta record
        if strand == '-':
            seq = DNA_reverse_complement(seq)
            # break seq onto multiple lines
            seqlines = split_seq(seq, BASES_PER_LINE)    
            record = (">%s:%d-%d strand=%s\n%s" % 
                      (chrom, start, end, strand, seqlines))
            print >>fh, record
    fh.close()
    ref_fa.close()
    # build bowtie alignment index from the fragment size exons
    logging.info("Building bowtie index")
    frag_size_index = os.path.join(output_dir, FRAG_SIZE_INDEX)
    args = [bowtie_build_bin, frag_size_fa_file, frag_size_index]
    return subprocess.call(args)
def bed12_to_fasta(gene_feature_file, reference_seq_file):
    ref_fa = pysam.Fastafile(reference_seq_file)
    for g in GeneFeature.parse(open(gene_feature_file)):
        exon_seqs = []
        error_occurred = False
        for start, end in g.exons:
            seq = ref_fa.fetch(g.chrom, start, end)
            if not seq:
                error_occurred = True
                break
            exon_seqs.append(seq)
        if error_occurred:
            continue
        # make fasta record
        seq = ''.join(exon_seqs)
        if g.strand == '-':
            seq = DNA_reverse_complement(seq)
        # break seq onto multiple lines
        seqlines = split_seq(seq, BASES_PER_LINE)    
        yield (">%s range=%s:%d-%d gene=%s strand=%s\n%s" % 
               (config.GENE_REF_PREFIX + g.tx_name, g.chrom, start, end, g.gene_name, g.strand, seqlines))
    ref_fa.close()
def build_exon_trees(samfh, genefile):
    rname_tid_map = dict(
        (rname, i) for i, rname in enumerate(samfh.references))
    exon_trees = collections.defaultdict(lambda: IntervalTree())
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        name = config.GENE_REF_PREFIX + g.tx_name
        if name not in rname_tid_map:
            continue
        if g.chrom not in rname_tid_map:
            continue
        gene_tid = rname_tid_map[name]
        # get reference index in sam file
        chrom_tid = rname_tid_map[g.chrom]
        # add gene to interval tree
        for start, end in g.exons[1::-1]:
            exon_interval = Interval(start,
                                     end,
                                     chrom=chrom_tid,
                                     strand=g.strand,
                                     value=gene_tid)
            exon_trees[chrom_tid].insert_interval(exon_interval)
    return dict(exon_trees)