Example #1
0
def create_inttree_from_file(infile):
    """Create interval tree to store annotations

    Args:
    infile: handle of open BED file with annotations

    Return:
    dictionary {chromosome name : interval tree with coordinates}
    """
    genome = {}
    for line in infile:
        clean_line = line.strip()
        parts = clean_line.split()
        chrom, start, stop = parts[0], int(parts[1]), int(parts[2])
        name = parts[3]
        tree = None
        #if chromosome already in tree, index to this tree
        if chrom in genome:
            tree = genome[chrom]
        else:
            #first time we encounter chromosome, create a new interval tree
            tree = IntervalTree()
            genome[chrom] = tree
        #add interval to tree
        tree.add(start, stop, name)
    return genome
    def __init__(self, bed_file_path):
 
        self.interval_tree_dict = dict()
        error_message = "Skipping line {0} - too short, only {1} column(s):\n{2}"
 
        with open(bed_file_path, 'r') as bed_file:
            for count, line in enumerate(bed_file):
                split_line = line.split("\t")
                number_of_columns = len(split_line)
 
                try:
                    chromosome, start, end, name = split_line[:4]
                except ValueError:
                    print error_message.format(count+1, number_of_columns, line.strip())
                    continue
                   
                start, end = int(start), int(end)
                tree = None
 
                if chromosome in self.interval_tree_dict:
                    tree = self.interval_tree_dict[chromosome]
                else:
                    tree = IntervalTree()
                    self.interval_tree_dict[chromosome] = tree
 
                tree.add(start, end, tuple(split_line[:4]))
Example #3
0
def index_gff3(gff3_file_path):
    #following an example from https://malariageninformatics.wordpress.com/2011/07/07/using-interval-trees-to-query-genome-annotations-by-position/
    # dictionary mapping chromosome names to interval trees
    genome = dict()

    # parse the annotations file (GFF3) and build the interval trees
    gff = pd.read_csv(gff3_file_path, sep="\t", header=None, comment="#")
    for idx, row in gff.iterrows():
        if args.tag is not None and row[2] != args.tag:
            continue
        seqid = row[0]
        start = int(row[3])
        end = int(row[4])
        tree = None
        # one interval tree per chromosome
        if seqid in genome:
            tree = genome[seqid]
        else:
            # first time we've encountered this chromosome, create an interval tree
            tree = IntervalTree()
            genome[seqid] = tree
        # index the feature
        if args.attribute is None and args.join is None:
            tree.add(start, end, row)
        else:
            attr = row[8].split(";")
            o = list()
            for n in attr:
                k, v = n.split("=")
                if k == args.attribute or k == args.join:
                    o.append(v)
            o = ",".join(o)
            tree.add(start, end, o)
    return genome
    def __init__(self, bed_file):
        """
       :param bed_file:
       :return interval tree data structure of gene ranges:
       """
        self.interval_tree_dict = dict()
        error_message = "Skipping line {0} too short with only {1} column(s).Check that it conforms to bed format:\n{2}"
 
        with open(bed_file, 'r') as bed_file_Handler:
            for count, line in enumerate(bed_file_Handler):
                segmentProperties = line.split("\t")
                numberOfColumns = len(segmentProperties)
                try:
                    chromosome, segment_start, segment_end, name = segmentProperties[:4]
                except ValueError:
                    print error_message.format(count + 1, numberOfColumns, line.strip())
                    continue
 
                segment_start, segment_end = int(segment_start), int(segment_end)
 
                if chromosome in self.interval_tree_dict:
                    tree = self.interval_tree_dict[chromosome]
                else:
                    tree = IntervalTree()
                    self.interval_tree_dict[chromosome] = tree
 
                tree.add(segment_start, segment_end, tuple(segmentProperties[:4]))
Example #5
0
def read_in_somatic_vcf_file(somatic_snv_files, clonal_percs, query_chr, truth_set_cn_calls, output_dir, subsample_somatic_snvs):
    """ read clonal somatic SNV vcf files"""
    fsf = open(os.path.join(output_dir,'forced_somatic_snv_frequencies_' + str(query_chr) + '.json'), 'w')
    print("ri ",query_chr)
    
    h = IntervalTree()
    h2={}

    for (somatic_snv_file, clonal_perc) in zip(somatic_snv_files, clonal_percs):
    # for now just do SNVs as adding in indels involve increasing the size of reads which could cause issues; 
    # thinking about it probably wouldnt - quite faffy though
        FH = open(somatic_snv_file,'r')
        for line in FH:
            if re.match('#',line):
                continue
            random_no = random.random()
            if random_no > subsample_somatic_snvs: 
                continue            
            (chrom, pos, id, ref, alt, qual, filter, info, format, normal, tumor)=line.strip().split()
            pos=int(pos)
            if chrom != query_chr:
                continue

            if format != 'DP:FDP:SDP:SUBDP:AU:CU:GU:TU':
                sys.exit('vcf format not the usual'+'DP:FDP:SDP:SUBDP:AU:CU:GU:TU')
            print("tumor ",tumor)
            (DP,FDP,SDP,SUBDP,AU,CU,GU,TU) = tumor.strip().split(':')
            cov=float(DP)

            if ref =='A':
                l=[CU,GU,TU]
            if ref =='C':
                l=[AU,GU,TU]
            if ref =='G':
                l=[CU,AU,TU]
            if ref =='T':
                l=[CU,GU,AU] #should be a pithy python way to do this but this'll do for now

            (first, second, third)=sorted([int(cv.split(',')[0] ) for cv in l], reverse=True) #just using first tier reads for now

            if random.random() > 0.5:
                somatic_haplotypeCN = 'firsthaplotype_CN'
            else:
                somatic_haplotypeCN = 'secondhaplotype_CN'

            print("pos ",pos, "shcn ", somatic_haplotypeCN , " r ")

            region_CN = 2
            for region in truth_set_cn_calls[query_chr]:
                if pos >= region['start'] and pos <= region['end']:
                    region_CN = region[somatic_haplotypeCN]
            somatic_mutation_freq = float(assign_freq_based_on_ploidy(region_CN))
            somatic_mutation_freq *= float(clonal_perc)
            h.add(pos, pos,{'pos':pos, 'ref':ref, 'alt':alt, 'line':line, 'freq':somatic_mutation_freq, 'somatic_haplotype':somatic_haplotypeCN}) #theoretically bug: could have snv and indel at same pos #also bug if snp or indel last/first on read
            h2[pos] = {'pos':pos, 'ref':ref, 'alt':alt, 'line':line, 'freq':somatic_mutation_freq, 'somatic_haplotype':somatic_haplotypeCN}

    pprint.pprint(h2)
    json.dump(h2, fsf, indent=4, sort_keys=True)
    return h
Example #6
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i * i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i * i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(
                Interval(i + 40, i + 50, value=dict(astr=str(i * i))))
            iv.add_interval(
                Interval(i + 60, i + 70, value=dict(astr=str(i * i))))

            n += 4
        self.intervals = self.iv = iv
        self.nintervals = n
Example #7
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i*i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i*i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(Interval(i + 40, i + 50,
                value=dict(astr=str(i*i))))
            iv.add_interval(Interval(i + 60, i + 70,
                value=dict(astr=str(i*i))))

            n += 4 
        self.intervals = self.iv = iv
        self.nintervals = n
Example #8
0
def plot_coverage(coords, bams):
    '''Given the name of a DNA coordinates firl and a list of bam file names,
    plot the read aligment coverage for each bam file for each coordinate.
    One graph per coordinate will be generated. The coverage for each
    BAM file for a given coordinate will be plotted on the same graph.
    The coordinates file should be in TSV format.'''
    coords = get_coords(coords)
    for chrom, start, end in coords:
        logging.info("processing coord {} {} {}".format(chrom, start, end))
        # Start plotting the graph and generate a name for the output file
        graph_filename = start_graph(chrom, start, end)
        coords_range = range(start, end + 1)
        for bam_filename in bams:
            # interval tree tracks the start and end mapped coordinates
            # of each read in the bam file that lies within our region
            # of interest.
            interval_tree = IntervalTree()
            with pysam.Samfile(bam_filename, "rb") as bam:
                logging.info("processing bam file {}".format(bam_filename))
                # Collect all the reads from the BAM file which lie in
                # the region of interest.
                # fetch uses 0-based indexing. Our input coordinates are
                # in 1-based coordinates.
                reads = bam.fetch(chrom, start - 1, end - 1)
                # Insert the start and end of each aligned read into the
                # interval tree.
                for read in reads:
                    if len(read.positions) > 0:
                        # Add 1 to convert from 0-based to 1-based coordinates
                        first_pos = read.positions[0] + 1
                        last_pos = read.positions[-1] + 1
                        interval_tree.add(first_pos, last_pos, None)
            # For each base position in our region of interest,
            # count the number of reads which overlap this position.
            # This computes the coverage for each position in the region.
            counts = [
                len(interval_tree.find(pos, pos)) for pos in coords_range
            ]
            # Plot the coverage information for this bam file
            legend_text = bam_name_legend(bam_filename)
            plot_graph(counts, coords_range, legend_text)
        # Close the drawing of the graph for this set of coordinates
        end_graph(graph_filename)
def make_intervals(hindiii_genome):
    '''
    Need to convert to 0-based for bx-python overlaps
    '''
    #make genome hindiii fragments into intervals
    genome = dict()
    for frag in hindiii_genome.values():
        tree = None
        # one interval tree per chromosome
        if frag.chrom in genome:
            tree = genome[frag.chrom]
        else:
            # first time we've encountered this chromosome, create an interval tree
            tree = IntervalTree()
            genome[frag.chrom] = tree
        # index the feature
        tree.add(int(frag.start) - 1, int(frag.end), frag.fragment_id)

    return genome
Example #10
0
def plot_coverage(coords, bams):
    '''Given the name of a DNA coordinates firl and a list of bam file names,
    plot the read aligment coverage for each bam file for each coordinate.
    One graph per coordinate will be generated. The coverage for each
    BAM file for a given coordinate will be plotted on the same graph.
    The coordinates file should be in TSV format.'''
    coords = get_coords(coords)
    for chrom, start, end in coords:
        logging.info("processing coord {} {} {}".format(chrom, start, end))
        # Start plotting the graph and generate a name for the output file
        graph_filename = start_graph(chrom, start, end)
        coords_range = range(start, end+1)
        for bam_filename in bams:
            # interval tree tracks the start and end mapped coordinates
            # of each read in the bam file that lies within our region
            # of interest.
            interval_tree = IntervalTree()
            with pysam.Samfile(bam_filename, "rb") as bam:
                logging.info("processing bam file {}".format(bam_filename))
                # Collect all the reads from the BAM file which lie in
                # the region of interest.
                # fetch uses 0-based indexing. Our input coordinates are
                # in 1-based coordinates.
                reads = bam.fetch(chrom, start-1, end-1)
                # Insert the start and end of each aligned read into the
                # interval tree.
                for read in reads:
                    if len(read.positions) > 0:
                        # Add 1 to convert from 0-based to 1-based coordinates
                        first_pos = read.positions[0] + 1
                        last_pos = read.positions[-1] + 1
                        interval_tree.add(first_pos, last_pos, None)
            # For each base position in our region of interest,
            # count the number of reads which overlap this position.
            # This computes the coverage for each position in the region.
            counts = [len(interval_tree.find(pos, pos))
                      for pos in coords_range]
            # Plot the coverage information for this bam file
            legend_text = bam_name_legend(bam_filename)
            plot_graph(counts, coords_range, legend_text)
        # Close the drawing of the graph for this set of coordinates
        end_graph(graph_filename)
Example #11
0
def index_gtf(gtf_file_path):
    # dictionary mapping chromosome names to interval trees
    genome = dict()
    #parse the annotations file (Gtf) and build the interval trees
    with open(gtf_file_path, "r") as annotations_file:
        reader = csv.reader(annotations_file, delimiter = '\t')
        for row in reader:
            if len(row) == 9 and not row[0].startswith('##'):
                seqid = row[0]
                start = int(row[3])
                end  = int(row[4])
                tree = None
                # build one interval tree per chromosome 
                if seqid in genome:
                    tree = genome[seqid]
                else:
#first time we've encoutered this chromosome, creat an interval tree
                    tree = IntervalTree()
                    genome[seqid] = tree
#index the feature
                tree.add(start, end, tuple(row))
    return genome
Example #12
0
class IntervalTreeOverlapDetector(OverlapDetector):
    def __init__(self, excludedSegments=None):
        from bx.intervals.intersection import IntervalTree
        self._intervalTree = IntervalTree()
        if excludedSegments:
            for start, end in excludedSegments:
                self._intervalTree.add(start, end)

    def overlaps(self, start, end):
        return bool(self._intervalTree.find(start, end))

    def addSegment(self, start, end):
        self._addElementHandleBxPythonZeroDivisionException(start, end)
        # self._intervalTree.add(start, end)

    def _addElementHandleBxPythonZeroDivisionException(self,
                                                       start,
                                                       end,
                                                       nrTries=10):
        """
        DivisionByZero error is caused by a bug in the bx-python library.
        It happens rarely, so we just execute the add command again up to nrTries times
        when it does. If it pops up more than 10 times, we assume something else is wrong and
        raise.
        """
        cnt = 0
        while True:
            cnt += 1
            try:
                self._intervalTree.add(start, end)
            except Exception as e:
                from gold.application.LogSetup import logMessage, logging
                logMessage("Try nr %i. %s" % (cnt, str(e)), level=logging.WARN)
                if cnt > nrTries:
                    raise e
                continue
            else:
                break
def index_annotation_file(annotation_file_path, annotation_type):
    """"Parses a annotation file and builds an interval tree"""

    #dictionary mapping chromosome names to interval trees, collecting geneID info
    genome = dict()
    #dictionary mapping chromosmoes names to interval trees, collecting transcriptID info
    transcriptome = dict()
    #dictionnary mapping transcript info
    transcripts_info = dict()
    #dictionary mapping chromosmoes names to interval trees, collecting exon number info
    exome = dict()
    #dictionnary mapping coding region info info
    coding_region_info = defaultdict(dict)

    with open(annotation_file_path,  'r') as annotation_file:

        reader = csv.reader(annotation_file, delimiter='\t')
        
        for line in reader:

            #Start with blank tree for each line
            tree_gene = None
            tree_transcript = None
            tree_exon = None

            if annotation_type == 'ref_gene':

                gene = ref_gene_parser(line)

                #one interval tree per chromosome
                if gene['chrom'] in genome:

                    tree_gene = genome[gene['chrom']]
                    tree_transcript = transcriptome[gene['chrom']]
                    tree_exon = exome[gene['chrom']]

                else:
                
                    #Chromosome not seen previously, create interval tree key
                    tree_gene = IntervalTree()
                    tree_transcript = IntervalTree()
                    tree_exon = IntervalTree()
                    genome[gene['chrom']] = tree_gene
                    transcriptome[gene['chrom']] = tree_transcript
                    exome[gene['chrom']] = tree_exon
                
                #index the feature
                tree_gene.add(gene['start'], gene['stop'], gene['gene_id'])
                tree_transcript.add(gene['start'], gene['stop'], gene['transcript_id'])
                
                #Fasta file exists
                if args.genome_reference: 
                    
                    transcripts_info[ gene['transcript_id'] ] = gene['transcript_id']

                    #Collect fasta sequence and coding region 
                    coding_region_info[ gene['transcript_id'] ]['fasta'] = read_fasta_file(fasta_path, gene['chrom'], int(gene['cds_start']), int(gene['cds_stop']))
                    coding_region_info[ gene['transcript_id'] ]['cds_start'] = gene['cds_start']
                    coding_region_info[ gene['transcript_id'] ]['cds_stop'] = gene['cds_stop']
                    coding_region_info[ gene['transcript_id'] ]['strand'] = gene['strand']
                
                mrna_fasta = []
                position_mrna = 0
                for exon in gene['exon_start']:

                    tree_exon.add(int(gene['exon_start'][exon]), int(gene['exon_stop'][exon]), exon) 
                    
                    #print(gene['transcript_id'], exon)
                    if coding_region_info[ gene['transcript_id'] ]['fasta']:

                        start_fasta = 0
                        stop_fasta = 0

                        if "+" in gene['strand']:

                            #Within coding region
                            if (int(gene['exon_start'][exon]) > gene['cds_start']) and (int(gene['exon_stop'][exon]) < gene['cds_stop']):
                                
                                start_fasta = int(gene['exon_start'][exon]) - gene['cds_start']
                                stop_fasta = int(gene['exon_stop'][exon]) - gene['cds_start']
                                position_exon = range(int(gene['exon_start'][exon]), int(gene['exon_stop'][exon]))
                                
                                position_mrna = map_genomic_position_to_mrna_position(coding_region_info, gene['strand'], gene['transcript_id'], position_mrna, position_exon, int(gene['exon_start'][exon]))
                                
                                #Upstream of coding region
                            elif (int(gene['exon_stop'][exon]) < gene['cds_start']):
                                
                                start_fasta = 0
                                stop_fasta = 0
                                
                            #Downstream of coding region
                            elif (int(gene['exon_start'][exon]) > gene['cds_stop']):
                                
                                start_fasta = 0
                                stop_fasta = 0
                            
                            #Start downstream of cds
                            elif int(gene['exon_start'][exon]) < gene['cds_start']:
                            
                                start_fasta = 0
                            
                                #Exon encompasses whole cds
                                if ( (int(gene['exon_stop'][exon]) > gene['cds_start']) and (int(gene['exon_stop'][exon]) > gene['cds_stop']) ):

                                    stop_fasta = gene['cds_stop'] - gene['cds_start']
                                    position_exon = range(gene['cds_start'], gene['cds_stop'])

                                #Finish upstream of cds start, but less than cds stop (handled above)
                                elif int(gene['exon_stop'][exon]) > gene['cds_start']:
                                
                                    stop_fasta = int(gene['exon_stop'][exon]) - gene['cds_start']
                                    position_exon = range(gene['cds_start'], int(gene['exon_stop'][exon]))                                
                                
                                    position_mrna = map_genomic_position_to_mrna_position(coding_region_info, gene['strand'], gene['transcript_id'], position_mrna, position_exon, int(gene['exon_start'][exon]))
                            
                                elif int(gene['exon_stop'][exon]) > gene['cds_stop']:

                                    stop_fasta = gene['cds_stop'] - gene['cds_start']

                                    if int(gene['exon_start'][exon]) < gene['cds_stop']:
                                
                                        start_fasta = int(gene['exon_start'][exon]) - gene['cds_start']
                                        position_exon = range(int(gene['exon_start'][exon]), gene['cds_stop'])
                                        
                                        position_mrna = map_genomic_position_to_mrna_position(coding_region_info, gene['strand'], gene['transcript_id'], position_mrna, position_exon, int(gene['exon_start'][exon]))

                            mrna_fasta.append(coding_region_info[ gene['transcript_id'] ]['fasta'][start_fasta:stop_fasta])

                    coding_region_info[gene['transcript_id'] ]['mRNA'] = ''.join(mrna_fasta)
                
    return genome, transcriptome, exome, transcripts_info, coding_region_info
def _bx(es):
    t = IntervalTree()
    for e in es:
        t.add(e[0], e[1], e)
        c = len(t.find(e[0], e[1]))
def load_macs2(chip_data, hindiii_genome, motif_data=None):
    '''Load macs2 narrowpeaks bed file
    0-based coordinate system
    '''
    class macs2():
        def __init__(self, overlap_IDs, peak_ID, chrom, start, end,
                     fold_enrichment, size, orientations):
            self.overlap_IDs = overlap_IDs
            self.peak_ID = peak_ID
            self.chrom = chrom
            self.start = start
            self.end = end
            self.fold_enrichment = fold_enrichment
            self.size = size
            self.orientations = orientations

    #make genome hindiii fragments into intervals
    genome = make_intervals(hindiii_genome)

    #make motifs into intervals
    if motif_data != None:
        motif = dict()
        with open(motif_data, 'r') as in_motif:
            for line in in_motif:
                if line.startswith('#'):
                    continue
                chrom, start, end, motif_name, score, orientation = line.rstrip(
                    '\n').split('\t')
                tree = None
                # one interval tree per chromosome
                if chrom in motif:
                    tree = motif[chrom]
                else:
                    # first time we've encountered this chromosome, create an interval tree
                    tree = IntervalTree()
                    motif[chrom] = tree
                # index the feature
                tree.add(int(start), int(end), orientation)

    all_peaks = []

    with open(chip_data, 'r') as in_data:
        for line in in_data:
            sp_line = line.rstrip('\n').split('\t')
            chrom = 'chr' + sp_line[0]
            start = int(sp_line[1]) + 1  # convert to 1-based coordinate system
            end = int(sp_line[2])
            peak_ID = sp_line[3]
            fold_enrichment = sp_line[6]

            size = end - start

            if motif_data != None:
                orientations = motif[chrom].find(start, end)
                if len(orientations) == 0:
                    orientations = '.'
            else:
                orientations = '.'

            overlap_ID = genome[chrom].find(
                start, end)  # find annotations overlapping an interval
            all_peaks.append(
                macs2(overlap_ID, peak_ID, chrom, start, end, fold_enrichment,
                      size, orientations))

    return all_peaks
def _bx(es):
    t = IntervalTree()
    for e in es:
        t.add(e[0], e[1], e)
        c = len(t.find(e[0], e[1]))