예제 #1
0
def get_acceptable_species():
    """

    Finds all species in data directory
    :return: acceptable_species (set): string of available genome

    """
    acceptable_species = set([])
    for fn in os.listdir(clipper.data_dir()):
        fn = fn.split(".")[0]
        if fn == "__init__":
            continue
        acceptable_species.add(fn)
    return acceptable_species
예제 #2
0
    def make_features(self):
        Region = collections.namedtuple("Region", ["region", "gene_id"])

        bedtracks = {}
        for region in self.assigned_regions:
            bedtracks[region] = pybedtools.BedTool(
                os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (self.species, region))
            )

        features = HTSeq.GenomicArrayOfSets("auto", stranded=True)
        for region, bedtrack in bedtracks.items():
            for iv, interval in izip(CLIP_analysis.bed_to_genomic_interval(bedtrack), bedtrack):
                features[iv] = set([Region(region, interval.name)])
        return features
예제 #3
0
def count_genomic_region_sizes(regions, species="hg19"):
    
    """
    
    Counts the genomic region sizes for the specified regions dir
    (should be changed to GTF)
    
    """
    
    genomic_region_sizes = {}
    #TODO update this to work of GFF file, because something isn't matching up...
    for region in regions:
        region_tool = pybedtools.BedTool(os.path.join(clipper.data_dir(), "regions",  species + "_" + region + ".bed"))
        genomic_region_sizes[region] = region_tool.total_coverage()
    return genomic_region_sizes
예제 #4
0
    def make_features(self):
        Region = collections.namedtuple("Region", ["region", "gene_id"])

        bedtracks = {}
        for region in self.assigned_regions:
            bedtracks[region] = pybedtools.BedTool(
                os.path.join(clipper.data_dir(), "regions",
                             "%s_%s.bed" % (self.species, region)))

        features = HTSeq.GenomicArrayOfSets("auto", stranded=True)
        for region, bedtrack in bedtracks.items():
            for iv, interval in izip(
                    CLIP_analysis.bed_to_genomic_interval(bedtrack), bedtrack):
                features[iv] = set([Region(region, interval.name)])
        return features
예제 #5
0
def get_acceptable_species():

    """
    
    Finds all species in data directory 
    
    """

    acceptable_species = set([])
    for fn in os.listdir(clipper.data_dir()):
        fn = fn.split(".")[0]

        if fn == "__init__":
            continue

        acceptable_species.add(fn)

    return acceptable_species
예제 #6
0
    def __init__(self, species, db=None, regions_dir=None):
        """
        
        creates genomic features function, chooses 
        how to direct creation of features based on _species
        
        regions_dir : str location to create region
        species: str species (hg19, mm9, ce10
        db: gffutils FeatureDb object
        
        """

        if regions_dir == None:
            regions_dir = os.path.join(data_dir(), "regions")
        self._regions_dir = regions_dir
        self._db = db
        self._species = species

        if species in ["hg19", "mm9"]:
            self._feature_names = {
                "five_prime_utr": "five_prime_utr",
                "three_prime_utr": "three_prime_utr",
                "exon": "exon",
                "CDS": "CDS",
                "gene_id": "gene_id",
                "transcript": "transcript",
            }
            self._fix_chrom = self._fix_chrom_null

        elif species in ["ce10"]:
            self._feature_names = {
                "five_prime_utr": "five_prime_UTR",
                "three_prime_utr": "three_prime_UTR",
                "exon": "exon",
                "CDS": "CDS",
                "gene_id": "ID",
                "transcript": "mRNA",
            }
            self._fix_chrom = self._fix_chrom_ce10
        if self._db is not None:
            self.featuretypes = list(self._db.featuretypes())
        else:
            self.featuretypes = None
예제 #7
0
    def __init__(self, species, db=None, regions_dir=None):
        """
        
        creates genomic features function, chooses 
        how to direct creation of features based on _species
        
        regions_dir : str location to create region
        species: str species (hg19, mm9, ce10
        db: gffutils FeatureDb object
        
        """

        if regions_dir == None:
            regions_dir = os.path.join(data_dir(), "regions")
        self._regions_dir = regions_dir
        self._db = db
        self._species = species
        
        if species in ["hg19", "mm9"]:
            self._feature_names = {
                             "five_prime_utr" : "five_prime_utr",
                             "three_prime_utr" : "three_prime_utr",
                             "exon" : "exon",
                             "CDS" : "CDS",
                             "gene_id" : "gene_id",
                             "transcript" : "transcript",
                             }
            self._fix_chrom = self._fix_chrom_null
            
        elif species in ["ce10"]:
            self._feature_names = {
                             "five_prime_utr" : "five_prime_UTR",
                             "three_prime_utr" : "three_prime_UTR",
                             "exon" : "exon",
                             "CDS" : "CDS",
                             "gene_id" : "ID",
                             "transcript" : "mRNA",
                             }
            self._fix_chrom = self._fix_chrom_ce10
        if self._db is not None:
            self.featuretypes = list(self._db.featuretypes())
        else:
            self.featuretypes = None
예제 #8
0
    def get_uORF_start_stop_gff(self):
        """
        
        Returns hg19 uORFs
        
        """

        db = gffutils.FeatureDB(
            "/nas3/yeolab/Genome/ensembl/gtf/gencode.v17.annotation.gtf.db.old"
        )

        transcript_gene_dict = self._create_transcript_map(db)

        #get all 5' UTRs
        (UTR3, UTR5, exons, genes, introns,
         CDS) = CLIP_analysis.get_genomic_regions(
             os.path.join(clipper.data_dir(), "regions"), "hg19", db).values()

        five_prime_utr_dict = self._get_five_prime_utr_sequences(
            UTR5, "/nas3/yeolab/Genome/ucsc/hg19/chromosomes/all.fa")

        return self._get_uorf_start_stop(five_prime_utr_dict)
예제 #9
0
 def get_uORF_start_stop_gff(self):
     
     """
     
     Returns hg19 uORFs
     
     """
     
     db = gffutils.FeatureDB("/nas3/yeolab/Genome/ensembl/gtf/gencode.v17.annotation.gtf.db.old")
     
     transcript_gene_dict = self._create_transcript_map(db)
     
     #get all 5' UTRs
     (UTR3, UTR5, 
      exons, genes, 
      introns, CDS) = CLIP_analysis.get_genomic_regions(os.path.join(clipper.data_dir(), "regions"), 
                                                         "hg19", 
                                                         db).values()
     
     five_prime_utr_dict = self._get_five_prime_utr_sequences(UTR5, "/nas3/yeolab/Genome/ucsc/hg19/chromosomes/all.fa")      
                                                   
     return self._get_uorf_start_stop(five_prime_utr_dict)
예제 #10
0
def get_exon_bed(species):

    short_species = species.split("_")[0]
    return os.path.join(clipper.data_dir(), "regions",
                        "%s_%s.bed" % (short_species, "exons"))
예제 #11
0
    def __init__(self, species, db=None, regions_dir=None):
        """
        
        creates genomic features function, chooses 
        how to direct creation of features based on _species
        
        regions_dir : str location to create region
        species: str species (hg19, mm9, ce10
        db: gffutils FeatureDb object
        
        """

        if regions_dir == None:
            regions_dir = os.path.join(data_dir(), "regions")
        self._regions_dir = regions_dir
        self._db = db
        self._species = species

        # I'm going to be lazy and leave this here, its needed to make a new genomic features for human genomes
        # engineering so it doesn't take too much time on load will be slightly annoying so just uncomment when you need it
        # result = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
        # for x, feature in enumerate(db.all_features()):
        #     gene_ids = feature.attributes['gene_id']
        #     transcript_ids = feature.attributes['transcript_id']
        #     feature_type = feature.featuretype
        #
        #
        #     if feature_type == "gene":
        #         if len(gene_ids) != 1:
        #             print gene_ids[0]
        #             break
        #
        #         result[gene_ids[0]]['gene'] = feature
        #     else:
        #         for gene_id in gene_ids:
        #             for transcript_id in transcript_ids:
        #                 result[gene_id][transcript_id][feature_type].append(feature)
        #
        # self._feature_hash = result

        if species in ["hg19", "mm9", "hg19_v19", "GRCh38_v24"]:
            self._feature_names = {
                "five_prime_utr": "five_prime_utr",
                "three_prime_utr": "three_prime_utr",
                "exon": "exon",
                "CDS": "CDS",
                "gene_id": "gene_id",
                "transcript": "transcript",
            }
            self._fix_chrom = self._fix_chrom_null

        elif species in ["ce10"]:
            self._feature_names = {
                "five_prime_utr": "five_prime_UTR",
                "three_prime_utr": "three_prime_UTR",
                "exon": "exon",
                "CDS": "CDS",
                "gene_id": "ID",
                "transcript": "mRNA",
            }
            self._fix_chrom = self._fix_chrom_ce10
        if self._db is not None:
            self.featuretypes = list(self._db.featuretypes())
        else:
            self.featuretypes = None
예제 #12
0
def assign_to_regions(tool,
                      clusters=None,
                      assigned_dir=".",
                      species="hg19",
                      nrand=3):
    """
    
    Assigns each cluster to a genic region
    finally saves all generated bed and fasta files for future analysis...

    tool - a bed tool (each line represnting a cluster)
    clusters - name of cluster file (optional)
    assigned_dir - location to save files in
    species - str species to segment
    nrand - int number offsets times to shuffle for null hypothesis


    """
    if clusters is None:
        clusters, ext = os.path.splitext(os.path.basename(tool.fn))
    bedtracks = {}

    regions, assigned_regions = regions_generator()
    short_species = species.split("_")[0]
    if short_species == "GRCh38":
        short_species = "hg38"

    for region in regions:
        bedtracks[region] = pybedtools.BedTool(
            os.path.join(clipper.data_dir(), "regions",
                         "%s_%s.bed" % (species, region)))
    #creates the basics of bed dict
    bed_dict = {'all': {'rand': {}}}

    genes = pybedtools.BedTool(
        os.path.join(clipper.data_dir(), "regions",
                     "%s_genes.bed" % (species)))

    offsets = get_offsets_bed12(tool)
    if tool.field_count() <= 5:
        tool.sort().merge().saveas()
    elif 6 <= tool.field_count() < 8:
        #Hack to get around not having gene name assigned by peak caller, due to overlapping genes this won't be perfect
        #move_name_real = functools.partial(move_name, original_length=len(tool[0].fields))
        #tool = tool.intersect(genes, wo=True, s=True).each(move_name_real).saveas()
        #fix_strand_ok = functools.partial(fix_strand, warn=False)
        tool = tool.sort().merge(
            s=True, c="4,5,6",
            o="collapse,collapse,collapse").each(fix_strand_v26).saveas()
    #elif not tool[0][7].isdigit():
    #    tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).each(fix_name).saveas()
    else:  #Clipper, this is ideal we like this technique
        tool = tool.sort().merge(s=True,
                                 c="4,5,6,7,8",
                                 o="collapse,collapse,collapse,min,min").each(
                                     fix_strand_v26).saveas()

    remaining_clusters = adjust_offsets(tool, offsets)

    # print "There are a total %d clusters I'll examine" % (len(tool))
    for region in regions:
        remaining_clusters, overlapping = intersection(remaining_clusters,
                                                       b=bedtracks[region])

        #if for some reason there isn't a peak in the region skip it
        if len(overlapping) == 0:
            # print "ignoring %s " % region
            continue

        #sets up bed dict for this region
        bed_dict[region] = {
            'real': overlapping.sort(stream=True).saveas(),
            'rand': {}
        }

        no_overlapping_count = len(remaining_clusters)
        overlapping_count = len(bed_dict[region]['real'])
        # print "For region: %s found %d that overlap and %d that don't" % (region,
        #                                                                   overlapping_count,
        #                                                                   no_overlapping_count)

        if 'real' not in bed_dict['all']:
            bed_dict['all']['real'] = bed_dict[region]['real']
        else:
            bed_dict['all']['real'] = bed_dict['all']['real'].cat(
                bed_dict[region]['real'], stream=True,
                postmerge=False).saveas()

        #saves offsets so after shuffling the offsets can be readjusted
        offset_dict = get_offsets_bed12(bed_dict[region]['real'])
        for i in range(nrand):
            random_intervals = bed_dict[region]['real'].shuffle(
                genome=short_species, incl=bedtracks[region].fn).sort()
            random_intervals = fix_shuffled_strand(random_intervals,
                                                   bedtracks[region].fn)
            random_intervals = adjust_offsets(random_intervals, offset_dict)
            bed_dict[region]['rand'][i] = random_intervals.saveas()

            if i not in bed_dict['all']['rand']:
                bed_dict['all']['rand'][i] = bed_dict[region]['rand'][i]
            else:
                bed_dict['all']['rand'][i] = bed_dict['all']['rand'][i].cat(
                    bed_dict[region]['rand'][i], stream=True, postmerge=False)

        #if there are no more clusters to assign stop trying
        if no_overlapping_count == 0:
            break

    # print "After assigning %d un-categorized regions" % len(remaining_clusters)

    if len(remaining_clusters) > 0:
        bed_dict['uncatagorized'] = {
            'real': remaining_clusters.sort(stream=True).saveas()
        }

    bed_dict = save_bedtools(bed_dict, clusters, assigned_dir)
    return bed_dict
예제 #13
0
    def __init__(self, species, db=None, regions_dir=None, gencode=False):
        """
        
        creates genomic features function, chooses 
        how to direct creation of features based on _species
        
        regions_dir : str location to create region
        species: str species (hg19, mm9, ce10
        db: gffutils FeatureDb object
        
        """

        if regions_dir == None:
            regions_dir = os.path.join(data_dir(), "regions")
        self._regions_dir = regions_dir
        self._db = db
        self._species = species

        #I'm going to be lazy and leave this here, its needed to make a new genomic features for human genomes
        #engineering so it doesn't take too much time on load will be slightly annoying so just uncomment when you need it
        result = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
        for x, feature in enumerate(db.all_features()):
            gene_ids = feature.attributes['gene_id']
            transcript_ids = feature.attributes['transcript_id']
            feature_type = feature.featuretype

            if feature_type == "gene":
                if len(gene_ids) != 1:
                    print gene_ids[0]
                    break

                result[gene_ids[0]]['gene'] = feature
            else:
                for gene_id in gene_ids:
                    for transcript_id in transcript_ids:
                        result[gene_id][transcript_id][feature_type].append(
                            feature)

        self._feature_hash = result

        if species in ["hg19", "mm9", "hg19_v19", "GRCh38_v24", "hb27"
                       ] or gencode:
            self._feature_names = {
                "five_prime_utr": "five_prime_utr",
                "three_prime_utr": "three_prime_utr",
                "exon": "exon",
                "CDS": "CDS",
                "gene_id": "gene_id",
                "transcript": "transcript",
            }
            self._fix_chrom = self._fix_chrom_null

        elif species in ["ce10"]:
            self._feature_names = {
                "five_prime_utr": "five_prime_UTR",
                "three_prime_utr": "three_prime_UTR",
                "exon": "exon",
                "CDS": "CDS",
                "gene_id": "ID",
                "transcript": "mRNA",
            }
            self._fix_chrom = self._fix_chrom_ce10
        if self._db is not None:
            self.featuretypes = list(self._db.featuretypes())
        else:
            self.featuretypes = None
예제 #14
0
def assign_to_regions(tool, clusters=None, assigned_dir=".", species="hg19", nrand=3):
    
    """
    
    Assigns each cluster to a genic region
    finally saves all generated bed and fasta files for future analysis...

    tool - a bed tool (each line represnting a cluster)
    clusters - name of cluster file (optional)
    assigned_dir - location to save files in
    species - str species to segment
    nrand - int number offsets times to shuffle for null hypothesis


    """
    if clusters is None:
        clusters, ext = os.path.splitext(os.path.basename(tool.fn))
    bedtracks = {}

    regions, assigned_regions = regions_generator()
    short_species = species.split("_")[0]
    
    for region in regions:
        bedtracks[region] = pybedtools.BedTool(os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (species,
                                                                                                          region)))
    #creates the basics of bed dict
    bed_dict = {'all': {'rand': {}}}

    genes = pybedtools.BedTool(os.path.join(clipper.data_dir(), "regions", "%s_genes.bed" % (species)))


    offsets = get_offsets_bed12(tool)
    if tool.field_count() <= 5:
        tool.sort().merge().saveas()
    elif 6 <= tool.field_count() < 8:
        #Hack to get around not having gene name assigned by peak caller, due to overlapping genes this won't be perfect
        #move_name_real = functools.partial(move_name, original_length=len(tool[0].fields))
        #tool = tool.intersect(genes, wo=True, s=True).each(move_name_real).saveas()
        #fix_strand_ok = functools.partial(fix_strand, warn=False)
        tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).saveas()
    elif not tool[0][7].isdigit():
        tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).saveas()
    else: #Clipper, this is ideal we like this technique
        tool = tool.sort().merge(s=True, c="4,5,6,7,8", o="collapse,collapse,collapse,min,min").each(fix_strand).saveas()

    remaining_clusters = adjust_offsets(tool, offsets)

    # print "There are a total %d clusters I'll examine" % (len(tool))
    for region in regions:
        remaining_clusters, overlapping = intersection(remaining_clusters, b=bedtracks[region])

        #if for some reason there isn't a peak in the region skip it
        if len(overlapping) == 0:
            # print "ignoring %s " % region
            continue

        #sets up bed dict for this region
        bed_dict[region] = {'real': overlapping.sort(stream=True).saveas(),
                            'rand': {}}

        no_overlapping_count = len(remaining_clusters)
        overlapping_count = len(bed_dict[region]['real'])
        # print "For region: %s found %d that overlap and %d that don't" % (region,
        #                                                                   overlapping_count,
        #                                                                   no_overlapping_count)

        if 'real' not in bed_dict['all']:
            bed_dict['all']['real'] = bed_dict[region]['real']
        else:
            bed_dict['all']['real'] = bed_dict['all']['real'].cat(bed_dict[region]['real'], stream=True, postmerge=False).saveas()

        #saves offsets so after shuffling the offsets can be readjusted
        offset_dict = get_offsets_bed12(bed_dict[region]['real'])
        for i in range(nrand):
            random_intervals = bed_dict[region]['real'].shuffle(genome=short_species, incl=bedtracks[region].fn).sort()
            random_intervals = fix_shuffled_strand(random_intervals, bedtracks[region].fn)
            random_intervals = adjust_offsets(random_intervals, offset_dict)
            bed_dict[region]['rand'][i] = random_intervals.saveas()

        if i not in bed_dict['all']['rand']:
            bed_dict['all']['rand'][i] = bed_dict[region]['rand'][i]
        else:
            bed_dict['all']['rand'][i] = bed_dict['all']['rand'][i].cat(bed_dict[region]['rand'][i], stream=True, postmerge=False)


        #if there are no more clusters to assign stop trying
        if no_overlapping_count == 0:
            break

    # print "After assigning %d un-categorized regions" % len(remaining_clusters)

    if len(remaining_clusters) > 0:
        bed_dict['uncatagorized'] = {'real': remaining_clusters.sort(stream=True).saveas()}

    bed_dict = save_bedtools(bed_dict, clusters, assigned_dir)
    return bed_dict
예제 #15
0
def get_exon_bed(species):

    short_species = species.split("_")[0]
    return os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (short_species, "exons"))