def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna): """ Generates transcript data structures to call peaks on Allows for either predefined files (from the data directory) or custom files Accepts species, and genebed, genemrnaand genepremrna options species - the species to run on gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations) gene_mrna - the effective length of the mrna of a gene (unmappable regions removed) gene_premrna - the effective length of the pre-mrna (unmappable regions removed) pre_mrna - flag True indicates use pre-mRNA lengths instead of mRNA lengths returns genes and lengths dict """ #error checking acceptable_species = get_acceptable_species() if (species is None and gene_bed is None and (gene_mrna is None or gene_pre_mrna is None)): raise ValueError("You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"") if species is not None and gene_bed is not None: raise ValueError("You shouldn't set both geneBed and species, defaults exist for %s" % (acceptable_species)) #Now actually assign values if species is not None: try: gene_bed = clipper.data_file(species + ".AS.STRUCTURE_genes.BED.gz") gene_mrna = clipper.data_file(species + ".AS.STRUCTURE_mRNA.lengths") gene_pre_mrna = clipper.data_file(species + ".AS.STRUCTURE_premRNA.lengths") except ValueError: raise ValueError("Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" % (species, acceptable_species)) #Selects mRNA or preMRNA lengths if pre_mrna is True: lenfile = gene_pre_mrna else: lenfile = gene_mrna if lenfile is None: raise IOError("""didn't pass correct mRNA length file option with given length file""") #builds dict to do processing on, genes = build_geneinfo(gene_bed) lengths = build_lengths(lenfile) return genes, lengths
def build_transcript_data_gtf_as_structure(species, pre_mrna): """ gtf_file - gtf file generated from AS_STRUCTURE_gtf ipython notebook pre_mrna - if true uses pre mRNA length instead of mRNA length """ bedtoolintervals = [] x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff") gtf_file = pybedtools.BedTool(x) for gene in gtf_file: effective_length = gene.attrs[ 'premrna_length'] if pre_mrna else gene.attrs['mrna_length'] attrs = "gene_id=%s;" % (gene.attrs['gene_id']) if "transcript_ids" in gene.attrs: attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids']) attrs += "effective_length=%s" % (str(effective_length)) bedtoolintervals.append( pybedtools.create_interval_from_list( map(str, [ gene['chrom'], "AS_STRUCTURE", "mRNA", str(gene.start + 1), str(gene.stop + 1), "0", gene['strand'], ".", attrs ]))) return pybedtools.BedTool(bedtoolintervals)
def build_transcript_data_gtf_as_structure(species, pre_mrna): """ gtf_file - gtf file generated from AS_STRUCTURE_gtf ipython notebook pre_mrna - if true uses pre mRNA length instead of mRNA length """ results = [] x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff") gtf_file = pybedtools.BedTool(x) for gene in gtf_file: effective_length = gene.attrs['premrna_length'] if pre_mrna else gene.attrs['mrna_length'] attrs = "gene_id=%s;" % (gene.attrs['gene_id']) if "transcript_ids" in gene.attrs: attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids']) attrs += "effective_length=%s" % (str(effective_length)) results.append(pybedtools.create_interval_from_list(map(str, [gene['chrom'], "AS_STRUCTURE", "mRNA", str(gene.start + 1), str(gene.stop + 1), "0", gene['strand'], ".", attrs ]))) return pybedtools.BedTool(results)
def build_transcript_data_gtf_as_structure(species, pre_mrna): """ calculate effective length for each transcript from pre-created gtf file in clipper/data Returns Bedtool containing effective length :param species: (str) genome name :param pre_mrna: (bool) if true uses pre-mRNA length instead of mRNA length :return: (pybedtools.Bedtool) :rtype: pybedtools.BedTool """ bedtool_intervals = [] x = clipper.data_file(species + ".AS.STRUCTURE.COMPILED.gff") gtf_file = pybedtools.BedTool(x) for gene in gtf_file: effective_length = gene.attrs['premrna_length'] if pre_mrna else gene.attrs['mrna_length'] attrs = "gene_id=%s;" % (gene.attrs['gene_id']) if "transcript_ids" in gene.attrs: attrs += "transcript_ids=%s;" % (gene.attrs['transcript_ids']) attrs += "effective_length=%s" % (str(effective_length)) # add to bedtool_intervals to_string = map(str, [gene['chrom'],"AS_STRUCTURE","mRNA",str(gene.start + 1),str(gene.stop + 1),"0",gene['strand'],".",attrs]) # map object bedtool_intervals.append(pybedtools.create_interval_from_list(list(to_string))) return pybedtools.BedTool(bedtool_intervals)
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna): """ Generates transcript data structures to call peaks on Allows for either predefined files (from the data directory) or custom files Accepts species, and genebed, genemrnaand genepremrna options species - the species to run on gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations) gene_mrna - the effective length of the mrna of a gene (unmappable regions removed) gene_premrna - the effective length of the pre-mrna (unmappable regions removed) pre_mrna - flag True indicates use pre-mRNA lengths instead of mRNA lengths returns genes and lengths dict """ #error checking acceptable_species = get_acceptable_species() if (species is None and gene_bed is None and (gene_mrna is None or gene_pre_mrna is None)): raise ValueError( "You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" ) if species is not None and gene_bed is not None: raise ValueError( "You shouldn't set both geneBed and species, defaults exist for %s" % (acceptable_species)) #Now actually assign values if species is not None: try: gene_bed = clipper.data_file(species + ".AS.STRUCTURE_genes.BED.gz") gene_mrna = clipper.data_file(species + ".AS.STRUCTURE_mRNA.lengths") gene_pre_mrna = clipper.data_file(species + ".AS.STRUCTURE_premRNA.lengths") except ValueError: raise ValueError( "Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" % (species, acceptable_species)) #Selects mRNA or preMRNA lengths if pre_mrna is True: lenfile = gene_pre_mrna else: lenfile = gene_mrna if lenfile is None: raise IOError("""didn't pass correct mRNA length file option with given length file""") #builds dict to do processing on, genes = build_geneinfo(gene_bed) lengths = build_lengths(lenfile) #this is a stopgap until it can be fully factored out, returing a gtf file of #genes and effective lengths, eventually this is the file we want to pass in gtf_list = [] for gene in genes.keys(): gtf_list.append( pybedtools.create_interval_from_list([ genes[gene][0], "AS_STRUCTURE", "mRNA", str(genes[gene][2]), str(genes[gene][3]), ".", str(genes[gene][4]), ".", "gene_id=" + gene + "; effective_length=" + str(lengths[gene]) ])) return pybedtools.BedTool(gtf_list)
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna): """ Generates transcript data structures to call peaks on Allows for either predefined files (from the data directory) or custom files Accepts species, and genebed, genemrnaand genepremrna options species - the species to run on gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations) gene_mrna - the effective length of the mrna of a gene (unmappable regions removed) gene_premrna - the effective length of the pre-mrna (unmappable regions removed) pre_mrna - flag True indicates use pre-mRNA lengths instead of mRNA lengths returns genes and lengths dict """ #error checking acceptable_species = get_acceptable_species() if (species is None and gene_bed is None and (gene_mrna is None or gene_pre_mrna is None)): raise ValueError("You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"") if species is not None and gene_bed is not None: raise ValueError("You shouldn't set both geneBed and species, defaults exist for %s" % (acceptable_species)) #Now actually assign values if species is not None: try: gene_bed = clipper.data_file(species + ".AS.STRUCTURE_genes.BED.gz") gene_mrna = clipper.data_file(species + ".AS.STRUCTURE_mRNA.lengths") gene_pre_mrna = clipper.data_file(species + ".AS.STRUCTURE_premRNA.lengths") except ValueError: raise ValueError("Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" % (species, acceptable_species)) #Selects mRNA or preMRNA lengths if pre_mrna is True: lenfile = gene_pre_mrna else: lenfile = gene_mrna if lenfile is None: raise IOError("""didn't pass correct mRNA length file option with given length file""") #builds dict to do processing on, genes = build_geneinfo(gene_bed) lengths = build_lengths(lenfile) #this is a stopgap until it can be fully factored out, returing a gtf file of #genes and effective lengths, eventually this is the file we want to pass in gtf_list = [] for gene in genes.keys(): gtf_list.append(pybedtools.create_interval_from_list([genes[gene][0], "AS_STRUCTURE", "mRNA", str(genes[gene][2]), str(genes[gene][3]), ".", str(genes[gene][4]), ".", "gene_id=" + gene + "; effective_length=" + str(lengths[gene])])) return pybedtools.BedTool(gtf_list)
def build_transcript_data(species, gene_bed, gene_mrna, gene_pre_mrna, pre_mrna): """ Generates transcript data structures to call peaks on Allows for either predefined files (from the data directory) or custom files Accepts species, and genebed, genemrnaand genepremrna options species - the species to run on gene_bed - an abribtary bed file of locations to search for peaks (should be gene locations) gene_mrna - the effective length of the mrna of a gene (unmappable regions removed) gene_premrna - the effective length of the pre-mrna (unmappable regions removed) returns genes and lengths dict """ #error checking acceptable_species = get_acceptable_species() if (species is None and gene_bed is None and (gene_mrna is None or gene_pre_mrna is None)): raise ValueError( "You must set either \"species\" or \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" ) if species is not None and gene_bed is not None: raise ValueError( "You shouldn't set both geneBed and species, defaults exist for %s" % (acceptable_species)) #Now actually assign values if species is not None: try: gene_bed = clipper.data_file(species + ".AS.STRUCTURE_genes.BED.gz") gene_mrna = clipper.data_file(species + ".AS.STRUCTURE_mRNA.lengths") gene_pre_mrna = clipper.data_file(species + ".AS.STRUCTURE_premRNA.lengths") except ValueError: raise ValueError( "Defaults don't exist for your species: %s. Please choose from: %s or supply \"geneBed\"+\"geneMRNA\"+\"genePREMRNA\"" % (species, acceptable_species)) #Selects mRNA or preMRNA lengths if pre_mrna is True: lenfile = gene_pre_mrna else: lenfile = gene_mrna if lenfile is None: raise IOError("""didn't pass correct mRNA length file option with given length file""") #builds dict to do processing on, genes = build_geneinfo(gene_bed) lengths = build_lengths(lenfile) return genes, lengths