def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations( species) export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, 'RNASeq') print 'Annotations for', len(gene_location_db), 'genes imported' sorted_list = [] protein_coding = 0 for gene in gene_location_db: chr, strand, start, end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding += 1 else: biotype = 'NA' if len(chr) < 7: sorted_list.append( [chr, strand, int(start), int(end), gene, biotype]) #else: print chr;sys.exit() print len(sorted_list), 'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr, strand, start, end, gene, biotype = values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene, symbol, chr, strand, str(start), str(end), biotype] export_data.write(string.join(values, '\t') + '\n') export_data.close() print species, 'chromosome locations exported to:\n', export_path
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species) export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq') print 'Annotations for',len(gene_location_db),'genes imported' sorted_list=[]; protein_coding=0 for gene in gene_location_db: chr,strand,start,end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding+=1 else: biotype = 'NA' if len(chr)<7: sorted_list.append([chr,strand,int(start),int(end),gene,biotype]) #else: print chr;sys.exit() print len(sorted_list),'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr,strand,start,end,gene,biotype=values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene,symbol,chr,strand,str(start),str(end),biotype] export_data.write(string.join(values,'\t')+'\n') export_data.close() print species, 'chromosome locations exported to:\n',export_path
eo.close() if __name__ == '__main__': ################ Comand-line arguments ################ import getopt CLIP_dir = None species = 'Hs' """ Usage: bedtools intersect -wb -a /Clip_merged_reproducible_ENCODE/K562/AARS-human.bed -b /annotations/combined/hg19_annotations-full.bed > /test.bed """ if len( sys.argv[1:] ) <= 1: ### Indicates that there are insufficient number of command-line arguments print 'WARNING!!!! Too commands supplied.' else: options, remainder = getopt.getopt(sys.argv[1:], '', ['species=', 'clip=']) #print sys.argv[1:] for opt, arg in options: if opt == '--species': species = arg elif opt == '--clip': CLIP_dir = arg import ExpressionBuilder coding_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species) dataset_peaks = eCLIPimport(CLIP_dir)