def crossmap(source_compressed_gtf): requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL' # Needed for UCSC # only download if necessary if not os.path.exists(os.path.join('data', 'hg38ToHg19.over.chain.gz')): sys.stdout.write('Downloading UCSC database... ') sys.stdout.flush() url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz' try: p = requests.get(url, verify=False) with open(os.path.join('data', 'hg38ToHg19.over.chain.gz'), 'wb') as o: o.write(p.content) except Exception as e: print( '\n\nCannot connect to UCSC FTP site. No internet connection?\n' ) print(f'Exception: {e}') quit() sys.stdout.write('\nMaking a hg19-conveterted GTF file\n') mapTree, targetChromSizes, sourceChromSizes = read_chain_file( os.path.join('data', 'hg38ToHg19.over.chain.gz')) converted_gtf = source_compressed_gtf.replace('.gtf.gz', '.hg19_converted.gtf') crossmap_gff_file(mapTree, source_compressed_gtf, converted_gtf) # Note this file is not sorted! a = pybedtools.BedTool(converted_gtf) a.sort().remove_invalid().saveas('tmp.txt') os.rename('tmp.txt', converted_gtf)
args = parser.parse_args() if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(0) if len(sys.argv) >= 2: command = sys.argv[1].lower() args = parser.parse_args() if command == 'bed': chain_file = args.chain in_file = args.in_bed out_file = args.out_bed (mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) crossmap_bed_file(mapTree, in_file, out_file, unmapfile=args.unmap_file, cstyle=args.cstyle) elif command == 'bam': chain_file = args.chain in_file = args.in_bam out_file = args.out_bam if out_file in ["STDOUT", "-"]: out_file = None print(args.add_tags) print("Insert size = %f" % (args.insert_size), file=sys.stderr) print("Insert size stdev = %f" % (args.insert_size_stdev),
def process_data(options, genome_build): # Dictionary of Gene objects genesdata = dict() # Load custom transcript IDs transIDs = None if options.input is not None: transIDs = readTranscriptIDs(options.input) print('\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n') else: print('\nAll transcripts from the Ensembl release are considered\n') # Load candidate and CCDS data for Ensembl <75 candidates = dict() if int(options.ensembl) < 75: datadir = os.path.dirname(os.path.realpath(__file__)) + '/data' for line in open(datadir + '/info' + options.ensembl + '.txt'): line = line.strip() if line == '': continue cols = line.split('\t') if cols[0] not in list(candidates.keys()): candidates[cols[0]] = dict() candidates[cols[0]][cols[1]] = int(cols[2]) ###################################################################### # Download Ensembl data if necessary source_compressed_gtf = 'Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz' source_compressed_gtf = os.path.join('data', source_compressed_gtf) if not os.path.exists(source_compressed_gtf): sys.stdout.write('Downloading Ensembl database... ') sys.stdout.flush() url = 'ftp://ftp.ensembl.org/pub/release-' + options.ensembl + '/gtf/homo_sapiens/Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz' try: wget.download(url) os.rename( 'Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz', source_compressed_gtf) except Exception as e: print( '\n\nCannot connect to Ensembl FTP site. No internet connection?\n' ) print(f'{e}\n{url}') quit() ################################################################ # Use crossmap to get hg19 if desired ################################################################# if options.no_hg19 is not False: requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL' # Needed for UCSC # only download if necessary if not os.path.exists(os.path.join('data', 'hg38ToHg19.over.chain.gz')): sys.stdout.write('Downloading UCSC database... ') sys.stdout.flush() url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz' try: p = requests.get(url, verify=False) with open(os.path.join('data', 'hg38ToHg19.over.chain.gz'), 'wb') as o: o.write(p.content) except Exception as e: print( '\n\nCannot connect to UCSC FTP site. No internet connection?\n' ) print(f'Exception: {e}') quit() sys.stdout.write('\nMaking a hg19-conveterted GTF file\n') mapTree, targetChromSizes, sourceChromSizes = read_chain_file( os.path.join('data', 'hg38ToHg19.over.chain.gz')) converted_gtf = os.path.join( 'data', 'Homo_sapiens.hg19_converted' + options.ensembl + '.gtf') crossmap_gff_file(mapTree, source_compressed_gtf, converted_gtf) # Note this file is not sorted! a = pybedtools.BedTool(converted_gtf) a.sort().remove_invalid().saveas('tmp.txt') os.rename('tmp.txt', converted_gtf) ################################################################ # ################################################################# # Iterate through the lines in the ensembl data file sys.stdout.write('Extracting transcript data from Ensembl...') transcript, prevenst, first, genesdata = parse_GTF( filename=source_compressed_gtf, options=options, genesdata=genesdata, transIDs=transIDs) sys.stdout.write('Done\n') sys.stdout.flush() # Finalize last transcript and add to Gene object if candidate if transcript is not None: transcript.finalize() if transcript.isCandidate(): if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[ transcript.ENST] = transcript # If no transcript ID from the input file was found in the Ensembl release if len(genesdata) == 0: print('\n\nNo transcripts from ' + options.input + ' found in Ensembl release.') print('\nNo transcript database created.') print( "-----------------------------------------------------------------\n" ) quit() write_temp(os.path.join(options.output_dir, options.output + '.txt'), options, transIDs, genesdata) enst_records = sort_tmpfile('temp.txt') assert (len(enst_records) > 0) writeToFile(enst_records, os.path.join(options.output_dir, options.output)) failed_conversions['GENE'] = set() failed_conversions['GENETYPE'] = set() failed_conversions['TRANSTYPE'] = set() failed_conversions['ENST'] = set() # ################################################################ # Begin converted GTF conversion # ################################################################ hg19_records = [] if options.no_hg19 is not False: sys.stdout.write('Extracting transcript data for hg19 version...') sys.stdout.flush() transcript, prevenst, first, genesdata = parse_GTF( filename=converted_gtf, options=options, genesdata=genesdata, transIDs=transIDs) # Finalize last transcript and add to Gene object if candidate if transcript is not None: try: transcript.finalize() except: warn(transcript) if transcript.isCandidate(): if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[ transcript.ENST] = transcript # If no transcript ID from the input file was found in the Ensembl release if len(genesdata) == 0: print('\n\nNo transcripts from ' + options.input + ' found in Ensembl release.') print('\nNo transcript database created.') print( "-----------------------------------------------------------------\n" ) quit() write_temp( os.path.join(options.output_dir, options.output + '.hg19_converted.txt'), options, transIDs, genesdata) sortedRecords = sort_tmpfile('temp.txt') writeToFile( sortedRecords, os.path.join(options.output_dir, options.output + '.hg19_converted')) sys.stdout.write('Completed hg19 version...') sys.stdout.flush() pickle.dump( failed_conversions, open( os.path.join(options.output_dir, options.output + '_failed_conversions.pkl'), 'wb')) hg19_records = sortedRecords # ################################################################ # END converted GTF conversion # ################################################################ # Remove temporary files sys.stdout.write('OK\n') sys.stdout.write('Removing temporary files... ') sys.stdout.flush() os.remove('temp.txt') # os.remove(source_compressed_gtf) print( f"Failed {failed_conversions['GENE'].__len__()} Genes and {failed_conversions['ENST'].__len__()} transcripts" ) # Return sorted records return len(enst_records), len(hg19_records)
args = parser.parse_args() if len(sys.argv)==1: parser.print_help(sys.stderr) sys.exit(0) if len(sys.argv) >= 2: command = sys.argv[1].lower() args = parser.parse_args() if command == 'bed': chain_file = args.chain in_file = args.in_bed out_file = args.out_bed (mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) crossmap_bed_file(mapTree, in_file, out_file, unmapfile = args.unmap_file, cstyle = args.cstyle) elif command == 'bam': chain_file = args.chain in_file = args.in_bam out_file = args.out_bam if out_file in ["STDOUT","-"]: out_file = None print (args.add_tags) print("Insert size = %f" % (args.insert_size), file=sys.stderr) print("Insert size stdev = %f" % (args.insert_size_stdev), file=sys.stderr) print("Number of stdev from the mean = %f" % (args.insert_size_fold), file=sys.stderr) if args.add_tags: print("Add tags to each alignment = %s" % (args.add_tags), file=sys.stderr)
} kwds = list(commands.keys()) if len(sys.argv) == 1: general_help(commands) sys.exit(0) elif len(sys.argv) >= 2: # deal with bed input if sys.argv[1].lower() == 'bed': if len(sys.argv) == 4: chain_file = sys.argv[2] in_file = sys.argv[3] out_file = None (mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file, print_table=False) crossmap_bed_file(mapTree, in_file, out_file) elif len(sys.argv) == 5: chain_file = sys.argv[2] in_file = sys.argv[3] out_file = sys.argv[4] (mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file) crossmap_bed_file(mapTree, in_file, out_file) else: bed_help() sys.exit(0) elif sys.argv[1].lower() == 'region': usage = ( "\nCrossMap.py region <chain_file> <regions.bed> [output_file] [options]\n\nExamples:\nCrossMap.py "
def process_data(options): # Dictionary of Gene objects genesdata = dict() # Load custom transcript IDs transIDs = None if options.input is not None: transIDs = readTranscriptIDs(options.input) print('\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n') else: nm = 'All transcripts from the release are considered' if options.nm_only: nm = 'All NM transcripts from the release are considered' print(f'\n{nm}\n') # Load candidate and CCDS data for Ensembl <75 dict() ###################################################################### # Download RefSeq data if necessary source_compressed_gtf = options.refseq + '_genomic.gtf.gz' # https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/reference/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gtf.gz source_compressed_gtf = os.path.join('data', source_compressed_gtf) if not os.path.exists(source_compressed_gtf): sys.stdout.write('Downloading RefSeq database... ') sys.stdout.flush() url = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/reference/' + options.refseq + '/' + options.refseq + '_genomic.gtf.gz' try: wget.download(url) sys.stdout.flush() # Convert chromosome names #Note we will lose unmapped transcripts here! print(f'\nUnzipping {options.refseq + "_genomic.gtf.gz"}') cmd = 'bgzip -d ' + options.refseq + '_genomic.gtf.gz' os.system(cmd) out = open('temp.txt', 'w') print(f'Parsing {options.refseq + "_genomic.gtf"}') with open(options.refseq + '_genomic.gtf', 'r') as g: for line in g: if line.startswith('#'): continue try: new_line = replace_chrom_names(line) except: print(f'Failed: {line}') exit() if new_line: out.write(new_line) out.close() print(f'Compressing the GTF into: {source_compressed_gtf}') cmd = 'bgzip -c temp.txt > ' + source_compressed_gtf os.system(cmd) os.remove('temp.txt') except Exception as e: print('\n\nCannot connect to RefSeq FTP site. No internet connection?\n') print(f'{e}\n{url}') quit() ################################################################ # Use crossmap to get hg19 if desired ################################################################# if options.no_hg19 is not False: requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL' # Needed for UCSC # only download if necessary if not os.path.exists(os.path.join('data', 'hg38ToHg19.over.chain.gz')): sys.stdout.write('Downloading UCSC database... ') sys.stdout.flush() url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz' try: p = requests.get(url, verify=False) with open(os.path.join('data', 'hg38ToHg19.over.chain.gz'), 'wb') as o: o.write(p.content) except Exception as e: print('\n\nCannot connect to UCSC FTP site. No internet connection?\n') print(f'Exception: {e}') quit() converted_gtf = os.path.join('data', 'Homo_sapiens.RefSeq.hg19_converted.' + options.refseq + '.gtf') if not os.path.exists(converted_gtf): sys.stdout.write('\nMaking a hg19-conveterted GTF file\n') mapTree, targetChromSizes, sourceChromSizes = read_chain_file( os.path.join('data', 'hg38ToHg19.over.chain.gz')) crossmap_gff_file(mapTree, source_compressed_gtf, converted_gtf) # Note this file is not sorted! a = pybedtools.BedTool(converted_gtf) a.sort().remove_invalid().saveas('tmp.txt') os.rename('tmp.txt', converted_gtf) ################################################################ # ################################################################# # Iterate through the lines in the refseq data file sys.stdout.write('Extracting transcript data from RefSeq...') transcript, prevenst, first, genesdata = parse_GTF(filename=source_compressed_gtf, options=options, genesdata=genesdata, transIDs=transIDs) sys.stdout.write('Done\n') sys.stdout.flush() # Finalize last transcript and add to Gene object if candidate if transcript is not None: transcript.finalize() if transcript.isCandidate(): if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript # If no transcript ID from the input file was found in the Ensembl release if len(genesdata) == 0: print('\n\nNo transcripts found in this release.') print('\nNo transcript database created.') print("-----------------------------------------------------------------\n") quit() write_temp(os.path.join(options.output_dir, options.output + '.txt'), options, transIDs, genesdata) enst_records = sort_tmpfile('temp.txt') assert (len(enst_records) > 0) writeToFile(enst_records, os.path.join(options.output_dir, options.output)) failed_conversions['GENE'] = set() failed_conversions['GENETYPE'] = set() failed_conversions['TRANSTYPE'] = set() failed_conversions['ENST'] = set() # ################################################################ # Begin converted GTF conversion # ################################################################ hg19_records = [] if options.no_hg19 is not False: sys.stdout.write('Extracting transcript data for hg19 version...') sys.stdout.flush() transcript, prevenst, first, genesdata = parse_GTF(filename=converted_gtf, options=options, genesdata=genesdata, transIDs=transIDs) # Finalize last transcript and add to Gene object if candidate if transcript is not None: try: transcript.finalize() except: warn(transcript) if transcript.isCandidate(): if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE, transcript.ENSG) genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript # If no transcript ID from the input file was found in the release if len(genesdata) == 0: print('\n\nNo transcripts from ' + options.input + ' found in the release.') print('\nNo transcript database created.') print("-----------------------------------------------------------------\n") quit() write_temp(os.path.join(options.output_dir, options.output + '.hg19_converted.txt'), options, transIDs, genesdata) sortedRecords = sort_tmpfile('temp.txt') writeToFile(sortedRecords, os.path.join(options.output_dir, options.output + '.hg19_converted')) sys.stdout.write('Completed hg19 version...') sys.stdout.flush() pickle.dump(failed_conversions, open(os.path.join(options.output_dir, options.output + '_failed_conversions.pkl'), 'wb')) hg19_records = sortedRecords # ################################################################ # END converted GTF conversion # ################################################################ # Remove temporary files sys.stdout.write('OK\n') sys.stdout.write('Removing temporary files... ') sys.stdout.flush() os.remove('temp.txt') os.remove(source_compressed_gtf) print(f"Failed {failed_conversions['GENE'].__len__()} Genes and {failed_conversions['ENST'].__len__()} transcripts") # Return sorted records return len(enst_records), len(hg19_records)