Ejemplo n.º 1
0
def crossmap(source_compressed_gtf):
    requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'  # Needed for UCSC
    # only download if necessary
    if not os.path.exists(os.path.join('data', 'hg38ToHg19.over.chain.gz')):
        sys.stdout.write('Downloading UCSC database... ')
        sys.stdout.flush()
        url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz'
        try:
            p = requests.get(url, verify=False)
            with open(os.path.join('data', 'hg38ToHg19.over.chain.gz'),
                      'wb') as o:
                o.write(p.content)

        except Exception as e:
            print(
                '\n\nCannot connect to UCSC FTP site. No internet connection?\n'
            )
            print(f'Exception: {e}')
            quit()

    sys.stdout.write('\nMaking a hg19-conveterted GTF file\n')
    mapTree, targetChromSizes, sourceChromSizes = read_chain_file(
        os.path.join('data', 'hg38ToHg19.over.chain.gz'))

    converted_gtf = source_compressed_gtf.replace('.gtf.gz',
                                                  '.hg19_converted.gtf')
    crossmap_gff_file(mapTree, source_compressed_gtf, converted_gtf)

    # Note this file is not sorted!
    a = pybedtools.BedTool(converted_gtf)
    a.sort().remove_invalid().saveas('tmp.txt')
    os.rename('tmp.txt', converted_gtf)
Ejemplo n.º 2
0
                              infile=in_file,
                              outfile_prefix=out_file,
                              chrom_size=targetChromSizes,
                              IS_size=args.insert_size,
                              IS_std=args.insert_size_stdev,
                              fold=args.insert_size_fold,
                              addtag=args.add_tags,
                              cstyle=args.cstyle)

        elif command == 'gff':
            chain_file = args.chain
            in_file = args.in_gff
            out_file = args.out_gff
            (mapTree, targetChromSizes,
             sourceChromSizes) = read_chain_file(chain_file)
            crossmap_gff_file(mapTree, in_file, out_file, cstyle=args.cstyle)

        elif command == 'wig':
            chain_file = args.chain
            in_file = args.in_wig
            out_file = args.out_wig
            (mapTree, targetChromSizes,
             sourceChromSizes) = read_chain_file(chain_file)
            crossmap_wig_file(mapTree,
                              in_file,
                              out_file,
                              targetChromSizes,
                              in_format='wiggle',
                              cstyle=args.cstyle)

        elif command == 'bigwig':
Ejemplo n.º 3
0
def process_data(options, genome_build):
    # Dictionary of Gene objects
    genesdata = dict()

    # Load custom transcript IDs
    transIDs = None
    if options.input is not None:
        transIDs = readTranscriptIDs(options.input)
        print('\nOnly ' + str(len(transIDs)) + ' transcripts read from ' +
              options.input + ' are considered\n')
    else:
        print('\nAll transcripts from the Ensembl release are considered\n')

    # Load candidate and CCDS data for Ensembl <75
    candidates = dict()
    if int(options.ensembl) < 75:
        datadir = os.path.dirname(os.path.realpath(__file__)) + '/data'
        for line in open(datadir + '/info' + options.ensembl + '.txt'):
            line = line.strip()
            if line == '': continue
            cols = line.split('\t')
            if cols[0] not in list(candidates.keys()):
                candidates[cols[0]] = dict()
            candidates[cols[0]][cols[1]] = int(cols[2])

    ######################################################################

    # Download Ensembl data if necessary
    source_compressed_gtf = 'Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz'
    source_compressed_gtf = os.path.join('data', source_compressed_gtf)
    if not os.path.exists(source_compressed_gtf):
        sys.stdout.write('Downloading Ensembl database... ')
        sys.stdout.flush()

        url = 'ftp://ftp.ensembl.org/pub/release-' + options.ensembl + '/gtf/homo_sapiens/Homo_sapiens.' + genome_build + '.' + options.ensembl + '.gtf.gz'
        try:
            wget.download(url)
            os.rename(
                'Homo_sapiens.' + genome_build + '.' + options.ensembl +
                '.gtf.gz', source_compressed_gtf)
        except Exception as e:
            print(
                '\n\nCannot connect to Ensembl FTP site. No internet connection?\n'
            )
            print(f'{e}\n{url}')
            quit()

    ################################################################
    # Use crossmap to get hg19 if desired
    #################################################################
    if options.no_hg19 is not False:
        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'  # Needed for UCSC
        # only download if necessary
        if not os.path.exists(os.path.join('data',
                                           'hg38ToHg19.over.chain.gz')):
            sys.stdout.write('Downloading UCSC database... ')
            sys.stdout.flush()
            url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz'
            try:
                p = requests.get(url, verify=False)
                with open(os.path.join('data', 'hg38ToHg19.over.chain.gz'),
                          'wb') as o:
                    o.write(p.content)

            except Exception as e:
                print(
                    '\n\nCannot connect to UCSC FTP site. No internet connection?\n'
                )
                print(f'Exception: {e}')
                quit()

        sys.stdout.write('\nMaking a hg19-conveterted GTF file\n')
        mapTree, targetChromSizes, sourceChromSizes = read_chain_file(
            os.path.join('data', 'hg38ToHg19.over.chain.gz'))
        converted_gtf = os.path.join(
            'data', 'Homo_sapiens.hg19_converted' + options.ensembl + '.gtf')
        crossmap_gff_file(mapTree, source_compressed_gtf, converted_gtf)

        # Note this file is not sorted!
        a = pybedtools.BedTool(converted_gtf)
        a.sort().remove_invalid().saveas('tmp.txt')
        os.rename('tmp.txt', converted_gtf)

    ################################################################
    #
    #################################################################
    # Iterate through the lines in the ensembl data file
    sys.stdout.write('Extracting transcript data from Ensembl...')

    transcript, prevenst, first, genesdata = parse_GTF(
        filename=source_compressed_gtf,
        options=options,
        genesdata=genesdata,
        transIDs=transIDs)

    sys.stdout.write('Done\n')
    sys.stdout.flush()

    # Finalize last transcript and add to Gene object if candidate
    if transcript is not None:
        transcript.finalize()
        if transcript.isCandidate():
            if transcript.ENSG not in list(genesdata.keys()):
                genesdata[transcript.ENSG] = Gene(transcript.GENE,
                                                  transcript.ENSG)
            genesdata[transcript.ENSG].TRANSCRIPTS[
                transcript.ENST] = transcript

    # If no transcript ID from the input file was found in the Ensembl release
    if len(genesdata) == 0:
        print('\n\nNo transcripts from ' + options.input +
              ' found in Ensembl release.')
        print('\nNo transcript database created.')
        print(
            "-----------------------------------------------------------------\n"
        )
        quit()

    write_temp(os.path.join(options.output_dir, options.output + '.txt'),
               options, transIDs, genesdata)
    enst_records = sort_tmpfile('temp.txt')
    assert (len(enst_records) > 0)
    writeToFile(enst_records, os.path.join(options.output_dir, options.output))

    failed_conversions['GENE'] = set()
    failed_conversions['GENETYPE'] = set()
    failed_conversions['TRANSTYPE'] = set()
    failed_conversions['ENST'] = set()
    # ################################################################
    # Begin converted GTF conversion
    # ################################################################
    hg19_records = []
    if options.no_hg19 is not False:
        sys.stdout.write('Extracting transcript data for hg19 version...')
        sys.stdout.flush()
        transcript, prevenst, first, genesdata = parse_GTF(
            filename=converted_gtf,
            options=options,
            genesdata=genesdata,
            transIDs=transIDs)

        # Finalize last transcript and add to Gene object if candidate
        if transcript is not None:
            try:
                transcript.finalize()
            except:
                warn(transcript)

            if transcript.isCandidate():
                if transcript.ENSG not in list(genesdata.keys()):
                    genesdata[transcript.ENSG] = Gene(transcript.GENE,
                                                      transcript.ENSG)
                genesdata[transcript.ENSG].TRANSCRIPTS[
                    transcript.ENST] = transcript

        # If no transcript ID from the input file was found in the Ensembl release
        if len(genesdata) == 0:
            print('\n\nNo transcripts from ' + options.input +
                  ' found in Ensembl release.')
            print('\nNo transcript database created.')
            print(
                "-----------------------------------------------------------------\n"
            )
            quit()

        write_temp(
            os.path.join(options.output_dir,
                         options.output + '.hg19_converted.txt'), options,
            transIDs, genesdata)
        sortedRecords = sort_tmpfile('temp.txt')
        writeToFile(
            sortedRecords,
            os.path.join(options.output_dir,
                         options.output + '.hg19_converted'))
        sys.stdout.write('Completed hg19 version...')
        sys.stdout.flush()
        pickle.dump(
            failed_conversions,
            open(
                os.path.join(options.output_dir,
                             options.output + '_failed_conversions.pkl'),
                'wb'))
        hg19_records = sortedRecords
    # ################################################################
    # END converted GTF conversion
    # ################################################################

    # Remove temporary files
    sys.stdout.write('OK\n')
    sys.stdout.write('Removing temporary files... ')
    sys.stdout.flush()
    os.remove('temp.txt')
    # os.remove(source_compressed_gtf)

    print(
        f"Failed {failed_conversions['GENE'].__len__()} Genes and {failed_conversions['ENST'].__len__()} transcripts"
    )

    # Return sorted records
    return len(enst_records), len(hg19_records)
Ejemplo n.º 4
0
				chain_file = args[1]
				in_file = args[2]
				out_file = args[3]
				(mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
				crossmap_region_file(mapTree, in_file, out_file, min_ratio = options.min_map_ratio)
			else:
				region_help()
				parser.print_usage()
				sys.exit(0)

		elif sys.argv[1].lower() == 'gff':
			if len(sys.argv) == 4:
				chain_file = sys.argv[2]
				in_file = sys.argv[3]
				(mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
				crossmap_gff_file(mapTree, in_file, None)
			elif len(sys.argv) == 5:
				chain_file = sys.argv[2]
				in_file = sys.argv[3]
				out_file = sys.argv[4]
				(mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
				crossmap_gff_file(mapTree, in_file, out_file)
			else:
				gff_help()
				sys.exit(0)
		elif sys.argv[1].lower() == 'wig':
			if len(sys.argv) == 5:
				chain_file = sys.argv[2]
				in_file = sys.argv[3]
				out_file = sys.argv[4]
				(mapTree, targetChromSizes, sourceChromSizes) = read_chain_file(chain_file)
Ejemplo n.º 5
0
def process_data(options):
    # Dictionary of Gene objects
    genesdata = dict()

    # Load custom transcript IDs
    transIDs = None
    if options.input is not None:
        transIDs = readTranscriptIDs(options.input)
        print('\nOnly ' + str(len(transIDs)) + ' transcripts read from ' + options.input + ' are considered\n')
    else:
        nm = 'All transcripts from the release are considered'
        if options.nm_only: nm = 'All NM transcripts from the release are considered'
        print(f'\n{nm}\n')

    # Load candidate and CCDS data for Ensembl <75
    dict()

    ######################################################################
    # Download RefSeq data if necessary
    source_compressed_gtf = options.refseq + '_genomic.gtf.gz'
    # https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/reference/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gtf.gz
    source_compressed_gtf = os.path.join('data', source_compressed_gtf)

    if not os.path.exists(source_compressed_gtf):
        sys.stdout.write('Downloading RefSeq database... ')
        sys.stdout.flush()
        url = 'https://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Homo_sapiens/reference/' + options.refseq + '/' + options.refseq + '_genomic.gtf.gz'
        try:
            wget.download(url)
            sys.stdout.flush()
            # Convert chromosome names #Note we will lose unmapped transcripts here!
            print(f'\nUnzipping {options.refseq + "_genomic.gtf.gz"}')
            cmd = 'bgzip -d ' + options.refseq + '_genomic.gtf.gz'
            os.system(cmd)
            out = open('temp.txt', 'w')
            print(f'Parsing {options.refseq + "_genomic.gtf"}')
            with open(options.refseq + '_genomic.gtf', 'r') as g:
                for line in g:
                    if line.startswith('#'): continue
                    try:
                        new_line = replace_chrom_names(line)
                    except:
                        print(f'Failed: {line}')
                        exit()
                    if new_line:
                        out.write(new_line)
            out.close()
            print(f'Compressing the GTF into: {source_compressed_gtf}')
            cmd = 'bgzip -c temp.txt > ' + source_compressed_gtf
            os.system(cmd)
            os.remove('temp.txt')
        except Exception as e:
            print('\n\nCannot connect to RefSeq FTP site. No internet connection?\n')
            print(f'{e}\n{url}')
            quit()
    ################################################################
    # Use crossmap to get hg19 if desired
    #################################################################
    if options.no_hg19 is not False:
        requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'  # Needed for UCSC
        # only download if necessary
        if not os.path.exists(os.path.join('data', 'hg38ToHg19.over.chain.gz')):
            sys.stdout.write('Downloading UCSC database... ')
            sys.stdout.flush()
            url = 'https://hgdownload.soe.ucsc.edu/goldenPath/hg38/liftOver/hg38ToHg19.over.chain.gz'
            try:
                p = requests.get(url, verify=False)
                with open(os.path.join('data', 'hg38ToHg19.over.chain.gz'), 'wb') as o:
                    o.write(p.content)

            except Exception as e:
                print('\n\nCannot connect to UCSC FTP site. No internet connection?\n')
                print(f'Exception: {e}')
                quit()

        converted_gtf = os.path.join('data', 'Homo_sapiens.RefSeq.hg19_converted.' + options.refseq + '.gtf')
        if not os.path.exists(converted_gtf):
            sys.stdout.write('\nMaking a hg19-conveterted GTF file\n')
            mapTree, targetChromSizes, sourceChromSizes = read_chain_file(
                os.path.join('data', 'hg38ToHg19.over.chain.gz'))
            crossmap_gff_file(mapTree, source_compressed_gtf, converted_gtf)

            # Note this file is not sorted!
            a = pybedtools.BedTool(converted_gtf)
            a.sort().remove_invalid().saveas('tmp.txt')
            os.rename('tmp.txt', converted_gtf)

    ################################################################
    #
    #################################################################
    # Iterate through the lines in the refseq data file
    sys.stdout.write('Extracting transcript data from RefSeq...')

    transcript, prevenst, first, genesdata = parse_GTF(filename=source_compressed_gtf,
                                                       options=options,
                                                       genesdata=genesdata,
                                                       transIDs=transIDs)

    sys.stdout.write('Done\n')
    sys.stdout.flush()
    # Finalize last transcript and add to Gene object if candidate
    if transcript is not None:
        transcript.finalize()
        if transcript.isCandidate():
            if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE,
                                                                                                transcript.ENSG)
            genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript

    # If no transcript ID from the input file was found in the Ensembl release
    if len(genesdata) == 0:
        print('\n\nNo transcripts found in this release.')
        print('\nNo transcript database created.')
        print("-----------------------------------------------------------------\n")
        quit()

    write_temp(os.path.join(options.output_dir, options.output + '.txt'), options, transIDs, genesdata)
    enst_records = sort_tmpfile('temp.txt')
    assert (len(enst_records) > 0)
    writeToFile(enst_records, os.path.join(options.output_dir, options.output))

    failed_conversions['GENE'] = set()
    failed_conversions['GENETYPE'] = set()
    failed_conversions['TRANSTYPE'] = set()
    failed_conversions['ENST'] = set()
    # ################################################################
    # Begin converted GTF conversion
    # ################################################################
    hg19_records = []
    if options.no_hg19 is not False:
        sys.stdout.write('Extracting transcript data for hg19 version...')
        sys.stdout.flush()
        transcript, prevenst, first, genesdata = parse_GTF(filename=converted_gtf,
                                                           options=options,
                                                           genesdata=genesdata,
                                                           transIDs=transIDs)

        # Finalize last transcript and add to Gene object if candidate
        if transcript is not None:
            try:
                transcript.finalize()
            except:
                warn(transcript)

            if transcript.isCandidate():
                if transcript.ENSG not in list(genesdata.keys()): genesdata[transcript.ENSG] = Gene(transcript.GENE,
                                                                                                    transcript.ENSG)
                genesdata[transcript.ENSG].TRANSCRIPTS[transcript.ENST] = transcript

        # If no transcript ID from the input file was found in the release
        if len(genesdata) == 0:
            print('\n\nNo transcripts from ' + options.input + ' found in the release.')
            print('\nNo transcript database created.')
            print("-----------------------------------------------------------------\n")
            quit()

        write_temp(os.path.join(options.output_dir, options.output + '.hg19_converted.txt'), options, transIDs,
                   genesdata)
        sortedRecords = sort_tmpfile('temp.txt')
        writeToFile(sortedRecords, os.path.join(options.output_dir, options.output + '.hg19_converted'))
        sys.stdout.write('Completed hg19 version...')
        sys.stdout.flush()
        pickle.dump(failed_conversions,
                    open(os.path.join(options.output_dir, options.output + '_failed_conversions.pkl'), 'wb'))
        hg19_records = sortedRecords
    # ################################################################
    # END converted GTF conversion
    # ################################################################

    # Remove temporary files
    sys.stdout.write('OK\n')
    sys.stdout.write('Removing temporary files... ')
    sys.stdout.flush()
    os.remove('temp.txt')
    os.remove(source_compressed_gtf)

    print(f"Failed {failed_conversions['GENE'].__len__()} Genes and {failed_conversions['ENST'].__len__()} transcripts")

    # Return sorted records
    return len(enst_records), len(hg19_records)