Beispiel #1
0
def crossmap_bam_file(mapping,
                      chainfile,
                      infile,
                      outfile_prefix,
                      chrom_size,
                      IS_size=200,
                      IS_std=30.0,
                      fold=3,
                      addtag=True):
    '''

	Description
	-----------
	Convert genome coordinates (in BAM/SAM format) between assemblies.
	BAM/SAM format: http://samtools.sourceforge.net/
	chrom_size is target chromosome size

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	chainfile : file
		Input chain format file.

	infile : file
		Input BAM, SAM or CRAM foramt file.

	outfile_prefix : str
		Output prefix.

	chrom_size : dict
		Chromosome size of the *target* assembly, used to build bam header.

	IS_size : int
		Average insert size of pair-end sequencing.

	IS_std : float
		Stanadard deviation of insert size.

	fold : float
		A mapped pair is considered as \"proper pair\" if both ends mapped to
		different strand and the distance between them is less then fold * stdev
		from the mean.

	addtag : bool
		if addtag is set to True, will add tags to each alignmnet:
			Q = QC (QC failed)
			N = unmapped (originally unmapped or originally mapped but failed
			    to liftover to new assembly)
			M = multiple mapped (alignment can be liftover to multiple places)
			U = unique mapped (alignment can be liftover to only 1 place)

		tags for pair-end sequencing include:
			QF: QC failed
			NN: both read1 and read2 unmapped
			NU: read1 unmapped, read2 unique mapped
			NM: read1 unmapped, multiple mapped
			UN: read1 uniquely mapped, read2 unmap
			UU: both read1 and read2 uniquely mapped
			UM: read1 uniquely mapped, read2 multiple mapped
			MN: read1 multiple mapped, read2 unmapped
			MU: read1 multiple mapped, read2 unique mapped
			MM: both read1 and read2 multiple mapped

		tags for single-end sequencing include:
			QF: QC failed
			SN: unmaped
			SM: multiple mapped
			SU: uniquely mapped
	'''

    # determine the input file format (BAM, CRAM or SAM)
    file_type = ''
    if infile.lower().endswith('.bam'):
        file_type = 'BAM'
        comments = ['ORIGINAL_BAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rb')
        if len(samfile.header) == 0:
            print("BAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.cram'):
        file_type = 'CRAM'
        comments = ['ORIGINAL_CRAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rc')
        if len(samfile.header) == 0:
            print("CRAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.sam'):
        file_type = 'SAM'
        comments = ['ORIGINAL_SAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'r')
        if len(samfile.header) == 0:
            print("SAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    else:
        print(
            "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
            file=sys.stderr)
        sys.exit(1)
    comments.append('CHAIN_FILE=' + chainfile)

    sam_ori_header = samfile.header.to_dict()

    # chromosome ID style of the original BAM file
    chrom_style = sam_ori_header['SQ'][0]['SN']  # either 'chr1' or '1'

    # update chrom_size of target genome
    target_chrom_sizes = {}
    for n, l in chrom_size.items():
        target_chrom_sizes[update_chromID(chrom_style, n)] = l

    (new_header, name_to_id) = sam_header.bam_header_generator(
        orig_header=sam_ori_header,
        chrom_size=target_chrom_sizes,
        prog_name="CrossMap",
        prog_ver=__version__,
        format_ver=1.0,
        sort_type='coordinate',
        co=comments)

    # write to file
    if outfile_prefix is not None:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            printlog(
                ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam'])
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            printlog([
                "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam'
            ])
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.sam',
                                     "wh",
                                     header=new_header)
            printlog(
                ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam'])
        else:
            print(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
                file=sys.stderr)
            sys.exit(1)
    # write to screen
    else:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            printlog(["Liftover BAM file:", infile])
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            printlog(["Liftover CRAM file:", infile])
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile('-', "w", header=new_header)
            printlog(["Liftover SAM file:", infile])
        else:
            print(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
                file=sys.stderr)
            sys.exit(1)
    QF = 0
    NN = 0
    NU = 0
    NM = 0
    UN = 0
    UU = 0
    UM = 0
    MN = 0
    MU = 0
    MM = 0
    SN = 0
    SM = 0
    SU = 0
    total_item = 0
    try:
        while (1):
            total_item += 1
            old_alignment = next(samfile)
            new_alignment = pysam.AlignedRead()  # create AlignedRead object

            new_alignment.query_name = old_alignment.query_name  # 1st column. read name.
            new_alignment.query_sequence = old_alignment.query_sequence  # 10th column. read sequence. all bases.
            new_alignment.query_qualities = old_alignment.query_qualities  # 11th column. read sequence quality. all bases.
            new_alignment.set_tags(old_alignment.get_tags())  # 12 - columns

            # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes
            # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution.
            try:
                rg, rgt = old_alignment.get_tag("RG", with_value_type=True)
            except KeyError:
                pass
            else:
                new_alignment.set_tag("RG", str(rg), rgt)

            ## Pair-end sequencing
            if old_alignment.is_paired:
                new_alignment.flag = 0x1  #pair-end in sequencing
                if old_alignment.is_read1:
                    new_alignment.flag = new_alignment.flag | 0x40
                elif old_alignment.is_read2:
                    new_alignment.flag = new_alignment.flag | 0x80

                if old_alignment.is_qcfail:
                    new_alignment.flag = new_alignment.flag | 0x200
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6
                    new_alignment.next_reference_id = -1  #7
                    new_alignment.next_reference_start = 0  #8
                    new_alignment.template_length = 0  #9

                    QF += 1
                    if addtag: new_alignment.set_tag(tag="QF", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                #==================================
                # R1 originally unmapped
                #==================================
                elif old_alignment.is_unmapped:
                    new_alignment.flag = new_alignment.flag | 0x4  #2
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6

                    # R1 & R2 originally unmapped
                    if old_alignment.mate_is_unmapped:
                        new_alignment.next_reference_id = -1  #7
                        new_alignment.next_reference_start = 0  #8
                        new_alignment.template_length = 0  #9

                        NN += 1
                        if addtag: new_alignment.set_tag(tag="NN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
                    # R1 unmap, R2 is mapped
                    else:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None

                        #------------------------------------
                        # R1 unmapped, R2 failed to liftover
                        #------------------------------------
                        if read2_maps is None:
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            NN += 1
                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 unique
                        #------------------------------------
                        elif len(read2_maps) == 2:
                            # 2-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 multiple
                        #------------------------------------
                        else:
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2-9
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                #==================================
                # R1 is originally mapped
                #==================================
                else:
                    try:
                        read1_chr = samfile.get_reference_name(
                            old_alignment.reference_id)
                        read1_strand = '-' if old_alignment.is_reverse else '+'
                        read1_start = old_alignment.reference_start
                        read1_end = old_alignment.reference_end
                        read1_maps = map_coordinates(mapping, read1_chr,
                                                     read1_start, read1_end,
                                                     read1_strand)
                    except:
                        read1_maps = None

                    if not old_alignment.mate_is_unmapped:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None
                    #------------------------------------
                    # R1 failed to liftover
                    #------------------------------------
                    if read1_maps is None:
                        # read2 is unmapped or failed to convertion
                        if old_alignment.mate_is_unmapped or (read2_maps is
                                                              None):
                            # col2 - col9
                            new_alignment.flag = new_alignment.flag | 0x4  #2
                            new_alignment.reference_id = -1  #3
                            new_alignment.reference_start = 0  #4
                            new_alignment.mapping_quality = 255  #5
                            new_alignment.cigartuples = old_alignment.cigartuples  #6
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            NN += 1
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is multiple mapped
                        else:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = 255  # mapq not available
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                    #------------------------------------
                    # R1 uniquely mapped
                    #------------------------------------
                    elif len(read1_maps) == 2:
                        # col2 - col5
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        new_alignment.reference_id = name_to_id[read1_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read1_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # R2 unmapped before or after conversion
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UN += 1
                            if addtag: new_alignment.set_tag(tag="UN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = abs(
                                new_alignment.reference_start -
                                new_alignment.next_reference_start
                            ) + old_alignment.reference_length
                            # 2
                            if (read2_maps[1][3] != read1_maps[1][3]) and (
                                    new_alignment.template_length <=
                                    IS_size + fold * IS_std) and (
                                        new_alignment.template_length >=
                                        IS_size - fold * IS_std):
                                new_alignment.flag = new_alignment.flag | 0x2

                            UU += 1
                            if addtag: new_alignment.set_tag(tag="UU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is multiple mapped
                        else:
                            # 2 (strand)
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100

                            #7-9
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UM += 1
                            if addtag: new_alignment.set_tag(tag="UM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                    #------------------------------------
                    # R1 multiple mapped
                    #-----------------------------------
                    elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0:
                        # 2
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        # 3-5
                        new_alignment.tid = name_to_id[read1_maps[1]
                                                       [0]]  #chrom
                        new_alignment.pos = read1_maps[1][1]  #start
                        new_alignment.mapq = 255

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # (1) R2 is unmapped
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MN += 1
                            if addtag: new_alignment.set_tag(tag="MN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (2) read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MU += 1
                            if addtag: new_alignment.set_tag(tag="MU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (3) R2 is multiple mapped
                        else:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MM += 1
                            if addtag: new_alignment.set_tag(tag="MM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

            # Singel end sequencing
            else:
                # 7-9
                new_alignment.next_reference_id = -1
                new_alignment.next_reference_start = 0
                new_alignment.template_length = 0

                # (1) originally unmapped
                if old_alignment.is_unmapped:
                    # 2-6
                    new_alignment.flag = new_alignment.flag | 0x4
                    new_alignment.reference_id = -1
                    new_alignment.reference_start = 0
                    new_alignment.mapping_quality = 255
                    new_alignment.cigartuples = old_alignment.cigartuples

                    SN += 1
                    if addtag: new_alignment.set_tag(tag="SN", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                else:
                    new_alignment.flag = 0x0
                    read_chr = samfile.get_reference_name(
                        old_alignment.reference_id)
                    read_strand = '-' if old_alignment.is_reverse else '+'
                    read_start = old_alignment.reference_start
                    read_end = old_alignment.reference_end
                    read_maps = map_coordinates(mapping, read_chr, read_start,
                                                read_end, read_strand)

                    # (2) unmapped afte liftover
                    if read_maps is None:
                        new_alignment.flag = new_alignment.flag | 0x4
                        new_alignment.reference_id = -1
                        new_alignment.reference_start = 0
                        new_alignment.mapping_quality = 255

                        SN += 1
                        if addtag: new_alignment.set_tag(tag="SN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (3) unique mapped
                    if len(read_maps) == 2:
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            try:
                                new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                              -1]  #reverse quality string
                            except:
                                new_alignment.query_qualities = []
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.reference_id = name_to_id[read_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        SU += 1
                        if addtag: new_alignment.set_tag(tag="SU", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (4) multiple mapped
                    if len(read_maps) > 2 and len(read_maps) % 2 == 0:
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.tid = name_to_id[read_maps[1][0]]
                        new_alignment.pos = read_maps[1][1]
                        new_alignment.mapq = old_alignment.mapq

                        SM += 1
                        if addtag: new_alignment.set_tag(tag="SM", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
    except StopIteration:
        printlog(["Done!"])
    OUT_FILE.close()

    if outfile_prefix is not None:
        if file_type == "BAM" or file_type == "CRAM":
            try:
                printlog([
                    'Sort "%s" and save as "%s"' %
                    (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')
                ])
                pysam.sort("-o", outfile_prefix + '.sorted.bam',
                           outfile_prefix + '.bam')
            except:
                printlog(["Warning: ", "output BAM file was NOT sorted"])
            try:
                printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')])
                pysam.index(outfile_prefix + '.sorted.bam',
                            outfile_prefix + '.sorted.bam.bai')
            except:
                printlog(["Warning: ", "output BAM file was NOT indexed."])

    print("Total alignments:" + str(total_item - 1))
    print("	 QC failed: " + str(QF))
    if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0:
        print("	 Paired-end reads:")
        print("\tR1 unique, R2 unique (UU): " + str(UU))
        print("\tR1 unique, R2 unmapp (UN): " + str(UN))
        print("\tR1 unique, R2 multiple (UM): " + str(UM))

        print("\tR1 multiple, R2 multiple (MM): " + str(MM))
        print("\tR1 multiple, R2 unique (MU): " + str(MU))
        print("\tR1 multiple, R2 unmapped (MN): " + str(MN))

        print("\tR1 unmap, R2 unmap (NN): " + str(NN))
        print("\tR1 unmap, R2 unique (NU): " + str(NU))
        print("\tR1 unmap, R2 multiple (NM): " + str(NM))
    if max(SN, SU, SM) > 0:
        print("	 Single-end reads:")
        print("\tUniquley mapped (SU): " + str(SU))
        print("\tMultiple mapped (SM): " + str(SM))
        print("\tUnmapped (SN): " + str(SN))
Beispiel #2
0
def crossmap_vcf_file(mapping,
                      infile,
                      outfile,
                      liftoverfile,
                      refgenome,
                      noCompAllele=False,
                      compress=False):
    '''
	Convert genome coordinates in VCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	noCompAllele : bool
		A logical value indicates whether to compare ref_allele to alt_allele after
		liftover. If True, the variant will be marked as "unmap" if
		ref_allele == alt_allele.
	'''

    if noCompAllele:
        printlog(
            ["Keep variants [reference_allele == alternative_allele] ..."])
    else:
        printlog([
            "Filter out variants [reference_allele == alternative_allele] ..."
        ])

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue
            fields = str.split(line, maxsplit=7)
            total += 1

            chrom = fields[0]
            start = int(fields[1]) - 1  # 0 based
            end = start + len(fields[3])

            a = map_coordinates(mapping, chrom, start, end, '+')
            if a is None:
                print(line + "\tFail(Unmap)", file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                # update chrom
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]
                fields[0] = target_chr

                # update start coordinate
                fields[1] = target_start + 1

                # update ref allele
                target_chr = update_chromID(refFasta.references[0], target_chr)
                try:
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()
                except:
                    print(line + "\tFail(KeyError)", file=UNMAP)
                    fail += 1
                    continue

                # update END if any
                fields[7] = re.sub('END\=\d+', 'END=' + str(target_end),
                                   fields[7])

                if a[1][3] == '-':
                    fields[4] = revcomp_DNA(fields[4], True)

                # check if ref_allele is the same as alt_allele
                if noCompAllele:
                    print('\t'.join(map(str, fields)), file=FILE_OUT)
                else:
                    if fields[3] != fields[4]:
                        print('\t'.join(map(str, fields)), file=FILE_OUT)
                    else:
                        print(line + "\tFail(REF==ALT)", file=UNMAP)
                        fail += 1
            else:
                print(line + "\tFail(Multiple_hits)", file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()

    printlog(["Total entries:", str(total)])
    printlog(["Failed to map:", str(fail)])

    if compress:
        try:
            printlog(["Compressing \"%s\" ..." % outfile])
            subprocess.call("gzip " + outfile, shell=True)
        except:
            pass
Beispiel #3
0
def crossmap_bed_file(mapping,
                      inbed,
                      outfile=None,
                      unmapfile=None,
                      cstyle='a'):
    '''
	Convert genome coordinates (in bed format) between assemblies.
	BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	inbed : file
		Input BED file.

	outfile : str, optional
		Prefix of output files.

	unmapfile: str, optional
		Name of file to save unmapped entries. This option will be ignored if outfile is None.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    # check if 'outfile' was set. If not set, print to screen, if set, print to file
    if outfile is not None:
        FILE_OUT = open(outfile, 'w')
        if unmapfile is not None:
            UNMAP = open(unmapfile, 'w')
        else:
            UNMAP = open(outfile + '.unmap', 'w')
    else:
        pass

    for line in ireader.reader(inbed):
        if line.startswith(('#', 'track', 'browser')): continue
        if not line.strip(): continue
        line = line.strip()
        fields = line.split()
        strand = '+'

        # filter out line less than 3 columns
        if len(fields) < 3:
            print("Less than 3 fields. skip " + line, file=sys.stderr)
            if outfile:
                print(line + '\tInvalidBedFormat', file=UNMAP)
            continue
        try:
            int(fields[1])
        except:
            print("Start coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidStartPosition', file=UNMAP)
            continue
        try:
            int(fields[2])
        except:
            print("End coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidEndPosition', file=UNMAP)
            continue
        if int(fields[1]) > int(fields[2]):
            print(
                "\"Start\" is larger than \"End\" coordinate is not an integer. skip "
                + line,
                file=sys.stderr)
            if outfile:
                print(line + '\tStart>End', file=UNMAP)
            continue

        # deal with bed less than 12 columns
        if len(fields) < 12:

            # try to reset strand
            try:
                for f in fields:
                    if f in ['+', '-']:
                        strand = f
            except:
                pass

            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])

            a = map_coordinates(mapping,
                                chrom,
                                start,
                                end,
                                strand,
                                chrom_style=cstyle)

            try:
                if (a is None) or (len(a) % 2 != 0):
                    if outfile is None:
                        print(line + '\tUnmap')
                    else:
                        print(line + '\tUnmap', file=UNMAP)
                    continue

                if len(a) == 2:
                    #reset fields
                    fields[0] = a[1][0]
                    fields[1] = a[1][1]
                    fields[2] = a[1][2]
                    for i in range(
                            0, len(fields)):  #update the strand information
                        if fields[i] in ['+', '-']:
                            fields[i] = a[1][3]

                    if outfile is None:
                        print(line + '\t->\t' +
                              '\t'.join([str(i) for i in fields]))
                    else:
                        print('\t'.join([str(i) for i in fields]),
                              file=FILE_OUT)
                if len(a) > 2:
                    count = 0
                    for j in range(1, len(a), 2):
                        count += 1
                        fields[0] = a[j][0]
                        fields[1] = a[j][1]
                        fields[2] = a[j][2]
                        for i in range(
                                0,
                                len(fields)):  #update the strand information
                            if fields[i] in ['+', '-']:
                                fields[i] = a[j][3]

                        if outfile is None:
                            print(line + '\t' + '(split.' + str(count) + ':' +
                                  ':'.join([str(i) for i in a[j - 1]]) +
                                  ')\t' + '\t'.join([str(i) for i in fields]))
                        else:
                            print('\t'.join([str(i) for i in fields]),
                                  file=FILE_OUT)
            except:
                if outfile is None:
                    print(line + '\tFail')
                else:
                    print(line + '\tFail', file=UNMAP)
                continue

        # deal with bed12 and bed12+8 (genePred format)
        if len(fields) == 12 or len(fields) == 20:
            strand = fields[5]
            if strand not in ['+', '-']:
                raise Exception("Unknown strand: %s. Can only be '+' or '-'." %
                                strand)
            fail_flag = False
            exons_old_pos = annoGene.getExonFromLine(
                line)  #[[chr,st,end],[chr,st,end],...]
            #print exons_old_pos
            exons_new_pos = []
            for e_chr, e_start, e_end in exons_old_pos:
                # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')]
                a = map_coordinates(mapping,
                                    e_chr,
                                    e_start,
                                    e_end,
                                    strand,
                                    chrom_style=cstyle)
                if a is None:
                    fail_flag = True
                    break

                if len(a) == 2:
                    exons_new_pos.append(a[1])
                else:
                    fail_flag = True
                    break

            if not fail_flag:
                # check if all exons were mapped to the same chromosome and the same strand
                chr_id = set()
                exon_strand = set()

                for e_chr, e_start, e_end, e_strand in exons_new_pos:
                    chr_id.add(e_chr)
                    exon_strand.add(e_strand)
                if len(chr_id) != 1 or len(exon_strand) != 1:
                    fail_flag = True

                if not fail_flag:
                    # build new bed
                    cds_start_offset = int(fields[6]) - int(fields[1])
                    cds_end_offset = int(fields[2]) - int(fields[7])
                    new_chrom = exons_new_pos[0][0]
                    new_chrom_st = exons_new_pos[0][1]
                    new_chrom_end = exons_new_pos[-1][2]
                    new_name = fields[3]
                    new_score = fields[4]
                    new_strand = exons_new_pos[0][3]
                    new_thickStart = new_chrom_st + cds_start_offset
                    new_thickEnd = new_chrom_end - cds_end_offset
                    new_ittemRgb = fields[8]
                    new_blockCount = len(exons_new_pos)
                    new_blockSizes = ','.join(
                        [str(o - n) for m, n, o, p in exons_new_pos])
                    new_blockStarts = ','.join([
                        str(n - new_chrom_st) for m, n, o, p in exons_new_pos
                    ])

                    new_bedline = '\t'.join(
                        str(i)
                        for i in (new_chrom, new_chrom_st, new_chrom_end,
                                  new_name, new_score, new_strand,
                                  new_thickStart, new_thickEnd, new_ittemRgb,
                                  new_blockCount, new_blockSizes,
                                  new_blockStarts))
                    if check_bed12(new_bedline) is False:
                        fail_flag = True
                    else:
                        if outfile is None:
                            print(line + '\t->\t' + new_bedline)
                        else:
                            print(new_bedline, file=FILE_OUT)

            if fail_flag:
                if outfile is None:
                    print(line + '\tFail')
                else:
                    print(line, file=UNMAP)
Beispiel #4
0
def crossmap_gff_file(mapping, ingff, outfile=None, cstyle='a'):
    '''
	Description
	-----------
	Convert genome coordinates (in GFF/GTF format) between assemblies.
	GFF (General Feature Format) lines have nine required fields that must be Tab-separated:

	1. seqname - The name of the sequence. Must be a chromosome or scaffold.
	2. source - The program that generated this feature.
	3. feature - The name of this type of feature. Some examples of standard feature types
	   are "CDS", "start_codon", "stop_codon", and "exon".
	4. start - The starting position of the feature in the sequence. The first base is numbered 1.
	5. end - The ending position of the feature (inclusive).
	6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1
	   for this annotation data set, the score value will determine the level of gray in
	   which this feature is displayed (higher numbers = darker gray). If there is no score
	   value, enter ".".
	7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
	8. frame - If the feature is a coding exon, frame should be a number between 0-2 that
	   represents the reading frame of the first base. If the feature is not a coding exon,
	   the value should be '.'.
	9. group - All lines with the same group are linked together into a single item.

	GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3

	GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The
	first eight GTF fields are the same as GFF. The group field has been expanded into a
	list of attributes. Each attribute consists of a type/value pair. Attributes must end
	in a semi-colon, and be separated from any following attribute by exactly one space.

	GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4

	We do NOT check if features (exon, CDS, etc) originally belonging to the same gene  were
	converted into the same chromosome/strand.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	ingff : file
		Input GFF/GTF file.

	outfile : str, optional
		Prefix of output files.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    if outfile is not None:
        rand_str = ''.join(
            random.choices(string.ascii_uppercase + string.digits, k=8))
        FILE_OUT = open(outfile, 'w')
        UNMAP = open(outfile + '.' + rand_str + '.unmap', 'w')

    for line in ireader.reader(ingff):
        if line.startswith(('#', 'track', 'browser', 'visibility')): continue
        if not line.strip(): continue

        line = line.strip()
        fields = line.split('\t')
        try:
            start = int(fields[3]) - 1  #0-based
            end = int(fields[4]) / 1
            feature_size = end - start
        except:
            print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' +
                  line,
                  file=sys.stderr)
            if outfile:
                print(line, file=UNMAP)
            continue
        if fields[6] not in ['+', '-', '.']:
            print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr)
            if outfile:
                print(line, file=UNMAP)
            continue

        strand = '-' if fields[6] == '-' else '+'

        chrom = fields[0]
        a = map_coordinates(mapping,
                            chrom,
                            start,
                            end,
                            strand,
                            chrom_style=cstyle)

        if a is None:
            if outfile is None:
                print(line + '\tfail (no match to target assembly)')
            else:
                print(line, file=UNMAP)
            continue
        if len(a) != 2:
            if outfile is None:
                print(line + '\tfail (multpile match to target assembly)')
            else:
                print(line, file=UNMAP)
        else:
            if (int(a[1][2]) - int(
                    a[1][1])) != feature_size:  # check if it is exact match
                if outfile is None:
                    print(line + '\tfail (not exact match)')
                else:
                    print(line, file=UNMAP)
            fields[0] = a[1][0]  # chrom
            fields[3] = int(a[1][1]) + 1  # start, 1-based
            fields[4] = int(a[1][2])
            fields[6] = a[1][3]

            if outfile is None:
                print(line + '\t->\t' + '\t'.join([str(i) for i in fields]))
            else:
                print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
Beispiel #5
0
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome,
                      ref_name):
    '''
	Convert genome coordinates in MAF (mutation annotation foramt) format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	ref_name : str
		The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38".
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        logging.info("Creating index for: %s" % refgenome)
        pysam.faidx(refgenome)
    if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome):
        logging.info(
            "Index file is older than reference genome. Re-creating index for: %s"
            % refgenome)
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('#'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            continue
        elif line.startswith('Hugo_Symbol'):
            print(
                "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s"
                % ("CrossMap", __version__,
                   datetime.date.today().strftime("%B%d,%Y"), liftoverfile,
                   refgenome),
                file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            logging.info("Lifting over ... ")
        else:

            fields = str.split(line, sep='\t')
            total += 1

            fields[3] = ref_name
            chrom = fields[4]
            start = int(fields[5]) - 1  # 0 based
            end = int(fields[6])
            #strand = fields[7]

            a = map_coordinates(mapping, chrom, start, end, '+')

            if a is None:
                print(line, file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]

                # update chrom
                fields[4] = target_chr

                # update start coordinate
                fields[5] = target_start + 1

                # update end
                fields[6] = target_end

                # update ref allele
                try:
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[10] = refFasta.fetch(target_chr, target_start,
                                                target_end).upper()
                except:
                    print(line, file=UNMAP)
                    fail += 1
                    continue

                if a[1][3] == '-':
                    fields[10] = revcomp_DNA(fields[10], True)
                print('\t'.join(map(str, fields)), file=FILE_OUT)

            else:
                print(line, file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()
    logging.info("Total entries: %d", total)
    logging.info("Failed to map: %d", fail)
Beispiel #6
0
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'):
	'''
	Convert genome coordinates in GVCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	noCompAllele : bool
		A logical value indicates whether to compare ref_allele to alt_allele after
		liftover. If True, the variant will be marked as "unmap" if
		ref_allele == alt_allele.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

	if noCompAllele:
		logging.info("Keep variants [reference_allele == alternative_allele] ...")
	else:
		logging.info("Filter out variants [reference_allele == alternative_allele] ...")

	#index refegenome file if it hasn't been done
	if not os.path.exists(refgenome + '.fai'):
		logging.info("Creating index for: %s" % refgenome)
		pysam.faidx(refgenome)
	if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome):
		logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome)
		pysam.faidx(refgenome)

	refFasta = pysam.Fastafile(refgenome)

	FILE_OUT = open(outfile ,'w')
	UNMAP = open(outfile + '.unmap','w')

	total_var = 0
	failed_var = 0
	total_region = 0
	failed_region = 0
	withChr = False # check if the VCF data lines use 'chr1' or '1'

	for line in ireader.reader(infile):
		if not line.strip():
			continue
		line=line.strip()

		#deal with meta-information lines.
		#meta-information lines needed in both mapped and unmapped files
		if line.startswith('##fileformat'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##INFO'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##FILTER'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##FORMAT'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##ALT'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##SAMPLE'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##PEDIGREE'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##GVCFBlock'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##GATKCommandLine'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##source'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)

		#meta-information lines needed in unmapped files
		elif line.startswith('##assembly'):
			print(line, file=UNMAP)
		elif line.startswith('##contig'):
			print(line, file=UNMAP)
			if 'ID=chr' in line:
				chr_template = 'chr1'
			else:
				chr_template = '1'

		#update contig information
		elif line.startswith('#CHROM'):
			logging.info("Updating contig field ... ")
			target_gsize = dict(list(zip(refFasta.references, refFasta.lengths)))
			for chr_id in sorted(target_gsize):
				if chr_id.startswith('chr'):
					#if withChr is True:
					print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)

			print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT)
			print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
			print("##originalFile=<%s>" % infile, file=FILE_OUT)
			print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
			print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT)
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
			logging.info("Lifting over ... ")

		else:
			if line.startswith('#'):continue

			# process non-variant region
			if 'END=' in line:
				fields = str.split(line,maxsplit=8)
				total_region += 1
				chrom = fields[0]
				start = int(fields[1])-1	 # 0 based
				try:
					m = re.search(r"END\=(\d+)", line)
					end = int(m[1])
				except:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_region += 1
					continue

				a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle)
				if a is None:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_region += 1
					continue
				if len(a) == 2:
					# update chrom
					target_chr = str(a[1][0])	#target_chr is from chain file, could be 'chr1' or '1'
					target_start = a[1][1]
					target_end = a[1][2]
					fields[0] = target_chr

					# update start coordinate
					fields[1] = target_start + 1

					# update END
					fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end)))
					print('\t'.join(map(str, fields)), file=FILE_OUT)

			# process variant line
			else:

				fields = str.split(line,maxsplit=7)
				total_var += 1
				chrom = fields[0]
				start = int(fields[1])-1	 	# 0 based, ref_allele start
				end = start + len(fields[3])	# ref_allele end
				alt_allele = fields[4].replace(' ','').split(',')[0]	# 20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;

				a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle)
				if a is None:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_var += 1
					continue

				if len(a) == 2:
					# update chrom
					target_chr = str(a[1][0])	#target_chr is from chain file, could be 'chr1' or '1'
					target_start = a[1][1]
					target_end = a[1][2]
					fields[0] = target_chr

					# update start coordinate
					fields[1] = target_start + 1

					# update ref allele
					try:
						target_chr = update_chromID(refFasta.references[0], target_chr)
						fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper()
					except:
						print(line+ "\tFail(No_targetRef)", file=UNMAP)
						failed_var += 1

					if a[1][3] == '-':
						fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>'

					# check if ref_allele is the same as alt_allele
					if noCompAllele:
						print('\t'.join(map(str, fields)), file=FILE_OUT)
					else:
						if fields[3] != fields[4]:
							print('\t'.join(map(str, fields)), file=FILE_OUT)
						else:
							print (line + "\tFail(REF==ALT)", file=UNMAP)
							failed_var += 1

				else:
					print (line + "\tFail(Multiple_hits)", file=UNMAP)
					failed_var += 1
					continue
	FILE_OUT.close()
	UNMAP.close()
	logging.info ("Total variants: %d" % total_var)
	logging.info ("Variants failed to map: %d" % failed_var)
	logging.info ("Total non-variant regions: %d" % total_region)
	logging.info ("Non-variant regions failed to map: %d" % failed_region)

	if compress:
		try:
			logging.info("Compressing \"%s\" ..." % outfile)
			subprocess.call("gzip " + outfile, shell=True)
		except:
			pass
Beispiel #7
0
def crossmap_region_file(mapping,
                         inbed,
                         outfile=None,
                         min_ratio=0.85,
                         cstyle='a'):
    '''
	Convert large genomic regions (in bed format) between assemblies.
	BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	inbed : file
		Input BED file.

	outfile : str, optional
		Prefix of output files.

	min_ratio : float, optional
		Minimum ratio of query bases that must remap

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

    # check if 'outfile' was set. If not set, print to screen, if set, print to file
    if outfile is not None:
        FILE_OUT = open(outfile, 'w')
        UNMAP = open(outfile + '.unmap', 'w')
    else:
        pass

    for line in ireader.reader(inbed):
        if line.startswith(('#', 'track', 'browser')): continue
        if not line.strip(): continue
        line = line.strip()
        fields = line.split()
        strand = '+'

        # filter out line less than 3 columns
        if len(fields) < 3:
            print("Less than 3 fields. skip " + line, file=sys.stderr)
            if outfile:
                print(line + '\tInvalidBedFormat', file=UNMAP)
            continue
        try:
            int(fields[1])
        except:
            print("Start coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidStartPosition', file=UNMAP)
            continue
        try:
            int(fields[2])
        except:
            print("End coordinate is not an integer. skip " + line,
                  file=sys.stderr)
            if outfile:
                print(line + '\tInvalidEndPosition', file=UNMAP)
            continue
        if int(fields[1]) > int(fields[2]):
            print(
                "\"Start\" is larger than \"End\" coordinate is not an integer. skip "
                + line,
                file=sys.stderr)
            if outfile:
                print(line + '\tStart>End', file=UNMAP)
            continue

        # try to reset strand
        try:
            for f in fields:
                if f in ['+', '-']:
                    strand = f
        except:
            pass

        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        total_query_length = end - start  #used to calculate q_map_ratio

        a = map_coordinates(mapping,
                            chrom,
                            start,
                            end,
                            strand,
                            chrom_style=cstyle)
        # input: 'chr1',246974830,247024835
        # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')]
        # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)]

        if (a is None) or (len(a) % 2 != 0):
            if outfile is None:
                print(line + '\tFail\tUnmap')
            else:
                print(line + '\tFail\tUnmap', file=UNMAP)
            continue

        #when a == 2, there is one-to-one match (i.e. 100% match)
        if len(a) == 2:
            #reset fields to target assembly
            fields[0] = a[1][0]
            fields[1] = a[1][1]
            fields[2] = a[1][2]
            for i in range(0, len(fields)):  #update the strand information
                if fields[i] in ['+', '-']:
                    fields[i] = a[1][3]

            if outfile is None:
                print(line + '\t->\t' + '\t'.join([str(i) for i in fields]) +
                      "\tmap_ratio=1.0000")
            else:
                print('\t'.join([str(i)
                                 for i in fields]) + "\tmap_ratio=1.0000",
                      file=FILE_OUT)

        #when a is an even number but bigger than 2, each segment is 100% match,
        # but the whole region is not. In this case, check *min_ratio* of the query
        if len(a) > 2:
            a_query = a[::
                        2]  #EVEN: [('chr1', 246974830, 246974833, '+'), ('chr1', 247024833, 247024835, '+')]
            a_query_mapped_nt = sum([i[2] - i[1]
                                     for i in a_query])  #sum([3,2])
            a_target = a[
                1::
                2]  #ODDS: [('chr1', 248908207, 248908210, '+'), ('chr1', 249058210, 249058212, '+')]
            a_target_chroms = set([i[0] for i in a_target])
            a_target_chroms = set([i[0] for i in a_target])
            a_target_starts = [i[1] for i in a_target]
            a_target_ends = [i[2] for i in a_target]
            #print (a_target_ends)
            map_ratio = a_query_mapped_nt / total_query_length

            #map_ratio > cutoff
            if map_ratio >= min_ratio:
                if len(a_target_chroms) == 1:
                    t_chrom = a_target_chroms.pop()
                    fields[0] = t_chrom
                    fields[1] = min(a_target_starts)
                    fields[2] = max(a_target_ends)
                    if outfile is None:
                        print(line + '\t->\t' +
                              '\t'.join([str(i) for i in fields]) +
                              ("\tmap_ratio=%.4f" % map_ratio))
                    else:
                        print('\t'.join([str(i) for i in fields]) +
                              ("\tmap_ratio=%.4f" % map_ratio),
                              file=FILE_OUT)
                else:
                    if outfile is None: print(line + '\tFail\tCrossChroms')
                    else: print(line + '\tFail\tCrossChroms', file=UNMAP)
            # map_ratio > 0 but < cutoff
            elif map_ratio > 0 and map_ratio < min_ratio:
                if outfile is None:
                    print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio))
                else:
                    print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio),
                          file=UNMAP)
Beispiel #8
0
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome):
    '''
	Convert genome coordinates in GVCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total_var = 0
    failed_var = 0
    total_region = 0
    failed_region = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##GVCFBlock'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##GATKCommandLine'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##source'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue

            # process non-variant region
            if 'END=' in line:
                fields = str.split(line, maxsplit=8)
                total_region += 1
                chrom = fields[0]
                start = int(fields[1]) - 1  # 0 based
                try:
                    m = re.search(r"END\=(\d+)", line)
                    end = int(m[1])
                except:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_region += 1
                    continue

                a = map_coordinates(mapping, chrom, start, end, '+')
                if a is None:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_region += 1
                    continue
                if len(a) == 2:
                    # update chrom
                    target_chr = str(
                        a[1][0]
                    )  #target_chr is from chain file, could be 'chr1' or '1'
                    target_start = a[1][1]
                    target_end = a[1][2]
                    fields[0] = target_chr

                    # update start coordinate
                    fields[1] = target_start + 1

                    # update END
                    fields[7] = fields[7].replace(('END=' + str(end)),
                                                  ('END=' + str(target_end)))
                    print('\t'.join(map(str, fields)), file=FILE_OUT)

            # process variant line
            else:

                fields = str.split(line, maxsplit=7)
                total_var += 1
                chrom = fields[0]
                start = int(fields[1]) - 1  # 0 based, ref_allele start
                end = start + len(fields[3])  # ref_allele end
                alt_allele = fields[4].replace(' ', '').split(
                    ','
                )[0]  # 20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;

                a = map_coordinates(mapping, chrom, start, end, '+')
                if a is None:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_var += 1
                    continue

                if len(a) == 2:
                    # update chrom
                    target_chr = str(
                        a[1][0]
                    )  #target_chr is from chain file, could be 'chr1' or '1'
                    target_start = a[1][1]
                    target_end = a[1][2]
                    fields[0] = target_chr

                    # update start coordinate
                    fields[1] = target_start + 1

                    # update ref allele
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()

                    if a[1][3] == '-':
                        fields[4] = revcomp_DNA(alt_allele,
                                                True) + ',<NON_REF>'

                    #ref_allele and alt_alele are different
                    if fields[3] != alt_allele:
                        print('\t'.join(map(str, fields)), file=FILE_OUT)
                    else:
                        print(line + "\tFail(REF==ALT)", file=UNMAP)
                        failed_var += 1
                else:
                    print(line + "\tFail(Multiple_hits)", file=UNMAP)
                    failed_var += 1
                    continue
    FILE_OUT.close()
    UNMAP.close()
    printlog(["Total variants:", str(total_var)])
    printlog(["Variants failed to map:", str(failed_var)])
    printlog(["Total non-variant regions:", str(total_region)])
    printlog(["Non-variant regions failed to map:", str(failed_region)])
Beispiel #9
0
def crossmap_wig_file(mapping,
                      in_file,
                      out_prefix,
                      taget_chrom_size,
                      in_format,
                      binSize=100000):
    '''
	Description
	-----------
	Convert genome coordinates (in wiggle/bigwig format) between assemblies.
	wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html
	bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	in_file : file
		Input file in wig or bigwig format. Both "variableStep" and "fixedStep" wiggle
		lines are supported.

	out_prefix : str
		Prefix of output files.

	taget_chrom_size : dict
		Chromosome size of the target genome assembly. Key is chromosome ID, value is the
		length of the chromosome. Note, the chromosome ID and length information were
		extracted from the chain file, therefore, the chrom_IDs can be with or without
		the leading "chr".

	in_format : str
		Either "wiggle" or "bigwig"

	binSize : int
		The chunk size when reading bigwig file in each iteration.
	'''

    OUT_FILE1 = open(out_prefix + '.bgr', 'w')  # original bgr file
    OUT_FILE2 = open(out_prefix + '.sorted.bgr', 'w')  # sorted bgr file
    OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w")  # bigwig file

    chrom_style = 'chr1'

    if in_format.upper() == "WIGGLE":
        logging.info("Liftover wiggle file \"%s\" to bedGraph file \"%s\"" %
                     (in_file, out_prefix + '.bgr'))

        for chrom, start, end, strand, score in wiggleReader(in_file):
            chrom_style = chrom
            maps = map_coordinates(mapping, chrom, start, end, '+')
            if maps is None:
                continue
            if len(maps) == 2:
                print('\t'.join([
                    str(i)
                    for i in [maps[1][0], maps[1][1], maps[1][2], score]
                ]),
                      file=OUT_FILE1)
            else:
                continue
            maps[:] = []
        OUT_FILE1.close()

        logging.info("Merging overlapped entries in bedGraph file")
        for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'):
            print('\t'.join([str(i) for i in (chrom, start, end, score)]),
                  file=OUT_FILE2)
        OUT_FILE2.close()

        os.remove(out_prefix + '.bgr')  #remove .bgr, keep .sorted.bgr

        # make bigwig header
        target_chroms_sorted = []
        for k in sorted(taget_chrom_size.keys()):
            i_chrom = update_chromID(chrom_style, k)
            i_value = taget_chrom_size[k]
            target_chroms_sorted.append((i_chrom, i_value))

        # add bigwig header
        logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw'))
        OUT_FILE3.addHeader(target_chroms_sorted)

        # add entries to bigwig file
        logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw'))
        for line in ireader.reader(out_prefix + '.sorted.bgr'):
            r_chr, r_st, r_end, r_value = line.split()
            OUT_FILE3.addEntries([r_chr], [int(r_st)],
                                 ends=[int(r_end)],
                                 values=[float(r_value)])

        OUT_FILE3.close()

    elif in_format.upper() == "BIGWIG":
        logging.info("Liftover bigwig file %s to bedGraph file %s:" %
                     (in_file, out_prefix + '.bgr'))
        for chrom, start, end, score in bigwigReader(in_file):
            chrom_style = chrom
            maps = map_coordinates(mapping, chrom, start, end, '+')
            try:
                if maps is None: continue
                if len(maps) == 2:
                    print('\t'.join([
                        str(i)
                        for i in [maps[1][0], maps[1][1], maps[1][2], score]
                    ]),
                          file=OUT_FILE1)
                else:
                    continue
            except:
                continue
            maps[:] = []
        OUT_FILE1.close()

        logging.info("Merging overlapped entries in bedGraph file")
        for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'):
            print('\t'.join([str(i) for i in (chrom, start, end, score)]),
                  file=OUT_FILE2)
        OUT_FILE2.close()
        os.remove(out_prefix + '.bgr')  #remove .bgr, keep .sorted.bgr

        logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw'))

        # make bigwig header
        target_chroms_sorted = []
        for k in sorted(taget_chrom_size.keys()):
            i_chrom = update_chromID(chrom_style, k)
            i_value = taget_chrom_size[k]
            target_chroms_sorted.append((i_chrom, i_value))

        # add bigwig header
        OUT_FILE3.addHeader(target_chroms_sorted)

        # add entries to bigwig file
        logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw'))
        for line in ireader.reader(out_prefix + '.sorted.bgr'):
            r_chr, r_st, r_end, r_value = line.split()
            OUT_FILE3.addEntries([r_chr], [int(r_st)], [int(r_end)],
                                 [float(r_value)])
        OUT_FILE3.close()
    else:
        raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'")