예제 #1
0
def crossmap_vcf_file(mapping,
                      infile,
                      outfile,
                      liftoverfile,
                      refgenome,
                      noCompAllele=False,
                      compress=False):
    '''
	Convert genome coordinates in VCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	noCompAllele : bool
		A logical value indicates whether to compare ref_allele to alt_allele after
		liftover. If True, the variant will be marked as "unmap" if
		ref_allele == alt_allele.
	'''

    if noCompAllele:
        printlog(
            ["Keep variants [reference_allele == alternative_allele] ..."])
    else:
        printlog([
            "Filter out variants [reference_allele == alternative_allele] ..."
        ])

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue
            fields = str.split(line, maxsplit=7)
            total += 1

            chrom = fields[0]
            start = int(fields[1]) - 1  # 0 based
            end = start + len(fields[3])

            a = map_coordinates(mapping, chrom, start, end, '+')
            if a is None:
                print(line + "\tFail(Unmap)", file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                # update chrom
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]
                fields[0] = target_chr

                # update start coordinate
                fields[1] = target_start + 1

                # update ref allele
                target_chr = update_chromID(refFasta.references[0], target_chr)
                try:
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()
                except:
                    print(line + "\tFail(KeyError)", file=UNMAP)
                    fail += 1
                    continue

                # update END if any
                fields[7] = re.sub('END\=\d+', 'END=' + str(target_end),
                                   fields[7])

                if a[1][3] == '-':
                    fields[4] = revcomp_DNA(fields[4], True)

                # check if ref_allele is the same as alt_allele
                if noCompAllele:
                    print('\t'.join(map(str, fields)), file=FILE_OUT)
                else:
                    if fields[3] != fields[4]:
                        print('\t'.join(map(str, fields)), file=FILE_OUT)
                    else:
                        print(line + "\tFail(REF==ALT)", file=UNMAP)
                        fail += 1
            else:
                print(line + "\tFail(Multiple_hits)", file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()

    printlog(["Total entries:", str(total)])
    printlog(["Failed to map:", str(fail)])

    if compress:
        try:
            printlog(["Compressing \"%s\" ..." % outfile])
            subprocess.call("gzip " + outfile, shell=True)
        except:
            pass
예제 #2
0
def crossmap_bam_file(mapping,
                      chainfile,
                      infile,
                      outfile_prefix,
                      chrom_size,
                      IS_size=200,
                      IS_std=30.0,
                      fold=3,
                      addtag=True):
    '''

	Description
	-----------
	Convert genome coordinates (in BAM/SAM format) between assemblies.
	BAM/SAM format: http://samtools.sourceforge.net/
	chrom_size is target chromosome size

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	chainfile : file
		Input chain format file.

	infile : file
		Input BAM, SAM or CRAM foramt file.

	outfile_prefix : str
		Output prefix.

	chrom_size : dict
		Chromosome size of the *target* assembly, used to build bam header.

	IS_size : int
		Average insert size of pair-end sequencing.

	IS_std : float
		Stanadard deviation of insert size.

	fold : float
		A mapped pair is considered as \"proper pair\" if both ends mapped to
		different strand and the distance between them is less then fold * stdev
		from the mean.

	addtag : bool
		if addtag is set to True, will add tags to each alignmnet:
			Q = QC (QC failed)
			N = unmapped (originally unmapped or originally mapped but failed
			    to liftover to new assembly)
			M = multiple mapped (alignment can be liftover to multiple places)
			U = unique mapped (alignment can be liftover to only 1 place)

		tags for pair-end sequencing include:
			QF: QC failed
			NN: both read1 and read2 unmapped
			NU: read1 unmapped, read2 unique mapped
			NM: read1 unmapped, multiple mapped
			UN: read1 uniquely mapped, read2 unmap
			UU: both read1 and read2 uniquely mapped
			UM: read1 uniquely mapped, read2 multiple mapped
			MN: read1 multiple mapped, read2 unmapped
			MU: read1 multiple mapped, read2 unique mapped
			MM: both read1 and read2 multiple mapped

		tags for single-end sequencing include:
			QF: QC failed
			SN: unmaped
			SM: multiple mapped
			SU: uniquely mapped
	'''

    # determine the input file format (BAM, CRAM or SAM)
    file_type = ''
    if infile.lower().endswith('.bam'):
        file_type = 'BAM'
        comments = ['ORIGINAL_BAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rb')
        if len(samfile.header) == 0:
            print("BAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.cram'):
        file_type = 'CRAM'
        comments = ['ORIGINAL_CRAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'rc')
        if len(samfile.header) == 0:
            print("CRAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    elif infile.lower().endswith('.sam'):
        file_type = 'SAM'
        comments = ['ORIGINAL_SAM_FILE=' + infile]
        samfile = pysam.Samfile(infile, 'r')
        if len(samfile.header) == 0:
            print("SAM file has no header section. Exit!", file=sys.stderr)
            sys.exit(1)
    else:
        print(
            "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
            file=sys.stderr)
        sys.exit(1)
    comments.append('CHAIN_FILE=' + chainfile)

    sam_ori_header = samfile.header.to_dict()

    # chromosome ID style of the original BAM file
    chrom_style = sam_ori_header['SQ'][0]['SN']  # either 'chr1' or '1'

    # update chrom_size of target genome
    target_chrom_sizes = {}
    for n, l in chrom_size.items():
        target_chrom_sizes[update_chromID(chrom_style, n)] = l

    (new_header, name_to_id) = sam_header.bam_header_generator(
        orig_header=sam_ori_header,
        chrom_size=target_chrom_sizes,
        prog_name="CrossMap",
        prog_ver=__version__,
        format_ver=1.0,
        sort_type='coordinate',
        co=comments)

    # write to file
    if outfile_prefix is not None:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            printlog(
                ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam'])
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.bam',
                                     "wb",
                                     header=new_header)
            printlog([
                "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam'
            ])
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile(outfile_prefix + '.sam',
                                     "wh",
                                     header=new_header)
            printlog(
                ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam'])
        else:
            print(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
                file=sys.stderr)
            sys.exit(1)
    # write to screen
    else:
        if file_type == 'BAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            printlog(["Liftover BAM file:", infile])
        elif file_type == 'CRAM':
            OUT_FILE = pysam.Samfile('-', "wb", header=new_header)
            printlog(["Liftover CRAM file:", infile])
        elif file_type == 'SAM':
            OUT_FILE = pysam.Samfile('-', "w", header=new_header)
            printlog(["Liftover SAM file:", infile])
        else:
            print(
                "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.",
                file=sys.stderr)
            sys.exit(1)
    QF = 0
    NN = 0
    NU = 0
    NM = 0
    UN = 0
    UU = 0
    UM = 0
    MN = 0
    MU = 0
    MM = 0
    SN = 0
    SM = 0
    SU = 0
    total_item = 0
    try:
        while (1):
            total_item += 1
            old_alignment = next(samfile)
            new_alignment = pysam.AlignedRead()  # create AlignedRead object

            new_alignment.query_name = old_alignment.query_name  # 1st column. read name.
            new_alignment.query_sequence = old_alignment.query_sequence  # 10th column. read sequence. all bases.
            new_alignment.query_qualities = old_alignment.query_qualities  # 11th column. read sequence quality. all bases.
            new_alignment.set_tags(old_alignment.get_tags())  # 12 - columns

            # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes
            # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution.
            try:
                rg, rgt = old_alignment.get_tag("RG", with_value_type=True)
            except KeyError:
                pass
            else:
                new_alignment.set_tag("RG", str(rg), rgt)

            ## Pair-end sequencing
            if old_alignment.is_paired:
                new_alignment.flag = 0x1  #pair-end in sequencing
                if old_alignment.is_read1:
                    new_alignment.flag = new_alignment.flag | 0x40
                elif old_alignment.is_read2:
                    new_alignment.flag = new_alignment.flag | 0x80

                if old_alignment.is_qcfail:
                    new_alignment.flag = new_alignment.flag | 0x200
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6
                    new_alignment.next_reference_id = -1  #7
                    new_alignment.next_reference_start = 0  #8
                    new_alignment.template_length = 0  #9

                    QF += 1
                    if addtag: new_alignment.set_tag(tag="QF", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                #==================================
                # R1 originally unmapped
                #==================================
                elif old_alignment.is_unmapped:
                    new_alignment.flag = new_alignment.flag | 0x4  #2
                    new_alignment.reference_id = -1  #3
                    new_alignment.reference_start = 0  #4
                    new_alignment.mapping_quality = 255  #5
                    new_alignment.cigartuples = old_alignment.cigartuples  #6

                    # R1 & R2 originally unmapped
                    if old_alignment.mate_is_unmapped:
                        new_alignment.next_reference_id = -1  #7
                        new_alignment.next_reference_start = 0  #8
                        new_alignment.template_length = 0  #9

                        NN += 1
                        if addtag: new_alignment.set_tag(tag="NN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
                    # R1 unmap, R2 is mapped
                    else:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None

                        #------------------------------------
                        # R1 unmapped, R2 failed to liftover
                        #------------------------------------
                        if read2_maps is None:
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            NN += 1
                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 unique
                        #------------------------------------
                        elif len(read2_maps) == 2:
                            # 2-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        #------------------------------------
                        # R1 unmapped, R2 multiple
                        #------------------------------------
                        else:
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2-9
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                #==================================
                # R1 is originally mapped
                #==================================
                else:
                    try:
                        read1_chr = samfile.get_reference_name(
                            old_alignment.reference_id)
                        read1_strand = '-' if old_alignment.is_reverse else '+'
                        read1_start = old_alignment.reference_start
                        read1_end = old_alignment.reference_end
                        read1_maps = map_coordinates(mapping, read1_chr,
                                                     read1_start, read1_end,
                                                     read1_strand)
                    except:
                        read1_maps = None

                    if not old_alignment.mate_is_unmapped:
                        try:
                            read2_chr = samfile.get_reference_name(
                                old_alignment.next_reference_id)
                            read2_strand = '-' if old_alignment.mate_is_reverse else '+'
                            read2_start = old_alignment.next_reference_start
                            read2_end = read2_start + 1
                            read2_maps = map_coordinates(
                                mapping, read2_chr, read2_start, read2_end,
                                read2_strand)
                        except:
                            read2_maps = None
                    #------------------------------------
                    # R1 failed to liftover
                    #------------------------------------
                    if read1_maps is None:
                        # read2 is unmapped or failed to convertion
                        if old_alignment.mate_is_unmapped or (read2_maps is
                                                              None):
                            # col2 - col9
                            new_alignment.flag = new_alignment.flag | 0x4  #2
                            new_alignment.reference_id = -1  #3
                            new_alignment.reference_start = 0  #4
                            new_alignment.mapping_quality = 255  #5
                            new_alignment.cigartuples = old_alignment.cigartuples  #6
                            new_alignment.next_reference_id = -1  #7
                            new_alignment.next_reference_start = 0  #8
                            new_alignment.template_length = 0  #9

                            if addtag: new_alignment.set_tag(tag="NN", value=0)
                            NN += 1
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1]
                                [0]]  #recommend to set the RNAME of unmapped read to its mate's
                            new_alignment.reference_start = read2_maps[1][
                                1]  #recommend to set the POS of unmapped read to its mate's
                            new_alignment.mapping_quality = old_alignment.mapping_quality
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NU += 1
                            if addtag: new_alignment.set_tag(tag="NU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # read2 is multiple mapped
                        else:
                            # col2 - col9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.reference_start = read2_maps[1][1]
                            new_alignment.mapping_quality = 255  # mapq not available
                            new_alignment.cigartuples = old_alignment.cigartuples
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]  #start
                            new_alignment.template_length = 0

                            NM += 1
                            if addtag: new_alignment.set_tag(tag="NM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                    #------------------------------------
                    # R1 uniquely mapped
                    #------------------------------------
                    elif len(read1_maps) == 2:
                        # col2 - col5
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        new_alignment.reference_id = name_to_id[read1_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read1_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # R2 unmapped before or after conversion
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UN += 1
                            if addtag: new_alignment.set_tag(tag="UN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = abs(
                                new_alignment.reference_start -
                                new_alignment.next_reference_start
                            ) + old_alignment.reference_length
                            # 2
                            if (read2_maps[1][3] != read1_maps[1][3]) and (
                                    new_alignment.template_length <=
                                    IS_size + fold * IS_std) and (
                                        new_alignment.template_length >=
                                        IS_size - fold * IS_std):
                                new_alignment.flag = new_alignment.flag | 0x2

                            UU += 1
                            if addtag: new_alignment.set_tag(tag="UU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # R2 is multiple mapped
                        else:
                            # 2 (strand)
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100

                            #7-9
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            UM += 1
                            if addtag: new_alignment.set_tag(tag="UM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue
                    #------------------------------------
                    # R1 multiple mapped
                    #-----------------------------------
                    elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0:
                        # 2
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read1_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        # 3-5
                        new_alignment.tid = name_to_id[read1_maps[1]
                                                       [0]]  #chrom
                        new_alignment.pos = read1_maps[1][1]  #start
                        new_alignment.mapq = 255

                        if read1_maps[0][3] != read1_maps[1][
                                3]:  # opposite strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        elif read1_maps[0][3] == read1_maps[1][
                                3]:  #  same strand
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # (1) R2 is unmapped
                        if (old_alignment.mate_is_unmapped) or (read2_maps is
                                                                None):
                            #2,7-9
                            new_alignment.flag = new_alignment.flag | 0x8
                            new_alignment.next_reference_id = name_to_id[
                                read1_maps[1][0]]
                            new_alignment.next_reference_start = read1_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MN += 1
                            if addtag: new_alignment.set_tag(tag="MN", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (2) read2 is unique mapped
                        elif len(read2_maps) == 2:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MU += 1
                            if addtag: new_alignment.set_tag(tag="MU", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

                        # (3) R2 is multiple mapped
                        else:
                            # 2,7-9
                            if read2_maps[1][3] == '-':
                                new_alignment.flag = new_alignment.flag | 0x20
                            # 2 (secondary alignment)
                            new_alignment.flag = new_alignment.flag | 0x100
                            new_alignment.next_reference_id = name_to_id[
                                read2_maps[1][0]]  #chrom
                            new_alignment.next_reference_start = read2_maps[1][
                                1]
                            new_alignment.template_length = 0

                            MM += 1
                            if addtag: new_alignment.set_tag(tag="MM", value=0)
                            OUT_FILE.write(new_alignment)
                            continue

            # Singel end sequencing
            else:
                # 7-9
                new_alignment.next_reference_id = -1
                new_alignment.next_reference_start = 0
                new_alignment.template_length = 0

                # (1) originally unmapped
                if old_alignment.is_unmapped:
                    # 2-6
                    new_alignment.flag = new_alignment.flag | 0x4
                    new_alignment.reference_id = -1
                    new_alignment.reference_start = 0
                    new_alignment.mapping_quality = 255
                    new_alignment.cigartuples = old_alignment.cigartuples

                    SN += 1
                    if addtag: new_alignment.set_tag(tag="SN", value=0)
                    OUT_FILE.write(new_alignment)
                    continue
                else:
                    new_alignment.flag = 0x0
                    read_chr = samfile.get_reference_name(
                        old_alignment.reference_id)
                    read_strand = '-' if old_alignment.is_reverse else '+'
                    read_start = old_alignment.reference_start
                    read_end = old_alignment.reference_end
                    read_maps = map_coordinates(mapping, read_chr, read_start,
                                                read_end, read_strand)

                    # (2) unmapped afte liftover
                    if read_maps is None:
                        new_alignment.flag = new_alignment.flag | 0x4
                        new_alignment.reference_id = -1
                        new_alignment.reference_start = 0
                        new_alignment.mapping_quality = 255

                        SN += 1
                        if addtag: new_alignment.set_tag(tag="SN", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (3) unique mapped
                    if len(read_maps) == 2:
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            try:
                                new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                              -1]  #reverse quality string
                            except:
                                new_alignment.query_qualities = []
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.reference_id = name_to_id[read_maps[1]
                                                                [0]]
                        new_alignment.reference_start = read_maps[1][1]
                        new_alignment.mapping_quality = old_alignment.mapping_quality

                        SU += 1
                        if addtag: new_alignment.set_tag(tag="SU", value=0)
                        OUT_FILE.write(new_alignment)
                        continue

                    # (4) multiple mapped
                    if len(read_maps) > 2 and len(read_maps) % 2 == 0:
                        new_alignment.flag = new_alignment.flag | 0x100
                        if read_maps[1][3] == '-':
                            new_alignment.flag = new_alignment.flag | 0x10
                        if read_maps[0][3] != read_maps[1][3]:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples[::
                                                                                  -1]  #reverse cigar tuple
                            # 10
                            new_alignment.query_sequence = revcomp_DNA(
                                old_alignment.query_sequence
                            )  #reverse complement read sequence
                            # 11
                            new_alignment.query_qualities = old_alignment.query_qualities[::
                                                                                          -1]  #reverse quality string
                        else:
                            # 6
                            new_alignment.cigartuples = old_alignment.cigartuples

                        # 3-5
                        new_alignment.tid = name_to_id[read_maps[1][0]]
                        new_alignment.pos = read_maps[1][1]
                        new_alignment.mapq = old_alignment.mapq

                        SM += 1
                        if addtag: new_alignment.set_tag(tag="SM", value=0)
                        OUT_FILE.write(new_alignment)
                        continue
    except StopIteration:
        printlog(["Done!"])
    OUT_FILE.close()

    if outfile_prefix is not None:
        if file_type == "BAM" or file_type == "CRAM":
            try:
                printlog([
                    'Sort "%s" and save as "%s"' %
                    (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam')
                ])
                pysam.sort("-o", outfile_prefix + '.sorted.bam',
                           outfile_prefix + '.bam')
            except:
                printlog(["Warning: ", "output BAM file was NOT sorted"])
            try:
                printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')])
                pysam.index(outfile_prefix + '.sorted.bam',
                            outfile_prefix + '.sorted.bam.bai')
            except:
                printlog(["Warning: ", "output BAM file was NOT indexed."])

    print("Total alignments:" + str(total_item - 1))
    print("	 QC failed: " + str(QF))
    if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0:
        print("	 Paired-end reads:")
        print("\tR1 unique, R2 unique (UU): " + str(UU))
        print("\tR1 unique, R2 unmapp (UN): " + str(UN))
        print("\tR1 unique, R2 multiple (UM): " + str(UM))

        print("\tR1 multiple, R2 multiple (MM): " + str(MM))
        print("\tR1 multiple, R2 unique (MU): " + str(MU))
        print("\tR1 multiple, R2 unmapped (MN): " + str(MN))

        print("\tR1 unmap, R2 unmap (NN): " + str(NN))
        print("\tR1 unmap, R2 unique (NU): " + str(NU))
        print("\tR1 unmap, R2 multiple (NM): " + str(NM))
    if max(SN, SU, SM) > 0:
        print("	 Single-end reads:")
        print("\tUniquley mapped (SU): " + str(SU))
        print("\tMultiple mapped (SM): " + str(SM))
        print("\tUnmapped (SN): " + str(SN))
예제 #3
0
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'):
	'''
	Convert genome coordinates in GVCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.

	noCompAllele : bool
		A logical value indicates whether to compare ref_allele to alt_allele after
		liftover. If True, the variant will be marked as "unmap" if
		ref_allele == alt_allele.

	cstyle : str, optional
		Chromosome ID style. Must be one of ['a', 's', 'l'], where
		'a' : as-is. The chromosome ID of the output file is in the same style of the input file.
		's' : short ID, such as "1", "2", "X.
		'l' : long ID, such as "chr1", "chr2", "chrX.
	'''

	if noCompAllele:
		logging.info("Keep variants [reference_allele == alternative_allele] ...")
	else:
		logging.info("Filter out variants [reference_allele == alternative_allele] ...")

	#index refegenome file if it hasn't been done
	if not os.path.exists(refgenome + '.fai'):
		logging.info("Creating index for: %s" % refgenome)
		pysam.faidx(refgenome)
	if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome):
		logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome)
		pysam.faidx(refgenome)

	refFasta = pysam.Fastafile(refgenome)

	FILE_OUT = open(outfile ,'w')
	UNMAP = open(outfile + '.unmap','w')

	total_var = 0
	failed_var = 0
	total_region = 0
	failed_region = 0
	withChr = False # check if the VCF data lines use 'chr1' or '1'

	for line in ireader.reader(infile):
		if not line.strip():
			continue
		line=line.strip()

		#deal with meta-information lines.
		#meta-information lines needed in both mapped and unmapped files
		if line.startswith('##fileformat'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##INFO'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##FILTER'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##FORMAT'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##ALT'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##SAMPLE'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##PEDIGREE'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##GVCFBlock'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##GATKCommandLine'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
		elif line.startswith('##source'):
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)

		#meta-information lines needed in unmapped files
		elif line.startswith('##assembly'):
			print(line, file=UNMAP)
		elif line.startswith('##contig'):
			print(line, file=UNMAP)
			if 'ID=chr' in line:
				chr_template = 'chr1'
			else:
				chr_template = '1'

		#update contig information
		elif line.startswith('#CHROM'):
			logging.info("Updating contig field ... ")
			target_gsize = dict(list(zip(refFasta.references, refFasta.lengths)))
			for chr_id in sorted(target_gsize):
				if chr_id.startswith('chr'):
					#if withChr is True:
					print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT)

			print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT)
			print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
			print("##originalFile=<%s>" % infile, file=FILE_OUT)
			print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
			print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT)
			print(line, file=FILE_OUT)
			print(line, file=UNMAP)
			logging.info("Lifting over ... ")

		else:
			if line.startswith('#'):continue

			# process non-variant region
			if 'END=' in line:
				fields = str.split(line,maxsplit=8)
				total_region += 1
				chrom = fields[0]
				start = int(fields[1])-1	 # 0 based
				try:
					m = re.search(r"END\=(\d+)", line)
					end = int(m[1])
				except:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_region += 1
					continue

				a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle)
				if a is None:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_region += 1
					continue
				if len(a) == 2:
					# update chrom
					target_chr = str(a[1][0])	#target_chr is from chain file, could be 'chr1' or '1'
					target_start = a[1][1]
					target_end = a[1][2]
					fields[0] = target_chr

					# update start coordinate
					fields[1] = target_start + 1

					# update END
					fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end)))
					print('\t'.join(map(str, fields)), file=FILE_OUT)

			# process variant line
			else:

				fields = str.split(line,maxsplit=7)
				total_var += 1
				chrom = fields[0]
				start = int(fields[1])-1	 	# 0 based, ref_allele start
				end = start + len(fields[3])	# ref_allele end
				alt_allele = fields[4].replace(' ','').split(',')[0]	# 20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;

				a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle)
				if a is None:
					print (line + "\tFail(Unmap)", file=UNMAP)
					failed_var += 1
					continue

				if len(a) == 2:
					# update chrom
					target_chr = str(a[1][0])	#target_chr is from chain file, could be 'chr1' or '1'
					target_start = a[1][1]
					target_end = a[1][2]
					fields[0] = target_chr

					# update start coordinate
					fields[1] = target_start + 1

					# update ref allele
					try:
						target_chr = update_chromID(refFasta.references[0], target_chr)
						fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper()
					except:
						print(line+ "\tFail(No_targetRef)", file=UNMAP)
						failed_var += 1

					if a[1][3] == '-':
						fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>'

					# check if ref_allele is the same as alt_allele
					if noCompAllele:
						print('\t'.join(map(str, fields)), file=FILE_OUT)
					else:
						if fields[3] != fields[4]:
							print('\t'.join(map(str, fields)), file=FILE_OUT)
						else:
							print (line + "\tFail(REF==ALT)", file=UNMAP)
							failed_var += 1

				else:
					print (line + "\tFail(Multiple_hits)", file=UNMAP)
					failed_var += 1
					continue
	FILE_OUT.close()
	UNMAP.close()
	logging.info ("Total variants: %d" % total_var)
	logging.info ("Variants failed to map: %d" % failed_var)
	logging.info ("Total non-variant regions: %d" % total_region)
	logging.info ("Non-variant regions failed to map: %d" % failed_region)

	if compress:
		try:
			logging.info("Compressing \"%s\" ..." % outfile)
			subprocess.call("gzip " + outfile, shell=True)
		except:
			pass
예제 #4
0
파일: mapmaf.py 프로젝트: yiwei011/CrossMap
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome,
                      ref_name):
    '''
	Convert genome coordinates in MAF (mutation annotation foramt) format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	ref_name : str
		The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38".
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        logging.info("Creating index for: %s" % refgenome)
        pysam.faidx(refgenome)
    if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome):
        logging.info(
            "Index file is older than reference genome. Re-creating index for: %s"
            % refgenome)
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total = 0
    fail = 0

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('#'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            continue
        elif line.startswith('Hugo_Symbol'):
            print(
                "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s"
                % ("CrossMap", __version__,
                   datetime.date.today().strftime("%B%d,%Y"), liftoverfile,
                   refgenome),
                file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            logging.info("Lifting over ... ")
        else:

            fields = str.split(line, sep='\t')
            total += 1

            fields[3] = ref_name
            chrom = fields[4]
            start = int(fields[5]) - 1  # 0 based
            end = int(fields[6])
            #strand = fields[7]

            a = map_coordinates(mapping, chrom, start, end, '+')

            if a is None:
                print(line, file=UNMAP)
                fail += 1
                continue

            if len(a) == 2:
                target_chr = str(
                    a[1][0]
                )  #target_chr is from chain file, could be 'chr1' or '1'
                target_start = a[1][1]
                target_end = a[1][2]

                # update chrom
                fields[4] = target_chr

                # update start coordinate
                fields[5] = target_start + 1

                # update end
                fields[6] = target_end

                # update ref allele
                try:
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[10] = refFasta.fetch(target_chr, target_start,
                                                target_end).upper()
                except:
                    print(line, file=UNMAP)
                    fail += 1
                    continue

                if a[1][3] == '-':
                    fields[10] = revcomp_DNA(fields[10], True)
                print('\t'.join(map(str, fields)), file=FILE_OUT)

            else:
                print(line, file=UNMAP)
                fail += 1
                continue
    FILE_OUT.close()
    UNMAP.close()
    logging.info("Total entries: %d", total)
    logging.info("Failed to map: %d", fail)
예제 #5
0
파일: mapgvcf.py 프로젝트: roryk/CrossMap
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome):
    '''
	Convert genome coordinates in GVCF format.

	Parameters
	----------
	mapping : dict
		Dictionary with source chrom name as key, IntervalTree object as value.

	infile : file
		Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz,
		*.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to
		remote file.

	outfile : str
		prefix of output files.

	liftoverfile : file
		Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a
		regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or
		URL (http://, https://, ftp://) pointing to remote file.

	refgenome : file
		The genome sequence file of 'target' assembly in FASTA format.
	'''

    #index refegenome file if it hasn't been done
    if not os.path.exists(refgenome + '.fai'):
        printlog(["Creating index for", refgenome])
        pysam.faidx(refgenome)

    refFasta = pysam.Fastafile(refgenome)

    FILE_OUT = open(outfile, 'w')
    UNMAP = open(outfile + '.unmap', 'w')

    total_var = 0
    failed_var = 0
    total_region = 0
    failed_region = 0
    withChr = False  # check if the VCF data lines use 'chr1' or '1'

    for line in ireader.reader(infile):
        if not line.strip():
            continue
        line = line.strip()

        #deal with meta-information lines.
        #meta-information lines needed in both mapped and unmapped files
        if line.startswith('##fileformat'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##INFO'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FILTER'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##FORMAT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##ALT'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##SAMPLE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##PEDIGREE'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##GVCFBlock'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##GATKCommandLine'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
        elif line.startswith('##source'):
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)

        #meta-information lines needed in unmapped files
        elif line.startswith('##assembly'):
            print(line, file=UNMAP)
        elif line.startswith('##contig'):
            print(line, file=UNMAP)
            if 'ID=chr' in line:
                withChr = True

        #update contig information
        elif line.startswith('#CHROM'):
            printlog(["Updating contig field ... "])
            target_gsize = dict(
                list(zip(refFasta.references, refFasta.lengths)))
            for chr_id in sorted(target_gsize):
                if chr_id.startswith('chr'):
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id.replace('chr', ''), target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                else:
                    if withChr is True:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              ('chr' + chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)
                    else:
                        print("##contig=<ID=%s,length=%d,assembly=%s>" %
                              (chr_id, target_gsize[chr_id],
                               os.path.basename(refgenome)),
                              file=FILE_OUT)

            print(
                "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>"
                % __version__,
                file=FILE_OUT)
            print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT)
            print("##originalFile=<%s>" % infile, file=FILE_OUT)
            print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT)
            print("##liftOverDate=<%s>" %
                  datetime.date.today().strftime("%B%d,%Y"),
                  file=FILE_OUT)
            print(line, file=FILE_OUT)
            print(line, file=UNMAP)
            printlog(["Lifting over ... "])

        else:
            if line.startswith('#'): continue

            # process non-variant region
            if 'END=' in line:
                fields = str.split(line, maxsplit=8)
                total_region += 1
                chrom = fields[0]
                start = int(fields[1]) - 1  # 0 based
                try:
                    m = re.search(r"END\=(\d+)", line)
                    end = int(m[1])
                except:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_region += 1
                    continue

                a = map_coordinates(mapping, chrom, start, end, '+')
                if a is None:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_region += 1
                    continue
                if len(a) == 2:
                    # update chrom
                    target_chr = str(
                        a[1][0]
                    )  #target_chr is from chain file, could be 'chr1' or '1'
                    target_start = a[1][1]
                    target_end = a[1][2]
                    fields[0] = target_chr

                    # update start coordinate
                    fields[1] = target_start + 1

                    # update END
                    fields[7] = fields[7].replace(('END=' + str(end)),
                                                  ('END=' + str(target_end)))
                    print('\t'.join(map(str, fields)), file=FILE_OUT)

            # process variant line
            else:

                fields = str.split(line, maxsplit=7)
                total_var += 1
                chrom = fields[0]
                start = int(fields[1]) - 1  # 0 based, ref_allele start
                end = start + len(fields[3])  # ref_allele end
                alt_allele = fields[4].replace(' ', '').split(
                    ','
                )[0]  # 20  10000598    .   T   A,<NON_REF> 1754.77 .   DP=54;

                a = map_coordinates(mapping, chrom, start, end, '+')
                if a is None:
                    print(line + "\tFail(Unmap)", file=UNMAP)
                    failed_var += 1
                    continue

                if len(a) == 2:
                    # update chrom
                    target_chr = str(
                        a[1][0]
                    )  #target_chr is from chain file, could be 'chr1' or '1'
                    target_start = a[1][1]
                    target_end = a[1][2]
                    fields[0] = target_chr

                    # update start coordinate
                    fields[1] = target_start + 1

                    # update ref allele
                    target_chr = update_chromID(refFasta.references[0],
                                                target_chr)
                    fields[3] = refFasta.fetch(target_chr, target_start,
                                               target_end).upper()

                    if a[1][3] == '-':
                        fields[4] = revcomp_DNA(alt_allele,
                                                True) + ',<NON_REF>'

                    #ref_allele and alt_alele are different
                    if fields[3] != alt_allele:
                        print('\t'.join(map(str, fields)), file=FILE_OUT)
                    else:
                        print(line + "\tFail(REF==ALT)", file=UNMAP)
                        failed_var += 1
                else:
                    print(line + "\tFail(Multiple_hits)", file=UNMAP)
                    failed_var += 1
                    continue
    FILE_OUT.close()
    UNMAP.close()
    printlog(["Total variants:", str(total_var)])
    printlog(["Variants failed to map:", str(failed_var)])
    printlog(["Total non-variant regions:", str(total_region)])
    printlog(["Non-variant regions failed to map:", str(failed_region)])