def crossmap_vcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele=False, compress=False): ''' Convert genome coordinates in VCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. ''' if noCompAllele: printlog( ["Keep variants [reference_allele == alternative_allele] ..."]) else: printlog([ "Filter out variants [reference_allele == alternative_allele] ..." ]) #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue fields = str.split(line, maxsplit=7) total += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based end = start + len(fields[3]) a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) fail += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) try: fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line + "\tFail(KeyError)", file=UNMAP) fail += 1 continue # update END if any fields[7] = re.sub('END\=\d+', 'END=' + str(target_end), fields[7]) if a[1][3] == '-': fields[4] = revcomp_DNA(fields[4], True) # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) fail += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total entries:", str(total)]) printlog(["Failed to map:", str(fail)]) if compress: try: printlog(["Compressing \"%s\" ..." % outfile]) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30.0, fold=3, addtag=True): ''' Description ----------- Convert genome coordinates (in BAM/SAM format) between assemblies. BAM/SAM format: http://samtools.sourceforge.net/ chrom_size is target chromosome size Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. chainfile : file Input chain format file. infile : file Input BAM, SAM or CRAM foramt file. outfile_prefix : str Output prefix. chrom_size : dict Chromosome size of the *target* assembly, used to build bam header. IS_size : int Average insert size of pair-end sequencing. IS_std : float Stanadard deviation of insert size. fold : float A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then fold * stdev from the mean. addtag : bool if addtag is set to True, will add tags to each alignmnet: Q = QC (QC failed) N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) M = multiple mapped (alignment can be liftover to multiple places) U = unique mapped (alignment can be liftover to only 1 place) tags for pair-end sequencing include: QF: QC failed NN: both read1 and read2 unmapped NU: read1 unmapped, read2 unique mapped NM: read1 unmapped, multiple mapped UN: read1 uniquely mapped, read2 unmap UU: both read1 and read2 uniquely mapped UM: read1 uniquely mapped, read2 multiple mapped MN: read1 multiple mapped, read2 unmapped MU: read1 multiple mapped, read2 unique mapped MM: both read1 and read2 multiple mapped tags for single-end sequencing include: QF: QC failed SN: unmaped SM: multiple mapped SU: uniquely mapped ''' # determine the input file format (BAM, CRAM or SAM) file_type = '' if infile.lower().endswith('.bam'): file_type = 'BAM' comments = ['ORIGINAL_BAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rb') if len(samfile.header) == 0: print("BAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.cram'): file_type = 'CRAM' comments = ['ORIGINAL_CRAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rc') if len(samfile.header) == 0: print("CRAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.sam'): file_type = 'SAM' comments = ['ORIGINAL_SAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'r') if len(samfile.header) == 0: print("SAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) comments.append('CHAIN_FILE=' + chainfile) sam_ori_header = samfile.header.to_dict() # chromosome ID style of the original BAM file chrom_style = sam_ori_header['SQ'][0]['SN'] # either 'chr1' or '1' # update chrom_size of target genome target_chrom_sizes = {} for n, l in chrom_size.items(): target_chrom_sizes[update_chromID(chrom_style, n)] = l (new_header, name_to_id) = sam_header.bam_header_generator( orig_header=sam_ori_header, chrom_size=target_chrom_sizes, prog_name="CrossMap", prog_ver=__version__, format_ver=1.0, sort_type='coordinate', co=comments) # write to file if outfile_prefix is not None: if file_type == 'BAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog( ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog([ "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam' ]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.sam', "wh", header=new_header) printlog( ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) # write to screen else: if file_type == 'BAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover BAM file:", infile]) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover CRAM file:", infile]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile('-', "w", header=new_header) printlog(["Liftover SAM file:", infile]) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) QF = 0 NN = 0 NU = 0 NM = 0 UN = 0 UU = 0 UM = 0 MN = 0 MU = 0 MM = 0 SN = 0 SM = 0 SU = 0 total_item = 0 try: while (1): total_item += 1 old_alignment = next(samfile) new_alignment = pysam.AlignedRead() # create AlignedRead object new_alignment.query_name = old_alignment.query_name # 1st column. read name. new_alignment.query_sequence = old_alignment.query_sequence # 10th column. read sequence. all bases. new_alignment.query_qualities = old_alignment.query_qualities # 11th column. read sequence quality. all bases. new_alignment.set_tags(old_alignment.get_tags()) # 12 - columns # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution. try: rg, rgt = old_alignment.get_tag("RG", with_value_type=True) except KeyError: pass else: new_alignment.set_tag("RG", str(rg), rgt) ## Pair-end sequencing if old_alignment.is_paired: new_alignment.flag = 0x1 #pair-end in sequencing if old_alignment.is_read1: new_alignment.flag = new_alignment.flag | 0x40 elif old_alignment.is_read2: new_alignment.flag = new_alignment.flag | 0x80 if old_alignment.is_qcfail: new_alignment.flag = new_alignment.flag | 0x200 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 QF += 1 if addtag: new_alignment.set_tag(tag="QF", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 originally unmapped #================================== elif old_alignment.is_unmapped: new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 # R1 & R2 originally unmapped if old_alignment.mate_is_unmapped: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue # R1 unmap, R2 is mapped else: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 unmapped, R2 failed to liftover #------------------------------------ if read2_maps is None: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 unique #------------------------------------ elif len(read2_maps) == 2: # 2-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 multiple #------------------------------------ else: if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2-9 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 is originally mapped #================================== else: try: read1_chr = samfile.get_reference_name( old_alignment.reference_id) read1_strand = '-' if old_alignment.is_reverse else '+' read1_start = old_alignment.reference_start read1_end = old_alignment.reference_end read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) except: read1_maps = None if not old_alignment.mate_is_unmapped: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 failed to liftover #------------------------------------ if read1_maps is None: # read2 is unmapped or failed to convertion if old_alignment.mate_is_unmapped or (read2_maps is None): # col2 - col9 new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 if addtag: new_alignment.set_tag(tag="NN", value=0) NN += 1 OUT_FILE.write(new_alignment) continue # read2 is unique mapped elif len(read2_maps) == 2: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue # read2 is multiple mapped else: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = 255 # mapq not available new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 uniquely mapped #------------------------------------ elif len(read1_maps) == 2: # col2 - col5 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 new_alignment.reference_id = name_to_id[read1_maps[1] [0]] new_alignment.reference_start = read1_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # R2 unmapped before or after conversion if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 UN += 1 if addtag: new_alignment.set_tag(tag="UN", value=0) OUT_FILE.write(new_alignment) continue # R2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = abs( new_alignment.reference_start - new_alignment.next_reference_start ) + old_alignment.reference_length # 2 if (read2_maps[1][3] != read1_maps[1][3]) and ( new_alignment.template_length <= IS_size + fold * IS_std) and ( new_alignment.template_length >= IS_size - fold * IS_std): new_alignment.flag = new_alignment.flag | 0x2 UU += 1 if addtag: new_alignment.set_tag(tag="UU", value=0) OUT_FILE.write(new_alignment) continue # R2 is multiple mapped else: # 2 (strand) if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 #7-9 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 UM += 1 if addtag: new_alignment.set_tag(tag="UM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 multiple mapped #----------------------------------- elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0: # 2 new_alignment.flag = new_alignment.flag | 0x100 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 # 3-5 new_alignment.tid = name_to_id[read1_maps[1] [0]] #chrom new_alignment.pos = read1_maps[1][1] #start new_alignment.mapq = 255 if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # (1) R2 is unmapped if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 MN += 1 if addtag: new_alignment.set_tag(tag="MN", value=0) OUT_FILE.write(new_alignment) continue # (2) read2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MU += 1 if addtag: new_alignment.set_tag(tag="MU", value=0) OUT_FILE.write(new_alignment) continue # (3) R2 is multiple mapped else: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MM += 1 if addtag: new_alignment.set_tag(tag="MM", value=0) OUT_FILE.write(new_alignment) continue # Singel end sequencing else: # 7-9 new_alignment.next_reference_id = -1 new_alignment.next_reference_start = 0 new_alignment.template_length = 0 # (1) originally unmapped if old_alignment.is_unmapped: # 2-6 new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 new_alignment.cigartuples = old_alignment.cigartuples SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue else: new_alignment.flag = 0x0 read_chr = samfile.get_reference_name( old_alignment.reference_id) read_strand = '-' if old_alignment.is_reverse else '+' read_start = old_alignment.reference_start read_end = old_alignment.reference_end read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) # (2) unmapped afte liftover if read_maps is None: new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue # (3) unique mapped if len(read_maps) == 2: if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 try: new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string except: new_alignment.query_qualities = [] else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.reference_id = name_to_id[read_maps[1] [0]] new_alignment.reference_start = read_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality SU += 1 if addtag: new_alignment.set_tag(tag="SU", value=0) OUT_FILE.write(new_alignment) continue # (4) multiple mapped if len(read_maps) > 2 and len(read_maps) % 2 == 0: new_alignment.flag = new_alignment.flag | 0x100 if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.tid = name_to_id[read_maps[1][0]] new_alignment.pos = read_maps[1][1] new_alignment.mapq = old_alignment.mapq SM += 1 if addtag: new_alignment.set_tag(tag="SM", value=0) OUT_FILE.write(new_alignment) continue except StopIteration: printlog(["Done!"]) OUT_FILE.close() if outfile_prefix is not None: if file_type == "BAM" or file_type == "CRAM": try: printlog([ 'Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam') ]) pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') except: printlog(["Warning: ", "output BAM file was NOT sorted"]) try: printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) pysam.index(outfile_prefix + '.sorted.bam', outfile_prefix + '.sorted.bam.bai') except: printlog(["Warning: ", "output BAM file was NOT indexed."]) print("Total alignments:" + str(total_item - 1)) print(" QC failed: " + str(QF)) if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0: print(" Paired-end reads:") print("\tR1 unique, R2 unique (UU): " + str(UU)) print("\tR1 unique, R2 unmapp (UN): " + str(UN)) print("\tR1 unique, R2 multiple (UM): " + str(UM)) print("\tR1 multiple, R2 multiple (MM): " + str(MM)) print("\tR1 multiple, R2 unique (MU): " + str(MU)) print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) print("\tR1 unmap, R2 unmap (NN): " + str(NN)) print("\tR1 unmap, R2 unique (NU): " + str(NU)) print("\tR1 unmap, R2 multiple (NM): " + str(NM)) if max(SN, SU, SM) > 0: print(" Single-end reads:") print("\tUniquley mapped (SU): " + str(SU)) print("\tMultiple mapped (SM): " + str(SM)) print("\tUnmapped (SN): " + str(SN))
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' if noCompAllele: logging.info("Keep variants [reference_allele == alternative_allele] ...") else: logging.info("Filter out variants [reference_allele == alternative_allele] ...") #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome): logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile ,'w') UNMAP = open(outfile + '.unmap','w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line=line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: chr_template = 'chr1' else: chr_template = '1' #update contig information elif line.startswith('#CHROM'): logging.info("Updating contig field ... ") target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): #if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: if line.startswith('#'):continue # process non-variant region if 'END=' in line: fields = str.split(line,maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line,maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ','').split(',')[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() except: print(line+ "\tFail(No_targetRef)", file=UNMAP) failed_var += 1 if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print (line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print (line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() logging.info ("Total variants: %d" % total_var) logging.info ("Variants failed to map: %d" % failed_var) logging.info ("Total non-variant regions: %d" % total_region) logging.info ("Non-variant regions failed to map: %d" % failed_region) if compress: try: logging.info("Compressing \"%s\" ..." % outfile) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome, ref_name): ''' Convert genome coordinates in MAF (mutation annotation foramt) format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ref_name : str The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38". ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome): logging.info( "Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #meta-information lines needed in both mapped and unmapped files if line.startswith('#'): print(line, file=FILE_OUT) print(line, file=UNMAP) continue elif line.startswith('Hugo_Symbol'): print( "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s" % ("CrossMap", __version__, datetime.date.today().strftime("%B%d,%Y"), liftoverfile, refgenome), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: fields = str.split(line, sep='\t') total += 1 fields[3] = ref_name chrom = fields[4] start = int(fields[5]) - 1 # 0 based end = int(fields[6]) #strand = fields[7] a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line, file=UNMAP) fail += 1 continue if len(a) == 2: target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] # update chrom fields[4] = target_chr # update start coordinate fields[5] = target_start + 1 # update end fields[6] = target_end # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[10] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line, file=UNMAP) fail += 1 continue if a[1][3] == '-': fields[10] = revcomp_DNA(fields[10], True) print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line, file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() logging.info("Total entries: %d", total) logging.info("Failed to map: %d", fail)
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue # process non-variant region if 'END=' in line: fields = str.split(line, maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line, maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ', '').split( ',' )[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' #ref_allele and alt_alele are different if fields[3] != alt_allele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total variants:", str(total_var)]) printlog(["Variants failed to map:", str(failed_var)]) printlog(["Total non-variant regions:", str(total_region)]) printlog(["Non-variant regions failed to map:", str(failed_region)])