def crossmap_bam_file(mapping, chainfile, infile, outfile_prefix, chrom_size, IS_size=200, IS_std=30.0, fold=3, addtag=True): ''' Description ----------- Convert genome coordinates (in BAM/SAM format) between assemblies. BAM/SAM format: http://samtools.sourceforge.net/ chrom_size is target chromosome size Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. chainfile : file Input chain format file. infile : file Input BAM, SAM or CRAM foramt file. outfile_prefix : str Output prefix. chrom_size : dict Chromosome size of the *target* assembly, used to build bam header. IS_size : int Average insert size of pair-end sequencing. IS_std : float Stanadard deviation of insert size. fold : float A mapped pair is considered as \"proper pair\" if both ends mapped to different strand and the distance between them is less then fold * stdev from the mean. addtag : bool if addtag is set to True, will add tags to each alignmnet: Q = QC (QC failed) N = unmapped (originally unmapped or originally mapped but failed to liftover to new assembly) M = multiple mapped (alignment can be liftover to multiple places) U = unique mapped (alignment can be liftover to only 1 place) tags for pair-end sequencing include: QF: QC failed NN: both read1 and read2 unmapped NU: read1 unmapped, read2 unique mapped NM: read1 unmapped, multiple mapped UN: read1 uniquely mapped, read2 unmap UU: both read1 and read2 uniquely mapped UM: read1 uniquely mapped, read2 multiple mapped MN: read1 multiple mapped, read2 unmapped MU: read1 multiple mapped, read2 unique mapped MM: both read1 and read2 multiple mapped tags for single-end sequencing include: QF: QC failed SN: unmaped SM: multiple mapped SU: uniquely mapped ''' # determine the input file format (BAM, CRAM or SAM) file_type = '' if infile.lower().endswith('.bam'): file_type = 'BAM' comments = ['ORIGINAL_BAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rb') if len(samfile.header) == 0: print("BAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.cram'): file_type = 'CRAM' comments = ['ORIGINAL_CRAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'rc') if len(samfile.header) == 0: print("CRAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) elif infile.lower().endswith('.sam'): file_type = 'SAM' comments = ['ORIGINAL_SAM_FILE=' + infile] samfile = pysam.Samfile(infile, 'r') if len(samfile.header) == 0: print("SAM file has no header section. Exit!", file=sys.stderr) sys.exit(1) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) comments.append('CHAIN_FILE=' + chainfile) sam_ori_header = samfile.header.to_dict() # chromosome ID style of the original BAM file chrom_style = sam_ori_header['SQ'][0]['SN'] # either 'chr1' or '1' # update chrom_size of target genome target_chrom_sizes = {} for n, l in chrom_size.items(): target_chrom_sizes[update_chromID(chrom_style, n)] = l (new_header, name_to_id) = sam_header.bam_header_generator( orig_header=sam_ori_header, chrom_size=target_chrom_sizes, prog_name="CrossMap", prog_ver=__version__, format_ver=1.0, sort_type='coordinate', co=comments) # write to file if outfile_prefix is not None: if file_type == 'BAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog( ["Liftover BAM file:", infile, '==>', outfile_prefix + '.bam']) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.bam', "wb", header=new_header) printlog([ "Liftover CRAM file:", infile, '==>', outfile_prefix + '.bam' ]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile(outfile_prefix + '.sam', "wh", header=new_header) printlog( ["Liftover SAM file:", infile, '==>', outfile_prefix + '.sam']) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) # write to screen else: if file_type == 'BAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover BAM file:", infile]) elif file_type == 'CRAM': OUT_FILE = pysam.Samfile('-', "wb", header=new_header) printlog(["Liftover CRAM file:", infile]) elif file_type == 'SAM': OUT_FILE = pysam.Samfile('-', "w", header=new_header) printlog(["Liftover SAM file:", infile]) else: print( "Unknown file type! Input file must have suffix '.bam','.cram', or '.sam'.", file=sys.stderr) sys.exit(1) QF = 0 NN = 0 NU = 0 NM = 0 UN = 0 UU = 0 UM = 0 MN = 0 MU = 0 MM = 0 SN = 0 SM = 0 SU = 0 total_item = 0 try: while (1): total_item += 1 old_alignment = next(samfile) new_alignment = pysam.AlignedRead() # create AlignedRead object new_alignment.query_name = old_alignment.query_name # 1st column. read name. new_alignment.query_sequence = old_alignment.query_sequence # 10th column. read sequence. all bases. new_alignment.query_qualities = old_alignment.query_qualities # 11th column. read sequence quality. all bases. new_alignment.set_tags(old_alignment.get_tags()) # 12 - columns # by default pysam will change RG:Z to RG:A, which can cause downstream failures with GATK and freebayes # Thanks Wolfgang Resch <*****@*****.**> identified this bug and provided solution. try: rg, rgt = old_alignment.get_tag("RG", with_value_type=True) except KeyError: pass else: new_alignment.set_tag("RG", str(rg), rgt) ## Pair-end sequencing if old_alignment.is_paired: new_alignment.flag = 0x1 #pair-end in sequencing if old_alignment.is_read1: new_alignment.flag = new_alignment.flag | 0x40 elif old_alignment.is_read2: new_alignment.flag = new_alignment.flag | 0x80 if old_alignment.is_qcfail: new_alignment.flag = new_alignment.flag | 0x200 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 QF += 1 if addtag: new_alignment.set_tag(tag="QF", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 originally unmapped #================================== elif old_alignment.is_unmapped: new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 # R1 & R2 originally unmapped if old_alignment.mate_is_unmapped: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue # R1 unmap, R2 is mapped else: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 unmapped, R2 failed to liftover #------------------------------------ if read2_maps is None: new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 NN += 1 if addtag: new_alignment.set_tag(tag="NN", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 unique #------------------------------------ elif len(read2_maps) == 2: # 2-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 unmapped, R2 multiple #------------------------------------ else: if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2-9 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #================================== # R1 is originally mapped #================================== else: try: read1_chr = samfile.get_reference_name( old_alignment.reference_id) read1_strand = '-' if old_alignment.is_reverse else '+' read1_start = old_alignment.reference_start read1_end = old_alignment.reference_end read1_maps = map_coordinates(mapping, read1_chr, read1_start, read1_end, read1_strand) except: read1_maps = None if not old_alignment.mate_is_unmapped: try: read2_chr = samfile.get_reference_name( old_alignment.next_reference_id) read2_strand = '-' if old_alignment.mate_is_reverse else '+' read2_start = old_alignment.next_reference_start read2_end = read2_start + 1 read2_maps = map_coordinates( mapping, read2_chr, read2_start, read2_end, read2_strand) except: read2_maps = None #------------------------------------ # R1 failed to liftover #------------------------------------ if read1_maps is None: # read2 is unmapped or failed to convertion if old_alignment.mate_is_unmapped or (read2_maps is None): # col2 - col9 new_alignment.flag = new_alignment.flag | 0x4 #2 new_alignment.reference_id = -1 #3 new_alignment.reference_start = 0 #4 new_alignment.mapping_quality = 255 #5 new_alignment.cigartuples = old_alignment.cigartuples #6 new_alignment.next_reference_id = -1 #7 new_alignment.next_reference_start = 0 #8 new_alignment.template_length = 0 #9 if addtag: new_alignment.set_tag(tag="NN", value=0) NN += 1 OUT_FILE.write(new_alignment) continue # read2 is unique mapped elif len(read2_maps) == 2: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.reference_id = name_to_id[ read2_maps[1] [0]] #recommend to set the RNAME of unmapped read to its mate's new_alignment.reference_start = read2_maps[1][ 1] #recommend to set the POS of unmapped read to its mate's new_alignment.mapping_quality = old_alignment.mapping_quality new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NU += 1 if addtag: new_alignment.set_tag(tag="NU", value=0) OUT_FILE.write(new_alignment) continue # read2 is multiple mapped else: # col2 - col9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.flag = new_alignment.flag | 0x100 new_alignment.reference_id = name_to_id[ read2_maps[1][0]] new_alignment.reference_start = read2_maps[1][1] new_alignment.mapping_quality = 255 # mapq not available new_alignment.cigartuples = old_alignment.cigartuples new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] #start new_alignment.template_length = 0 NM += 1 if addtag: new_alignment.set_tag(tag="NM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 uniquely mapped #------------------------------------ elif len(read1_maps) == 2: # col2 - col5 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 new_alignment.reference_id = name_to_id[read1_maps[1] [0]] new_alignment.reference_start = read1_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # R2 unmapped before or after conversion if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 UN += 1 if addtag: new_alignment.set_tag(tag="UN", value=0) OUT_FILE.write(new_alignment) continue # R2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = abs( new_alignment.reference_start - new_alignment.next_reference_start ) + old_alignment.reference_length # 2 if (read2_maps[1][3] != read1_maps[1][3]) and ( new_alignment.template_length <= IS_size + fold * IS_std) and ( new_alignment.template_length >= IS_size - fold * IS_std): new_alignment.flag = new_alignment.flag | 0x2 UU += 1 if addtag: new_alignment.set_tag(tag="UU", value=0) OUT_FILE.write(new_alignment) continue # R2 is multiple mapped else: # 2 (strand) if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 #7-9 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 UM += 1 if addtag: new_alignment.set_tag(tag="UM", value=0) OUT_FILE.write(new_alignment) continue #------------------------------------ # R1 multiple mapped #----------------------------------- elif len(read1_maps) > 2 and len(read1_maps) % 2 == 0: # 2 new_alignment.flag = new_alignment.flag | 0x100 if read1_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 # 3-5 new_alignment.tid = name_to_id[read1_maps[1] [0]] #chrom new_alignment.pos = read1_maps[1][1] #start new_alignment.mapq = 255 if read1_maps[0][3] != read1_maps[1][ 3]: # opposite strand # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string elif read1_maps[0][3] == read1_maps[1][ 3]: # same strand # 6 new_alignment.cigartuples = old_alignment.cigartuples # (1) R2 is unmapped if (old_alignment.mate_is_unmapped) or (read2_maps is None): #2,7-9 new_alignment.flag = new_alignment.flag | 0x8 new_alignment.next_reference_id = name_to_id[ read1_maps[1][0]] new_alignment.next_reference_start = read1_maps[1][ 1] new_alignment.template_length = 0 MN += 1 if addtag: new_alignment.set_tag(tag="MN", value=0) OUT_FILE.write(new_alignment) continue # (2) read2 is unique mapped elif len(read2_maps) == 2: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MU += 1 if addtag: new_alignment.set_tag(tag="MU", value=0) OUT_FILE.write(new_alignment) continue # (3) R2 is multiple mapped else: # 2,7-9 if read2_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x20 # 2 (secondary alignment) new_alignment.flag = new_alignment.flag | 0x100 new_alignment.next_reference_id = name_to_id[ read2_maps[1][0]] #chrom new_alignment.next_reference_start = read2_maps[1][ 1] new_alignment.template_length = 0 MM += 1 if addtag: new_alignment.set_tag(tag="MM", value=0) OUT_FILE.write(new_alignment) continue # Singel end sequencing else: # 7-9 new_alignment.next_reference_id = -1 new_alignment.next_reference_start = 0 new_alignment.template_length = 0 # (1) originally unmapped if old_alignment.is_unmapped: # 2-6 new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 new_alignment.cigartuples = old_alignment.cigartuples SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue else: new_alignment.flag = 0x0 read_chr = samfile.get_reference_name( old_alignment.reference_id) read_strand = '-' if old_alignment.is_reverse else '+' read_start = old_alignment.reference_start read_end = old_alignment.reference_end read_maps = map_coordinates(mapping, read_chr, read_start, read_end, read_strand) # (2) unmapped afte liftover if read_maps is None: new_alignment.flag = new_alignment.flag | 0x4 new_alignment.reference_id = -1 new_alignment.reference_start = 0 new_alignment.mapping_quality = 255 SN += 1 if addtag: new_alignment.set_tag(tag="SN", value=0) OUT_FILE.write(new_alignment) continue # (3) unique mapped if len(read_maps) == 2: if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 try: new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string except: new_alignment.query_qualities = [] else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.reference_id = name_to_id[read_maps[1] [0]] new_alignment.reference_start = read_maps[1][1] new_alignment.mapping_quality = old_alignment.mapping_quality SU += 1 if addtag: new_alignment.set_tag(tag="SU", value=0) OUT_FILE.write(new_alignment) continue # (4) multiple mapped if len(read_maps) > 2 and len(read_maps) % 2 == 0: new_alignment.flag = new_alignment.flag | 0x100 if read_maps[1][3] == '-': new_alignment.flag = new_alignment.flag | 0x10 if read_maps[0][3] != read_maps[1][3]: # 6 new_alignment.cigartuples = old_alignment.cigartuples[:: -1] #reverse cigar tuple # 10 new_alignment.query_sequence = revcomp_DNA( old_alignment.query_sequence ) #reverse complement read sequence # 11 new_alignment.query_qualities = old_alignment.query_qualities[:: -1] #reverse quality string else: # 6 new_alignment.cigartuples = old_alignment.cigartuples # 3-5 new_alignment.tid = name_to_id[read_maps[1][0]] new_alignment.pos = read_maps[1][1] new_alignment.mapq = old_alignment.mapq SM += 1 if addtag: new_alignment.set_tag(tag="SM", value=0) OUT_FILE.write(new_alignment) continue except StopIteration: printlog(["Done!"]) OUT_FILE.close() if outfile_prefix is not None: if file_type == "BAM" or file_type == "CRAM": try: printlog([ 'Sort "%s" and save as "%s"' % (outfile_prefix + '.bam', outfile_prefix + '.sorted.bam') ]) pysam.sort("-o", outfile_prefix + '.sorted.bam', outfile_prefix + '.bam') except: printlog(["Warning: ", "output BAM file was NOT sorted"]) try: printlog(['Index "%s" ...' % (outfile_prefix + '.sorted.bam')]) pysam.index(outfile_prefix + '.sorted.bam', outfile_prefix + '.sorted.bam.bai') except: printlog(["Warning: ", "output BAM file was NOT indexed."]) print("Total alignments:" + str(total_item - 1)) print(" QC failed: " + str(QF)) if max(NN, NU, NM, UN, UU, UM, MN, MU, MM) > 0: print(" Paired-end reads:") print("\tR1 unique, R2 unique (UU): " + str(UU)) print("\tR1 unique, R2 unmapp (UN): " + str(UN)) print("\tR1 unique, R2 multiple (UM): " + str(UM)) print("\tR1 multiple, R2 multiple (MM): " + str(MM)) print("\tR1 multiple, R2 unique (MU): " + str(MU)) print("\tR1 multiple, R2 unmapped (MN): " + str(MN)) print("\tR1 unmap, R2 unmap (NN): " + str(NN)) print("\tR1 unmap, R2 unique (NU): " + str(NU)) print("\tR1 unmap, R2 multiple (NM): " + str(NM)) if max(SN, SU, SM) > 0: print(" Single-end reads:") print("\tUniquley mapped (SU): " + str(SU)) print("\tMultiple mapped (SM): " + str(SM)) print("\tUnmapped (SN): " + str(SN))
def crossmap_vcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele=False, compress=False): ''' Convert genome coordinates in VCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. ''' if noCompAllele: printlog( ["Keep variants [reference_allele == alternative_allele] ..."]) else: printlog([ "Filter out variants [reference_allele == alternative_allele] ..." ]) #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue fields = str.split(line, maxsplit=7) total += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based end = start + len(fields[3]) a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) fail += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) try: fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line + "\tFail(KeyError)", file=UNMAP) fail += 1 continue # update END if any fields[7] = re.sub('END\=\d+', 'END=' + str(target_end), fields[7]) if a[1][3] == '-': fields[4] = revcomp_DNA(fields[4], True) # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) fail += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total entries:", str(total)]) printlog(["Failed to map:", str(fail)]) if compress: try: printlog(["Compressing \"%s\" ..." % outfile]) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_bed_file(mapping, inbed, outfile=None, unmapfile=None, cstyle='a'): ''' Convert genome coordinates (in bed format) between assemblies. BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. inbed : file Input BED file. outfile : str, optional Prefix of output files. unmapfile: str, optional Name of file to save unmapped entries. This option will be ignored if outfile is None. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' # check if 'outfile' was set. If not set, print to screen, if set, print to file if outfile is not None: FILE_OUT = open(outfile, 'w') if unmapfile is not None: UNMAP = open(unmapfile, 'w') else: UNMAP = open(outfile + '.unmap', 'w') else: pass for line in ireader.reader(inbed): if line.startswith(('#', 'track', 'browser')): continue if not line.strip(): continue line = line.strip() fields = line.split() strand = '+' # filter out line less than 3 columns if len(fields) < 3: print("Less than 3 fields. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidBedFormat', file=UNMAP) continue try: int(fields[1]) except: print("Start coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidStartPosition', file=UNMAP) continue try: int(fields[2]) except: print("End coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidEndPosition', file=UNMAP) continue if int(fields[1]) > int(fields[2]): print( "\"Start\" is larger than \"End\" coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tStart>End', file=UNMAP) continue # deal with bed less than 12 columns if len(fields) < 12: # try to reset strand try: for f in fields: if f in ['+', '-']: strand = f except: pass chrom = fields[0] start = int(fields[1]) end = int(fields[2]) a = map_coordinates(mapping, chrom, start, end, strand, chrom_style=cstyle) try: if (a is None) or (len(a) % 2 != 0): if outfile is None: print(line + '\tUnmap') else: print(line + '\tUnmap', file=UNMAP) continue if len(a) == 2: #reset fields fields[0] = a[1][0] fields[1] = a[1][1] fields[2] = a[1][2] for i in range( 0, len(fields)): #update the strand information if fields[i] in ['+', '-']: fields[i] = a[1][3] if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) else: print('\t'.join([str(i) for i in fields]), file=FILE_OUT) if len(a) > 2: count = 0 for j in range(1, len(a), 2): count += 1 fields[0] = a[j][0] fields[1] = a[j][1] fields[2] = a[j][2] for i in range( 0, len(fields)): #update the strand information if fields[i] in ['+', '-']: fields[i] = a[j][3] if outfile is None: print(line + '\t' + '(split.' + str(count) + ':' + ':'.join([str(i) for i in a[j - 1]]) + ')\t' + '\t'.join([str(i) for i in fields])) else: print('\t'.join([str(i) for i in fields]), file=FILE_OUT) except: if outfile is None: print(line + '\tFail') else: print(line + '\tFail', file=UNMAP) continue # deal with bed12 and bed12+8 (genePred format) if len(fields) == 12 or len(fields) == 20: strand = fields[5] if strand not in ['+', '-']: raise Exception("Unknown strand: %s. Can only be '+' or '-'." % strand) fail_flag = False exons_old_pos = annoGene.getExonFromLine( line) #[[chr,st,end],[chr,st,end],...] #print exons_old_pos exons_new_pos = [] for e_chr, e_start, e_end in exons_old_pos: # a has two elements, first is query, 2nd is target. # [('chr1', 246974830, 246974833,'+'), ('chr1', 248908207, 248908210,'+')] a = map_coordinates(mapping, e_chr, e_start, e_end, strand, chrom_style=cstyle) if a is None: fail_flag = True break if len(a) == 2: exons_new_pos.append(a[1]) else: fail_flag = True break if not fail_flag: # check if all exons were mapped to the same chromosome and the same strand chr_id = set() exon_strand = set() for e_chr, e_start, e_end, e_strand in exons_new_pos: chr_id.add(e_chr) exon_strand.add(e_strand) if len(chr_id) != 1 or len(exon_strand) != 1: fail_flag = True if not fail_flag: # build new bed cds_start_offset = int(fields[6]) - int(fields[1]) cds_end_offset = int(fields[2]) - int(fields[7]) new_chrom = exons_new_pos[0][0] new_chrom_st = exons_new_pos[0][1] new_chrom_end = exons_new_pos[-1][2] new_name = fields[3] new_score = fields[4] new_strand = exons_new_pos[0][3] new_thickStart = new_chrom_st + cds_start_offset new_thickEnd = new_chrom_end - cds_end_offset new_ittemRgb = fields[8] new_blockCount = len(exons_new_pos) new_blockSizes = ','.join( [str(o - n) for m, n, o, p in exons_new_pos]) new_blockStarts = ','.join([ str(n - new_chrom_st) for m, n, o, p in exons_new_pos ]) new_bedline = '\t'.join( str(i) for i in (new_chrom, new_chrom_st, new_chrom_end, new_name, new_score, new_strand, new_thickStart, new_thickEnd, new_ittemRgb, new_blockCount, new_blockSizes, new_blockStarts)) if check_bed12(new_bedline) is False: fail_flag = True else: if outfile is None: print(line + '\t->\t' + new_bedline) else: print(new_bedline, file=FILE_OUT) if fail_flag: if outfile is None: print(line + '\tFail') else: print(line, file=UNMAP)
def crossmap_gff_file(mapping, ingff, outfile=None, cstyle='a'): ''' Description ----------- Convert genome coordinates (in GFF/GTF format) between assemblies. GFF (General Feature Format) lines have nine required fields that must be Tab-separated: 1. seqname - The name of the sequence. Must be a chromosome or scaffold. 2. source - The program that generated this feature. 3. feature - The name of this type of feature. Some examples of standard feature types are "CDS", "start_codon", "stop_codon", and "exon". 4. start - The starting position of the feature in the sequence. The first base is numbered 1. 5. end - The ending position of the feature (inclusive). 6. score - A score between 0 and 1000. If the track line useScore attribute is set to 1 for this annotation data set, the score value will determine the level of gray in which this feature is displayed (higher numbers = darker gray). If there is no score value, enter ".". 7. strand - Valid entries include '+', '-', or '.' (for don't know/don't care). 8. frame - If the feature is a coding exon, frame should be a number between 0-2 that represents the reading frame of the first base. If the feature is not a coding exon, the value should be '.'. 9. group - All lines with the same group are linked together into a single item. GFF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format3 GTF (Gene Transfer Format) is a refinement to GFF that tightens the specification. The first eight GTF fields are the same as GFF. The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair. Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. GTF format: http://genome.ucsc.edu/FAQ/FAQformat.html#format4 We do NOT check if features (exon, CDS, etc) originally belonging to the same gene were converted into the same chromosome/strand. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. ingff : file Input GFF/GTF file. outfile : str, optional Prefix of output files. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' if outfile is not None: rand_str = ''.join( random.choices(string.ascii_uppercase + string.digits, k=8)) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.' + rand_str + '.unmap', 'w') for line in ireader.reader(ingff): if line.startswith(('#', 'track', 'browser', 'visibility')): continue if not line.strip(): continue line = line.strip() fields = line.split('\t') try: start = int(fields[3]) - 1 #0-based end = int(fields[4]) / 1 feature_size = end - start except: print('Cannot recognize \"start\" and \"end\" coordinates. Skip ' + line, file=sys.stderr) if outfile: print(line, file=UNMAP) continue if fields[6] not in ['+', '-', '.']: print('Cannot recognize \"strand\". Skip ' + line, file=sys.stderr) if outfile: print(line, file=UNMAP) continue strand = '-' if fields[6] == '-' else '+' chrom = fields[0] a = map_coordinates(mapping, chrom, start, end, strand, chrom_style=cstyle) if a is None: if outfile is None: print(line + '\tfail (no match to target assembly)') else: print(line, file=UNMAP) continue if len(a) != 2: if outfile is None: print(line + '\tfail (multpile match to target assembly)') else: print(line, file=UNMAP) else: if (int(a[1][2]) - int( a[1][1])) != feature_size: # check if it is exact match if outfile is None: print(line + '\tfail (not exact match)') else: print(line, file=UNMAP) fields[0] = a[1][0] # chrom fields[3] = int(a[1][1]) + 1 # start, 1-based fields[4] = int(a[1][2]) fields[6] = a[1][3] if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields])) else: print('\t'.join([str(i) for i in fields]), file=FILE_OUT)
def crossmap_maf_file(mapping, infile, outfile, liftoverfile, refgenome, ref_name): ''' Convert genome coordinates in MAF (mutation annotation foramt) format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in VCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ref_name : str The NCBI build name of the target assembly, for example, "GRCh37", "GRCh38". ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getctime(refgenome + '.fai') < os.path.getctime(refgenome): logging.info( "Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total = 0 fail = 0 for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #meta-information lines needed in both mapped and unmapped files if line.startswith('#'): print(line, file=FILE_OUT) print(line, file=UNMAP) continue elif line.startswith('Hugo_Symbol'): print( "#liftOver: Program=%sv%s, Time=%s, ChainFile=%s, NewRefGenome=%s" % ("CrossMap", __version__, datetime.date.today().strftime("%B%d,%Y"), liftoverfile, refgenome), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: fields = str.split(line, sep='\t') total += 1 fields[3] = ref_name chrom = fields[4] start = int(fields[5]) - 1 # 0 based end = int(fields[6]) #strand = fields[7] a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line, file=UNMAP) fail += 1 continue if len(a) == 2: target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] # update chrom fields[4] = target_chr # update start coordinate fields[5] = target_start + 1 # update end fields[6] = target_end # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[10] = refFasta.fetch(target_chr, target_start, target_end).upper() except: print(line, file=UNMAP) fail += 1 continue if a[1][3] == '-': fields[10] = revcomp_DNA(fields[10], True) print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line, file=UNMAP) fail += 1 continue FILE_OUT.close() UNMAP.close() logging.info("Total entries: %d", total) logging.info("Failed to map: %d", fail)
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome, noCompAllele = False, compress = False, cstyle = 'a'): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. noCompAllele : bool A logical value indicates whether to compare ref_allele to alt_allele after liftover. If True, the variant will be marked as "unmap" if ref_allele == alt_allele. cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' if noCompAllele: logging.info("Keep variants [reference_allele == alternative_allele] ...") else: logging.info("Filter out variants [reference_allele == alternative_allele] ...") #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): logging.info("Creating index for: %s" % refgenome) pysam.faidx(refgenome) if os.path.getmtime(refgenome + '.fai') < os.path.getmtime(refgenome): logging.info("Index file is older than reference genome. Re-creating index for: %s" % refgenome) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile ,'w') UNMAP = open(outfile + '.unmap','w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line=line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: chr_template = 'chr1' else: chr_template = '1' #update contig information elif line.startswith('#CHROM'): logging.info("Updating contig field ... ") target_gsize = dict(list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): #if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (update_chromID(chr_template, chr_id, cstyle), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print("##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) logging.info("Lifting over ... ") else: if line.startswith('#'):continue # process non-variant region if 'END=' in line: fields = str.split(line,maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line,maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1])-1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ','').split(',')[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+', chrom_style = cstyle) if a is None: print (line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str(a[1][0]) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele try: target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr,target_start,target_end).upper() except: print(line+ "\tFail(No_targetRef)", file=UNMAP) failed_var += 1 if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' # check if ref_allele is the same as alt_allele if noCompAllele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: if fields[3] != fields[4]: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print (line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print (line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() logging.info ("Total variants: %d" % total_var) logging.info ("Variants failed to map: %d" % failed_var) logging.info ("Total non-variant regions: %d" % total_region) logging.info ("Non-variant regions failed to map: %d" % failed_region) if compress: try: logging.info("Compressing \"%s\" ..." % outfile) subprocess.call("gzip " + outfile, shell=True) except: pass
def crossmap_region_file(mapping, inbed, outfile=None, min_ratio=0.85, cstyle='a'): ''' Convert large genomic regions (in bed format) between assemblies. BED format: http://genome.ucsc.edu/FAQ/FAQformat.html#format1 Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. inbed : file Input BED file. outfile : str, optional Prefix of output files. min_ratio : float, optional Minimum ratio of query bases that must remap cstyle : str, optional Chromosome ID style. Must be one of ['a', 's', 'l'], where 'a' : as-is. The chromosome ID of the output file is in the same style of the input file. 's' : short ID, such as "1", "2", "X. 'l' : long ID, such as "chr1", "chr2", "chrX. ''' # check if 'outfile' was set. If not set, print to screen, if set, print to file if outfile is not None: FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') else: pass for line in ireader.reader(inbed): if line.startswith(('#', 'track', 'browser')): continue if not line.strip(): continue line = line.strip() fields = line.split() strand = '+' # filter out line less than 3 columns if len(fields) < 3: print("Less than 3 fields. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidBedFormat', file=UNMAP) continue try: int(fields[1]) except: print("Start coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidStartPosition', file=UNMAP) continue try: int(fields[2]) except: print("End coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tInvalidEndPosition', file=UNMAP) continue if int(fields[1]) > int(fields[2]): print( "\"Start\" is larger than \"End\" coordinate is not an integer. skip " + line, file=sys.stderr) if outfile: print(line + '\tStart>End', file=UNMAP) continue # try to reset strand try: for f in fields: if f in ['+', '-']: strand = f except: pass chrom = fields[0] start = int(fields[1]) end = int(fields[2]) total_query_length = end - start #used to calculate q_map_ratio a = map_coordinates(mapping, chrom, start, end, strand, chrom_style=cstyle) # input: 'chr1',246974830,247024835 # output: [('chr1', 246974830, 246974833, '+' ), ('chr1', 248908207, 248908210, '+' ), ('chr1', 247024833, 247024835, '+'), ('chr1', 249058210, 249058212,'+')] # [('chr1', 246974830, 246974833), ('chr1', 248908207, 248908210)] if (a is None) or (len(a) % 2 != 0): if outfile is None: print(line + '\tFail\tUnmap') else: print(line + '\tFail\tUnmap', file=UNMAP) continue #when a == 2, there is one-to-one match (i.e. 100% match) if len(a) == 2: #reset fields to target assembly fields[0] = a[1][0] fields[1] = a[1][1] fields[2] = a[1][2] for i in range(0, len(fields)): #update the strand information if fields[i] in ['+', '-']: fields[i] = a[1][3] if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields]) + "\tmap_ratio=1.0000") else: print('\t'.join([str(i) for i in fields]) + "\tmap_ratio=1.0000", file=FILE_OUT) #when a is an even number but bigger than 2, each segment is 100% match, # but the whole region is not. In this case, check *min_ratio* of the query if len(a) > 2: a_query = a[:: 2] #EVEN: [('chr1', 246974830, 246974833, '+'), ('chr1', 247024833, 247024835, '+')] a_query_mapped_nt = sum([i[2] - i[1] for i in a_query]) #sum([3,2]) a_target = a[ 1:: 2] #ODDS: [('chr1', 248908207, 248908210, '+'), ('chr1', 249058210, 249058212, '+')] a_target_chroms = set([i[0] for i in a_target]) a_target_chroms = set([i[0] for i in a_target]) a_target_starts = [i[1] for i in a_target] a_target_ends = [i[2] for i in a_target] #print (a_target_ends) map_ratio = a_query_mapped_nt / total_query_length #map_ratio > cutoff if map_ratio >= min_ratio: if len(a_target_chroms) == 1: t_chrom = a_target_chroms.pop() fields[0] = t_chrom fields[1] = min(a_target_starts) fields[2] = max(a_target_ends) if outfile is None: print(line + '\t->\t' + '\t'.join([str(i) for i in fields]) + ("\tmap_ratio=%.4f" % map_ratio)) else: print('\t'.join([str(i) for i in fields]) + ("\tmap_ratio=%.4f" % map_ratio), file=FILE_OUT) else: if outfile is None: print(line + '\tFail\tCrossChroms') else: print(line + '\tFail\tCrossChroms', file=UNMAP) # map_ratio > 0 but < cutoff elif map_ratio > 0 and map_ratio < min_ratio: if outfile is None: print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio)) else: print(line + '\tFail' + ("\tmap_ratio=%.4f" % map_ratio), file=UNMAP)
def crossmap_gvcf_file(mapping, infile, outfile, liftoverfile, refgenome): ''' Convert genome coordinates in GVCF format. Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. infile : file Input file in GVCF format. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. outfile : str prefix of output files. liftoverfile : file Chain (https://genome.ucsc.edu/goldenPath/help/chain.html) format file. Can be a regular or compressed (*.gz, *.Z,*.z, *.bz, *.bz2, *.bzip2) file, local file or URL (http://, https://, ftp://) pointing to remote file. refgenome : file The genome sequence file of 'target' assembly in FASTA format. ''' #index refegenome file if it hasn't been done if not os.path.exists(refgenome + '.fai'): printlog(["Creating index for", refgenome]) pysam.faidx(refgenome) refFasta = pysam.Fastafile(refgenome) FILE_OUT = open(outfile, 'w') UNMAP = open(outfile + '.unmap', 'w') total_var = 0 failed_var = 0 total_region = 0 failed_region = 0 withChr = False # check if the VCF data lines use 'chr1' or '1' for line in ireader.reader(infile): if not line.strip(): continue line = line.strip() #deal with meta-information lines. #meta-information lines needed in both mapped and unmapped files if line.startswith('##fileformat'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##INFO'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FILTER'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##FORMAT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##ALT'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##SAMPLE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##PEDIGREE'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GVCFBlock'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##GATKCommandLine'): print(line, file=FILE_OUT) print(line, file=UNMAP) elif line.startswith('##source'): print(line, file=FILE_OUT) print(line, file=UNMAP) #meta-information lines needed in unmapped files elif line.startswith('##assembly'): print(line, file=UNMAP) elif line.startswith('##contig'): print(line, file=UNMAP) if 'ID=chr' in line: withChr = True #update contig information elif line.startswith('#CHROM'): printlog(["Updating contig field ... "]) target_gsize = dict( list(zip(refFasta.references, refFasta.lengths))) for chr_id in sorted(target_gsize): if chr_id.startswith('chr'): if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id.replace('chr', ''), target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: if withChr is True: print("##contig=<ID=%s,length=%d,assembly=%s>" % ('chr' + chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) else: print("##contig=<ID=%s,length=%d,assembly=%s>" % (chr_id, target_gsize[chr_id], os.path.basename(refgenome)), file=FILE_OUT) print( "##liftOverProgram=<CrossMap,version=%s,website=https://sourceforge.net/projects/crossmap>" % __version__, file=FILE_OUT) print("##liftOverChainFile=<%s>" % liftoverfile, file=FILE_OUT) print("##originalFile=<%s>" % infile, file=FILE_OUT) print("##targetRefGenome=<%s>" % refgenome, file=FILE_OUT) print("##liftOverDate=<%s>" % datetime.date.today().strftime("%B%d,%Y"), file=FILE_OUT) print(line, file=FILE_OUT) print(line, file=UNMAP) printlog(["Lifting over ... "]) else: if line.startswith('#'): continue # process non-variant region if 'END=' in line: fields = str.split(line, maxsplit=8) total_region += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based try: m = re.search(r"END\=(\d+)", line) end = int(m[1]) except: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_region += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update END fields[7] = fields[7].replace(('END=' + str(end)), ('END=' + str(target_end))) print('\t'.join(map(str, fields)), file=FILE_OUT) # process variant line else: fields = str.split(line, maxsplit=7) total_var += 1 chrom = fields[0] start = int(fields[1]) - 1 # 0 based, ref_allele start end = start + len(fields[3]) # ref_allele end alt_allele = fields[4].replace(' ', '').split( ',' )[0] # 20 10000598 . T A,<NON_REF> 1754.77 . DP=54; a = map_coordinates(mapping, chrom, start, end, '+') if a is None: print(line + "\tFail(Unmap)", file=UNMAP) failed_var += 1 continue if len(a) == 2: # update chrom target_chr = str( a[1][0] ) #target_chr is from chain file, could be 'chr1' or '1' target_start = a[1][1] target_end = a[1][2] fields[0] = target_chr # update start coordinate fields[1] = target_start + 1 # update ref allele target_chr = update_chromID(refFasta.references[0], target_chr) fields[3] = refFasta.fetch(target_chr, target_start, target_end).upper() if a[1][3] == '-': fields[4] = revcomp_DNA(alt_allele, True) + ',<NON_REF>' #ref_allele and alt_alele are different if fields[3] != alt_allele: print('\t'.join(map(str, fields)), file=FILE_OUT) else: print(line + "\tFail(REF==ALT)", file=UNMAP) failed_var += 1 else: print(line + "\tFail(Multiple_hits)", file=UNMAP) failed_var += 1 continue FILE_OUT.close() UNMAP.close() printlog(["Total variants:", str(total_var)]) printlog(["Variants failed to map:", str(failed_var)]) printlog(["Total non-variant regions:", str(total_region)]) printlog(["Non-variant regions failed to map:", str(failed_region)])
def crossmap_wig_file(mapping, in_file, out_prefix, taget_chrom_size, in_format, binSize=100000): ''' Description ----------- Convert genome coordinates (in wiggle/bigwig format) between assemblies. wiggle format: http://genome.ucsc.edu/goldenPath/help/wiggle.html bigwig format: http://genome.ucsc.edu/goldenPath/help/bigWig.html Parameters ---------- mapping : dict Dictionary with source chrom name as key, IntervalTree object as value. in_file : file Input file in wig or bigwig format. Both "variableStep" and "fixedStep" wiggle lines are supported. out_prefix : str Prefix of output files. taget_chrom_size : dict Chromosome size of the target genome assembly. Key is chromosome ID, value is the length of the chromosome. Note, the chromosome ID and length information were extracted from the chain file, therefore, the chrom_IDs can be with or without the leading "chr". in_format : str Either "wiggle" or "bigwig" binSize : int The chunk size when reading bigwig file in each iteration. ''' OUT_FILE1 = open(out_prefix + '.bgr', 'w') # original bgr file OUT_FILE2 = open(out_prefix + '.sorted.bgr', 'w') # sorted bgr file OUT_FILE3 = pyBigWig.open(out_prefix + '.bw', "w") # bigwig file chrom_style = 'chr1' if in_format.upper() == "WIGGLE": logging.info("Liftover wiggle file \"%s\" to bedGraph file \"%s\"" % (in_file, out_prefix + '.bgr')) for chrom, start, end, strand, score in wiggleReader(in_file): chrom_style = chrom maps = map_coordinates(mapping, chrom, start, end, '+') if maps is None: continue if len(maps) == 2: print('\t'.join([ str(i) for i in [maps[1][0], maps[1][1], maps[1][2], score] ]), file=OUT_FILE1) else: continue maps[:] = [] OUT_FILE1.close() logging.info("Merging overlapped entries in bedGraph file") for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): print('\t'.join([str(i) for i in (chrom, start, end, score)]), file=OUT_FILE2) OUT_FILE2.close() os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr # make bigwig header target_chroms_sorted = [] for k in sorted(taget_chrom_size.keys()): i_chrom = update_chromID(chrom_style, k) i_value = taget_chrom_size[k] target_chroms_sorted.append((i_chrom, i_value)) # add bigwig header logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw')) OUT_FILE3.addHeader(target_chroms_sorted) # add entries to bigwig file logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw')) for line in ireader.reader(out_prefix + '.sorted.bgr'): r_chr, r_st, r_end, r_value = line.split() OUT_FILE3.addEntries([r_chr], [int(r_st)], ends=[int(r_end)], values=[float(r_value)]) OUT_FILE3.close() elif in_format.upper() == "BIGWIG": logging.info("Liftover bigwig file %s to bedGraph file %s:" % (in_file, out_prefix + '.bgr')) for chrom, start, end, score in bigwigReader(in_file): chrom_style = chrom maps = map_coordinates(mapping, chrom, start, end, '+') try: if maps is None: continue if len(maps) == 2: print('\t'.join([ str(i) for i in [maps[1][0], maps[1][1], maps[1][2], score] ]), file=OUT_FILE1) else: continue except: continue maps[:] = [] OUT_FILE1.close() logging.info("Merging overlapped entries in bedGraph file") for (chrom, start, end, score) in bgrMerge.merge(out_prefix + '.bgr'): print('\t'.join([str(i) for i in (chrom, start, end, score)]), file=OUT_FILE2) OUT_FILE2.close() os.remove(out_prefix + '.bgr') #remove .bgr, keep .sorted.bgr logging.info("Writing header to \"%s\" ..." % (out_prefix + '.bw')) # make bigwig header target_chroms_sorted = [] for k in sorted(taget_chrom_size.keys()): i_chrom = update_chromID(chrom_style, k) i_value = taget_chrom_size[k] target_chroms_sorted.append((i_chrom, i_value)) # add bigwig header OUT_FILE3.addHeader(target_chroms_sorted) # add entries to bigwig file logging.info("Writing entries to \"%s\" ..." % (out_prefix + '.bw')) for line in ireader.reader(out_prefix + '.sorted.bgr'): r_chr, r_st, r_end, r_value = line.split() OUT_FILE3.addEntries([r_chr], [int(r_st)], [int(r_end)], [float(r_value)]) OUT_FILE3.close() else: raise Exception("Unknown foramt. Must be 'wiggle' or 'bigwig'")