def setup(self): r = requests.get('https://civicdb.org/api/variants?count=5000&page=1') variants = json.loads(r.text)['records'] lifter = LiftOver(constants.liftover_chain_paths['hg19']) vdict = {} for variant in variants: chrom_37 = variant['coordinates']['chromosome'] pos_37 = variant['coordinates']['start'] if chrom_37 is None or pos_37 is None: continue new_coords = lifter.convert_coordinate("chr" + chrom_37, int(pos_37)) if len(new_coords) > 0: chrom_38 = new_coords[0][0].replace('chr', '') pos_38 = new_coords[0][1] else: continue ref = variant['coordinates']['reference_bases'] alt = variant['coordinates']['variant_bases'] toks = [chrom_38, pos_38, ref, alt] if None not in toks: vkey = ':'.join(map(str, toks)) vdict[vkey] = variant else: continue self.civicdata = vdict
def setup_conv(in_build): global b3x global str_db_file global contig global contigmt global pos_triplet_fn global lo_37to38 global lo_38to37 print("Loading LiftOver conversion chain file for build %d..." % in_build) if in_build == 19: b3x = 'b37' str_db_file = 'str_hg19.gff3' contig = 'chrY' contigmt = 'chrM' pos_triplet_fn = pos_triplet_37 lo_37to38 = LiftOver('crossmap/GRCh37_to_GRCh38.chain.gz') elif in_build == 37: b3x = 'b37' str_db_file = 'str_hg19.gff3' contig = 'Y' contigmt = 'MT' pos_triplet_fn = pos_triplet_37 lo_37to38 = LiftOver('crossmap/GRCh37_to_GRCh38.chain.gz') else: b3x = 'b38' str_db_file = 'str_hg38.gff3' contig = 'chrY' contigmt = 'chrM' pos_triplet_fn = pos_triplet_38 lo_38to37 = LiftOver('crossmap/GRCh38_to_GRCh37.chain.gz')
def __init__(self, args): self.args = args self.doLiftOver = LiftOver('hg19', 'hg38') self.lengths_orig = [] self.lengths_filtered = [] self.oldVsNew = []
def lift_pos(posvec, chrvec, chainFile): logging.info("Lifting genomic positions...") nsnps = len(posvec) posvec = posvec - 1 pos_lifted = np.empty((nsnps, ), dtype='int32') chr_lifted = np.empty((nsnps, ), dtype='int32') pos_indi = np.empty((nsnps, ), dtype='|S10') dup_indi = np.empty((nsnps, ), dtype='bool') dup_indi.fill(False) lift = LiftOver(chainFile) for i in range(nsnps): if (i + 1) % 200000 == 0: logging.info("{} SNPs done".format(i + 1)) pos = posvec[i] chr = 'chr%d' % (chrvec[i], ) tmp = lift.convert_coordinate(chr, pos) if not tmp: pos_lifted[i] = pos pos_indi[i] = 'miss' chr_lifted[i] = chrvec[i] elif len(tmp) > 1: pos_lifted[i] = tmp[0][1] chr_lifted[i] = re.sub('chr', '', tmp[0][0]) pos_indi[i] = 'multi' else: pos_lifted[i] = tmp[0][1] chr_lifted[i] = re.sub('chr', '', tmp[0][0]) if pos == tmp[0][1]: pos_indi[i] = 'unchanged' else: pos_indi[i] = 'lifted' return pos_lifted + 1, pos_indi, chr_lifted
def __init__(self, regionsFileName, hg): with open(regionsFileName, 'r') as f: self.regionsDict = json.load(f) f.close() self.lo = None if hg != 'hg38': self.lo = LiftOver(hg, 'hg38')
def liftover_to_19(loc, build): floc = [loc.split(':')[0], loc.split(':')[1]] lo = LiftOver(os.path.join(chainpath, chains.get(build))) con_pos = lo.convert_coordinate(*floc) if con_pos: return int(con_pos[0][1]) return NaN
def setup(self): self.civicdata = {} lifter = LiftOver(constants.liftover_chain_paths['hg19']) page_url = 'https://civicdb.org/api/variants?count=500&page=1' while page_url is not None: try: r = requests.get(page_url, timeout=5) except requests.exceptions.ConnectionError: msg = 'ERROR: Incomplete CIVIC data load' print(msg) self.logger.error(msg) break d = json.loads(r.text) records = d['records'] page_url = d['_meta']['links']['next'] for variant in records: chrom_37 = variant['coordinates']['chromosome'] pos_37 = variant['coordinates']['start'] if chrom_37 is None or pos_37 is None: continue new_coords = lifter.convert_coordinate("chr" + chrom_37, int(pos_37)) if len(new_coords) > 0: chrom_38 = new_coords[0][0].replace('chr', '') pos_38 = new_coords[0][1] else: continue ref = variant['coordinates']['reference_bases'] alt = variant['coordinates']['variant_bases'] toks = [chrom_38, pos_38, ref, alt] if None not in toks: vkey = ':'.join(map(str, toks)) self.civicdata[vkey] = variant else: continue
def __init__(self, from_db, to_db): """ from_db -- 'hg19','hg38','mm9','mm10' to_db -- 'hg19','hg38','mm9','mm10' """ from pyliftover import LiftOver LiftOver.__init__(self, from_db=from_db, to_db=to_db)
def ancestral_fasta(args): """subroutine for ancestor subcommand """ # single chromosome fasta file for reference genome ref = pyfaidx.Fasta(args.reference, read_ahead=10000) # make a copy to build our ancestor for this chromosome copyfile(args.reference, args.output) anc = pyfaidx.Fasta(args.output, read_ahead=10000, mutable=True) # reference genome for outgroup species (all chromosomes) out = pyfaidx.Fasta(args.outgroup, read_ahead=10000) # outgroup to reference alignment chain file lo = LiftOver(args.chain) # snps database for the same chromosome vcf = cyvcf2.VCF(args.vcf) # change regions outside of callability mask to all N bases if args.bed: if args.bed == '-': bed = sys.stdin else: bed = open(args.bed, 'r') last_end = 0 for line in bed: chrom, start, end = line.rstrip().split('\t')[:3] start = int(start) anc[chrom][last_end:start] = 'N' * (start - last_end) last_end = int(end) anc[chrom][last_end:len(anc[chrom])] = 'N' * (len(anc[chrom]) - last_end) for variant in vcf: # change variants that are not biallelic SNPs to N bases if not (variant.is_snp and len(variant.ALT) == 1): anc[variant.CHROM][variant.start:variant.end] = 'N' * ( variant.end - variant.start) else: out_coords = lo.convert_coordinate(variant.CHROM, variant.start) # change ambiguously aligning sites to N bases if out_coords is None or len(out_coords) != 1: anc[variant.CHROM][variant.start] = 'N' else: if variant.REF != ref[variant.CHROM][ variant.start].seq.upper(): raise ValueError(f'variant reference allele {variant.REF} ' f'mismatches reference sequence ' f'{ref[variant.CHROM][variant.start]}') out_chromosome, out_position, out_strand = out_coords[0][:3] out_allele = out[out_chromosome][out_position].seq # if negative strand, take reverse complement base if out_strand == '-': out_allele = reverse_complement(out_allele) # and finally, polarize if out_allele.upper() == variant.ALT[0]: anc[variant.CHROM][variant.start] = out_allele elif out_allele.upper() != variant.REF: # triallelic anc[variant.CHROM][variant.start] = 'N'
def main(): usage = "\n\n\tusage: {} cancer_introns.b38.annot_ready.tsv hg38ToHg19.over.chain.gz > cancer_introns.b37.annot_ready.tsv\n\n".format( sys.argv[0]) if len(sys.argv) < 3: print(usage, file=sys.stderr) sys.exit(1) cancer_introns_file = sys.argv[1] hg_chain_file = sys.argv[2] lo = LiftOver('hg38ToHg19.over.chain.gz') with open(cancer_introns_file, 'rt') as fh: header = next(fh) header = header.rstrip() print(header) for line in fh: line = line.rstrip() vals = line.split("\t") intron = vals[0] chr, coordset = intron.split(":") (lend, rend) = coordset.split("-") lend = int(lend) rend = int(rend) new_lend = lo.convert_coordinate(chr, lend - 1) #print("new_lend: {}".format(str(new_lend))) new_rend = lo.convert_coordinate(chr, rend - 1) #print("new_rend: {}".format(str(new_rend))) if new_lend and new_rend: new_lend_chr = new_lend[0][0] new_lend_coord = new_lend[0][1] + 1 new_rend_chr = new_rend[0][0] new_rend_coord = new_rend[0][1] + 1 if new_lend_chr != new_rend_chr or new_lend_chr != chr: sys.stderr.write("-failed conversion of {}".format(line) + " --> {} {}, {} {}\n".format( new_lend_chr, new_lend_coord, new_rend_chr, new_rend_coord)) continue if new_lend_coord > new_rend_coord: (new_lend_coord, new_rend_coord) = (new_rend_coord, new_lend_coord) new_intron_feature = "{}:{}-{}".format(chr, new_lend_coord, new_rend_coord) vals[0] = new_intron_feature print("\t".join(vals)) sys.exit(0)
def main(args): # open input vcf vcf = vcf_parser.Vcf(args['inputfile']) # add 3 new tag definitions - for hg19 liftover: chr, pos, and end hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">' hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">' hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">' vcf.header.add_tag_definition(hg19END_definition) vcf.header.add_tag_definition(hg19POS_definition) vcf.header.add_tag_definition(hg19CHROM_definition) # get chain file for liftover lo = LiftOver(args['chainfile']) # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants. with open(args['outputfile'], 'w') as fo: vcf.write_header(fo) for vnt_obj in vcf.parse_variants(): # generate hg19 LO coordinates based on CHROM and POS hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1) if len(hits) > 0: #add hg19_chr hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_pos hg19POS_value = 'hg19_pos='+str(hits[0][1]+1) vnt_obj.add_tag_info(hg19POS_value) # also want to incorporate END position for SV and CNV # check if "END" exists in INFO and if it does, try a liftover try: END = int(vnt_obj.INFO.split("END=")[1].split(";")[0]) except: END = '' if END != '': hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1) if len(hits_end) > 0: try: #if hg19_chr is already defined, don't add it vnt_obj.get_tag_value("hg19_chr") #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) except: #if hg19_chr is not defined, add hg19_chr hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) vcf.write_variant(fo, vnt_obj) subprocess.run(["bgzip", args['outputfile']]) subprocess.run(["tabix",args['outputfile']+".gz"])
def try_find_build(rs, pos): snps_info = fetch_snps(rs) #snps_info = [('rs3737728', 'GRCh38.p2', '1', '1086035'), ('rs3934834', 'GRCh38.p2', '1', '1070426'), ('rs9651273', 'GRCh38.p2', '1', '1096160')] logging.info("Loading liftover chain files...") lift38_19 = LiftOver('pyliftover/hg38ToHg19.over.chain.gz') lift19_18 = LiftOver('pyliftover/hg19ToHg18.over.chain.gz') lift19_17 = LiftOver('pyliftover/hg19ToHg17.over.chain.gz') logging.info("Done") for (rsId, build, true_chr, pos_hg38), source_pos in zip(snps_info, pos): try: #if build != 'GRCh38.p2': # assume a specific build we get from Entrez.efetch(db='SNP') # continue source_pos -= 1 pos_hg19 = lift38_19.convert_coordinate('chr{}'.format(true_chr), int(pos_hg38) - 1)[0][1] pos_hg18 = lift19_18.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] pos_hg17 = lift19_17.convert_coordinate('chr{}'.format(true_chr), pos_hg19)[0][1] print( "build={} {} chr{} source={} hg38={}{} hg19={}{} hg18={}{} hg17={}{}" .format(build, rsId, true_chr, source_pos, pos_hg38, '*' if pos_hg38 == source_pos else '', pos_hg19, '*' if pos_hg19 == source_pos else '', pos_hg18, '*' if pos_hg18 == source_pos else '', pos_hg17, '*' if pos_hg17 == source_pos else '')) except: pass
def liftover(self): # todo # Not sure what the failure mode of this tool is. Will probably need to write a try catch eventually # Changing the chromosome and position messes up the key as well. Could probably fix that. But i don't have # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName. from pyliftover import LiftOver lo = LiftOver('hg38', self.build) lifted = lo.convert_coordinate(self.chromosome, self.position) self.chromosome = lifted[0][0] self.position = lifted[0][1]
def from_hg18_to_hg19(chr, coord): """ object to perform hg18 --> hg19 conversion. ----------- REMEMBER that LIFT-OVER coordinates are 0-based!!! ----------- ADD +1 to obtain a values in 1-based coordinate!! :param chr: chromosome name, e.g. 'chr6' :param coord: integer, e.g. 10000 :return: coord in hg coordinates system """ lo = LiftOver('hg18', 'hg19') conv = lo.convert_coordinate(chr, int(coord)+1) hg19_coord = conv[0][1] return hg19_coord
def _parse_cmd_args(self, args): """ Parse the arguments in sys.argv """ parser = argparse.ArgumentParser() parser.add_argument('path', help='Path to this converter\'s python module') parser.add_argument('inputs', nargs='+', help='Files to be converted to .crv') parser.add_argument('-f', dest='format', help='Specify an input format') parser.add_argument('-n', '--name', dest='name', help='Name of job. Default is input file name.') parser.add_argument('-d', '--output-dir', dest='output_dir', help='Output directory. '\ +'Default is input file directory.') parser.add_argument( '-l', '--liftover', dest='liftover', choices=['hg38'] + list(constants.liftover_chain_paths.keys()), default='hg38', help='Input gene assembly. Will be lifted over to hg38') parsed_args = parser.parse_args(args) self.input_paths = [os.path.abspath(x) for x in parsed_args.inputs] if parsed_args.format: self.input_format = parsed_args.format self.input_dir = os.path.dirname(self.input_paths[0]) if parsed_args.output_dir: self.output_dir = parsed_args.output_dir else: self.output_dir = self.input_dir if not (os.path.exists(self.output_dir)): os.makedirs(self.output_dir) if parsed_args.name: self.output_base_fname = parsed_args.name else: self.output_base_fname = os.path.basename(self.input_paths[0]) self.input_assembly = parsed_args.liftover self.do_liftover = self.input_assembly != 'hg38' if self.do_liftover: self.lifter = LiftOver( constants.liftover_chain_paths[self.input_assembly]) else: self.lifter = None self.status_fpath = os.path.join( self.output_dir, self.output_base_fname + '.status.json')
def pyliftover(hg38_chrom, hg38_coord): hg38_key = '%s:%s' % (hg38_chrom, hg38_coord) if hg38_key not in pyliftover_dict: lo = LiftOver(config.input_dir + 'hg38ToHg19.over.chain.gz') result = lo.convert_coordinate(hg38_chrom, int(hg38_coord)) if result is not None: coords_list = result[0] pyliftover_dict[hg38_key] = { 'chrom': coords_list[0], 'coord': str(coords_list[1]) } return pyliftover_dict[hg38_key]
def create_lo(input_version, output_version): lo = LiftOver(input_version, output_version) return { "input_version": input_version, "output_version": output_version, "lo": lo }
def hgVersionJudge(self, nowVersion): if (int(nowVersion) != 19): strs = 'hg' + str(nowVersion) lo = LiftOver(strs, 'hg19') return lo else: return 0
def get_schic_contacts(filename): all_contacts = np.loadtxt(filename, dtype=str) # filter for cis chrX contacts contacts = all_contacts[(all_contacts[:, 0] == 'chrX') & (all_contacts[:, 2] == 'chrX')] contacts = contacts[:, (1, 3)].astype(int) # lift over all contacts from mm10 to mm9 lo = LiftOver('mm10', 'mm9') def do_lift(loc): lifted_loc = lo.convert_coordinate('chrX', loc) if len(lifted_loc) == 1: return lifted_loc[0][1] elif len(lifted_loc) > 1: raise ("Non-unique liftover result") else: print "Locus {} not in mm9 assembly".format(loc) lifted_contacts = np.array( zip(map(do_lift, contacts[:, 0]), map(do_lift, contacts[:, 1]))) # keep only contacts in genomic region of interest contacts = contacts[(contacts[:, 0] >= coords_min) & (contacts[:, 1] <= coords_max)] return contacts
class Converter: def __init__(self): ## lo = LiftOver("/opt/data/misc/hg38ToHg19.over.chain.gz") self.lo = LiftOver('hg19', 'hg38') def hg38(self, ch, pos): ch = str(ch).upper() if (ch.isdigit() or ch == 'X' or ch == 'Y'): ch = "chr{}".format(ch) try: coord = self.lo.convert_coordinate(ch, pos - 1) except: print "WARNING: HG38 conversion at {}:{}".format(ch, pos) coord = None if (not coord): return None if (len(coord) == 0): return "No Match" r = coord[0][1] + 1 if (len(coord) == 1): return r return r, coord def close(self): return
def __init__(self, chainfile): """ This object will perform unique single positional liftovers - it will only lift over chromosome positions that map unique to the new genome and if the strand hasn't changed. Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be re-normalized, and to remove variants where the REF now doesn't match after a liftover. The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one needs to check for this. It's organised as an object rather than a collection of functions so that the LiftOver chainfile only gets opened/passed once and not for every position to be lifted over. :param chainfile: A string containing the path to the local UCSC .gzipped chainfile :return: """ self.liftover = LiftOver(chainfile)
async def live_annotate(input_data, annotators): from cravat.constants import mapping_parser_name from cravat.constants import all_mappings_col_name from cravat.inout import AllMappingsParser global live_modules global live_mapper global module_confs global modules_to_run_ordered response = {} assembly = input_data.get('assembly', 'hg38') if assembly in cravat.constants.liftover_chain_paths: lifter = LiftOver(cravat.constants.liftover_chain_paths[assembly]) chrom, pos, ref, alt = liftover(input_data, lifter) input_data['chrom'] = chrom input_data['pos'] = pos input_data['ref'] = ref input_data['alt'] = alt crx_data = live_mapper.map(input_data) crx_data = live_mapper.live_report_substitute(crx_data) crx_data[mapping_parser_name] = AllMappingsParser( crx_data[all_mappings_col_name]) for module_name in modules_to_run_ordered: module = live_modules[module_name] if annotators is not None and module_name not in annotators: continue try: conf = module_confs[module_name] json_colnames = [] for col in conf['output_columns']: if 'table' in col and col['table'] == True: json_colnames.append(col['name']) if 'secondary_inputs' in conf: sec_mods = conf['secondary_inputs'] secondary_data = {} for sec_mod in sec_mods: secondary_data[sec_mod] = [response[sec_mod]] annot_data = module.annotate(input_data=crx_data, secondary_data=secondary_data) else: annot_data = module.annotate(input_data=crx_data) annot_data = module.live_report_substitute(annot_data) if annot_data == '' or annot_data == {}: annot_data = None elif type(annot_data) is dict: annot_data = clean_annot_dict(annot_data) if annot_data is not None: for colname in json_colnames: json_data = annot_data.get(colname, None) if json_data is not None and type(json_data) == str: json_data = json.loads(json_data) annot_data[colname] = json_data response[module_name] = annot_data except Exception as e: import traceback traceback.print_exc() response[module_name] = None del crx_data[mapping_parser_name] set_crx_canonical(crx_data) response['crx'] = crx_data return response
class UniqueLiftover(object): def __init__(self, chainfile): """ This object will perform unique single positional liftovers - it will only lift over chromosome positions that map unique to the new genome and if the strand hasn't changed. Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be re-normalized, and to remove variants where the REF now doesn't match after a liftover. The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one needs to check for this. It's organised as an object rather than a collection of functions so that the LiftOver chainfile only gets opened/passed once and not for every position to be lifted over. :param chainfile: A string containing the path to the local UCSC .gzipped chainfile :return: """ self.liftover = LiftOver(chainfile) def liftover_cpra(self, chromosome, position, verbose=False): """ Given chromosome, position in 1-based co-ordinates, This will use pyliftover to liftover a CPRA, will return a (c,p) tuple or raise NonUniqueLiftover if no unique and strand maintaining liftover is possible :param chromosome: string with the chromosome as it's represented in the from_genome :param position: position on chromosome (will be cast to int) :param verbose: print verbose information for debugging :return: ((str) chromosome, (int) position) or None if no liftover """ chromosome = str(chromosome) position = int(position) # Perform the liftover lookup, shift the position by 1 as pyliftover deals in 0-based co-ords new = self.liftover.convert_coordinate(chromosome, position - 1) # This has to be here as new will be NoneType when the chromosome doesn't exist in the chainfile if new: # If the liftover is unique if len(new) == 1: # If the liftover hasn't changed strand if new[0][2] == "+": # Set the co-ordinates to the lifted-over ones and write out new_chromosome = str(new[0][0]) # Shift the position forward by one to convert back to a 1-based co-ords new_position = int(new[0][1]) + 1 return new_chromosome, new_position else: exception_string = ( "{},{} has a flipped strand in liftover: {}".format( chromosome, position, new)) else: exception_string = "{},{} lifts over to multiple positions: {}".format( chromosome, position, new) elif new is None: exception_string = "Chromosome '{}' provided not in chain file".format( chromosome) if verbose: logging.error(exception_string) return None, None
def liftover(pos, chro, from_assembly, to_assembly): """ LiftOver a specific coordinate between assemblies using the UCSC LiftOver tool NOTE: pyLiftover uses base 0, whereas coordinate system uses base 1 therefore position 27107251 is actually 27107250 in pyLiftover """ if from_assembly == to_assembly: return pos chro = 'chr' + str(chro) pos = int(pos) lo = LiftOver(from_assembly, to_assembly) out = lo.convert_coordinate(chro, pos) return out[0][1]
def liftover(self, chromosome, position, build='hg19'): # todo # Not sure what the failure mode of this tool is. Will probably need to write a try catch eventually # Changing the chromosome and position messes up the key as well. Could probably fix that. But i don't have # the ref and alt alleles on hand and I don't want to parse them out of chromosomeHgvsName. lo = LiftOver('hg38', build) lifted = lo.convert_coordinate(chromosome, position) new_chromosome = lifted[0][0] new_position = lifted[0][1] if self.debug: print("%s %s -> %s %s" % (chromosome, position, new_chromosome, new_position)) return new_chromosome, new_position
def main(coords, orig_assembly, new_assembly, chainfile, outfh): # Create a LiftOver object with desired mapping. lo = LiftOver(orig_assembly, new_assembly) results = [] for coord in coords: try: chrom, pos = coord.split(':') # No idea why, but pos needs to be an int instead of a str! returnval = lo.convert_coordinate(chrom, int(pos))[0] results.append((chrom, pos,) + returnval) except: # Not sure what kinds of errors we can get. I think if a locus is # deleted, we'll get None as a result (which we'll want to handle), # but apart from that, not sure what to expect. sys.stderr.write('Offending coord: %s' % coord) raise print_results(results, outfh)
class UniqueLiftover(object): def __init__(self, chainfile): """ This object will perform unique single positional liftovers - it will only lift over chromosome positions that map unique to the new genome and if the strand hasn't changed. Note: You should run a VCF Normalization sweep on all lifted ofer CPRAs to check for variants that need to be re-normalized, and to remove variants where the REF now doesn't match after a liftover. The combination of these steps will ensure high quality liftovers. However, it should be noted that this won't prevent the situation where multiple positions in the old genome pile up uniquely in the new genome, so one needs to check for this. It's organised as an object rather than a collection of functions so that the LiftOver chainfile only gets opened/passed once and not for every position to be lifted over. :param chainfile: A string containing the path to the local UCSC .gzipped chainfile :return: """ self.liftover = LiftOver(chainfile) def liftover_cpra(self, chromosome, position, verbose=False): """ Given chromosome, position in 1-based co-ordinates, This will use pyliftover to liftover a CPRA, will return a (c,p) tuple or raise NonUniqueLiftover if no unique and strand maintaining liftover is possible :param chromosome: string with the chromosome as it's represented in the from_genome :param position: position on chromosome (will be cast to int) :return: ((str) chromosome, (int) position) or None if no liftover """ chromosome = str(chromosome) position = int(position) # Perform the liftover lookup, shift the position by 1 as pyliftover deals in 0-based co-ords new = self.liftover.convert_coordinate(chromosome, position - 1) # This has to be here as new will be NoneType when the chromosome doesn't exist in the chainfile if new: # If the liftover is unique if len(new) == 1: # If the liftover hasn't changed strand if new[0][2] == "+": # Set the co-ordinates to the lifted-over ones and write out new_chromosome = str(new[0][0]) # Shift the position forward by one to convert back to a 1-based co-ords new_position = int(new[0][1]) + 1 return new_chromosome, new_position else: exception_string = "{},{} has a flipped strand in liftover: {}".format(chromosome, position, new) else: exception_string = "{},{} lifts over to multiple positions: {}".format(chromosome, position, new) elif new is None: exception_string = "Chromosome '{}' provided not in chain file".format(chromosome) if verbose: logging.error(exception_string) return None, None
def main(): # Parse args args = parse_args() confidence_orders = ['High', 'Medium', 'Low'] # Used to sort "highest" confidence # Load gold-standards gold_standards = load_gold_standards(args.input_pattern) # Create liftOver instances from chain files if args.grch37_to_38: args.grch37_to_38 = LiftOver(args.grch37_to_38) if args.grch38_to_37: args.grch38_to_37 = LiftOver(args.grch38_to_37) # Iterate over and process records out_data = [] for record in gold_standards: # Lift-over positions to all assemblies record['sentinel_variant'] = fill_in_assemblies( record['sentinel_variant'], args.grch37_to_38, args.grch38_to_37 ) # Extract highest confidence record['gold_standard_info']['highest_confidence'] = sorted( [entry['confidence'] for entry in record['gold_standard_info']['evidence']], key=lambda x: confidence_orders.index(x) )[0] out_data.append(record) # Write output if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output), exist_ok=True) with open(args.output, 'w') as out_h: json.dump(out_data, out_h, ensure_ascii=False, indent=2) return 0
def get_liftover(frm=19, to=38): """ Info: http://hgdownload.cse.ucsc.edu/downloads.html """ from pyliftover import LiftOver liftoverfile = 'hg{}ToHg{}.over.chain.gz'.format(frm, to) try: return LiftOver(processedDataStorage + liftoverfile) except FileNotFoundError: raise FileNotFoundError( 'Source: http://hgdownload.cse.ucsc.edu/gbdb/hg{}/liftOver/{}'. format(frm, liftoverfile))
def liftover_cho(df): lo = LiftOver('hg18', 'hg38') def lift_coord(row): chrom = 'chr' + str(row['Chromosome']) pos = row['Genomic position'] - 1 result = lo.convert_coordinate(chrom, pos) if len(result) == 0: print(f"Didn't find hg38 coordinate for {row['Chromosome']}:{row['Genomic position']}") return 'NA' return result[0][1] + 1 df['Genomic position'] = df.apply(lift_coord, axis=1) return df
class liftover: def __init__(self, build_from, build_to): # Source Genome Build if build_from in map_release.values(): self.build_from = build_from else: build_mapped = map_release.get(build_from) if build_mapped is None: raise Exception( 'Unknown SOURCE genome build. The value was: {}'.format( build_from)) else: self.build_from = build_mapped # Destination Genome Build if build_to in map_release.values(): self.build_to = build_to else: build_mapped = map_release.get(build_to) if build_mapped is None: raise Exception( 'Unknown DESTINATION genome build. The value was: {}'. format(build_from)) else: self.build_to = build_mapped # Download/Source the Chain from UCSC if self.build_from != self.build_to: self.GetChain() else: self.chain = None def GetChain(self): '''Downloads the chain from UCSC ''' self.chain_name = 'UCSC: {} to {}'.format(self.build_from, self.build_to) self.chain = LiftOver(self.build_from, self.build_to) def lift(self, chr, pos): lifted = self.chain.convert_coordinate( 'chr{}'.format(str(chr)), int(pos) ) # ToDo figure out whether this step should be adjusted for 0/1 indexing? if lifted is not None: if len(lifted) == 1: return lifted[0][0][3:], int( lifted[0][1]), False # Only 1 position if len(lifted) > 1: return lifted[0][0][3:], int( lifted[0][1]), True # Multiple positions (take first) else: return None, None, None else: return None, None, None
class CravatAnnotator(BaseAnnotator): def setup(self): chain_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'hg38ToHg19.over.chain') self.liftover = LiftOver(chain_path) def annotate(self, input_data, secondary_data=None): out = {} hg19_data = self.liftover.convert_coordinate(input_data['chrom'], int(input_data['pos']) - 1) if len(hg19_data) > 0: out['chrom'] = hg19_data[0][0] out['pos'] = hg19_data[0][1] + 1 return out
def main(coords, orig_assembly, new_assembly, chainfile, outfh): # Create a LiftOver object with desired mapping. lo = LiftOver(orig_assembly, new_assembly) results = [] for coord in coords: try: chrom, pos = coord.split(':') # No idea why, but pos needs to be an int instead of a str! returnval = lo.convert_coordinate(chrom, int(pos))[0] results.append(( chrom, pos, ) + returnval) except: # Not sure what kinds of errors we can get. I think if a locus is # deleted, we'll get None as a result (which we'll want to handle), # but apart from that, not sure what to expect. sys.stderr.write('Offending coord: %s' % coord) raise print_results(results, outfh)
interval = intrxn[0].split(":")[1].split("-") elif RNAtoplot in intrxn[1] and partner in intrxn[0]: interval = intrxn[1].split(":")[1].split("-") if len(interval) == 2: for i in range(int(interval[0]), int(interval[1])): dist[i] += 1 print "RNA size:", len(dist) #Use the following part to liftover mouse coordinates to human liftfiles = {"mm28S": "/Users/lu/Documents/chang/rrna/liftover/mmtohs28S.liftoverchain", \ "mm45S": "/Users/lu/Documents/chang/rrna/liftover/mmtohs45S.liftoverchain", \ "Malat1": "/Users/lu/Documents/chang/psoralen/examples/MALAT1/mmtohg_Malat1.liftoverchain"} if RNAtoplot in liftfiles: newdist = [0 for i in range(0, size)] lo = LiftOver(liftfiles[RNAtoplot]) for i in range(0, size): lifted = lo.convert_coordinate(RNAtoplot, i, '+') if lifted: newdist[lifted[0][1]] += dist[i] dist = newdist figure = plt.figure(figsize=(8,2)) axes = plt.Axes(figure, [.3,.3,.6,.6]) figure.add_axes(axes) plt.bar(range(0, size), dist, color='k') axes.spines['top'].set_visible(False) axes.spines['right'].set_visible(False) axes.yaxis.set_ticks_position('left') axes.xaxis.set_ticks_position('bottom')
__author__ = 'rajaram' #Reference : https://pypi.python.org/pypi/pyliftover #Left over data : http://hgdownload.cse.ucsc.edu/gbdb/hg38/liftOver/ from pyliftover import LiftOver #lo = LiftOver('hg38', 'hg19') lo = LiftOver('hg38ToHg19.over.chain.gz') for x in range(0, 100): data = lo.convert_coordinate('chr1', 1000000+x) print data data2 = data.pop() print data2[0]
def addTSSInfo(self, vcfInputFile): vcf_reader = vcf.Reader(open(vcfInputFile, 'r')) vcf_reader.infos['TSSOL'] = VcfInfo('TSSOL', vcf_field_counts['A'], 'String', 'Info indicates whether the variant overlapping with the' ' transcription start site(TSS)') vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader) query = SPARQLQueries.sparqlQueries() totalVar = 0 tssOLVar = 0 lo = LiftOver('hg38ToHg19.over.chain.gz') for record in vcf_reader: variantStart = record.start variantEnd = record.end variantChromosome = record.CHROM variantSubType = record.var_subtype isOverlapping = False # Adding chr prefix to the chromosome if "chr" not in variantChromosome: variantChromosome = "chr"+str(record.CHROM) #liftover from hg20 to hg19 data = lo.convert_coordinate(variantChromosome, variantStart) #print variantChromosome print variantStart print variantEnd if ((data != None)): data2 = data.pop() variantChromosomehg19 = data2[0] variantStarthg19 = data2[1] data = lo.convert_coordinate(variantChromosome, variantEnd) data2 = data.pop() variantEndhg19 = data2[1] # SPARQL query result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql', variantStarthg19, variantEndhg19, variantChromosomehg19) for row in result: values = sparql.unpack_row(row) cageStart = values[1] cageEnd = values[2] if ((variantSubType == 'ins') & ( variantStart > cageStart )): isOverlapping = True tssOLVar = tssOLVar+1 break elif ((variantSubType != 'ins') & (cageStart > 0)): isOverlapping = True tssOLVar = tssOLVar+1 break totalVar = totalVar+1 record.add_info('TSSOL', [isOverlapping]) else: print "No liftover found for this pos = "+record.ID vcf_writer.write_record(record) print "No of variants = "+str(totalVar) print "No of tss overlapping variants = "+str(tssOLVar)
def __init__(self, from_db, to_db): LiftOver.__init__(self, from_db=from_db, to_db=to_db)
class SubmitHiCLiftOver: def __init__(self, args): self.args = args self.doLiftOver = LiftOver('hg19', 'hg38') self.lengths_orig = [] self.lengths_filtered = [] self.oldVsNew = [] def splitStrCoordStr(self, raw): chrom = raw.split(':')[0] start = raw.split(':')[1].split('-')[0] end = raw.split(':')[1].split('-')[1] return "\t".join([chrom, start, end]) def splitStrCoord(self, raw): chrom = raw.split(':')[0] start = raw.split(':')[1].split('-')[0] end = raw.split(':')[1].split('-')[1] return [chrom, int(start), int(end)] def wrapLiftover(self, debug, chrom, start, end, errMsg): lift_start = self.doLiftOver.convert_coordinate(chrom, start) if not lift_start: if debug: print(errMsg + " start", chrom, start) return None lift_start = lift_start[0] lift_end = self.doLiftOver.convert_coordinate(chrom, end) if not lift_end: if debug: print(errMsg + " end", chrom, end) return None lift_end = lift_end[0] if lift_start[0] != lift_end[0]: if debug: print(errMsg + " no longer same chrom", chrom, start, end, lift_start[0], lift_end[0]) return None oldLen = end - start chromLift = lift_start[0] startLift = lift_start[1] endLift = lift_end[1] newLen = endLift - startLift if oldLen < 1: if debug: print(errMsg + " oldLen: negative!", chrom, start, end) return None if newLen < 1: if debug: print(errMsg + " newLen: negative!", chromLift, startLift, endLift) return None absDiff = abs(newLen - oldLen) return [chromLift, startLift, endLift, oldLen, newLen, absDiff] def coordToStr(self, c): return c[0] + ':' + str(c[1]) + '-' + str(c[2]) def parseLine(self, line): # chr10 3240001 4120000 boundary.3|hg19|chr10:3240001-3280000___boundary.4|hg19|chr10:4080001-4120000 1.06090369391 # [0chrom, 1start, 2end, 3mess, 4value] toks = line.split() leftCoord = toks[:3] leftCoord[1] = int(leftCoord[1]) leftCoord[2] = int(leftCoord[2]) mtoks = toks[3].split('|') midBoundaryLeft = mtoks[0] if 3 != len(mtoks): midBoundaryRight = mtoks[2].split('__')[1] midCoordRaw = mtoks[2].split('__')[0] midCoord = self.splitStrCoord(midCoordRaw) if 3 != len(mtoks): rightCoord = self.splitStrCoord(mtoks[-1]) leftCoordLift = self.wrapLiftover(False, leftCoord[0], leftCoord[1], leftCoord[2], "left") if not leftCoordLift: return None self.lengths_orig.append([leftCoordLift[3], leftCoordLift[4]]) if leftCoordLift[5] > 5000: if 0: print("skipping b/c of lengths change") return None midCoordLift = self.wrapLiftover(False, midCoord[0], midCoord[1], midCoord[2], "mid") if not midCoordLift: return None if midCoordLift[5] > 5000: return None if 3 != len(mtoks): rightCoordLift = self.wrapLiftover(False, rightCoord[0], rightCoord[1], rightCoord[2], "right") if not rightCoordLift: return None if rightCoordLift[5] > 5000: return None self.lengths_filtered.append([leftCoordLift[3], leftCoordLift[4]]) if 3 != len(mtoks): mid = [midBoundaryLeft, "hg38-liftOver", self.coordToStr(midCoordLift) + '___' + midBoundaryRight, "hg38-liftOver", self.coordToStr(rightCoordLift)] else: mid = [midBoundaryLeft, "hg38-liftOver", self.coordToStr(midCoordLift)] ret = "\t".join([str(x) for x in leftCoordLift[:3] + ['|'.join(mid)] + [toks[4]]]) self.oldVsNew.append([line, ret]) return ret def tmpFile(self, accession, assembly, prefix): return os.path.join("/home/mjp/tadsLiftOverHg19ToHg38", assembly + "_liftOver_" + prefix + '_' + accession + ".bed.gz") def parseOutFile(self, accession, fnp): good = 0 bad = 0 with gzip.open(fnp) as f: with gzip.open(self.tmpFile(accession, 'hg38', 'point'), 'wb') as outF: for line in f: newLine = self.parseLine(line) if newLine: outF.write(newLine + '\n') good += 1 else: bad += 1 print("lifted:", accession, good, bad) def runLiftover(self): mc = MemCacheWrapper() qd = QueryDCC(cache=mc) url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released" for exp in qd.getExps(url): for f in exp.getTADs(): f.download() self.parseOutFile(f.fileID, f.fnp()) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_orig.tsv" with open(fnp, 'w') as f: for r in self.lengths_orig: f.write('\t'.join([str(x) for x in r]) + '\n') print("wrote", fnp) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/lengths_filtered.tsv" with open(fnp, 'w') as f: for r in self.lengths_filtered: f.write('\t'.join([str(x) for x in r]) + '\n') print("wrote", fnp) fnp = "/home/mjp/tadsLiftOverHg19ToHg38/oldVsNew.tsv" with open(fnp, 'w') as f: for r in self.oldVsNew: f.write(r[0]) f.write(r[1] + '\n') print("wrote", fnp) def fileJson(self, exp, f, fnp): return { "dataset": exp.encodeID, "file_format": "bed", "file_format_type": "bed3+", "file_size": os.path.getsize(fnp), "md5sum": Utils.md5(fnp), "output_type": f.output_type, "assembly": "GRCh38", "award": "/awards/U41HG007000/", "lab": "/labs/zhiping-weng/", "derived_from": [f.fileID], "submitted_file_name": fnp, "aliases": ["zhiping-weng:hic-tad-hg38-liftOver-" + f.fileID] } def submitFile(self, exp, f): fileAccession = f.fileID fnp = self.tmpFile(fileAccession, 'hg38', 'point') j = self.fileJson(exp, f, fnp) print(j) submitFile(self.args, j) def runSubmit(self): authenticateEncodeTxt(self.args) mc = MemCacheWrapper() qd = QueryDCC(cache=mc) url = "https://www.encodeproject.org/search/?type=Experiment&assay_title=Hi-C&status=released" for exp in qd.getExps(url): for f in exp.getTADs(): f.download() self.submitFile(exp, f)
There are some missing arguments modified. Usage: compare_mafs MAF_FILE_GDC MAF_FILE_TCGA MAF_FILE_1: Path for GDC maf file MAF_FILE_2: Path for TCGA maf file """ sys.exit() else: gdc_maf_project = sys.argv[1] tcga_maf_file = sys.argv[2] # Read files in GDC path gdc_maf_files = glob.glob('../kossproject/*_maf_files_tcga/TCGA.' + gdc_maf_project + '*.maf') nfiles_gdc = len(gdc_maf_files) # Read crossing reference lo = LiftOver('hg19', 'hg38') fastaRef = pybedtools.example_filename('/mnt/GDCpaper/Homo_sapiens.GRCh38.dna.primary_assembly.fa') # Variables for count FP, FN, TP, TN pair_list = {} TP=0 FP=0 total=0 noncross=0 diffref=0 # Reading each file separately gdc_var_files_list = [None] * nfiles_gdc gdc_pairs = [] file = 0 for maf_file in gdc_maf_files:
fam_handle = open(fam_fname) # Dox individuals dox_fname = args[1] #dox_fname = "../data/samples.txt" assert os.path.exists(dox_fname), "Input samples file exists." dox_handle = open(dox_fname) contig = os.path.basename(in_fname).split(".")[1] qual = "." filter = "." format = "GT" # liftOver chain lo = LiftOver("hg19", "hg38") # Today's date today = str(datetime.date.today()).replace("-", "") # Logging number of SNPs log_variants = 0 # Number of variants log_snps = 0 # Number of SNPs log_success = 0 # Number of SNPs successfully converted to hg38 log_nonconvert = 0 # Number of SNPs with no coordinates in hg38 log_multi = 0 # Number of SNPs with multiple coordinates in hg38 log_diff_chr = 0 # Number of SNPs with hg38 coordinates on diff chromosome # Parse individuals ----------------------------------------------------------- # All individuals in Plink .fam file
Usage: lift_over.py <from-build> <to-build> stdin line format: chrom bp_in_from_build stdout line format: bp_in_to_build, or '-' if not found Created on February 19, 2014 @author: Oren Livne <*****@*****.**> ============================================================ ''' import sys, traceback, util from pyliftover import LiftOver if __name__ == '__main__': try: src, target = sys.argv[1:3] if src == target: for _, bp in (line.strip().split(' ') for line in sys.stdin): print '%d %d' % (int(bp), int(bp)) else: lo = LiftOver(src, target) for chrom, bp in (line.strip().split(' ') for line in sys.stdin): out = lo.convert_coordinate('chr' + chrom, int(bp)) if not out: print '-' else: print '%d' % (out[0][1],) except: traceback.print_exc(file=sys.stdout) sys.exit(util.EXIT_FAILURE)
from pyliftover import LiftOver import os import xml.etree.ElementTree as ET import shutil hg19_test = 'seqont_c' top_dir = 'C:\\Users\\Kyle\\cravat\\CRAVAT-testing\\test_cases\\seqont' lo = LiftOver('hg19','hg18') hg18_test = '_'.join(hg19_test.split('_')[:-1]) + '_18' hg19_dir = os.path.join(top_dir, hg19_test) hg18_dir = os.path.join(top_dir, hg18_test) # Make new directory. Move files to new dir with updated names . All changes will be done here print 'Making folder and files' if os.path.exists(hg18_dir): cont = raw_input('hg18 dir exists, continue? <y/n>: ') if cont == 'y': shutil.rmtree(hg18_dir) os.makedirs(hg18_dir) else: exit() else: os.makedirs(hg18_dir) shutil.copy(os.path.join(hg19_dir,'%s_desc.xml' %hg19_test), os.path.join(hg18_dir,'%s_desc.xml' %hg18_test)) shutil.copy(os.path.join(hg19_dir,'%s_input.txt' %hg19_test), os.path.join(hg18_dir,'%s_input.txt' %hg18_test)) shutil.copy(os.path.join(hg19_dir,'%s_key.csv' %hg19_test), os.path.join(hg18_dir,'%s_key.csv' %hg18_test)) # Add a <hg18>on</hg18> tag to the desc.xml print 'Changing desc file'
from pymongo import MongoClient from pyliftover import LiftOver mongo_client = MongoClient() db = mongo_client.fasttrack lo = LiftOver("hg38ToHg19.over.chain.gz") unmatched = 0 matched = 0 for r in db.gwas.find(): chrid = r["chr_id"] chrpos = r["chr_pos"] if chrid and chrpos: try: _chrpos = int(chrpos) except: pass else: lifted = lo.convert_coordinate("chr%s" % chrid, _chrpos - 1) if lifted: new_chrid = lifted[0][0].split("chr")[1] new_chrpos = lifted[0][1] matched += 1 db.gwas.update_many( {"chr_id": chrid, "chr_pos": chrpos}, {"$set": {"hg19chr": new_chrid, "hg19pos": new_chrpos}} ) else: # print('NONE: %s %s' %(chrid, chrpos)) unmatched += 1
def plot_manhattan( args, annotations, l_x, l_y, l_c, x_ticks, y_max, d_pos_init_chrom): y_max = max(int(y_max + 3), args.min_y) if args.EFO: ## Just make some assumptions about builds here for now. ## https://en.wikipedia.org/wiki/Reference_genome lo = LiftOver('hg38', 'hg19') with open(args.EFO) as f: cnt = collections.Counter() for line in f: cnt[line.split('\t')[7]] += 1 trait_most_common = cnt.most_common(1)[0][0] with open(args.EFO) as f: ## Skip header. for line in f: break for line in f: l = line.split('\t') # ## Try to weed out all the garbage present in the GWAS catalog. # if not l[7] == trait_most_common: # continue CHR_ID = l[11] ## Skip if missing data. if CHR_ID == '': continue try: CHR_POS = int(l[12]) ## Continue if CHR_POS is not an integer. except ValueError: continue rsID = l[21] y = PVALUE_MLOG = min(y_max, float(l[28])) # if y < -math.log10(args.threshold_p): # continue try: x = d_pos_init_chrom[CHR_ID] + lo.convert_coordinate( 'chr{}'.format(CHR_ID), CHR_POS)[0][1] except KeyError: assert CHR_ID == 'X' continue except IndexError: print('IndexError', CHR_ID, CHR_POS, lo.convert_coordinate('chr{}'.format(CHR_ID), CHR_POS), file=sys.stderr) continue # l_x.append(x) # l_y.append(y) # l_c.append('#FF0000') ## Colour most frequently occuring trait red. if l[7] == trait_most_common: plt.vlines(x, 0, y, colors='#FF0000', linewidth=0.5, linestyle='--') ## Colour less frequently occuring traits orange, ## because these might be junk in the GWAS catalog. else: plt.vlines(x, 0, y, colors='#FF8000', linewidth=0.5, linestyle='--') n = len(l_y) plt.ylabel(r'-log$_{10}$($p$)') # plt.axhline(-math.log10(0.05 / n), color='0.8', linewidth=0.5) # plt.axhline(-math.log10(5 * 10 ** -8), color='0.5', linewidth=0.5) plt.axhline(-math.log10(args.threshold_p), color='0.2', linewidth=0.5, linestyle='--') try: plt.ylim((0, y_max)) # todo: make argument except: pass print('plt.scatter(manhattan)', file=sys.stderr) plt.scatter(l_x, l_y, c=l_c, s=3) plt.title(args.title, fontsize='small') for annotation in annotations: # if annotation['prob'] > 0.05 / n: if annotation['prob'] > args.threshold_p: continue print('\t'.join( [str(annotation[k]) for k in sorted(annotation.keys())])) plt.annotate( '\n'.join(( 'p={:.1E}'.format(annotation['prob']), 'pos={:,}'.format(annotation['pos']), 'MAF={:.3f}'.format(min(annotation['af'], 1 - annotation['af'])), annotation['rsID'], ','.join(annotation['gene_names']), )), xy=(annotation['x'], annotation['y']), ## xytext=(), fontsize='xx-small', horizontalalignment='center', verticalalignment='bottom', rotation=30, ) plt.xticks( *zip(*x_ticks), rotation=-75, size=6, fontsize=6) print('plt.savefig( {}.manhattan.png )'.format(args.out), file=sys.stderr) plt.savefig('{}.manhattan.png'.format(args.out), dpi=600) return