def run(args): Utilities.ensure_requisite_folders(args.output) logging.info("starting lifting over.") liftover = pyliftover.LiftOver(args.liftover) with gzip.open(args.output, "w") as _o: with open(args.input) as _i: for i,line in enumerate(_i): if i ==0: line = "\t".join(line.strip().split()) + "\n" _o.write(line.encode()) continue try: comps = line.strip().split() chr = comps[0] start = int(comps[1]) end = int(comps[2]) _chrs, _s = _l(liftover, chr, start) _chre, _e = _l(liftover, chr, end) if _chrs != _chre: logging.warning("{}:{}:{} have different target chromosomes: {}/{}".format(chr, start, end, _chrs, _chre)) line = "{}\n".format("\t".join([_chrs, str(_s), str(_e)])) _o.write(line.encode()) except Exception as e: logging.info("Error for: %s", line) logging.info("Finished lifting over.")
def UpdateWarburtonTable1(infile, ref_file, outfile): inhandle = open(infile, 'r') intable = csv.reader(inhandle, delimiter='\t') outhandle = open(outfile, 'w') updated_file = '{0}_hg38.tsv'.format('.'.join(infile.split('.')[:-1])) updated_handle = open(updated_file, 'w') updated_table = csv.writer(updated_handle, delimiter='\t') header = intable.next() updated_table.writerow(header) lo = pyliftover.LiftOver('hg18', 'hg38') ## seq=GetSeq(ref_file)" for row in intable: chrom, interval = row[-1].split(':') left, right = interval.split('-') left = int(''.join(left.split(','))) right = int(''.join(right.split(','))) coord_left = lo.convert_coordinate(chrom, left)[0][1] chromosome, coord_right = lo.convert_coordinate(chrom, right)[0][:2] print chromosome, left, coord_left print chromosome, right, coord_right new_line = row[:-1] + [ '{0}:{1}-{2}'.format(chromosome, coord_left, coord_right) ] ## seq_name='>{0}_{1}_{2}_Up{3}_{4}_{5}\n',format(row[7],row[2],row[3], chromosome, coord_left, coord_right,) ## outfile.write(seq_name) ## outfile.write ( '{0}\n'.format( seq[chromosome][coord_left:coord_right].upper() ) ) updated_table.writerow(new_line) inhandle.close() updated_handle.close() outfile.close()
def liftover(args, d): logging.info("Performing liftover") l = pyliftover.LiftOver(args.liftover) new_position = [] new_chromosome = [] for t in d.itertuples(): #NA is important, instead of None or NaN, so that integer positions are not converted to floats by pandas. Yuck! _new_chromosome = "NA" _new_position = "NA" try: p = int(t.position) l_ = l.convert_coordinate(t.chromosome, p) if l_: if len(l_) > 1: logging.warning( "Liftover with more than one candidate: %s", t.variant_id) _new_chromosome = l_[0][0] _new_position = int(l_[0][1]) except: pass new_chromosome.append(_new_chromosome) new_position.append(_new_position) d = d.assign(chromosome=new_chromosome) d = d.assign(position=new_position) logging.info("%d variants after liftover", d.shape[0]) return d
def call_liftover(df): """Call pyliftover.LiftOver to update genomic coordinates.""" logging.info(f' updating genomic coordinates.') build = df['build'][0] if (build == 'hg37') | (build == 'hg19') | (build == 'b37'): chain = HG19TO38 elif (build == 'hg18') | (build == 'b18'): chain = HG18TO38 else: logging.error(f' genome build information is not available.') sys.exit(1) lifting = pyliftover.LiftOver(chain) new_chrom = [] new_pos = [] df['chrom_' + build] = df['chrom'] df['pos_' + build] = df['pos'] df['variant_id_' + build] = df['variant_id'] for t in df.itertuples(): _lifted_chrom, _lifted_pos = convert_coords(lifting, t) new_chrom.append(_lifted_chrom) new_pos.append(_lifted_pos) df = df.assign(chrom=new_chrom) df = df.assign(pos=new_pos) # update build information in the dataframe df['build'] = 'b38' logging.info(f' {str(df.shape[0])} variants after liftover') return df
def main(): params = parseArgs() if params.liftover: lo = pyliftover.LiftOver(params.liftover) if params.table: tab=pd.read_csv(params.table, sep="\t") print("Read table:") print(tab) def convert(row): name="chr"+row[params.chrom] ret=lo.convert_coordinate(name, row[params.bp]) return(int(ret[0][1])) tab[params.ocol] = tab.apply(convert,axis = 1) print("Writing the output table:") print(tab) tab.to_csv(params.oname, sep="\t", index=False) if params.marey: marey=make_marey(tab, params.chrom, params.ocol) print("Created the following Marey Map input:") print(marey) mout=params.oname+"_mmap.txt" marey.to_csv(mout, sep=" ", quoting=csv.QUOTE_NONNUMERIC, index=False) else: params.display_help("Error: No table provided") else: params.display_help("Error: No liftover file provided")
def liftover(v, frm, to): import pyliftover #pyliftover is slow! # note that pyliftover is 0 based # First frm-to pair may take time to download the data from UCSC # return a list of tuple. lo = pyliftover.LiftOver(frm, to) chrom,pos,ref,alt = v.split('-') results = lo.convert_coordinate('chr'+chrom, int(pos)-1) if not results: return [] return ['-'.join([i[0][3:],str(i[1]+1),ref,alt]) for i in results]
def dosage_generator(args, variant_mapping=None, weights=None): if args.liftover: logging.info("Acquiring liftover conversion") liftover_chain = pyliftover.LiftOver(args.liftover) liftover_conversion = lambda chr, pos: Genomics.lift( liftover_chain, chr, pos, args.zero_based_positions) else: liftover_chain = None liftover_conversion = None whitelist = None if variant_mapping and type(variant_mapping) == dict: logging.info("Setting whitelist from mapping keys") whitelist = set(variant_mapping.keys()) else: logging.info("Setting whitelist from available models") whitelist = set(weights.rsid) d = None if args.text_genotypes: from metax.genotype import DosageGenotype d = DosageGenotype.dosage_files_geno_lines( args.text_genotypes, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) elif args.bgen_genotypes: from metax.genotype import BGENGenotype d = BGENGenotype.bgen_files_geno_lines( args.bgen_genotypes, variant_mapping=variant_mapping, force_colon=args.force_colon, use_rsid=args.bgen_use_rsid, whitelist=whitelist, skip_palindromic=args.skip_palindromic) elif args.vcf_genotypes: from metax.genotype import CYVCF2Genotype d = CYVCF2Genotype.vcf_files_geno_lines( args.vcf_genotypes, mode=args.vcf_mode, variant_mapping=variant_mapping, whitelist=whitelist, skip_palindromic=args.skip_palindromic, liftover_conversion=liftover_conversion) if d is None: raise Exceptions.InvalidArguments("unsupported genotype input") if args.force_mapped_metadata: d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata) return d
def liftover(args, d): logging.info("Performing liftover") l = pyliftover.LiftOver(args.liftover) new_position = [] new_chromosome = [] for t in d.itertuples(): _new_chromosome, _new_position = _lift(l, t.chromosome, t.position) new_chromosome.append(_new_chromosome) new_position.append(_new_position) d = d.assign(chromosome=new_chromosome) d = d.assign(position=new_position) logging.info("%d variants after liftover", d.shape[0]) return d
import pyliftover import sys if __name__ == '__main__': chrom = str("chr" + sys.argv[1]) pos = int(sys.argv[2]) lo = pyliftover.LiftOver('hg19ToHg38.over.chain') result = lo.convert_coordinate(chrom, pos) result = str(result[0]).replace('(', '').replace(')', '').replace("'", "") result = [i.strip() for i in result.split(',')] print result[0] + ',' + result[1]
def converttohg38(args): if args.sourcegenome not in ['hg18', 'hg19']: print('Source genome should be either hg18 or hg19.') exit() if os.path.exists(args.db) == False: print(args.db, 'does not exist.') exit() liftover = pyliftover.LiftOver( constants.get_liftover_chain_path_for_src_genome(args.sourcegenome)) print('Extracting table schema from DB...') cmd = ['sqlite3', args.db, '.schema'] output = subprocess.check_output(cmd) sqlpath = args.db + '.newdb.sql' wf = open(sqlpath, 'w') wf.write(output.decode()) wf.close() newdbpath = '.'.join(args.db.split('.')[:-1]) + '.hg38.sqlite' if os.path.exists(newdbpath): print('Deleting existing hg38 DB...') os.remove(newdbpath) print('Creating ' + newdbpath + '...') newdb = sqlite3.connect(newdbpath) newc = newdb.cursor() print('Creating same table(s) in ' + newdbpath + '...') cmd = ['sqlite3', newdbpath, '.read ' + sqlpath] output = subprocess.check_output(cmd) db = sqlite3.connect(args.db) c = db.cursor() if args.tables == None: print('tables not given. All tables will be tried.') output = subprocess.check_output(['sqlite3', args.db, '.table']) args.tables = output.decode().split() args.tables.sort() print('The following tables will be examined:', ', '.join(args.tables)) tables_toconvert = [] tables_tocopy = [] for table in args.tables: c.execute('select * from ' + table + ' limit 1') cols = [v[0] for v in c.description] hit = False if args.chromcol is not None and args.chromcol not in cols: tables_tocopy.append(table) continue for col in args.cols: if col in cols: hit = True break if hit: tables_toconvert.append(table) else: tables_tocopy.append(table) print('Tables to convert:', ', '.join(tables_toconvert) if len(tables_toconvert) > 0 else 'none') print('Tables to copy:', ', '.join(tables_tocopy) if len(tables_tocopy) > 0 else 'none') wf = open(newdbpath + '.noconversion', 'w') count_interval = 10000 for table in tables_toconvert: print('Converting ' + table + '...') c.execute('select * from ' + table) allcols = [v[0] for v in c.description] colnos = [] for col in args.cols: if col in allcols: colnos.append(allcols.index(col)) if args.chromcol is None: chromcolno = None else: chromcolno = allcols.index(args.chromcol) count = 0 for row in c.fetchall(): row = list(row) if chromcolno is not None: chrom = row[chromcolno] else: chrom = table if chrom.startswith('chr') == False: chrom = 'chr' + chrom for colno in colnos: pos = int(row[colno]) liftover_out = liftover.convert_coordinate(chrom, pos) if liftover_out == None: print('- no liftover mapping:', chrom + ':' + str(pos)) continue if liftover_out == []: wf.write(table + ':' + ','.join([str(v) for v in row]) + '\n') continue newpos = liftover_out[0][1] row[colno] = newpos q = 'insert into ' + table + ' values(' + ','.join([ '"' + v + '"' if type(v) == type('a') else str(v) for v in row ]) + ')' newc.execute(q) count += 1 if count % count_interval == 0: print(' ' + str(count) + '...') print(' ' + table + ': done.', count, 'rows converted') wf.close() for table in tables_tocopy: count = 0 print('Copying ' + table + '...') c.execute('select * from ' + table) for row in c.fetchall(): row = list(row) q = 'insert into ' + table + ' values(' + ','.join([ '"' + v + '"' if type(v) == type('a') else str(v) for v in row ]) + ')' newc.execute(q) count += 1 if count % count_interval == 0: print(' ' + str(count) + '...') print(' ' + table + ': done.', count, 'rows converted') newdb.commit()
def do_liftover(chain_fn, description): original_fn = 'nagalakshmi_annotations.txt' lifted_fn = 'nagalakshmi_annotations_lifted_{0}.txt'.format(description) original_fh = open(original_fn) for i in range(2): original_fh.readline() labels = original_fh.readline().strip().split() keys_to_convert = ['SGD_Start', 'SGD_End', '5\'-UTR_Start', '3\'-UTR_End'] lo = pyliftover.LiftOver(chain_fn) with open(lifted_fn, 'w') as lifted_fh: original_fh = open(original_fn) for i in range(2): lifted_fh.write(original_fh.readline()) labels_line = original_fh.readline() lifted_fh.write(labels_line) labels = labels_line.strip().split() for line in original_fh: fields = line.strip('\n').split('\t') name = fields[0] if name == 'YBR013C': # This gets its 5' UTR deleted by liftover. Ignore it for now continue #if name == 'YJR122W': # # This has its coding sequence misannotated in nagalakshmi. # continue pairs = zip(labels, map(maybe_int, fields)) gene = dict(pairs[1:]) if gene['Chrom'] == 'chrMito': # Renamed in EF4, and not included in weinberg anyways. continue bad_lift = False for key in keys_to_convert: if gene[key] != '': lift = lo.convert_coordinate(gene['Chrom'], gene[key] - 1) if lift == []: print gene, 'empty list' bad_lift = True break seqname, coord, _, _ = lift[0] gene[key] = coord if bad_lift: continue if gene['SGD_Start'] < gene['SGD_End']: # plus strand gene['SGD_End'] = gene['SGD_End'] + 1 elif gene['SGD_Start'] > gene['SGD_End']: # minus strand gene['SGD_Start'] = gene['SGD_Start'] + 1 else: raise ValueError(name) lifted_line = '\t'.join([name] + [str(gene[key]) for key in labels[1:]]) + '\n' lifted_fh.write(lifted_line)
def run(args): if os.path.exists(args.output): logging.info("Output already exists, nope.") return Utilities.ensure_requisite_folders(args.output) Utilities.ensure_requisite_folders(args.discard) if args.liftover: logging.info("Acquiring liftover") l = pyliftover.LiftOver(args.liftover) else: logging.info("Will not perform lift over") l = None logging.info("Loading snp reference metadata") snp_reference_metadata = pandas.read_table(args.snp_reference_metadata) reference = {} for t in snp_reference_metadata.itertuples(): k = "chr{}_{}".format(t.chromosome, t.position) if k in reference: raise RuntimeError("coordinate is already present") reference[k] = (t.id, t.rsid) dbsnp_format = {x: i for i, x in enumerate(DBSnp.DBSNP._fields)} complement_translation = "CGTA".maketrans({"C": "G", "G": "C", "T":"A", "A": "T"}) logging.info("Processing db snp file") if args.discard: discard = gzip.open(args.discard, "w") discard.write(l_(["rsid", "chromosome", "position", "a0", "a1", "strand", "type", "panel_variant_id", "panel_variant_rsid", "panel_variant_a0", "panel_variant_a1", "swap", "strand_reversal"])) allele_re = re.compile("chr\d+_\d+_(.*)_(.*)_b38") with gzip.open(args.output, "w") as result: result.write(l_(["rsid", "chromosome", "position", "a0", "a1", "strand", "type", "panel_variant_id", "panel_variant_rsid", "panel_variant_a0", "panel_variant_a1", "swap", "strand_reversal"])) with gzip.open(args.db_snp_file) as db_snp: db_snp.readline() for i,line in enumerate(db_snp): comps = line.decode().strip().split("\t") obs_alleles = comps[9].split("/") if len(obs_alleles) < 2: continue chr = comps[1] start_0 = comps[2] _new_chromosome, _new_position = gwas_parsing._lift(l, chr, start_0) if l else (chr, int(start_0)) if _new_chromosome == "NA" or _new_position == "NA": continue k = "{}_{}".format(_new_chromosome, _new_position+1) if not k in reference: continue rsid = comps[4] strand = comps[6] ref_allele = comps[7] var_type = comps[11] alt_alleles_ = [x for x in obs_alleles if x != ref_allele] alt_alleles = set(alt_alleles_) panel_variant_id, panel_variant_rsid = reference[k] panel_variant_rsid = panel_variant_rsid if type(panel_variant_rsid) == str else "NA" panel_alleles = allele_re.search(panel_variant_id) panel_ref_allele = panel_alleles.group(1) panel_alt_allele = panel_alleles.group(2) strand_reversed_panel_ref_allele = panel_ref_allele.translate(complement_translation) strand_reversed_panel_alt_allele = panel_alt_allele.translate(complement_translation) # if args.reverse_swap: # strand_reversed_panel_ref_allele = strand_reversed_panel_ref_allele[::-1] # strand_reversed_panel_alt_allele = strand_reversed_panel_alt_allele[::-1] swap, strand_reversal, selected_ref_allele, selected_alt_allele = None, None, ref_allele, alt_alleles_[0] if len(panel_ref_allele) == 1 and len(panel_alt_allele) == 1: #snp if panel_ref_allele == ref_allele and panel_alt_allele in alt_alleles: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, 1, panel_ref_allele, panel_alt_allele elif panel_ref_allele in alt_alleles and panel_alt_allele == ref_allele: swap, strand_reversal, selected_ref_allele, selected_alt_allele = -1, 1, panel_alt_allele, panel_ref_allele elif strand_reversed_panel_ref_allele == ref_allele and strand_reversed_panel_alt_allele in alt_alleles: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, -1, strand_reversed_panel_ref_allele, strand_reversed_panel_alt_allele elif strand_reversed_panel_ref_allele in alt_alleles and strand_reversed_panel_alt_allele == ref_allele: swap, strand_reversal, selected_ref_allele, selected_alt_allele = -1, -1, strand_reversed_panel_alt_allele, strand_reversed_panel_ref_allele elif len(panel_ref_allele) > 1 and len(panel_alt_allele) == 1 and ref_allele != "-": #deletion deleted = panel_ref_allele[1:] strand_reversed_deleted = strand_reversed_panel_ref_allele[1:] # if args.reverse_swap: # strand_reversed_deleted = strand_reversed_panel_ref_allele[:-1] for si_, allele_ in enumerate(alt_alleles): if allele_ == deleted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, 1, allele_, "-" if allele_ == strand_reversed_deleted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, -1, allele_, "-" elif len(panel_ref_allele) == 1 and len(panel_alt_allele) > 1 and ref_allele == "-": inserted = panel_alt_allele[1:] strand_reversed_inserted = strand_reversed_panel_alt_allele[1:]#[:-1] # if args.reverse_swap: # strand_reversed_inserted = strand_reversed_panel_alt_allele[:-1] for si_, allele_ in enumerate(alt_alleles): if allele_ == inserted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, 1, "-", allele_ if allele_ == strand_reversed_inserted: swap, strand_reversal, selected_ref_allele, selected_alt_allele = 1, -1, "-", allele_ else: pass ol = l_([rsid, chr, str(int(start_0) + 1), selected_ref_allele, selected_alt_allele, strand, var_type, panel_variant_id, panel_variant_rsid, panel_ref_allele, panel_alt_allele, swap, strand_reversal]) if swap is not None and strand is not None and selected_ref_allele is not None and selected_alt_allele is not None: result.write(ol) else: discard.write(ol) discard.close() logging.info("Done")
def Converter(genome_build_in='mm9', genome_build_out='mm10'): return pyliftover.LiftOver(genome_build_in, genome_build_out)
2. discard all positions outside of BRCA region Note: about 2000 files from 23andme, 200 files from ancestryDNA, 200 files from ftdna ftdna files are not processed because its human reference build version is not specified """ import glob import pyliftover import pdb import os SOURCE = ["23andme", "ancestry", "ftdna"] # preload all the pyliftover functions because the function needs to download from internet LIFT_MAP = { "37": pyliftover.LiftOver('hg19', 'hg38'), "36": pyliftover.LiftOver('hg18', 'hg38'), # pyliftover refuses to translate from hg17 to hg38 # therefore it's done in two steps hg17 -> hg19 -> hg38 "35": [pyliftover.LiftOver('hg17', 'hg19'), pyliftover.LiftOver('hg19', 'hg38')], "34": pyliftover.LiftOver('hg16', 'hg38') } BRCA_BOUNDARY = { "38": { "chr17": [43045629, 43125483], "chr13": [32315474, 32400266]
import pyliftover import os this_file_folder = os.path.dirname(os.path.realpath(__file__)) lo = pyliftover.LiftOver( os.path.join(this_file_folder, 'hg19ToHg38.over.chain.gz'))
#!/usr/bin/env python import pyliftover lo = LiftOver('hg17', 'hg18') lo = LiftOver('hg17ToHg18.over.chain.gz') pyliftover.LiftOver() # FROM: https://github.com/konstantint/pyliftover/tree/master/pyliftover # convert_coordinate(self, chromosome, position, strand='+'): # ''' # Returns a *list* of possible conversions for a given chromosome position. # The list may be empty (no conversion), have a single element (unique conversion), or several elements (position mapped to several chains). # The list contains tuples (target_chromosome, target_position, target_strand, conversion_chain_score), # where conversion_chain_score is the "alignment score" field specified at the chain used to perform conversion. If there # are several possible conversions, they are sorted by decreasing conversion_chain_score.