def main(argv): fname = 'exsnp/nors_j_dbsnp.txt' dup = 0 if len(argv) > 0: fname = argv[0] else: print( "no file provided, instead using default file %s to act as 3 column join file (chr:pos old_id rsid)" % (fname)) if len(argv) > 1: dup = int(argv[1]) joinfile = NormFile(fname) linecount = joinfile.row_count oneper = int(linecount / 100) add = oneper db = DBConnect("cc4") curs = db.getCursor() for num, cols in enumerate(joinfile.readls(" ")): try: var_rs = VariantRS(curs, cols[1], cols[2]) var_rs.log_dbsnpid(dup) db.commit() if num == oneper: print('%s percent done' % ((oneper / linecount) * 100)) oneper += add except: print("Unexpected error with %s: %s" % (cols, sys.exc_info()[0])) db.close() raise # if num == 1: # break db.close()
def _makeF(self): try: f = open(self.bfile,"w") conn = DBConnect(self.db) curs = conn.getCursor() curs.execute(self.qry,self.vals) for row in curs: self.row_count += 1 row = [str(i) for i in row] f.write("\t".join(row)) f.write("\n") except Exception as e: print("error connecting/executing query/writing to file ",sys.exc_info()[0], e) finally: f.close() curs.close() conn.close()
def main(argv): if len(argv) < 1: print( "provide file exported from exsnp.py output, with possible build 37 merges and XY/X/Y possible rs id alternatives. Format: rsid chr:coord existing_alt_id. Sometimes called j_already.txt" ) return else: infile = NormFile(argv[0]) conn = DBConnect("cc3") curs = conn.getCursor() for rsid, refpos, altid in infile.readls(dlim=" "): try: if refpos.split(':')[0].isnumeric(): buildrsmerge(curs, rsid, refpos, altid) else: xyrsmerge(curs, rsid, refpos, altid) conn.commit() except: conn.close() raise conn.close()
def main(): db = 'chip_comp' q = "SELECT DISTINCT chr FROM positions" conn = DBConnect(db) curs = conn.getCursor() try: curs.execute(q) chrs = curs.fetchall() except Exception as e: print("closing") conn.close() raise logfile = datetime.datetime.now().strftime("walk_%a_%d%b_%I%p.log") logging.basicConfig(filename=logfile, level=logging.INFO) start = time.time() count = 0 chrs = [st[0] for st in chrs if st[0] != '0'] logging.info('chromwalk.py: found %s chromosomes to walk: %s', len(chrs), ','.join(chrs)) report = open('walk_report.txt', 'w') try: for chrm in chrs: # [:1]: #remove index[] after testing report.write("Chromosome %s:\n" % (chrm)) fname = "walk_d_" + chrm + ".txt" q = "SELECT id,pos FROM positions WHERE chr = %s ORDER BY build,pos ASC" #LIMIT 220" # remove limit after testing #q = "SELECT id,pos FROM positions WHERE chr = %s AND id in ('exm27128','exm2250526','rs764252130','rs758750248','rs137990115') ORDER BY build,pos ASC" #LIMIT 220" # remove limit after testing vals = (chrm, ) qf = QueryFile(fname, q, vals, db) count += qf.row_count logging.info('starting %s, file has %s lines', fname, qf.row_count) qfit = itls(qf) walk(qf, curs, report) now = int(time.time() - start) logging.info('done %s, took %s seconds', fname, now) qf.remove() except Exception as e: statement = "error encountered during chromosome " + chrm + str( sys.exc_info()[0]) + str(e) logging.error(statement) raise finally: report.close() conn.close() now = int(time.time() - start) logging.info('Finished all chromosomes after %s seconds (%s rows)' % (now, count))
def main(): db = 'chip_comp' build = '37' vals = (build, build) # for a build, find positions that have multiple entries. filter out positions used by ids that occur multiple times but with different positions. qsamepos = ( "select pos,chr,count(id) from positions where build = %s group by pos,chr having count(id) > 1 and pos <> 0 and pos not in " "( " "select p1.pos from positions p1, positions p2 " "where " "p1.build = p2.build " "and " "p1.id = p2.id " "and " "p1.pos <> p2.pos " "and " "p1.pos <> 0 " "and " "p2.pos <> 0 " "and " "p1.build = %s " ") " "order by count(id) desc" # limit 3" # REMOVE LIMIT ) fname = 'samepos.txt' qf = QueryFile(fname, qsamepos, vals, db) rc = qf.row_count fvper = int(0.05 * rc) logline = fvper logfile = datetime.datetime.now().strftime("pmerge_%a_%d%b_%I%p.log") logging.basicConfig(filename=logfile, level=logging.INFO) logging.info( 'run_posmerge.py: created %s with %s rows, using db %s, merging on build %s', fname, rc, db, build) start = time.time() count = 0 conn = DBConnect(db) curs = conn.getCursor(dic=True) cursm = conn.getCursor() for line in qf.read(): try: posdups = getvars(line, curs, build) whichind = choose(posdups) whichone = posdups[whichind] whichdups = [d for i, d in enumerate(posdups) if i != whichind] mergeids(whichone, whichdups, cursm, conn) count += 1 if logline == count: now = int(time.time() - start) logging.info( "approximately %.2f%% parsed after %s seconds, %s positions, line: %s" % ((count / rc * 100), now, count, line)) logline += fvper except Exception as e: conn.rollback() conn.close() statement = "error at merging step for line " + line + str( sys.exc_info()[0]) + str(e) logging.error(statement) raise #should prob add an else: commit now = int(time.time() - start) logging.info('Finished after %s seconds (%s rows)' % (now, rc)) conn.close()
# debug: #readers = [AxiUKBBAffy2_1('/mnt/HPC/processed/mr875/tasks/dsp367/AxiUKBBAffy2_1_38_Eg.csv')] #readers = [InfCorEx24v1a1('/mnt/HPC/processed/Metadata/variant_annotation/CoreExomev1.0_annotation.csv'), # InfEx24v1a2('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumExome-24v1-0_A2.csv'), # InfCorEx24v1_1a1('/mnt/HPC/processed/Metadata/variant_annotation/CoreExomev1.1_annotation.csv'), # AxiUKBBAffy2_1('/mnt/HPC/processed/mr875/tasks/dsp367/Axiom_UKBBv2_1.na36.r1.a1.annot.csv'), # AxiUKBB_WCSG('/mnt/HPC/processed/Metadata/variant_annotation/Axiom_UKB_WCSG.na35.annot-2015.csv'), # InfImmun24v2('/mnt/HPC/processed/Metadata/variant_annotation/InfiniumImmunoArray_annotation.csv'), # InfImmun24v2grc38('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumImmunoArray-24v2-0_A2.csv'), # InfCorEx24v1_1grc38('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumCoreExome-24v1-1_A2.csv'), # InfOmniExpr('/mnt/HPC/processed/Metadata/variant_annotation/OmniExpress_annotation.csv'), # InfOmniExpr38('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumOmniExpress-24v1-2_A2.csv'), # MSExome('/mnt/HPC/processed/Metadata/variant_annotation/MSExome_annotation.csv')] #readers = [Dil('/mnt/HPC/processed/Metadata/variant_annotation/DIL_annotation.csv')] readers = [UKBBv21_2021('ukbbv2_1_Annot_2021.csv')] ch = DBConnect("cc3") logfile = datetime.datetime.now().strftime("%a_%d%b_%I%p.log") logging.basicConfig(filename=logfile, level=logging.INFO) offsetclass = "" # "AxiUKBB_WCSG" #pick a source class from which you don't want to parse from the beginning offsetvariant = 249200 # variant 1 = the first line under the header. so if offsetvariant = 3 then the 3rd variant will be parsed for source in readers: if type(source).__name__ == offsetclass: readin(ch, source, offsetvariant) else: readin(ch, source) ch.close()
def main(argv): exfname = 'exflank/external_flanks.txt' # add alternative path on command line db = 'cc4' editdb = False if len(argv) > 0: exfname = argv[0] try: listf = NormFile(exfname) except FileNotFoundError: print( "file %s does not exist. Provide an external flank file (<id>\\t<flankseq>)" % (exfname)) raise if len(argv) > 1: if argv[1] == 'True': editdb = True print(editdb) conn = DBConnect(db) curs = conn.getCursor(dic=True) badmatchf = open("exfl_badmatch", "w") longerf = open("exfl_longer.txt", "w") if editdb: longerf.write('no entries because "editdb" switch is on\n') count_matchmult = 0 count_allmatch = 0 count_matchone = 0 count_matchzero = 0 for uid, nflnk in listf.readls(" "): try: allfl = get_flank(uid, curs) except: print("Unexpected error:", sys.exc_info()[0], '\ninterrupted at uid ', uid) break match = compare_dbf(nflnk, allfl) revnflnk = rev(nflnk) match += compare_dbf(revnflnk, allfl) if match: matchfl = [allfl[ind] for ind in match ] # new list with matching dicts (so new indexes) localmatch = ResExf(matchfl).check_local( nflnk) # additional check, potentially reducing matches matchfl = [matchfl[ind] for ind in localmatch] nomatchfl = findnomatch(matchfl, allfl) if len(localmatch) > 1: count_matchmult += 1 remove, keep = multchoose( matchfl ) #not used if not editdb but worth doing to catch problems if len(localmatch) == len(allfl): count_allmatch += 1 if not editdb: fvplens = [ len(df['flank_seq'].split('[')[0]) for df in matchfl ] longerf.write('%s\t%s\n' % (uid, max(fvplens))) else: fordel = [matchfl[ind] for ind in remove] forkeep = [matchfl[ind] for ind in keep] # expecting 1 entry try: ResExf.remove_red(curs, fordel) ResExf.flag_chosen(curs, forkeep) conn.commit() except: print( "Unexpected error while editing db: removing dups and flagging chosen", sys.exc_info()[0], '\ninterrupted at uid ', uid) break else: count_matchone += 1 if editdb: if not alr_chose(matchfl): try: ResExf.flag_chosen(curs, matchfl) conn.commit() except: print( "Unexpected error while editing db (flagging chosen flank):", sys.exc_info()[0], '\ninterrupted at uid ', uid) break log_badmatch(badmatchf, uid, nflnk, nomatchfl) else: count_matchzero += 1 log_badmatch(badmatchf, uid, nflnk, allfl) if editdb: try: ResExf.add_ext(curs, uid, nflnk) conn.commit() except: print( "Unexpected error while editing db (adding ext flank seq):", sys.exc_info()[0], '\ninterrupted at uid ', uid) break print( '%s variants matched 1 of their flank sequences\n%s variants matched multiple of their external flanks (%s of these match ALL of their flanks(so no mismatches at all))\n%s variants do not have any db flanks matching the external flank sequence' % (count_matchone, count_matchmult, count_allmatch, count_matchzero)) conn.close() longerf.close() badmatchf.close()