Beispiel #1
0
def main(argv):
    fname = 'exsnp/nors_j_dbsnp.txt'
    dup = 0
    if len(argv) > 0:
        fname = argv[0]
    else:
        print(
            "no file provided, instead using default file %s to act as 3 column join file (chr:pos old_id rsid)"
            % (fname))
    if len(argv) > 1:
        dup = int(argv[1])
    joinfile = NormFile(fname)
    linecount = joinfile.row_count
    oneper = int(linecount / 100)
    add = oneper
    db = DBConnect("cc4")
    curs = db.getCursor()
    for num, cols in enumerate(joinfile.readls(" ")):
        try:
            var_rs = VariantRS(curs, cols[1], cols[2])
            var_rs.log_dbsnpid(dup)
            db.commit()
            if num == oneper:
                print('%s percent done' % ((oneper / linecount) * 100))
                oneper += add
        except:
            print("Unexpected error with %s: %s" % (cols, sys.exc_info()[0]))
            db.close()
            raise


#        if num == 1:
#            break
    db.close()
Beispiel #2
0
 def _makeF(self):
     try:
         f = open(self.bfile,"w")
         conn = DBConnect(self.db)
         curs = conn.getCursor()
         curs.execute(self.qry,self.vals)
         for row in curs:
             self.row_count += 1
             row = [str(i) for i in row]
             f.write("\t".join(row))
             f.write("\n")
     except Exception as e:
         print("error connecting/executing query/writing to file ",sys.exc_info()[0], e)
     finally:
         f.close()
         curs.close()
         conn.close()
Beispiel #3
0
def main(argv):

    if len(argv) < 1:
        print(
            "provide file exported from exsnp.py output, with possible build 37 merges and XY/X/Y possible rs id alternatives. Format: rsid chr:coord existing_alt_id. Sometimes called j_already.txt"
        )
        return
    else:
        infile = NormFile(argv[0])
    conn = DBConnect("cc3")
    curs = conn.getCursor()
    for rsid, refpos, altid in infile.readls(dlim=" "):
        try:
            if refpos.split(':')[0].isnumeric():
                buildrsmerge(curs, rsid, refpos, altid)
            else:
                xyrsmerge(curs, rsid, refpos, altid)
            conn.commit()
        except:
            conn.close()
            raise
    conn.close()
Beispiel #4
0
def main():

    db = 'chip_comp'
    q = "SELECT DISTINCT chr FROM positions"
    conn = DBConnect(db)
    curs = conn.getCursor()
    try:
        curs.execute(q)
        chrs = curs.fetchall()
    except Exception as e:
        print("closing")
        conn.close()
        raise
    logfile = datetime.datetime.now().strftime("walk_%a_%d%b_%I%p.log")
    logging.basicConfig(filename=logfile, level=logging.INFO)
    start = time.time()
    count = 0
    chrs = [st[0] for st in chrs if st[0] != '0']
    logging.info('chromwalk.py: found %s chromosomes to walk: %s', len(chrs),
                 ','.join(chrs))
    report = open('walk_report.txt', 'w')
    try:
        for chrm in chrs:  # [:1]: #remove index[] after testing
            report.write("Chromosome %s:\n" % (chrm))
            fname = "walk_d_" + chrm + ".txt"
            q = "SELECT id,pos FROM positions WHERE chr = %s ORDER BY build,pos ASC"  #LIMIT 220" # remove limit after testing
            #q = "SELECT id,pos FROM positions WHERE chr = %s AND id in ('exm27128','exm2250526','rs764252130','rs758750248','rs137990115') ORDER BY build,pos ASC" #LIMIT 220" # remove limit after testing
            vals = (chrm, )
            qf = QueryFile(fname, q, vals, db)
            count += qf.row_count
            logging.info('starting %s, file has %s lines', fname, qf.row_count)
            qfit = itls(qf)
            walk(qf, curs, report)
            now = int(time.time() - start)
            logging.info('done %s, took %s seconds', fname, now)
            qf.remove()
    except Exception as e:
        statement = "error encountered during chromosome " + chrm + str(
            sys.exc_info()[0]) + str(e)
        logging.error(statement)
        raise
    finally:
        report.close()
        conn.close()
    now = int(time.time() - start)
    logging.info('Finished all chromosomes after %s seconds (%s rows)' %
                 (now, count))
Beispiel #5
0
def main():

    db = 'chip_comp'
    build = '37'
    vals = (build, build)
    # for a build, find positions that have multiple entries. filter out positions used by ids that occur multiple times but with different positions.
    qsamepos = (
        "select pos,chr,count(id) from positions where build = %s group by pos,chr having count(id) > 1 and pos <> 0 and pos not in "
        "( "
        "select p1.pos from positions p1, positions p2  "
        "where "
        "p1.build = p2.build "
        "and "
        "p1.id = p2.id "
        "and "
        "p1.pos <> p2.pos "
        "and "
        "p1.pos <> 0 "
        "and "
        "p2.pos <> 0 "
        "and  "
        "p1.build = %s "
        ") "
        "order by count(id) desc"  # limit 3" # REMOVE LIMIT
    )
    fname = 'samepos.txt'
    qf = QueryFile(fname, qsamepos, vals, db)
    rc = qf.row_count
    fvper = int(0.05 * rc)
    logline = fvper
    logfile = datetime.datetime.now().strftime("pmerge_%a_%d%b_%I%p.log")
    logging.basicConfig(filename=logfile, level=logging.INFO)
    logging.info(
        'run_posmerge.py: created %s with %s rows, using db %s, merging on build %s',
        fname, rc, db, build)
    start = time.time()
    count = 0
    conn = DBConnect(db)
    curs = conn.getCursor(dic=True)
    cursm = conn.getCursor()
    for line in qf.read():
        try:
            posdups = getvars(line, curs, build)
            whichind = choose(posdups)
            whichone = posdups[whichind]
            whichdups = [d for i, d in enumerate(posdups) if i != whichind]
            mergeids(whichone, whichdups, cursm, conn)
            count += 1
            if logline == count:
                now = int(time.time() - start)
                logging.info(
                    "approximately %.2f%% parsed after %s seconds, %s positions, line: %s"
                    % ((count / rc * 100), now, count, line))
                logline += fvper
        except Exception as e:
            conn.rollback()
            conn.close()
            statement = "error at merging step for line " + line + str(
                sys.exc_info()[0]) + str(e)
            logging.error(statement)
            raise
        #should prob add an else: commit
    now = int(time.time() - start)
    logging.info('Finished after %s seconds (%s rows)' % (now, rc))
    conn.close()
Beispiel #6
0
# debug:
#readers = [AxiUKBBAffy2_1('/mnt/HPC/processed/mr875/tasks/dsp367/AxiUKBBAffy2_1_38_Eg.csv')]

#readers = [InfCorEx24v1a1('/mnt/HPC/processed/Metadata/variant_annotation/CoreExomev1.0_annotation.csv'),
#        InfEx24v1a2('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumExome-24v1-0_A2.csv'),
#        InfCorEx24v1_1a1('/mnt/HPC/processed/Metadata/variant_annotation/CoreExomev1.1_annotation.csv'),
#        AxiUKBBAffy2_1('/mnt/HPC/processed/mr875/tasks/dsp367/Axiom_UKBBv2_1.na36.r1.a1.annot.csv'),
#        AxiUKBB_WCSG('/mnt/HPC/processed/Metadata/variant_annotation/Axiom_UKB_WCSG.na35.annot-2015.csv'),
#        InfImmun24v2('/mnt/HPC/processed/Metadata/variant_annotation/InfiniumImmunoArray_annotation.csv'),
#        InfImmun24v2grc38('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumImmunoArray-24v2-0_A2.csv'),
#        InfCorEx24v1_1grc38('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumCoreExome-24v1-1_A2.csv'),
#        InfOmniExpr('/mnt/HPC/processed/Metadata/variant_annotation/OmniExpress_annotation.csv'),
#        InfOmniExpr38('/mnt/HPC/processed/Metadata/variant_annotation_grch38/InfiniumOmniExpress-24v1-2_A2.csv'),
#        MSExome('/mnt/HPC/processed/Metadata/variant_annotation/MSExome_annotation.csv')]

#readers = [Dil('/mnt/HPC/processed/Metadata/variant_annotation/DIL_annotation.csv')]

readers = [UKBBv21_2021('ukbbv2_1_Annot_2021.csv')]

ch = DBConnect("cc3")
logfile = datetime.datetime.now().strftime("%a_%d%b_%I%p.log")
logging.basicConfig(filename=logfile, level=logging.INFO)
offsetclass = ""  # "AxiUKBB_WCSG"  #pick a source class from which you don't want to parse from the beginning
offsetvariant = 249200  # variant 1 = the first line under the header. so if offsetvariant = 3 then the 3rd variant will be parsed
for source in readers:
    if type(source).__name__ == offsetclass:
        readin(ch, source, offsetvariant)
    else:
        readin(ch, source)
ch.close()
Beispiel #7
0
def main(argv):
    exfname = 'exflank/external_flanks.txt'  # add alternative path on command line
    db = 'cc4'
    editdb = False
    if len(argv) > 0:
        exfname = argv[0]
    try:
        listf = NormFile(exfname)
    except FileNotFoundError:
        print(
            "file %s does not exist. Provide an external flank file (<id>\\t<flankseq>)"
            % (exfname))
        raise
    if len(argv) > 1:
        if argv[1] == 'True':
            editdb = True
    print(editdb)
    conn = DBConnect(db)
    curs = conn.getCursor(dic=True)
    badmatchf = open("exfl_badmatch", "w")
    longerf = open("exfl_longer.txt", "w")
    if editdb:
        longerf.write('no entries because "editdb" switch is on\n')
    count_matchmult = 0
    count_allmatch = 0
    count_matchone = 0
    count_matchzero = 0
    for uid, nflnk in listf.readls(" "):
        try:
            allfl = get_flank(uid, curs)
        except:
            print("Unexpected error:",
                  sys.exc_info()[0], '\ninterrupted at uid ', uid)
            break
        match = compare_dbf(nflnk, allfl)
        revnflnk = rev(nflnk)
        match += compare_dbf(revnflnk, allfl)
        if match:
            matchfl = [allfl[ind] for ind in match
                       ]  # new list with matching dicts (so new indexes)
            localmatch = ResExf(matchfl).check_local(
                nflnk)  # additional check, potentially reducing matches
            matchfl = [matchfl[ind] for ind in localmatch]
            nomatchfl = findnomatch(matchfl, allfl)
            if len(localmatch) > 1:
                count_matchmult += 1
                remove, keep = multchoose(
                    matchfl
                )  #not used if not editdb but worth doing to catch problems
                if len(localmatch) == len(allfl):
                    count_allmatch += 1
                if not editdb:
                    fvplens = [
                        len(df['flank_seq'].split('[')[0]) for df in matchfl
                    ]
                    longerf.write('%s\t%s\n' % (uid, max(fvplens)))
                else:
                    fordel = [matchfl[ind] for ind in remove]
                    forkeep = [matchfl[ind]
                               for ind in keep]  # expecting 1 entry
                    try:
                        ResExf.remove_red(curs, fordel)
                        ResExf.flag_chosen(curs, forkeep)
                        conn.commit()
                    except:
                        print(
                            "Unexpected error while editing db: removing dups and flagging chosen",
                            sys.exc_info()[0], '\ninterrupted at uid ', uid)
                        break
            else:
                count_matchone += 1
                if editdb:
                    if not alr_chose(matchfl):
                        try:
                            ResExf.flag_chosen(curs, matchfl)
                            conn.commit()
                        except:
                            print(
                                "Unexpected error while editing db (flagging chosen flank):",
                                sys.exc_info()[0], '\ninterrupted at uid ',
                                uid)
                            break
            log_badmatch(badmatchf, uid, nflnk, nomatchfl)
        else:
            count_matchzero += 1
            log_badmatch(badmatchf, uid, nflnk, allfl)
            if editdb:
                try:
                    ResExf.add_ext(curs, uid, nflnk)
                    conn.commit()
                except:
                    print(
                        "Unexpected error while editing db (adding ext flank seq):",
                        sys.exc_info()[0], '\ninterrupted at uid ', uid)
                    break
    print(
        '%s variants matched 1 of their flank sequences\n%s variants matched multiple of their external flanks (%s of these match ALL of their flanks(so no mismatches at all))\n%s variants do not have any db flanks matching the external flank sequence'
        % (count_matchone, count_matchmult, count_allmatch, count_matchzero))
    conn.close()
    longerf.close()
    badmatchf.close()