def processSAMfile( sam_fn, numthreads, numlines ): result = gt._autoVivification() mapped_reads = 0 #clean memory gc.collect() print_message( "Parsing SAM files with %s subprocesses..."%numthreads, argvs.silent, begin_t, logfile ) pool = Pool(processes=numthreads) jobs = [] results = [] for chunkStart,chunkSize in chunkify(sam_fn): jobs.append( pool.apply_async(worker, (sam_fn,chunkStart,chunkSize)) ) #wait for all jobs to finish tol_jobs = len(jobs) cnt=0 for job in jobs: results.append( job.get() ) cnt+=1 if argvs.debug: print_message( "[DEBUG] Progress: %s/%s (%.1f%%) chunks done."%(cnt, tol_jobs, cnt/tol_jobs*100), argvs.silent, begin_t, logfile ) #clean up pool.close() print_message( "Merging results...", argvs.silent, begin_t, logfile ) for res in results: for k in res: if k in result: result[k]["ML"] = result[k]["ML"] | res[k]["ML"] result[k]["MB"] += res[k]["MB"] result[k]["MR"] += res[k]["MR"] result[k]["NM"] += res[k]["NM"] else: result[k]={} result[k].update(res[k]) # convert mapped regions to linear length refs = result.keys() for k in list(refs): if not result[k]["MR"]: del result[k] else: result[k]["LL"] = 0 mapped_reads += result[k]["MR"] mask = result[k]["ML"] p = recompile('1+') bitstr = bin(mask).replace('0b','') iterator = p.finditer(bitstr) for match in iterator: r = match.span() result[k]["LL"] += r[1]-r[0] return result, mapped_reads
def processASR(c, tid2lineage, asr, rc, al, asr_dir, target_tid, tag, flag_red_str, flag_ind_iso=0): """ ARGVS: c OBJ sqlite3 connection obj asr STR asr file rc LIST refseq category al LIST assembly level asr_dir STR asr local directory target_tid LIST target taxid tag STR tag flag_red_str BOOL flag for allowing redundant strains flag_ind_iso BOOL flag for including isolate in strain name RETURN: asr_info DICT """ # init vars asr_info = t._autoVivification() lineage = t._autoVivification() cnt_q = 0 #qualified assembly cnt_tol = 0 #total genomes # parsing assembly_summary_refseq.txt file with open(asr) as f: for line in f: if line.startswith('#'): continue else: cnt_tol += 1 line = line.strip('\n') #split each line in assembly_summary_refseq.txt: # 0- 4 assembly_accession bioproject biosample wgs_master refseq_category # 5- 9 taxid species_taxid organism_name infraspecific_name isolate # 10-14 version_status assembly_level release_type genome_rep seq_rel_date # 15-19 asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path # 20-21 excluded_from_refseq relation_to_type_material tmp = line.split('\t') filename = "" local_path = "" sys.stderr.write("[INFO] Processing: %s..." % tmp[0]) # try to get taxonomy lineage try: lineage = t.taxid2lineageDICT(tmp[5], 1, 1) except: sys.stderr.write( "skipped. Removed TaxID (%s) found for %s.\n" % (tmp[5], tmp[0])) continue # SKIPPING following records # 1) not specified refseq_category, example: reference if rc: flag = 0 for cate in rc: if cate.lower() in tmp[4].lower(): flag = 1 break if not flag: sys.stderr.write( "skipped. Not belongs to specific refseq_category %s.\n" % rc) continue # 2) not specified assembly_level, example: complete if al: flag = 0 for l in al: if l.lower() in tmp[11].lower(): flag = 1 break if not flag: sys.stderr.write( "skipped. Not belongs to specific assembly_level %s.\n" % al) continue # 3) assemblies that marked "excluded_from_refseq" if tmp[20]: sys.stderr.write("skipped. Marked as excluded_from_refseq.\n") continue # 4) not belongs to specified tax id if tmp[5] and target_tid: flag_tid_in_lineage = 0 for t_tid in target_tid: if not flag_tid_in_lineage: for t_rank in lineage: if lineage[t_rank]['taxid'] == t_tid: flag_tid_in_lineage = 1 break if not flag_tid_in_lineage: sys.stderr.write( "skipped. Not belongs to specified tid.\n") continue # 5) sequence location is not available if tmp[19] != "na": if tmp[19].startswith('ftp'): # real filename of assembly filename = tmp[19].split('/')[-1] + "_genomic.fna.gz" # directory structure is retained to keep local filesystem healthy local_path = asr_dir + tmp[19].split('genomes')[1] else: filename = tmp[19].split('/')[-1] local_path = "/".join(tmp[19].split('/')[:-1]) else: sys.stderr.write("skipped. No URL provided.\n") continue # 6) not an unique strain tid = tmp[5] rank = t.taxid2rank(tid) name = t.taxid2name(tid) if rank != "strain" and rank != "unknown": # generate custom strain if not exists and add strains to database (cus_str_taxid, cus_str_name, str_name, iso_name) = generateCustomStrain( tid, name, '', c, tmp[8].replace("strain=", ""), tmp[9], flag_ind_iso) #add custom strain to lineage lineage["strain"]['taxid'] = cus_str_taxid lineage["strain"]['name'] = cus_str_name flag_new_strain = checkNewTaxid(c, lineage["strain"]['taxid'], "", lineage["strain"]['name'], tmp[8].replace("strain=", ""), tmp[9], tmp[0], filename) if not flag_new_strain and not flag_red_str: sys.stderr.write("skipped. Not an unique strain.\n") continue sys.stderr.write("qualified.\n") # download sequence files if not available at local directory if not os.path.isfile(local_path + "/" + filename): url = tmp[19] + "/" + filename wget(url, local_path) else: sys.stderr.write("[INFO] Found local file: %s.\n" % (local_path + "/" + filename)) # tag if not tag: tag = lineage["superkingdom"]['name'][0] # use assembly_accession as key cnt_q += 1 tid = lineage["strain"]['taxid'] asr_info[tmp[0]]['taxid'] = tid asr_info[tmp[0]]['full_str_name'] = lineage["strain"]['name'] asr_info[tmp[0]]['ftp_path'] = tmp[19] asr_info[tmp[0]]['local_path'] = local_path asr_info[tmp[0]]['filename'] = filename asr_info[tmp[0]]['type_material'] = True if tmp[21] else False asr_info[tmp[0]]['cate_tag'] = tag tid2lineage[tid] = lineage f.close() return asr_info, tid2lineage, cnt_q, cnt_tol
def wget(url, lpath="./"): subprocess.run("mkdir -p %s" % lpath, shell=True, check=True) subprocess.run("wget '%s' -P %s" % (url, lpath), shell=True, check=True) if __name__ == '__main__': argvs = parse_params() cnt = 0 cnt_fa = 0 # accepted genomes cnt_fa_tol = 0 # total genomes cnt_seq = 0 cnt_seq_tol = 0 tid2lineage = t._autoVivification() output = sys.stdout output_gdbl_buffer = {} # loading taxonomy sys.stderr.write("Loading taxonomy...") t.loadTaxonomy(argvs.dbPath) sys.stderr.write("completed.\n") # init the taxonomy sqlite3 db file conn = sqlite3.connect(argvs.sqlitedb) conn.isolation_level = None c = conn.cursor() initTaxaDB(c) # create output file
def outputResultsAsRanks( res_rollup, o, tg_rank, mode, mc, mr, ml, mh ): output = gt._autoVivification() major_ranks = {"superkingdom":1,"phylum":2,"class":3,"order":4,"family":5,"genus":6,"species":7,"strain":8} # init total abundance tol_abu = {} tol_abu["ROLLUP_DOC"] = 0 tol_abu["LINEAR_DOC"] = 0 tol_abu["READ_COUNT"] = 0 tol_abu["TOTAL_BP_MAPPED"] = 0 tol_abu["ABU"] = 0 # calculate total abundances and prepare dictionary using ranks as keys for tid in res_rollup: rank = gt.taxid2rank(tid) if rank == "superkingdom": tol_abu["ROLLUP_DOC"] += res_rollup[tid]["RD"] tol_abu["READ_COUNT"] += res_rollup[tid]["MR"] tol_abu["TOTAL_BP_MAPPED"] += res_rollup[tid]["MB"] tol_abu["ABU"] += res_rollup[tid]["ABU"] if rank in major_ranks: if not rank in output: output[rank] = [] output[rank].append(tid) # Fields for full mode add_field = "\t" + "\t".join([ "LINEAR_COV", "LINEAR_COV_MAPPED_SIG", "BEST_LINEAR_COV", "DOC", "BEST_DOC", "MAPPED_SIG_LENGTH", "TOL_SIG_LENGTH", "ABUNDANCE", "REL_ABU_ROLLUP_DOC", "REL_ABU_READ_COUNT", "REL_ABU_TOL_BP_MAPPED", "MLRL", "NOTE" ]) if mode == "full" else "" # essential fields o.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s\n" % ( "LEVEL", "NAME", "TAXID", "READ_COUNT", "TOTAL_BP_MAPPED", "TOTAL_BP_MISMATCH", "LINEAR_LENGTH", "LINEAR_DOC", "ROLLUP_DOC", "REL_ABUNDANCE", add_field ) ) for rank in sorted( major_ranks, key=major_ranks.__getitem__ ): if major_ranks[rank] > major_ranks[tg_rank] and mode == "summary": break for tid in sorted( output[rank], key=lambda tid: res_rollup[tid]["ABU"], reverse=True): note = "" note += "Filtered out (minCov > %.2f); "%(res_rollup[tid]["LL"]/db_stats[tid]) if rank == "strain" and tid in db_stats and mc > res_rollup[tid]["LL"]/db_stats[tid] else "" note += "Filtered out (minReads > %s); "%res_rollup[tid]["MR"] if mr > int(res_rollup[tid]["MR"]) else "" note += "Filtered out (minLen > %s); "%res_rollup[tid]["LL"] if ml > int(res_rollup[tid]["LL"]) else "" note += "Filtered out (minMLRL > %.2f); "%(res_rollup[tid]["LL"]/res_rollup[tid]["MR"]) if mh > (res_rollup[tid]["LL"]/res_rollup[tid]["MR"]) else "" note += "Not shown (%s-result biased); "%rank if major_ranks[rank] > major_ranks[tg_rank] else "" # additional fileds for full mode add_field = "\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%s\t%s\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\t%s" % ( res_rollup[tid]["LL"]/res_rollup[tid]["TS"], # LINEAR_COV res_rollup[tid]["LL"]/res_rollup[tid]["SL"], # LINEAR_COV_MAPPED_SIG res_rollup[tid]["bLC"], # BEST_LINEAR_COV res_rollup[tid]["MB"]/res_rollup[tid]["TS"], # DOC res_rollup[tid]["bDOC"], # BEST_DOC res_rollup[tid]["SL"], # MAPPED_SIG_LENGTH res_rollup[tid]["TS"], # TOL_SIG_LENGTH res_rollup[tid]["ABU"], # ABUNDANCE res_rollup[tid]["RD"]/tol_abu["ROLLUP_DOC"] if tol_abu["ROLLUP_DOC"] else 0, # REL_ABU_ROLLUP_DOC res_rollup[tid]["MR"]/tol_abu["READ_COUNT"] if tol_abu["READ_COUNT"] else 0, # REL_ABU_READ_COUNT res_rollup[tid]["MB"]/tol_abu["TOTAL_BP_MAPPED"] if tol_abu["TOTAL_BP_MAPPED"] else 0, # REL_ABU_TOL_BP_MAPPED res_rollup[tid]["LL"]/res_rollup[tid]["MR"], # MLRL note, # NOTE #res_rollup[tid]["ML"] ) if mode == "full" else "" if note and mode=="summary": continue #relative abundance o.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.4f\t%.4f\t%.4f%s\n" % ( rank, gt.taxid2name(tid), tid, res_rollup[tid]["MR"], res_rollup[tid]["MB"], res_rollup[tid]["NM"], res_rollup[tid]["LL"], res_rollup[tid]["MB"]/res_rollup[tid]["LL"], res_rollup[tid]["RD"], res_rollup[tid]["ABU"]/tol_abu["ABU"] if tol_abu["ABU"] else 0, add_field ) )
def taxonomyRollUp( r, db_stats, relAbu, mc, mr, ml, mh ): """ Take parsed SAM output and rollup to superkingdoms """ res_rollup = gt._autoVivification() res_tree = gt._autoVivification() major_ranks = {"superkingdom":1,"phylum":2,"class":3,"order":4,"family":5,"genus":6,"species":7} # rollup to strain first for ref in r: (acc, start, stop, stid) = ref.split('|') if stid in res_rollup: # ML: mapped region # MB: # of mapped bases # MR: # of mapped reads # NM: # of mismatches # LL: linear length # SL: length of this signature fragments (mapped) # TS: length of total signature fragments for a strain (mapped + unmapped) #res_rollup[stid]["ML"] += ";%s:%s" % ( ref, ",".join("..".join(map(str,l)) for l in r[ref]["ML"]) ) res_rollup[stid]["MB"] += r[ref]["MB"] res_rollup[stid]["MR"] += r[ref]["MR"] res_rollup[stid]["NM"] += r[ref]["NM"] res_rollup[stid]["LL"] += r[ref]["LL"] res_rollup[stid]["SL"] += int(stop) - int(start) + 1 else: #res_rollup[stid]["ML"] = "%s:%s" % ( ref, ",".join("..".join(map(str,l)) for l in r[ref]["ML"]) ) res_rollup[stid]["MB"] = r[ref]["MB"] res_rollup[stid]["MR"] = r[ref]["MR"] res_rollup[stid]["NM"] = r[ref]["NM"] res_rollup[stid]["LL"] = r[ref]["LL"] res_rollup[stid]["SL"] = int(stop) - int(start) + 1 res_rollup[stid]["TS"] = db_stats[stid] # get all strain tax id allStrTaxid = list(res_rollup) # Calculating DOC, LC and CC for strains # These calculations need to be done before rollup step because # it's possible that a strain's parent is a strain (no rank) as well for stid in allStrTaxid: res_rollup[stid]["bDOC"] = res_rollup[stid]["MB"]/db_stats[stid] res_rollup[stid]["bLC"] = res_rollup[stid]["LL"]/db_stats[stid] res_rollup[stid]["RD"] = res_rollup[stid]["MB"]/db_stats[stid] # roll strain results to upper levels for stid in allStrTaxid: # apply cutoffs strain level and rollup to higher levels if mc > res_rollup[stid]["LL"]/db_stats[stid] or \ mr > res_rollup[stid]["MR"] or \ ml > res_rollup[stid]["LL"] or \ mh > res_rollup[stid]["LL"]/res_rollup[stid]["MR"]: continue tree = gt.taxid2fullLinkDict( stid ) for pid, tid in tree.items(): res_tree[pid][tid] = 1 if tid == stid: # skip strain id, rollup only continue if not gt.taxid2rank(tid) in major_ranks: continue if tid in res_rollup: # bDOC: best Depth of Coverage of a strain # bLC: best linear coverage of a strain #res_rollup[tid]["ML"] += ";%s" % res_rollup[stid]["ML"] res_rollup[tid]["MB"] += res_rollup[stid]["MB"] res_rollup[tid]["MR"] += res_rollup[stid]["MR"] res_rollup[tid]["NM"] += res_rollup[stid]["NM"] res_rollup[tid]["LL"] += res_rollup[stid]["LL"] res_rollup[tid]["SL"] += res_rollup[stid]["SL"] res_rollup[tid]["TS"] += res_rollup[stid]["TS"] res_rollup[tid]["RD"] += res_rollup[stid]["RD"] res_rollup[tid]["bDOC"] = res_rollup[stid]["bDOC"] if res_rollup[stid]["bDOC"] > res_rollup[tid]["bDOC"] else res_rollup[tid]["bDOC"] res_rollup[tid]["bLC"] = res_rollup[stid]["bLC"] if res_rollup[stid]["bLC"] > res_rollup[tid]["bLC"] else res_rollup[tid]["bLC"] else: #res_rollup[tid]["ML"] = res_rollup[stid]["ML"] res_rollup[tid]["MB"] = res_rollup[stid]["MB"] res_rollup[tid]["MR"] = res_rollup[stid]["MR"] res_rollup[tid]["NM"] = res_rollup[stid]["NM"] res_rollup[tid]["LL"] = res_rollup[stid]["LL"] res_rollup[tid]["SL"] = res_rollup[stid]["SL"] res_rollup[tid]["TS"] = res_rollup[stid]["TS"] res_rollup[tid]["RD"] = res_rollup[stid]["RD"] res_rollup[tid]["bDOC"] = res_rollup[stid]["bDOC"] res_rollup[tid]["bLC"] = res_rollup[stid]["bLC"] #add abundance to res_rollup for tid in res_rollup: if relAbu == "LINEAR_LENGTH": res_rollup[tid]["ABU"] = res_rollup[tid]["LL"] elif relAbu == "TOTAL_BP_MAPPED": res_rollup[tid]["ABU"] = res_rollup[tid]["MB"] elif relAbu == "READ_COUNT": res_rollup[tid]["ABU"] = res_rollup[tid]["MR"] elif relAbu == "LINEAR_DOC": res_rollup[tid]["ABU"] = res_rollup[tid]["MB"]/res_rollup[tid]["LL"] else: res_rollup[tid]["ABU"] = res_rollup[tid]["RD"] return res_rollup, res_tree
res_rollup[rank][tid]["ASGN"], tid, float(res_rollup[rank][tid]["ROLL"]) / float(tol_read_count))) if __name__ == '__main__': argvs = parse_params(__version__) #load taxonomy sys.stderr.write("[INFO] Loading taxonomy...\n") t.loadTaxonomy(argvs.taxaPath) sys.stderr.write("[INFO] Done.\n") major_ranks = { "superkingdom": 1, "phylum": 2, "class": 3, "order": 4, "family": 5, "genus": 6, "species": 7, "strain": 8 } res_rollup = t._autoVivification() tol_read_count = 0 sys.stderr.write("[INFO] Start processing read classifications...\n") tol_read_count = parsing() sys.stderr.write("[INFO] Done. %s read classifications are processed.\n" % tol_read_count) write_report(argvs.output, tol_read_count) sys.stderr.write("[INFO] Done writing report.\n")