Esempio n. 1
0
def processSAMfile( sam_fn, numthreads, numlines ):
	result = gt._autoVivification()
	mapped_reads = 0

	#clean memory
	gc.collect()

	print_message( "Parsing SAM files with %s subprocesses..."%numthreads, argvs.silent, begin_t, logfile )
	pool = Pool(processes=numthreads)
	jobs = []
	results = []

	for chunkStart,chunkSize in chunkify(sam_fn):
		jobs.append( pool.apply_async(worker, (sam_fn,chunkStart,chunkSize)) )

	#wait for all jobs to finish
	tol_jobs = len(jobs)
	cnt=0
	for job in jobs:
		results.append( job.get() )
		cnt+=1
		if argvs.debug: print_message( "[DEBUG] Progress: %s/%s (%.1f%%) chunks done."%(cnt, tol_jobs, cnt/tol_jobs*100), argvs.silent, begin_t, logfile )

	#clean up
	pool.close()

	print_message( "Merging results...", argvs.silent, begin_t, logfile )
	for res in results:
		for k in res:
			if k in result:
				result[k]["ML"] = result[k]["ML"] | res[k]["ML"]
				result[k]["MB"] += res[k]["MB"]
				result[k]["MR"] += res[k]["MR"]
				result[k]["NM"] += res[k]["NM"]
			else:
				result[k]={}
				result[k].update(res[k])

	# convert mapped regions to linear length
	refs = result.keys()
	for k in list(refs):
		if not result[k]["MR"]:
			del result[k]
		else:
			result[k]["LL"] = 0
			mapped_reads += result[k]["MR"]

			mask = result[k]["ML"]
			p = recompile('1+')
			bitstr = bin(mask).replace('0b','')
			iterator = p.finditer(bitstr)
			for match in iterator:
				r = match.span()
				result[k]["LL"] += r[1]-r[0]

	return result, mapped_reads
Esempio n. 2
0
def processASR(c,
               tid2lineage,
               asr,
               rc,
               al,
               asr_dir,
               target_tid,
               tag,
               flag_red_str,
               flag_ind_iso=0):
    """
	ARGVS:
	   c             OBJ   sqlite3 connection obj
	   asr           STR   asr file
	   rc            LIST  refseq category
	   al            LIST  assembly level
	   asr_dir       STR   asr local directory
	   target_tid    LIST  target taxid
	   tag           STR   tag
	   flag_red_str  BOOL  flag for allowing redundant strains
	   flag_ind_iso  BOOL  flag for including isolate in strain name
	RETURN:
	   asr_info      DICT
	"""

    # init vars
    asr_info = t._autoVivification()
    lineage = t._autoVivification()
    cnt_q = 0  #qualified assembly
    cnt_tol = 0  #total genomes

    # parsing assembly_summary_refseq.txt file
    with open(asr) as f:
        for line in f:
            if line.startswith('#'):
                continue
            else:
                cnt_tol += 1

            line = line.strip('\n')

            #split each line in assembly_summary_refseq.txt:
            #  0- 4  assembly_accession    bioproject      biosample         wgs_master           refseq_category
            #  5- 9  taxid                 species_taxid   organism_name     infraspecific_name   isolate
            # 10-14  version_status        assembly_level  release_type      genome_rep           seq_rel_date
            # 15-19  asm_name              submitter       gbrs_paired_asm   paired_asm_comp      ftp_path
            # 20-21  excluded_from_refseq  relation_to_type_material
            tmp = line.split('\t')
            filename = ""
            local_path = ""

            sys.stderr.write("[INFO] Processing: %s..." % tmp[0])

            # try to get taxonomy lineage
            try:
                lineage = t.taxid2lineageDICT(tmp[5], 1, 1)
            except:
                sys.stderr.write(
                    "skipped. Removed TaxID (%s) found for %s.\n" %
                    (tmp[5], tmp[0]))
                continue
            # SKIPPING following records
            # 1) not specified refseq_category, example: reference
            if rc:
                flag = 0
                for cate in rc:
                    if cate.lower() in tmp[4].lower():
                        flag = 1
                        break
                if not flag:
                    sys.stderr.write(
                        "skipped. Not belongs to specific refseq_category %s.\n"
                        % rc)
                    continue
            # 2) not specified assembly_level, example: complete
            if al:
                flag = 0
                for l in al:
                    if l.lower() in tmp[11].lower():
                        flag = 1
                        break
                if not flag:
                    sys.stderr.write(
                        "skipped. Not belongs to specific assembly_level %s.\n"
                        % al)
                    continue
            # 3) assemblies that marked "excluded_from_refseq"
            if tmp[20]:
                sys.stderr.write("skipped. Marked as excluded_from_refseq.\n")
                continue
            # 4) not belongs to specified tax id
            if tmp[5] and target_tid:
                flag_tid_in_lineage = 0
                for t_tid in target_tid:
                    if not flag_tid_in_lineage:
                        for t_rank in lineage:
                            if lineage[t_rank]['taxid'] == t_tid:
                                flag_tid_in_lineage = 1
                                break
                if not flag_tid_in_lineage:
                    sys.stderr.write(
                        "skipped. Not belongs to specified tid.\n")
                    continue
            # 5) sequence location is not available
            if tmp[19] != "na":
                if tmp[19].startswith('ftp'):
                    # real filename of assembly
                    filename = tmp[19].split('/')[-1] + "_genomic.fna.gz"
                    # directory structure is retained to keep local filesystem healthy
                    local_path = asr_dir + tmp[19].split('genomes')[1]
                else:
                    filename = tmp[19].split('/')[-1]
                    local_path = "/".join(tmp[19].split('/')[:-1])
            else:
                sys.stderr.write("skipped. No URL provided.\n")
                continue
            # 6) not an unique strain
            tid = tmp[5]
            rank = t.taxid2rank(tid)
            name = t.taxid2name(tid)
            if rank != "strain" and rank != "unknown":
                # generate custom strain if not exists and add strains to database
                (cus_str_taxid, cus_str_name, str_name,
                 iso_name) = generateCustomStrain(
                     tid, name, '', c, tmp[8].replace("strain=", ""), tmp[9],
                     flag_ind_iso)
                #add custom strain to lineage
                lineage["strain"]['taxid'] = cus_str_taxid
                lineage["strain"]['name'] = cus_str_name

            flag_new_strain = checkNewTaxid(c, lineage["strain"]['taxid'], "",
                                            lineage["strain"]['name'],
                                            tmp[8].replace("strain=", ""),
                                            tmp[9], tmp[0], filename)
            if not flag_new_strain and not flag_red_str:
                sys.stderr.write("skipped. Not an unique strain.\n")
                continue

            sys.stderr.write("qualified.\n")

            # download sequence files if not available at local directory
            if not os.path.isfile(local_path + "/" + filename):
                url = tmp[19] + "/" + filename
                wget(url, local_path)
            else:
                sys.stderr.write("[INFO] Found local file: %s.\n" %
                                 (local_path + "/" + filename))

            # tag
            if not tag:
                tag = lineage["superkingdom"]['name'][0]

            # use assembly_accession as key
            cnt_q += 1
            tid = lineage["strain"]['taxid']
            asr_info[tmp[0]]['taxid'] = tid
            asr_info[tmp[0]]['full_str_name'] = lineage["strain"]['name']
            asr_info[tmp[0]]['ftp_path'] = tmp[19]
            asr_info[tmp[0]]['local_path'] = local_path
            asr_info[tmp[0]]['filename'] = filename
            asr_info[tmp[0]]['type_material'] = True if tmp[21] else False
            asr_info[tmp[0]]['cate_tag'] = tag
            tid2lineage[tid] = lineage
    f.close()
    return asr_info, tid2lineage, cnt_q, cnt_tol
Esempio n. 3
0

def wget(url, lpath="./"):
    subprocess.run("mkdir -p %s" % lpath, shell=True, check=True)
    subprocess.run("wget '%s' -P %s" % (url, lpath), shell=True, check=True)


if __name__ == '__main__':
    argvs = parse_params()

    cnt = 0
    cnt_fa = 0  # accepted genomes
    cnt_fa_tol = 0  # total genomes
    cnt_seq = 0
    cnt_seq_tol = 0
    tid2lineage = t._autoVivification()
    output = sys.stdout
    output_gdbl_buffer = {}

    # loading taxonomy
    sys.stderr.write("Loading taxonomy...")
    t.loadTaxonomy(argvs.dbPath)
    sys.stderr.write("completed.\n")

    # init the taxonomy sqlite3 db file
    conn = sqlite3.connect(argvs.sqlitedb)
    conn.isolation_level = None
    c = conn.cursor()
    initTaxaDB(c)

    # create output file
Esempio n. 4
0
def outputResultsAsRanks( res_rollup, o, tg_rank, mode, mc, mr, ml, mh ):
	output = gt._autoVivification()
	major_ranks = {"superkingdom":1,"phylum":2,"class":3,"order":4,"family":5,"genus":6,"species":7,"strain":8}

	# init total abundance
	tol_abu = {}
	tol_abu["ROLLUP_DOC"] = 0
	tol_abu["LINEAR_DOC"] = 0
	tol_abu["READ_COUNT"] = 0
	tol_abu["TOTAL_BP_MAPPED"] = 0
	tol_abu["ABU"] = 0

	# calculate total abundances and prepare dictionary using ranks as keys
	for tid in res_rollup:
		rank = gt.taxid2rank(tid)
		if rank == "superkingdom":
			tol_abu["ROLLUP_DOC"]      += res_rollup[tid]["RD"]
			tol_abu["READ_COUNT"]      += res_rollup[tid]["MR"]
			tol_abu["TOTAL_BP_MAPPED"] += res_rollup[tid]["MB"]
			tol_abu["ABU"]             += res_rollup[tid]["ABU"]

		if rank in major_ranks:
			if not rank in output:
				output[rank] = []
			output[rank].append(tid)

	# Fields for full mode
	add_field = "\t" + "\t".join([
			"LINEAR_COV",
			"LINEAR_COV_MAPPED_SIG",
			"BEST_LINEAR_COV",
			"DOC",
			"BEST_DOC",
			"MAPPED_SIG_LENGTH",
			"TOL_SIG_LENGTH",
			"ABUNDANCE",
			"REL_ABU_ROLLUP_DOC",
			"REL_ABU_READ_COUNT",
			"REL_ABU_TOL_BP_MAPPED",
			"MLRL",
			"NOTE" ]) if mode == "full" else ""

	# essential fields
	o.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s%s\n" % (
			"LEVEL",
			"NAME",
			"TAXID",
			"READ_COUNT",
			"TOTAL_BP_MAPPED",
			"TOTAL_BP_MISMATCH",
			"LINEAR_LENGTH",
			"LINEAR_DOC",
			"ROLLUP_DOC",
			"REL_ABUNDANCE", add_field ) )

	for rank in sorted( major_ranks, key=major_ranks.__getitem__ ):
		if major_ranks[rank] > major_ranks[tg_rank] and mode == "summary":
			break

		for tid in sorted( output[rank], key=lambda tid: res_rollup[tid]["ABU"], reverse=True):
			note = ""
			note += "Filtered out (minCov > %.2f); "%(res_rollup[tid]["LL"]/db_stats[tid]) if rank == "strain" and tid in db_stats and mc > res_rollup[tid]["LL"]/db_stats[tid] else ""
			note += "Filtered out (minReads > %s); "%res_rollup[tid]["MR"] if mr > int(res_rollup[tid]["MR"]) else ""
			note += "Filtered out (minLen > %s); "%res_rollup[tid]["LL"] if ml > int(res_rollup[tid]["LL"]) else ""
			note += "Filtered out (minMLRL > %.2f); "%(res_rollup[tid]["LL"]/res_rollup[tid]["MR"]) if mh > (res_rollup[tid]["LL"]/res_rollup[tid]["MR"]) else ""
			note += "Not shown (%s-result biased); "%rank if major_ranks[rank] > major_ranks[tg_rank] else ""

			# additional fileds for full mode
			add_field = "\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%s\t%s\t%.2f\t%.4f\t%.4f\t%.4f\t%.4f\t%s" % (
				res_rollup[tid]["LL"]/res_rollup[tid]["TS"],                                              # LINEAR_COV
				res_rollup[tid]["LL"]/res_rollup[tid]["SL"],                                              # LINEAR_COV_MAPPED_SIG
				res_rollup[tid]["bLC"],                                                                   # BEST_LINEAR_COV
				res_rollup[tid]["MB"]/res_rollup[tid]["TS"],                                              # DOC
				res_rollup[tid]["bDOC"],                                                                  # BEST_DOC
				res_rollup[tid]["SL"],                                                                    # MAPPED_SIG_LENGTH
				res_rollup[tid]["TS"],                                                                    # TOL_SIG_LENGTH
				res_rollup[tid]["ABU"],                                                                   # ABUNDANCE
				res_rollup[tid]["RD"]/tol_abu["ROLLUP_DOC"] if tol_abu["ROLLUP_DOC"] else 0,              # REL_ABU_ROLLUP_DOC
				res_rollup[tid]["MR"]/tol_abu["READ_COUNT"] if tol_abu["READ_COUNT"] else 0,              # REL_ABU_READ_COUNT
				res_rollup[tid]["MB"]/tol_abu["TOTAL_BP_MAPPED"] if tol_abu["TOTAL_BP_MAPPED"] else 0,    # REL_ABU_TOL_BP_MAPPED
				res_rollup[tid]["LL"]/res_rollup[tid]["MR"],                                              # MLRL
				note,                                                                                     # NOTE
				#res_rollup[tid]["ML"]
			) if mode == "full" else ""

			if note and mode=="summary": continue

			#relative abundance
			o.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.4f\t%.4f\t%.4f%s\n" %
				(   rank,
					gt.taxid2name(tid),
					tid,
					res_rollup[tid]["MR"],
					res_rollup[tid]["MB"],
					res_rollup[tid]["NM"],
					res_rollup[tid]["LL"],
					res_rollup[tid]["MB"]/res_rollup[tid]["LL"],
					res_rollup[tid]["RD"],
					res_rollup[tid]["ABU"]/tol_abu["ABU"] if tol_abu["ABU"] else 0,
					add_field
				)
			)
Esempio n. 5
0
def taxonomyRollUp( r, db_stats, relAbu, mc, mr, ml, mh ):
	"""
	Take parsed SAM output and rollup to superkingdoms
	"""
	res_rollup = gt._autoVivification()
	res_tree = gt._autoVivification()
	major_ranks = {"superkingdom":1,"phylum":2,"class":3,"order":4,"family":5,"genus":6,"species":7}

	# rollup to strain first
	for ref in r:
		(acc, start, stop, stid) = ref.split('|')
		if stid in res_rollup:
			# ML: mapped region
			# MB: # of mapped bases
			# MR: # of mapped reads
			# NM: # of mismatches
			# LL: linear length
			# SL: length of this signature fragments (mapped)
			# TS: length of total signature fragments for a strain (mapped + unmapped)
			#res_rollup[stid]["ML"] += ";%s:%s" %  ( ref, ",".join("..".join(map(str,l)) for l in r[ref]["ML"]) )
			res_rollup[stid]["MB"] += r[ref]["MB"]
			res_rollup[stid]["MR"] += r[ref]["MR"]
			res_rollup[stid]["NM"] += r[ref]["NM"]
			res_rollup[stid]["LL"] += r[ref]["LL"]
			res_rollup[stid]["SL"] += int(stop) - int(start) + 1
		else:
			#res_rollup[stid]["ML"] = "%s:%s" %  ( ref, ",".join("..".join(map(str,l)) for l in r[ref]["ML"]) )
			res_rollup[stid]["MB"] = r[ref]["MB"]
			res_rollup[stid]["MR"] = r[ref]["MR"]
			res_rollup[stid]["NM"] = r[ref]["NM"]
			res_rollup[stid]["LL"] = r[ref]["LL"]
			res_rollup[stid]["SL"] = int(stop) - int(start) + 1
			res_rollup[stid]["TS"] = db_stats[stid]

	# get all strain tax id
	allStrTaxid = list(res_rollup)

	# Calculating DOC, LC and CC for strains
	# These calculations need to be done before rollup step because
	# it's possible that a strain's parent is a strain (no rank) as well
	for stid in allStrTaxid:
		res_rollup[stid]["bDOC"] = res_rollup[stid]["MB"]/db_stats[stid]
		res_rollup[stid]["bLC"]  = res_rollup[stid]["LL"]/db_stats[stid]
		res_rollup[stid]["RD"]   = res_rollup[stid]["MB"]/db_stats[stid]

	# roll strain results to upper levels
	for stid in allStrTaxid:
		# apply cutoffs strain level and rollup to higher levels		
		if mc > res_rollup[stid]["LL"]/db_stats[stid] or \
			mr > res_rollup[stid]["MR"] or \
			ml > res_rollup[stid]["LL"] or \
			mh > res_rollup[stid]["LL"]/res_rollup[stid]["MR"]:
			continue

		tree = gt.taxid2fullLinkDict( stid )

		for pid, tid in tree.items():
			res_tree[pid][tid] = 1
			if tid == stid: # skip strain id, rollup only
				continue
			if not gt.taxid2rank(tid) in major_ranks:
				continue
			if tid in res_rollup:
				# bDOC: best Depth of Coverage of a strain
				# bLC:  best linear coverage of a strain
				#res_rollup[tid]["ML"]   += ";%s" % res_rollup[stid]["ML"]
				res_rollup[tid]["MB"]   += res_rollup[stid]["MB"]
				res_rollup[tid]["MR"]   += res_rollup[stid]["MR"]
				res_rollup[tid]["NM"]   += res_rollup[stid]["NM"]
				res_rollup[tid]["LL"]   += res_rollup[stid]["LL"]
				res_rollup[tid]["SL"]   += res_rollup[stid]["SL"]
				res_rollup[tid]["TS"]   += res_rollup[stid]["TS"]
				res_rollup[tid]["RD"]   += res_rollup[stid]["RD"]
				res_rollup[tid]["bDOC"]  = res_rollup[stid]["bDOC"] if res_rollup[stid]["bDOC"] > res_rollup[tid]["bDOC"] else res_rollup[tid]["bDOC"]
				res_rollup[tid]["bLC"]   = res_rollup[stid]["bLC"] if res_rollup[stid]["bLC"] > res_rollup[tid]["bLC"] else res_rollup[tid]["bLC"]
			else:
				#res_rollup[tid]["ML"]    = res_rollup[stid]["ML"]
				res_rollup[tid]["MB"]    = res_rollup[stid]["MB"]
				res_rollup[tid]["MR"]    = res_rollup[stid]["MR"]
				res_rollup[tid]["NM"]    = res_rollup[stid]["NM"]
				res_rollup[tid]["LL"]    = res_rollup[stid]["LL"]
				res_rollup[tid]["SL"]    = res_rollup[stid]["SL"]
				res_rollup[tid]["TS"]    = res_rollup[stid]["TS"]
				res_rollup[tid]["RD"]    = res_rollup[stid]["RD"]
				res_rollup[tid]["bDOC"]  = res_rollup[stid]["bDOC"]
				res_rollup[tid]["bLC"]   = res_rollup[stid]["bLC"]

	#add abundance to res_rollup
	for tid in res_rollup:
		if relAbu == "LINEAR_LENGTH":
			res_rollup[tid]["ABU"] = res_rollup[tid]["LL"]
		elif relAbu == "TOTAL_BP_MAPPED":
			res_rollup[tid]["ABU"] = res_rollup[tid]["MB"]
		elif relAbu == "READ_COUNT":
			res_rollup[tid]["ABU"] = res_rollup[tid]["MR"]
		elif relAbu == "LINEAR_DOC":
			res_rollup[tid]["ABU"] = res_rollup[tid]["MB"]/res_rollup[tid]["LL"]
		else:
			res_rollup[tid]["ABU"] = res_rollup[tid]["RD"]

	return res_rollup, res_tree
Esempio n. 6
0
                 res_rollup[rank][tid]["ASGN"], tid,
                 float(res_rollup[rank][tid]["ROLL"]) / float(tol_read_count)))


if __name__ == '__main__':
    argvs = parse_params(__version__)
    #load taxonomy
    sys.stderr.write("[INFO] Loading taxonomy...\n")
    t.loadTaxonomy(argvs.taxaPath)
    sys.stderr.write("[INFO] Done.\n")

    major_ranks = {
        "superkingdom": 1,
        "phylum": 2,
        "class": 3,
        "order": 4,
        "family": 5,
        "genus": 6,
        "species": 7,
        "strain": 8
    }
    res_rollup = t._autoVivification()
    tol_read_count = 0

    sys.stderr.write("[INFO] Start processing read classifications...\n")
    tol_read_count = parsing()
    sys.stderr.write("[INFO] Done. %s read classifications are processed.\n" %
                     tol_read_count)
    write_report(argvs.output, tol_read_count)
    sys.stderr.write("[INFO] Done writing report.\n")