Beispiel #1
0
def clean_single_read_data(fastq1_file, fastq1ok_file, fastq_bad_file, verbose=False, adapters_file=None, cutoff=None, polyG_cutoff=23):
	''' Remove reads containing N, # quality, polyG/polyC tracks and adapters.
	'''
	wh1 = open(fastq1ok_file, "w")
	bad = open(fastq_bad_file, "w")

	statistics = {
		"pe": 0,
		"se": 0,
		"N": 0,
		"zeroQ": 0,
		"polyC%s" % polyG_cutoff: 0,
		"polyG%s" % polyG_cutoff: 0,
		"adapters": 0,
	}

	if cutoff:
		cutoff = int(cutoff)
		cutoff_key = "length%s" % cutoff
		statistics[cutoff_key] = 0


	print "Load adapters file"
	adapters = []
	if adapters_file:
		with open(adapters_file) as fh:
			for line in fh:
				adap = line.strip().split()[0]
				rev_adap = get_revcomp(adap)
				if not adap in adapters:
					adapters.append(adap)
				if not rev_adap in adapters:
					adapters.append(rev_adap)
	else:
		print "Adapter file missing"
	print "Number of adapters:", len(adapters)

	for i, read1 in enumerate(fastq_reader(fastq1_file)):
		error1 = None
		if verbose:
			print i, round(100 * statistics["se"]/float(i+1), 2), "% of good", statistics, "\r",
		if cutoff:
			if read1.length < cutoff:
				error1 = cutoff_key
		if not error1:
			error1 = is_bad_read(read1, adapters, polyG_cutoff)
		if error1:
			bad.write(read1.fastq_with_error(error1))
			statistics[error1] += 1
		else:
			wh1.write(read1.fastq)
			statistics["se"] += 1
			
	wh1.close()
	bad.close()
	if i > 0:
		statistics["fraction"] = statistics["pe"]/float(i)
	print
	print statistics
	return statistics
def kmer_to_repbase_with_mongo(kmer_file):

	client = MongoClient()
	client = MongoClient('mongodb://localhost:27017/')
	db = client.Repbase

	index = db.MainIndex
	name_index = db.NameIndex

	name_hash = {}

	print "Iter over kmers"
	for d in sc_iter_simple_tab_file(kmer_file):
		(kmer, tf) = d
		kmer = kmer.lower()
		print kmer, tf
		data = index.find_one({'kmer':kmer})
		if not data:
			rkmer = get_revcomp(kmer)
			data = index.find_one({'kmer':rkmer})
		if data:
			matches = data["index"]
			for rid, tf in matches:
				if rid in name_hash:
					name = name_hash[rid]
				else:
					name = name_index.find_one({"kid":rid})
					name = name["name"]
					name_hash[rid] = name
				print "\t", name, tf
		else:
			print "\t???"
Beispiel #3
0
def clean_se_run(settings):
	'''
	'''
	# 1. Reads filter illumina kmers
	# 2. Filter data
	# 3. Save
	print "Load library for key=", settings["k"]
	with open(settings["pickle_libraries_file"]) as fh:
		library = cPickle.load(fh)
	library = library[settings["k"]]
	kmers =set(library.keys())
	for kmer in library.keys():
		kmers.add(get_revcomp(kmer))
	with open(settings["dat_libraries_file"], "w") as fh:
		for kmer in kmers:
			fh.write("%s\t-\n" % kmer)

	prefix = settings["prefix"]
	verbose = settings["verbose"]
	adapters_file = settings["dat_libraries_file"]
	fastq1_file = "%s.fastq" % prefix
	fastq1ok_file = "%s.ok.fastq" % prefix
	fastq_bad_file  = "%s.bad.fastq" % prefix
	clean_single_read_data(fastq1_file, fastq1ok_file, fastq_bad_file, verbose=verbose, adapters_file=adapters_file,
			cutoff=settings["cutoff"], polyG_cutoff=settings["polyGcutoff"]
		)
Beispiel #4
0
def check_adapters(settings):
	'''
	'''
	print "Load library for key=", settings["k"]
	with open(settings["pickle_libraries_file"]) as fh:
		library = cPickle.load(fh)
	library = library[settings["k"]]
	assert len(library.keys()[0]) == settings["k"]
	print "Library size:", len(library.keys())
	contaminated_kmers = {}
	print "Iter over kmers"
	for i, d in enumerate(sc_read_simple_tab_file(settings["fastq_file"])):
		(kmer, tf) = d
		tf = int(tf)
		kmer = kmer.lower()
		print i, kmer, tf, "\r",
		if settings["cutoff"] and tf < settings["cutoff"]:
			break
		rkmer = get_revcomp(kmer)
		if kmer in library or rkmer in library:
			print
			print kmer, tf, library[kmer]
			contaminated_kmers[kmer] = (tf, library[kmer])
	all_kmers = set(contaminated_kmers.keys())
	contaminated_kmers = contaminated_kmers.items()
	contaminated_kmers.sort(key=lambda x: x[1], reverse=True)
	print "Save data"
	with open(settings["output_file"], "w") as fh:
		for (k, v) in contaminated_kmers:
			rkey = get_revcomp(k)
			s = "%s\t%s\n" % (k, v)
			fh.write(s)
			if not rkey in all_kmers:
				s = "%s\t%s\n" % (rkey, v)
				fh.write(s)
	return contaminated_kmers
Beispiel #5
0
def clean_pair_reads_data(fastq1_file, fastq2_file, fastq1ok_file, fastq2ok_file, fastq_se_file, fastq_bad_file, verbose=False, adapters_file=None, cutoff=None, polyG_cutoff=23):
	''' Remove reads containing N, # quality, polyG/polyC tracks and adapters.
	'''
	wh1 = open(fastq1ok_file, "w")
	wh2 = open(fastq2ok_file, "w")
	se = open(fastq_se_file, "w")
	bad = open(fastq_bad_file, "w")

	statistics = {
		"pe": 0,
		"se": 0,
		"N": 0,
		"zeroQ": 0,
		"polyC%s" % polyG_cutoff: 0,
		"polyG%s" % polyG_cutoff: 0,
		"adapters": 0,
	}
	if cutoff:
		cutoff = int(cutoff)
		cutoff_key = "length%s" % cutoff
		statistics[cutoff_key] = 0

	adapters = []
	if adapters_file:
		with open(adapters_file) as fh:
			for line in fh.readlines():
				adap = line.strip().split()[0]
				rev_adap = get_revcomp(adap)
				if not adap in adapters:
					adapters.append(adap)
				if not rev_adap in adapters:
					adapters.append(rev_adap)
	print "Number of adapters:", len(adapters)

	for i, (read1, read2) in enumerate(iter_pe_data(fastq1_file, fastq2_file)):
		error1 = None
		error2 = None
		if verbose:
			print i, round(100 * statistics["pe"]/float(i+1), 2), "% of good", statistics, "\r",
		if cutoff:
			if read1.length < cutoff:
				error1 = cutoff_key
			if read2.length < cutoff:
				error2 = cutoff_key
		if not (error1 or error1):
			error1 = is_bad_read(read1, adapters, polyG_cutoff)
			error2 = is_bad_read(read2, adapters, polyG_cutoff)
		if not error1 and not error2:
			wh1.write(read1.fastq)
			wh2.write(read2.fastq)
			statistics["pe"] += 1
			continue
		if error1:
			bad.write(read1.fastq_with_error(error1))
			statistics[error1] += 1
		else:
			se.write(read1.fastq)
			statistics["se"] += 1
		if error2:
			bad.write(read2.fastq_with_error(error2))
			statistics[error2] += 1
		else:
			se.write(read2.fastq)
			statistics["se"] += 1
			
	wh1.close()
	wh2.close()
	se.close()
	bad.close()
	if i > 0:
		statistics["fraction"] = statistics["pe"]/float(i)
	print
	print statistics
	return statistics
def kmer_to_cegma_with_mongo(kmer_file, verbose=False):

	client = MongoClient()
	client = MongoClient('mongodb://localhost:27017/')
	db = client.Repbase

	index = db.CegmaMainIndex
	name_index = db.CegmaNameIndex
	repbase_index = db.MainIndex

	name_hash = {}

	print "Iter over kmers"
	match = {
		"repbase": 0,
		"cegma": 0,
		"repbase_cegma": 0,
		"other": 0,
	}
	match_distr = {
		"repbase": defaultdict(int),
		"cegma": defaultdict(int),
		"repbase_cegma": defaultdict(int),
		"other": defaultdict(int),
	}
	for d in sc_iter_simple_tab_file(kmer_file):
		
		(kmer, tf) = d
		repbase_hit = False
		cegma_hit = False
		# print tf, kmer, "\r",
		tf = int(tf)
		print match, tf, "\r", 
		kmer = kmer.lower()
		# if verbose:
		# 	print tf, kmer, "\r",
		data = repbase_index.find_one({'kmer':kmer})
		if not data:
			rkmer = get_revcomp(kmer)
			data = repbase_index.find_one({'kmer':rkmer})
		if data:
			repbase_hit = True

		data = index.find_one({'kmer':kmer})
		if not data:
			rkmer = get_revcomp(kmer)
			data = index.find_one({'kmer':rkmer})
		if data:
			matches = data["index"]
			cegma_hit = True
		# 	print
		# 	print kmer, tf
		# 	for rid, tf in matches:
		# 		if rid in name_hash:
		# 			name = name_hash[rid]
		# 		else:
		# 			name = name_index.find_one({"kid":rid})
		# 			name = name["name"].strip()
		# 			name_hash[rid] = name
		# 		print "\t", name, tf
		# else:
		# 	# print "\t???"
		# 	pass
		if repbase_hit and cegma_hit:
			match["repbase_cegma"] += 1
			match_distr["repbase_cegma"][tf] += 1
			continue
		elif repbase_hit:
			match["repbase"] += 1
			match_distr["repbase"][tf] += 1
			continue
		elif cegma_hit:
			match["cegma"] += 1
			match_distr["cegma"][tf] += 1
			continue
		else:
			match["other"] += 1
	print
	print match_distr
	print match