def diffSeq(line, fnRef, fnSim): MaxPrintChars = 100 fpRef = open(fnRef, 'r') fpSim = open(fnSim, 'r') words = line.strip().split() chrom = words[0] refPos0 = int(words[1]) refPos1 = int(words[2]) simPos0 = int(words[7]) simPos1 = int(words[8]) print line.strip() print '---------' if refPos1 - refPos0 + 1 < MaxPrintChars and simPos1 - simPos0 + 1 < MaxPrintChars: for name, seq, qual in readfq(fpRef): if name == chrom: print seq[refPos0 - 1:refPos1] break for name, seq, qual in readfq(fpSim): if name == chrom: print seq[simPos0 - 1:simPos1] break fpRef.close() fpSim.close()
def diffSeq(line, fnRef, fnSim): MaxPrintChars = 100 fpRef = open(fnRef, 'r') fpSim = open(fnSim, 'r') words = line.strip().split() chrom = words[0] refPos0 = int(words[1]) refPos1 = int(words[2]) simPos0 = int(words[7]) simPos1 = int(words[8]) print line.strip() print '---------' if refPos1 - refPos0 +1 < MaxPrintChars and simPos1 - simPos0+1 < MaxPrintChars: for name, seq, qual in readfq(fpRef): if name == chrom: print seq[refPos0-1: refPos1] break for name, seq, qual in readfq(fpSim): if name == chrom: print seq[simPos0-1: simPos1] break fpRef.close() fpSim.close()
def demultiplex_dual_barcodes(): parser = argparse.ArgumentParser() parser.add_argument('--index1_file', required=True) parser.add_argument('--index1_max_distance', type=int, required=True) parser.add_argument('--index2_file', required=True) parser.add_argument('--index2_max_distance', type=int, required=True) parser.add_argument('--read1_fq', required=True) parser.add_argument('--read2_fq', required=True) parser.add_argument('--output_dir', required=True) #parser.add_argument('--output_file_prefix') args=parser.parse_args() bc=dict() bce=dict() for i in [1,2]: bc[i]=read_barcode_dict(args.__dict__['index'+str(i)+'_file']) bce[i]=expand_barcode_dict(bc[i], args.__dict__['index'+str(i)+'_max_distance']) outfiles=dict() def output_file(filename): if filename not in outfiles: outfiles[filename]=gzip.open(filename, 'wt') return outfiles[filename] f1=readfq(open(args.read1_fq)) f2=readfq(open(args.read2_fq)) for name1, seq1, qual1 in f1: m=re.search('#([ACGTN]+)_([ACGTN]+)/', name1) sample_id=dict() for i in [1,2]: index_seq=m.group(i) if index_seq in bce[i]: sample_id[i]=bc[i][bce[i][index_seq]] else: sample_id[i]=None if sample_id[1] is None or sample_id[2] is None: filename_suffix='Unassigned' else: filename_suffix=sample_id[1]+'_'+sample_id[2] base_output_filename=args.output_dir+'/'+filename_suffix f=output_file(base_output_filename+'_R1.fq.gz') f.write('\n'.join(['@'+name1,seq1,'+',qual1,''])) name2, seq2, qual2 = next(f2) f=output_file(base_output_filename+'_R2.fq.gz') f.write('\n'.join(['@'+name2,seq2,'+',qual2,''])) #print(output_filename) for f in outfiles.values(): f.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("input", help="path for the fasta file to build bwt", type=str) parser.add_argument("save", help="path to save bwt", type=str) try: args = parser.parse_args() except: parser.print_help() sys.exit(1) handle = open(args.input) seqs = [] rc_seqs = [] names = [] for name, seq, qual in readfq(handle): seqs.append(seq) rc_seqs.append(revcomp(seq)) names.append(name) print("build forward index") fm_index = FMindex(seqs, names) print("finish build") print("build reverse index") rc_index = FMindex(rc_seqs, names) print("finish build") index = [fm_index, rc_index] pickle.dump(index, open(args.save, "wb"))
def main(): parser = argparse.ArgumentParser() parser.add_argument("query", help="path for query sequence pasta file", type=str) parser.add_argument("target", help="path for target sequence pasta file", type=str) parser.add_argument("threshold", help="threshold to report overlap", type=int) parser.add_argument("-align", required=False, action='store_true') try: args = parser.parse_args() except: parser.print_help() sys.exit(1) # build index for all target sequence handle = open(args.target) target_dict = {} seqs = [] names = [] for name, seq, qual in readfq(handle): seqs.append(seq) names.append(name) target_dict[name] = seq print("build index") fm_index = FMindex(seqs, names) print("finish build") # test overlap handle = open(args.query) for name, seq, qual in readfq(handle): # print(name) outputs = fm_index.find_overlaps(seq, name, args.threshold) for output in outputs: if (output[0]!=output[3]): output.append("fw") print("\t".join(output)) if args.align: print_align(output, seq, target_dict[output[3]]) outputs = fm_index.find_overlaps(revcomp(seq), name, args.threshold) for output in outputs: if (output[0] != output[3]): output.append("rc") print("\t".join(output)) if args.align: print_align(output, seq, target_dict[output[3]])
def main(): parser = argparse.ArgumentParser() parser.add_argument("query", help="path for query sequence pasta file", type=str) parser.add_argument("index", help="path for bwt index", type=str) parser.add_argument("threshold", help="threshold to report overlap", type=int) try: args = parser.parse_args() except: parser.print_help() sys.exit(1) # build index for all target sequence fw_index, rc_index = pickle.load(open(args.index, "rb")) # test overlap handle = open(args.query) for name, seq, qual in readfq(handle): outputs = fw_index.find_overlaps(seq, name, args.threshold) for output in outputs: if (output[0]!=output[3]): output.append("fw") output.append("suffix") print("\t".join(output)) outputs = fw_index.find_overlaps(revcomp(seq), name, args.threshold) for output in outputs: if (output[0] != output[3]): output.append("rc") output.append("suffix") print("\t".join(output)) outputs = rc_index.find_overlaps(revcomp(seq), name, args.threshold) for output in outputs: if (output[0] != output[3]): output.append("fw") output.append("prefix") print("\t".join(output)) outputs = rc_index.find_overlaps(seq, name, args.threshold) for output in outputs: if (output[0] != output[3]): output.append("rc") output.append("prefix") print("\t".join(output))
genes_not_trusted, genes_not_trusted_conditioned, trusted_SVnames, threshold_least_exons=nb_exons, threshold_most_introns=nb_introns, threshold_exon_cov=cov_exons, threshold_intron_cov=cov_introns, # CIC cases are filtered here is_verbose=is_verbose, is_debug=is_debug) SVseqs = {} fs = [dir_fasta + _ for _ in os.listdir(dir_fasta) if _.endswith('.fa')] qnames = [_.split('@')[1] for _ in PPG_100flk.keys()] for f in fs: with open(f) as file: for qname, seq, qual in readfq(file): if qname in qnames: SVseqs[qname] = seq print('loaded {0} SV seqs from {1} files'.format(len(SVseqs), len(fs))) SVflks = {} fs = [dir_flk + _ for _ in os.listdir(dir_flk) if _.endswith('.fa')] qnames = [_.split('@')[1] for _ in PPG_100flk.keys()] for f in fs: with open(f) as file: for qname, seq, qual in readfq(file): if qname[:-2] in qnames: if qname[:-2] not in SVflks: SVflks[qname[:-2]] = {'r': None, 'l': None} SVflks[qname[:-2]][qname[-1]] = seq print('loaded {0} SV flankings from {1} files'.format(
parser.add_argument('-r', '--ref', metavar='ref.fa', required=True, dest='ref', help='input reference (required)') parser.add_argument('-o', '--outVCF', metavar='out.vcf', required=True, dest='outVCF', help='output vcf file (required)') args = parser.parse_args() # Get all the reference positions selectedPos = collections.defaultdict(set) nuclDict = collections.defaultdict(set) if args.vcfFile: vcf_reader = gzip.open(args.vcfFile, 'rb') if args.vcfFile.endswith('.gz') else open(args.vcfFile, 'r') for line in vcf_reader: if line.startswith('#'): continue fields = line.split('\t', 2) selectedPos[fields[0]].add(int(fields[1])) nuclDict[(fields[0], int(fields[1]))] = 'N' vcf_reader.close() # Store the true reference nucleotide f_in = gzip.open(args.ref) if args.ref.endswith('.gz') else open(args.ref) for seqName, seqNuc, seqQuals in readfq(f_in): for pos in selectedPos[seqName]: nuclDict[(seqName, pos)] = seqNuc[(pos-1):pos] # Replace vcf reference allele if args.vcfFile: vcf_reader = vcf.Reader(open(args.vcfFile), 'r', compressed=True) if args.vcfFile.endswith('.gz') else vcf.Reader(open(args.vcfFile), 'r', compressed=False) vcf_writer = vcf.Writer(open(args.outVCF, 'w'), vcf_reader, lineterminator='\n') for record in vcf_reader: record.REF = nuclDict[(record.CHROM, record.POS)] vcf_writer.write_record(record)
sys.exit(1) fp_in_r1 = open(sys.argv[1], 'r') fp_in_r2 = open(sys.argv[2], 'r') fp_ann = open(sys.argv[3], 'r') fp_out_r1 = open(sys.argv[4] + '1.fq', 'w') fp_out_r2 = open(sys.argv[4] + '2.fq', 'w') fp_ans = open(sys.argv[4] + '.ans', 'w') #Read Ann posRefStart, posSimStart, posRefEnd, posSimEnd = {}, {}, {}, {} for line in fp_ann: a = read_ann.ann(line) createPosMap(a, posRefStart, posRefEnd, posSimStart, posSimEnd) #print posRefEnd, posSimEnd #convert read1 tot_num = 0 for name, seq, qual in readfq(fp_in_r1): words = name.split('_') chrom = words[0] simPos0 = int(words[1]) simPos1 = int(words[2]) refPos0 = simPos2Ref(chrom, simPos0, posRefStart, posRefEnd, posSimStart, posSimEnd, fp_ans) refPos1 = simPos2Ref(chrom, simPos1, posRefStart, posRefEnd, posSimStart, posSimEnd, fp_ans) words[1], words[2] = str(refPos0), str(refPos1) name = '' for i in words: name += i + '_' name = name.rstrip('_')
import re import sys from readfq import readfq complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N'} if len(sys.argv) < 3: sys.exit("error: too few arguments\nusage: find_motif.py infile.fa <motifseq>\n") seqs = dict() infile = open(sys.argv[1]) for name, seq, qual in readfq(infile): seqs[name] = seq motif_str = sys.argv[2].upper() motif_str_rc = "".join([complement[b] for b in motif_str[::-1]]) pat = "(%s|%s)" % (motif_str.replace("N", "[ATCG]"), motif_str_rc.replace("N", "[ATCG]")) motif = re.compile(pat, flags=re.IGNORECASE) for name, seq in seqs.items(): matches = motif.finditer(seq) for match in matches: if match is not None: if len(sys.argv) == 4: print "\t".join(map(str, (name, match.start(), match.end(), sys.argv[3]))) else: print "\t".join(map(str, (name, match.start(), match.end()))) #print "\t".join(map(str, (name, match.start(), match.end(), seq[match.start():match.end()])))
def main(): parser = argparse.ArgumentParser() parser.add_argument("seed", help="path for query sequence pasta file", type=str) parser.add_argument("data", help="path for all reads", type=str) parser.add_argument("index", help="path for bwt index", type=str) parser.add_argument("threshold", help="threshold to report overlap", type=int) parser.add_argument("output", help="path for output fasta", type=str) try: args = parser.parse_args() except: parser.print_help() sys.exit(1) # build index for all target sequence seq_dict = SeqIO.index(args.data, 'fasta') sets = [] seeds = set() total = set() handle = open(args.seed) for name, seq, qual in readfq(handle): seeds.add(name) sets.append(seeds) total = total.union(seeds) fw_index, rc_index = pickle.load(open(args.index, "rb")) new_set = seeds while len(new_set) > 0: prev_set = new_set new_set = set() for name in prev_set: seq = str(seq_dict[name].seq) outputs = fw_index.find_overlaps(seq, name, args.threshold) for output in outputs: if output[3] not in total: new_set.add(output[3]) outputs = fw_index.find_overlaps(revcomp(seq), name, args.threshold) for output in outputs: if output[3] not in total: new_set.add(output[3]) outputs = rc_index.find_overlaps(revcomp(seq), name, args.threshold) for output in outputs: if output[3] not in total: new_set.add(output[3]) outputs = rc_index.find_overlaps(seq, name, args.threshold) for output in outputs: if output[3] not in total: new_set.add(output[3]) sets.append(new_set) total = total.union(new_set) print("Find one set of reads {}".format(len(new_set))) print('Total number of reads is {}'.format(len(total))) outputs = [] for seq_id in total: outputs.append(seq_dict[seq_id]) SeqIO.write(outputs, open(args.output, 'w'), 'fasta')
required=True, dest='ref', help='input reference (required)') args = parser.parse_args() # Get all the positions bcf = cyvcf2.VCF(args.bcf) selectedPos = collections.defaultdict(set) for record in bcf: selectedPos[record.CHROM].add(record.POS) # Store the prefix and suffix reference nucleotide nuclPrevDict = collections.defaultdict(set) nuclPostDict = collections.defaultdict(set) f_in = gzip.open(args.ref) if args.ref.endswith('.gz') else open(args.ref) for seqName, seqNuc, seqQuals in readfq(f_in): for pos in selectedPos[seqName]: nuclPrevDict[(seqName, pos)] = seqNuc[(pos - 2):(pos - 1)] nuclPostDict[(seqName, pos)] = seqNuc[pos:(pos + 1)] # Build mutation dictionary mt = dict() for i in ['A', 'C', 'G', 'T']: for j in ['A', 'C', 'G', 'T']: for k in ['A', 'C', 'G', 'T']: for l in ['A', 'C', 'G', 'T']: if j != k: if (j == 'C') or (j == 'T'): mt[(i, j, k, l)] = (i, j, k, l) else: mt[(i, j, k, l)] = (rev(l), rev(j), rev(k), rev(i))
status = 0 # unless new best ref matches last best ref out_ls_fh.write('\t'.join([ central_sample_id, best_qc[central_sample_id][2], best_qc[central_sample_id][1], str(status), ]) + '\n') # Add this sequence's PAG to the best_published_names set best_published_names.add(best_qc[central_sample_id][1]) # Iterate the matched FASTA and print out sequences that have a name in the best_published_names set seen_best_published_names = set([]) with open(args.fasta) as latest_fasta_fh: for name, seq, qual in readfq(latest_fasta_fh): # Apparently I write the names out wrong so that's good curr_pag = name.split('|')[0].replace('COGUK', 'COG-UK') central_sample_id = curr_pag.split('/')[1] if curr_pag in best_published_names: # Remove deletion chars (https://github.com/COG-UK/dipi-group/issues/38) seq = seq.replace('-', '') sys.stdout.write('>%s\n%s\n' % (central_sample_id, seq)) seen_best_published_names.add(curr_pag) sys.stderr.write("[NOTE] %s best sequences written.\n" % len(seen_best_published_names)) sys.stderr.write("[NOTE] %s best sequences missing.\n" % (len(best_published_names) - len(seen_best_published_names))) if len(seen_best_published_names) != len(best_published_names):
#!/usr/bin/env python # nuccount.py -- tally nucleotides in a file import sys from collections import Counter from readfq import readfq IUPAC_BASES = "ACGTRYSWKMBDHVN-." # intialize counter counts = Counter() for name, seq, qual in readfq(sys.stdin): # for each sequence entry, add all its bases to the counter counts.update(seq.upper()) # print the results for base in IUPAC_BASES: print base + "\t" + str(counts[base])
def main(debug=False): parser = argparse.ArgumentParser() parser.add_argument("path", help="path for sequence pasta file", type=str) parser.add_argument("k", help="size of kmer", type=int) parser.add_argument("perm", help="number of permutation", type=int) parser.add_argument("threshold", help="threshold to report similar", type=float) parser.add_argument("-cluster", help="generate clusters instead of find similar reads", required=False, action='store_true') parser.add_argument("-debug", required=False, action='store_true') try: args = parser.parse_args() except: parser.print_help() sys.exit(1) handle = open(args.path) lsh = BioLSH(threshold=args.threshold, num_perm=args.perm) minhash_dict = {} minhash_rc_dict = {} if args.debug: set_dict = {} with lsh.insertion_session() as session: for name, seq, qual in readfq(handle): m_hash = kmer_minhash(seq, args.k, args.perm, debug=args.debug) """ if name == 'r1': m_hash.set_hashvalues(np.array([0, 11, 22, 3, 5, 6, 9, 45, 98, 0, 1, 7])) elif name == 'r2': m_hash.set_hashvalues(np.array([11, 9, 3, 4, 98, 0, 1, 7, 23, 15, 0, 31])) """ session.insert(name, m_hash) minhash_dict[name] = m_hash minhash_rc_dict[name] = kmer_minhash(seq, args.k, args.perm, rc=True) if args.debug: set_dict[name] = kmer_set(seq, args.k) if args.cluster: clusters = lsh.cluster() for cluster in clusters: print("\t".join(cluster)) else: for id in minhash_dict.keys(): fw_result = lsh.query(minhash_dict[id]) rc_result = lsh.query(minhash_rc_dict[id]) result = list(set(fw_result).union(set(rc_result))) for similar_id in result: if id != similar_id: print("{}\t{}".format(id, similar_id)) if args.debug: for id in minhash_dict.keys(): for query_id in minhash_dict.keys(): print() print("query_id: {} target_id: {}".format(query_id, id)) result = minhash_dict[query_id].jaccard(minhash_dict[id]) print("MinHash Estimated:", result) result = float( len(set_dict[query_id].intersection( set_dict[id]))) / float( len(set_dict[query_id].union(set_dict[id]))) print("Real Similarity", result) result = minhash_dict[query_id].hamming(minhash_dict[id]) print("Hamming distance of signature vector:", result)
import sys from readfq import readfq import read_ann usage = 'map <Ref> <sim_genome> <sim_ann>' if __name__ == '__main__': if len(sys.argv) < 4: print >>sys.stderr, usage sys.exit(1) fp_ref = open(sys.argv[1], 'r') fp_sim_genome = open(sys.argv[2], 'r') fp_ann = open(sys.argv[3], 'r') last = None for chrom_ref, seq_ref, qual_ref in readfq(fp_ref): chrom_sim, seq_sim, qual_sim = readfq(fp_sim_genome).next() if chrom_ref != chrom_sim: print >>sys.stderr, '[Error]: Diff chromosome!' sys.exit(1) last = read_ann.ann(fp_ann.readline()) fp_ref.close() fp_sim_genome.close() fp_ann.close()
def main(debug=False): parser = argparse.ArgumentParser() parser.add_argument("path", help="path for sequence pasta file", type=str) parser.add_argument("groundtruth", help="path for sequence pasta file", type=str) parser.add_argument("k", help="size of kmer", type=int) parser.add_argument("perm", help="number of permutation", type=int) try: args = parser.parse_args() except: parser.print_help() sys.exit(1) handle = open(args.path) truth = collections.defaultdict(dict) with open(args.groundtruth) as f1: for line in f1: sp = line.split() query_id = sp[0] target_id = sp[1] overlap_size = int(sp[2]) truth[query_id][target_id] = overlap_size minhash_dict = {} minhash_rc_dict = {} for name, seq, qual in readfq(handle): m_hash = kmer_minhash(seq, args.k, args.perm) minhash_dict[name] = m_hash minhash_rc_dict[name] = kmer_minhash(seq, args.k, args.perm, rc=True) identity_true = [] identity_false = [] identities = [] jaccard_true = [] jaccard_false = [] jaccards = [] for id in minhash_dict.keys(): for query_id in minhash_dict.keys(): if id != query_id: jaccard = minhash_dict[query_id].jaccard(minhash_dict[id]) identity = minhash_dict[query_id].identity(minhash_dict[id]) jaccards.append(jaccard) identities.append(identity) if id in truth[query_id]: jaccard_true.append(jaccard) identity_true.append(identity) else: jaccard_false.append(jaccard) identity_false.append(identity) cur_dir = os.path.dirname(args.path) cur_file = os.path.basename(args.path).split(".")[0] jaccard_file = os.path.join( cur_dir, cur_file + '_jaccard_k_{}_perm_{}.png'.format(args.k, args.perm)) identity_file = os.path.join( cur_dir, cur_file + '_identity_k_{}_perm_{}.png'.format(args.k, args.perm)) plt.figure() hist, bins = np.histogram(identities, bins=50) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 hist1, bins = np.histogram(identity_true, bins=bins, normed=True) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist1, align='center', width=width, label="True", alpha=0.7) hist1, bins = np.histogram(identity_false, bins=bins, normed=True) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist1, align='center', width=width, label="False", alpha=0.7) plt.legend() plt.savefig(identity_file) plt.figure() hist, bins = np.histogram(jaccards, bins=50) width = 0.7 * (bins[1] - bins[0]) center = (bins[:-1] + bins[1:]) / 2 hist1, bins = np.histogram(jaccard_true, bins=bins, normed=True) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist1, align='center', width=width, label="True", alpha=0.7) hist1, bins = np.histogram(jaccard_false, bins=bins, normed=True) center = (bins[:-1] + bins[1:]) / 2 plt.bar(center, hist1, align='center', width=width, label="False", alpha=0.7) plt.legend() plt.savefig(jaccard_file)
sys.stderr.write("[NOTE] %d samples with metadata loaded\n" % len(parsed_metadata)) if len(seen_pags) != len(best_pags): missing = set(best_pags.values()) - seen_pags for pag in missing: sys.stderr.write( "[WARN] Best PAG found for %s but not matched to metadata\n" % pag) sys.exit(3) # Load the FASTA, lookup and emit the sample_date and genome sequence print(','.join([ "COG-ID", "Sample_date", "Adm1", "Pillar", "Published_date", "Sequence", ])) with open(args.fasta) as all_fh: for name, seq, qual in readfq(all_fh): central_sample_id = name print(','.join([ central_sample_id, parsed_metadata[central_sample_id]["collection_or_received_date"], parsed_metadata[central_sample_id]["adm1"], parsed_metadata[central_sample_id]["collection_pillar"], parsed_metadata[central_sample_id]["published_date"], seq, ]))
print usage sys.exit(1) fp_ref = open(sys.argv[1], 'r') fp_var = open(sys.argv[2], 'r') fp_sim_genome = open(sys.argv[3] + ".simGenome.fa", 'w') fp_sim_ann = open(sys.argv[3] + ".simAnn", 'w') sim_genome = '' cur_var = None line = fp_var.readline().strip() if len(line) == 0: print >> sys.stderr, "File %s is empty" % (sys.argv[2]) sys.exit(1) cur_var = var(line) for chrom, seq, qual in readfq(fp_ref): sim_genome_len = 0 sim_genome = '' non_variant_region_start = 1 non_variant_region_end = 1 sim_start = -1 sim_end = -1 while cur_var.chr == chrom: non_variant_region_end = cur_var.pos_start if cur_var.type == 'SNV' or cur_var.type == 'SNP': sim_genome += seq[non_variant_region_start - 1:non_variant_region_end] + cur_var.seq sim_genome_len += non_variant_region_end - non_variant_region_start + 1 + 1 #print >>sys.stderr, '>Ref_%d_%d\n%s'%(non_variant_region_start-1, non_variant_region_end+1, seq[non_variant_region_start-1:non_variant_region_end+1]) #print >>sys.stderr, '@Sim SNP\n%s'%(seq[non_variant_region_start-1:non_variant_region_end]+cur_var.seq)
#!/usr/bin/env python # nuccount.py -- tally nucleotides in a file import sys from collections import Counter from readfq import readfq IUPAC_BASES = "ACGTRYSWKMBDHVN-." # intialize counter counts = Counter() for name, seq, qual in readfq(sys.stdin): # for each sequence entry, add all its bases to the counter counts.update(seq.upper()) # print the results for base in IUPAC_BASES: print(base + "\t" + str(counts[base]))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--ref", required=True) parser.add_argument("--msa", required=True) parser.add_argument("-n", type=int, required=True, help="number of sequences to process, will be divided amongst threads") parser.add_argument("-t", "--threads", type=int, default=4) args = parser.parse_args() # Check files exist for fpt, fp in ("REF", args.ref), ("MSA", args.msa): if not os.path.isfile(fp): sys.stderr.write("[FAIL] Could not open %s %s.\n" % (fpt, fp)) sys.exit(1) else: sys.stderr.write("[NOTE] %s: %s\n" % (fpt, fp)) sys.stderr.write("[NOTE] NUM_SEQUENCES: %d\n" % args.n) sys.stderr.write("[NOTE] ASKLEPIAN_VARIANT_THREADS: %d\n" % args.threads) # Load the ref and assign it to ref_seq with open(args.ref) as canon_fh: for name, seq, qual in readfq(canon_fh): break if not name: sys.stderr.write("[FAIL] Could not read sequence from reference.\n") sys.exit(2) else: ref_seq = seq write_q = multiprocessing.Queue() processes = [] writer_process = multiprocessing.Process( target=write_worker, args=( write_q, args.threads, ), ) processes.append(writer_process) window_l = ceil(args.n / float(args.threads)) for window_i, window_pos in enumerate(range(0, args.n, window_l)): start = window_pos end = window_pos + window_l - 1 # remove 1 as we dont use gte in worker if window_i == (args.threads - 1): end = args.n # in case we've managed to screw the last window and its too short, just set it to N sys.stderr.write("[WORK] Worker %d (%d, %d)\n" % (window_i, start, end)) p = multiprocessing.Process( target=variant_worker, args=( write_q, ref_seq, args.msa, window_i, start, end, ), ) processes.append(p) # Engage for p in processes: p.start() # Block for p in processes: p.join() sys.stderr.write("[DONE] All workers exited, bye!\n")
if not ((type(winSize) == type(0)) and (type(step) == type(0))): raise Exception("**ERROR** type(winSize) and type(step) must be int.") if step > winSize: raise Exception("**ERROR** step must not be larger than winSize.") if winSize > len(sequence): raise Exception("**ERROR** winSize must not be larger than sequence length.") # Pre-compute number of chunks to emit numOfChunks = ((len(sequence)-winSize)/step)+1 # Do the work for i in range(0,numOfChunks*step,step): yield sequence[i:i+winSize] with open(reference, 'rb') as fh: for _, seq, _ in readfq.readfq(fh): for i, kmer in enumerate(sliding_window(seq, kmer_size)): bloom.add(kmer, i) i = 0 with open(reference, 'rb') as fh: for _, seq, _ in readfq.readfq(fh): for i, kmer in enumerate(sliding_window(seq, kmer_size)): if i % 5 == 0: bloom.delete(kmer, i) bloom.flush() del bloom bloom = pydablooms.load_dabloom(capacity=capacity, error_rate=error_rate,
def variant_worker(write_q, ref_seq, msa, window_i, start_record_i, end_record_i): # Open the MSA, iterate over each sequence and walk the genome to find # diagreements with the loaded reference # NOTE This particular MSA does not handle insertions record_i = -1 first = True with open(msa, 'r') as all_fh: for name, seq, qual in readfq(all_fh): record_i += 1 if record_i < start_record_i: continue if record_i > end_record_i: break if first: sys.stderr.write("[STAT] Worker %d started on record %d\n" % (window_i, record_i)) first = False central_sample_id = name query_on_ref_pos = 0 current_deletion_len = 0 curr_lines = [] for qbase in seq: if qbase == '-': # Extend the length of the current deletion current_deletion_len += 1 else: if current_deletion_len > 0: # We've come to the end of a deletion, output it curr_lines.append(','.join([ central_sample_id, #str( "%d-%d" % ((query_on_ref_pos-current_deletion_len)+1, query_on_ref_pos) ), str((query_on_ref_pos-current_deletion_len)+1), "", "%dD" % current_deletion_len, "1", ])) current_deletion_len = 0 # Now deletions are handled, check for single nucleotide variants # NOTE This includes missing data such as N # NOTE This algorithm does not consider INS against ref if qbase != ref_seq[query_on_ref_pos]: if current_deletion_len == 0: # SNV detected and we aren't in an active DEL curr_lines.append(','.join([ central_sample_id, str(query_on_ref_pos+1), ref_seq[query_on_ref_pos], qbase, "0", ])) # Advance pointer (this is overkill here but a useful starting point # for a future algo walking the ref for insertions) query_on_ref_pos += 1 if current_deletion_len > 0: # Output the last deletion, if there is one # (this is almost always going to be garbage but we include it for completeness) curr_lines.append(','.join([ central_sample_id, #str( "%d-%d" % ((query_on_ref_pos-current_deletion_len)+1, query_on_ref_pos) ), str((query_on_ref_pos-current_deletion_len)+1), "", "%dD" % current_deletion_len, "1", ])) # Push curr lines to writer write_q.put( '\n'.join(curr_lines) + '\n' ) # Break out, send sentinel to queue sys.stderr.write("[DONE] Worker %d finished at next record %d\n" % (window_i, record_i)) write_q.put(None)
fastq_file1 = open(read_files[0], 'r') fastq_file2 = open(read_files[1], 'r') pool = Pool(processes=int(args.threads)) read_comparisons = list() amplicon_reads = dict() amplicon_fastq_file_pairs = dict() no_read_amplicons = list() num_processed = 0 num_perfect = 0 num_mismatched = 0 sys.stdout.write("Sorting reads into appropriate amplicons\n") for result in pool.imap(match_read_primers, itertools.izip(readfq.readfq(fastq_file1), readfq.readfq(fastq_file2), itertools.repeat(primer_sets)), 100000): num_processed += 1 if (num_processed % 10000) == 0: sys.stdout.write("Processed %s reads\n" % num_processed) if result['ratio1_amplicon'] != result['ratio2_amplicon']: num_mismatched += 1 read1_filename = "%s-mismatched_R1.fastq" % args.output read2_filename = "%s-mismatched_R2.fastq" % args.output write_fastq_pairs(read1_filename, read2_filename, result) continue read1_fastq_name = "%s-%s_R1.fastq" % (args.output, result['ratio1_amplicon']) read2_fastq_name = "%s-%s_R2.fastq" % (args.output, result['ratio2_amplicon']) write_fastq_pairs(read1_fastq_name, read2_fastq_name, result)
import re import sys from readfq import readfq complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} if len(sys.argv) < 3: sys.exit( "error: too few arguments\nusage: find_motif.py infile.fa <motifseq>\n" ) seqs = dict() infile = open(sys.argv[1]) for name, seq, qual in readfq(infile): seqs[name] = seq motif_str = sys.argv[2].upper() motif_str_rc = "".join([complement[b] for b in motif_str[::-1]]) pat = "(%s|%s)" % (motif_str.replace( "N", "[ATCG]"), motif_str_rc.replace("N", "[ATCG]")) motif = re.compile(pat, flags=re.IGNORECASE) for name, seq in seqs.items(): matches = motif.finditer(seq) for match in matches: if match is not None: if len(sys.argv) == 4: print "\t".join( map(str, (name, match.start(), match.end(), sys.argv[3]))) else: print "\t".join(map(str, (name, match.start(), match.end())))