def extract(sam_file, output_file, no_output): sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM) if not no_output: output_unique_name = output_file + ".unique.sam" output_ambig_name = output_file + ".ambig.sam" output_unmap_name = output_file + ".unmap.sam" output_filt_name = output_file + ".filt.sam" output_unique = SeqIterator.SeqWriter(open(output_unique_name, 'w'), file_type=Constants.SAM) output_ambig = SeqIterator.SeqWriter(open(output_ambig_name, 'w'), file_type=Constants.SAM) output_unmap = SeqIterator.SeqWriter(open(output_unmap_name, 'w'), file_type=Constants.SAM) output_filt = SeqIterator.SeqWriter(open(output_filt_name, 'w'), file_type=Constants.SAM) else: output_unique = None output_ambig = None output_unmap = None output_filt = None last_record = [] counts = {"unique": 0, "unmap": 0, "ambig": 0, "filt": 0} for record in sam_input: if isUnmapped(record[Constants.SAM_KEY_FLAG]): last_record = writeTo(last_record, output_unique, output_ambig, counts, no_output) counts["unmap"] += 1 if not no_output: output_unmap.write(record) elif isFiltered(record[Constants.SAM_KEY_FLAG]): last_record = writeTo(last_record, output_unique, output_ambig, counts, no_output) counts["filt"] += 1 if not no_output: output_filt.write(record) elif len(last_record) >= 1 and record[ Constants.SAM_KEY_QNAME] != last_record[0][ Constants.SAM_KEY_QNAME]: last_record = writeTo(last_record, output_unique, output_ambig, counts, no_output) last_record.append(record) else: last_record.append(record) writeTo(last_record, output_unique, output_ambig, counts, no_output) return counts
def add_reads_to_queue_multi(q_read, input_file, num_processes): """ This function puts the reads from the input_file into the Queue q_read. This function is called to add all the reads on a queue for other processes to consume. A 'poison pill' is added for all num_processes. When other processes get this signal, they will stop. """ reads = SeqIterator.SeqIterator(input_file, file_type='fastq') for read in reads: q_read.put(read) for i in range(num_processes): q_read.put(None)
def test_extractedSeq(self): sam_iterator = SeqIterator.SeqIterator(self.sam_location, file_type=Constants.SAM) for record in sam_iterator: key = record[Constants.SAM_KEY_RNAME] position = int(record[Constants.SAM_KEY_POS]) seq = record[Constants.SAM_KEY_SEQ] seq_len = len(seq) token_cigar = BisPin_util.tokenizeCigar(record[Constants.SAM_KEY_CIGAR]) reference = BisPin_postprocess.getReferenceSequence(self.ref_dictionary, key, position, seq_len, token_cigar) self.assertEqual(seq_len, len(reference)) self.assertEqual(seq, reference)
def trim_reads(input_file, output_file, alpha, beta, gamma, m, t, offset=33, process_amount=1, balanced=False, algorithm=True, entropyOnly=False, file_type='fastq', verbose=False): """ This function is the entry point into the read trimmer. It is the function that could be called by other programs to do read trimming. This function creates processes if multiprocessing is used. """ if isinstance(output_file, str): output_file = open(output_file, 'w') trimmed_output = SeqIterator.SeqWriter(output_file, file_type=file_type) if process_amount == 1: try: reads = SeqIterator.SeqIterator(input_file, file_type=file_type) except IOError: sys.stderr.write( "Something is wrong with the reads file. Please check it.\n") return trim_reads_single(reads, trimmed_output, alpha, beta, gamma, m, t, offset, balanced, algorithm, entropyOnly, file_type, verbose) read_count = reads.records_processed() else: # Need to add records to a queue for the other processes. processes = [] q_read = multiprocessing.Queue() q_write = multiprocessing.Queue() proc_read = multiprocessing.Process(target=add_reads_to_queue_multi, args=(q_read, input_file, process_amount)) proc_read.start() # Start the processes. Each one pulls a record from the queue and processes it. for i in range(process_amount): proc = multiprocessing.Process(target=trim_reads_multi, args=(q_read, q_write, alpha, beta, gamma, m, t, offset, balanced, algorithm, entropyOnly, file_type, verbose)) processes.append(proc) proc.start() num_quit = 0 read_count = 0 # The calling process writes the reads to the output file. while (True): trimmed_read = q_write.get() if trimmed_read is None: num_quit += 1 if num_quit == process_amount: break else: read_count += 1 trimmed_output.write(trimmed_read) proc_read.join() for proc in processes: proc.join() return read_count
def convert_seqs(directory, filename, CTorGA, tmpDir=None, file_type=Constants.FASTA, start=None, end=None, gzip_switch=False, checkLength=True, outputfile=None, reverse=False): """ Does the C to T or the G to A conversion of DNA sequences. @param directory: A directory where the file to convert will be found. @param filename: The filename of the file to convert. @param CTorGA: A boolean value when True does a C to T conversion. Otherwise, a G to A conversion. @param tmpDir: A directory to write the converted file to. @param file_type: The file type of the converted records. i.e., FASTA or FASTQ @param start: The record number in the sequence of records to start with. @param end: The record number in the sequence of records to end with. @param gzip_switch: If True, the output is written in gzipped compressed format. The input must be gzipped as well. @param outputfile: The location of the outputfile to write to. @param reverse: Reverses the newly converted sequence when True. @return: A list consisting of the output file represented as a string. """ my_directory = directory if tmpDir == None else tmpDir if outputfile == None: str1 = Constants.CONV_CT if CTorGA else Constants.CONV_GA outputfile = os.path.join(my_directory, filename + ".BisPin." + str1) fd_convert = getPossibleGZIPFile(outputfile, gzip_switch) unconverted_seqs = SeqIterator.SeqIterator(os.path.join( directory, filename), file_type=file_type, gzip_switch=gzip_switch) converted_seqs = SeqIterator.SeqWriter(fd_convert, file_type=file_type) counter = -1 for rec in unconverted_seqs: counter += 1 if (start != None and counter < start) or (end != None and counter > end): continue my_seq = rec[1] if checkLength and len(my_seq) == 0: sys.stderr.write("A sequence was found of length 0:\t%s\n" % (str(rec))) continue new_seq = "" for base in my_seq: if CTorGA: if base == "C": base = "T" elif base == "c": base = "t" else: if base == "G": base = "A" elif base == "g": base = "a" new_seq += base if reverse: new_seq = new_seq[::-1] if file_type == Constants.FASTQ: my_seq = (rec[0], new_seq, rec[2], rec[3]) else: my_seq = (rec[0], new_seq) converted_seqs.write(my_seq) fd_convert.close() return [outputfile]
def automate(input_file, output_file_prefix, process_amount, accuracy, simulate, read_amount): process_amount = str(process_amount) if simulate: #subprocess.call(["~/DWGSIM/dwgsim", "-e", "0.012", "-E", "0.012", "-d", "250", "-s", "30", "-S", "0", "-N", str(read_amount), "-c", "2", "-1", "200", "-2", "200", "-f", "TACGTACGTCTGAGCATCGATCGATGTACAGC", "/research/jsporter/Data/genome/GRCh38.p9/methyl-convert/GRCh38.p9.methyl-convert.GA.fa", input_file + ".GA"]) subprocess.call([ "/home/jsporter/DWGSIM/dwgsim", "-e", "0.012", "-E", "0.012", "-d", "250", "-s", "30", "-S", "0", "-N", str(read_amount), "-c", "0", "-1", "100", "-2", "100", "/research/jsporter/Data/genome/GRCh38.p9/methyl-convert/GRCh38.p9.methyl-convert.GA.fa", input_file + ".GA" ]) subprocess.call([ "/home/jsporter/DWGSIM/dwgsim", "-e", "0.012", "-E", "0.012", "-d", "250", "-s", "30", "-S", "0", "-N", str(read_amount), "-c", "0", "-1", "100", "-2", "100", "/research/jsporter/Data/genome/GRCh38.p9/methyl-convert/GRCh38.p9.methyl-convert.CT.fa", input_file + ".CT" ]) subprocess.call([ "/home/jsporter/BS_Simulation/dwgsim_postprocess", "-p", "2", "-o", "1", input_file + ".GA" + ".bwa.read2.fastq" ], stdout=open(input_file + ".GA" + ".post", 'w')) subprocess.call([ "/home/jsporter/BS_Simulation/dwgsim_postprocess", "-p", "1", "-o", "0", input_file + ".CT" + ".bwa.read1.fastq" ], stdout=open(input_file + ".CT" + ".post", 'w')) input_file_2 = input_file + ".dwgsim.automate" + ".fastq" subprocess.call([ "cat", input_file + ".CT" + ".post", input_file + ".GA" + ".post" ], stdout=open(input_file_2, 'w')) input_file = input_file_2 total_reads = SeqIterator.SeqIterator(input_file, file_type='fastq').count() subprocess.call([ "/home/jsporter/walt-1.0/bin/walt", "-t", process_amount, "-i", "/research/jsporter/Data/genome/GRCh38.p9/walt/GRCh38.p9.multiLine.fa.index.walt.dbindex", "-o", input_file + ".walt.sam", "-r", input_file ]) subprocess.call([ "/home/jsporter/bismark_v0.16.3/bismark", "-p", process_amount, "--sam", "/research/jsporter/Data/genome/GRCh38.p9/bismark/", input_file ]) subprocess.call([ "/home/jsporter/bwa-meth/bwameth.py", "--reference", "/research/jsporter/Data/genome/GRCh38.p9/bwa-meth/GRCh38.p9.fa", input_file ], stdout=open(input_file + ".BWAMeth.sam", 'w')) subprocess.call(["countBWA.py", input_file + ".BWAMeth.sam"], stdout=open(input_file + ".BWAMeth.count", 'w')) bispin_args = [ "BisPin_align.py", "-W", "-n", process_amount, "-i", "1", "-I", "2", "/research/jsporter/Data/genome/GRCh38.p9/BisFAST/GRCh38.p9.fa", input_file + ".BisPin.sam", input_file ] subprocess.call(bispin_args, stdout=open(input_file + ".BisPin.out", 'w')) if accuracy: subprocess.call([ "calculateSimulationAccuracy.py", "-d", input_file + ".BisPin.sam", str(total_reads) ], stdout=open(input_file + ".BisPin.acc", 'w')) subprocess.call([ "calculateSimulationAccuracy.py", "-d", input_file.replace(".fastq", "") + "_bismark_bt2.sam", str(total_reads) ], stdout=open(input_file + ".bismark.acc", 'w')) subprocess.call([ "calculateSimulationAccuracy.py", "-d", input_file + ".BWAMeth.sam", str(total_reads) ], stdout=open(input_file + ".BWAMeth.acc", 'w')) subprocess.call([ "calculateSimulationAccuracy.py", "-d", input_file + ".walt.sam", str(total_reads) ], stdout=open(input_file + ".walt.acc", 'w')) for i in range(0, 11): for j in range(0, 11): if i + j > 10: break beta = i / 10.0 gamma = j / 10.0 alpha = (10 - i - j) / 10.0 m = 25 t = 0 #print alpha, beta, gamma output_file = "%s.s-%s.r-%s.m-%s.fastq" % ( output_file_prefix, str(beta), str(gamma), str(m)) InfoTrim.trim_reads(input_file, output_file, alpha, beta, gamma, m, t, process_amount=int(process_amount)) subprocess.call([ "/home/jsporter/bismark_v0.16.3/bismark", "-p", process_amount, "--sam", "/research/jsporter/Data/genome/GRCh38.p9/bismark/", output_file ]) subprocess.call([ "/home/jsporter/bwa-meth/bwameth.py", "--reference", "/research/jsporter/Data/genome/GRCh38.p9/bwa-meth/GRCh38.p9.fa", output_file ], stdout=open(output_file + ".BWAMeth.sam", 'w')) subprocess.call(["countBWA.py", output_file + ".BWAMeth.sam"], stdout=open(output_file + ".BWAMeth.count", 'w')) subprocess.call([ "/home/jsporter/walt-1.0/bin/walt", "-t", process_amount, "-i", "/research/jsporter/Data/genome/GRCh38.p9/walt/GRCh38.p9.multiLine.fa.index.walt.dbindex", "-o", output_file + ".walt.sam", "-r", output_file ]) bispin_args = [ "BisPin_align.py", "-W", "-n", process_amount, "-i", "1", "-I", "2", "/research/jsporter/Data/genome/GRCh38.p9/BisFAST/GRCh38.p9.fa", output_file + ".BisPin.sam", output_file ] subprocess.call(bispin_args, stdout=open(output_file + ".BisPin.out", 'w')) if accuracy: subprocess.call([ "calculateSimulationAccuracy.py", "-d", output_file + ".BisPin.sam", str(total_reads) ], stdout=open(output_file + ".BisPin.acc", 'w')) subprocess.call([ "calculateSimulationAccuracy.py", "-d", output_file.replace(".fastq", "") + "_bismark_bt2.sam", str(total_reads) ], stdout=open(output_file + ".bismark.acc", 'w')) subprocess.call([ "calculateSimulationAccuracy.py", "-d", output_file + ".BWAMeth.sam", str(total_reads) ], stdout=open(output_file + ".BWAMeth.acc", 'w')) subprocess.call([ "calculateSimulationAccuracy.py", "-d", output_file + ".walt.sam", str(total_reads) ], stdout=open(output_file + ".walt.acc", 'w'))