Example #1
0
def extract(sam_file, output_file, no_output):
    sam_input = SeqIterator.SeqIterator(sam_file, file_type=Constants.SAM)
    if not no_output:
        output_unique_name = output_file + ".unique.sam"
        output_ambig_name = output_file + ".ambig.sam"
        output_unmap_name = output_file + ".unmap.sam"
        output_filt_name = output_file + ".filt.sam"
        output_unique = SeqIterator.SeqWriter(open(output_unique_name, 'w'),
                                              file_type=Constants.SAM)
        output_ambig = SeqIterator.SeqWriter(open(output_ambig_name, 'w'),
                                             file_type=Constants.SAM)
        output_unmap = SeqIterator.SeqWriter(open(output_unmap_name, 'w'),
                                             file_type=Constants.SAM)
        output_filt = SeqIterator.SeqWriter(open(output_filt_name, 'w'),
                                            file_type=Constants.SAM)
    else:
        output_unique = None
        output_ambig = None
        output_unmap = None
        output_filt = None
    last_record = []
    counts = {"unique": 0, "unmap": 0, "ambig": 0, "filt": 0}
    for record in sam_input:
        if isUnmapped(record[Constants.SAM_KEY_FLAG]):
            last_record = writeTo(last_record, output_unique, output_ambig,
                                  counts, no_output)
            counts["unmap"] += 1
            if not no_output:
                output_unmap.write(record)
        elif isFiltered(record[Constants.SAM_KEY_FLAG]):
            last_record = writeTo(last_record, output_unique, output_ambig,
                                  counts, no_output)
            counts["filt"] += 1
            if not no_output:
                output_filt.write(record)
        elif len(last_record) >= 1 and record[
                Constants.SAM_KEY_QNAME] != last_record[0][
                    Constants.SAM_KEY_QNAME]:
            last_record = writeTo(last_record, output_unique, output_ambig,
                                  counts, no_output)
            last_record.append(record)
        else:
            last_record.append(record)
    writeTo(last_record, output_unique, output_ambig, counts, no_output)
    return counts
Example #2
0
def add_reads_to_queue_multi(q_read, input_file, num_processes):
    """
    This function puts the reads from the input_file into the Queue q_read.
    This function is called to add all the reads on a queue for other processes to consume.
    A 'poison pill' is added for all num_processes.  When other processes get this signal, they will stop.
    """
    reads = SeqIterator.SeqIterator(input_file, file_type='fastq')
    for read in reads:
        q_read.put(read)
    for i in range(num_processes):
        q_read.put(None)
Example #3
0
 def test_extractedSeq(self):
     sam_iterator = SeqIterator.SeqIterator(self.sam_location, file_type=Constants.SAM)
     for record in sam_iterator:
         key = record[Constants.SAM_KEY_RNAME]
         position = int(record[Constants.SAM_KEY_POS]) 
         seq = record[Constants.SAM_KEY_SEQ]
         seq_len = len(seq)
         token_cigar = BisPin_util.tokenizeCigar(record[Constants.SAM_KEY_CIGAR])
         reference = BisPin_postprocess.getReferenceSequence(self.ref_dictionary, key, position, seq_len, token_cigar)
         self.assertEqual(seq_len, len(reference))
         self.assertEqual(seq, reference)
Example #4
0
def trim_reads(input_file,
               output_file,
               alpha,
               beta,
               gamma,
               m,
               t,
               offset=33,
               process_amount=1,
               balanced=False,
               algorithm=True,
               entropyOnly=False,
               file_type='fastq',
               verbose=False):
    """
    This function is the entry point into the read trimmer.  It is the function that could be called
    by other programs to do read trimming.
    This function creates processes if multiprocessing is used.
    """
    if isinstance(output_file, str):
        output_file = open(output_file, 'w')
    trimmed_output = SeqIterator.SeqWriter(output_file, file_type=file_type)
    if process_amount == 1:
        try:
            reads = SeqIterator.SeqIterator(input_file, file_type=file_type)
        except IOError:
            sys.stderr.write(
                "Something is wrong with the reads file.  Please check it.\n")
            return
        trim_reads_single(reads, trimmed_output, alpha, beta, gamma, m, t,
                          offset, balanced, algorithm, entropyOnly, file_type,
                          verbose)
        read_count = reads.records_processed()
    else:
        # Need to add records to a queue for the other processes.
        processes = []
        q_read = multiprocessing.Queue()
        q_write = multiprocessing.Queue()
        proc_read = multiprocessing.Process(target=add_reads_to_queue_multi,
                                            args=(q_read, input_file,
                                                  process_amount))
        proc_read.start()
        # Start the processes.  Each one pulls a record from the queue and processes it.
        for i in range(process_amount):
            proc = multiprocessing.Process(target=trim_reads_multi,
                                           args=(q_read, q_write, alpha, beta,
                                                 gamma, m, t, offset, balanced,
                                                 algorithm, entropyOnly, file_type, verbose))

            processes.append(proc)
            proc.start()
        num_quit = 0
        read_count = 0
        # The calling process writes the reads to the output file.
        while (True):
            trimmed_read = q_write.get()
            if trimmed_read is None:
                num_quit += 1
                if num_quit == process_amount:
                    break
            else:
                read_count += 1
                trimmed_output.write(trimmed_read)
        proc_read.join()
        for proc in processes:
            proc.join()
    return read_count
Example #5
0
def convert_seqs(directory,
                 filename,
                 CTorGA,
                 tmpDir=None,
                 file_type=Constants.FASTA,
                 start=None,
                 end=None,
                 gzip_switch=False,
                 checkLength=True,
                 outputfile=None,
                 reverse=False):
    """
    Does the C to T or the G to A conversion of DNA sequences.
    @param directory: A directory where the file to convert will be found.
    @param filename: The filename of the file to convert.
    @param CTorGA: A boolean value when True does a C to T conversion.  Otherwise, a G to A conversion.
    @param tmpDir: A directory to write the converted file to.
    @param file_type: The file type of the converted records.  i.e., FASTA or FASTQ
    @param start: The record number in the sequence of records to start with.
    @param end: The record number in the sequence of records to end with.
    @param gzip_switch: If True, the output is written in gzipped compressed format.
    The input must be gzipped as well.
    @param outputfile: The location of the outputfile to write to.
    @param reverse: Reverses the newly converted sequence when True.
    @return: A list consisting of the output file represented as a string.
    """
    my_directory = directory if tmpDir == None else tmpDir
    if outputfile == None:
        str1 = Constants.CONV_CT if CTorGA else Constants.CONV_GA
        outputfile = os.path.join(my_directory, filename + ".BisPin." + str1)
    fd_convert = getPossibleGZIPFile(outputfile, gzip_switch)
    unconverted_seqs = SeqIterator.SeqIterator(os.path.join(
        directory, filename),
                                               file_type=file_type,
                                               gzip_switch=gzip_switch)
    converted_seqs = SeqIterator.SeqWriter(fd_convert, file_type=file_type)
    counter = -1
    for rec in unconverted_seqs:
        counter += 1
        if (start != None and counter < start) or (end != None
                                                   and counter > end):
            continue
        my_seq = rec[1]
        if checkLength and len(my_seq) == 0:
            sys.stderr.write("A sequence was found of length 0:\t%s\n" %
                             (str(rec)))
            continue
        new_seq = ""
        for base in my_seq:
            if CTorGA:
                if base == "C":
                    base = "T"
                elif base == "c":
                    base = "t"
            else:
                if base == "G":
                    base = "A"
                elif base == "g":
                    base = "a"
            new_seq += base
        if reverse:
            new_seq = new_seq[::-1]
        if file_type == Constants.FASTQ:
            my_seq = (rec[0], new_seq, rec[2], rec[3])
        else:
            my_seq = (rec[0], new_seq)
        converted_seqs.write(my_seq)
    fd_convert.close()
    return [outputfile]
def automate(input_file, output_file_prefix, process_amount, accuracy,
             simulate, read_amount):
    process_amount = str(process_amount)
    if simulate:
        #subprocess.call(["~/DWGSIM/dwgsim", "-e", "0.012", "-E", "0.012", "-d", "250", "-s", "30", "-S", "0", "-N", str(read_amount), "-c", "2", "-1", "200", "-2", "200", "-f", "TACGTACGTCTGAGCATCGATCGATGTACAGC",  "/research/jsporter/Data/genome/GRCh38.p9/methyl-convert/GRCh38.p9.methyl-convert.GA.fa", input_file + ".GA"])
        subprocess.call([
            "/home/jsporter/DWGSIM/dwgsim", "-e", "0.012", "-E", "0.012", "-d",
            "250", "-s", "30", "-S", "0", "-N",
            str(read_amount), "-c", "0", "-1", "100", "-2", "100",
            "/research/jsporter/Data/genome/GRCh38.p9/methyl-convert/GRCh38.p9.methyl-convert.GA.fa",
            input_file + ".GA"
        ])
        subprocess.call([
            "/home/jsporter/DWGSIM/dwgsim", "-e", "0.012", "-E", "0.012", "-d",
            "250", "-s", "30", "-S", "0", "-N",
            str(read_amount), "-c", "0", "-1", "100", "-2", "100",
            "/research/jsporter/Data/genome/GRCh38.p9/methyl-convert/GRCh38.p9.methyl-convert.CT.fa",
            input_file + ".CT"
        ])
        subprocess.call([
            "/home/jsporter/BS_Simulation/dwgsim_postprocess", "-p", "2", "-o",
            "1", input_file + ".GA" + ".bwa.read2.fastq"
        ],
                        stdout=open(input_file + ".GA" + ".post", 'w'))
        subprocess.call([
            "/home/jsporter/BS_Simulation/dwgsim_postprocess", "-p", "1", "-o",
            "0", input_file + ".CT" + ".bwa.read1.fastq"
        ],
                        stdout=open(input_file + ".CT" + ".post", 'w'))
        input_file_2 = input_file + ".dwgsim.automate" + ".fastq"
        subprocess.call([
            "cat", input_file + ".CT" + ".post", input_file + ".GA" + ".post"
        ],
                        stdout=open(input_file_2, 'w'))
        input_file = input_file_2
    total_reads = SeqIterator.SeqIterator(input_file,
                                          file_type='fastq').count()
    subprocess.call([
        "/home/jsporter/walt-1.0/bin/walt", "-t", process_amount, "-i",
        "/research/jsporter/Data/genome/GRCh38.p9/walt/GRCh38.p9.multiLine.fa.index.walt.dbindex",
        "-o", input_file + ".walt.sam", "-r", input_file
    ])
    subprocess.call([
        "/home/jsporter/bismark_v0.16.3/bismark", "-p", process_amount,
        "--sam", "/research/jsporter/Data/genome/GRCh38.p9/bismark/",
        input_file
    ])
    subprocess.call([
        "/home/jsporter/bwa-meth/bwameth.py", "--reference",
        "/research/jsporter/Data/genome/GRCh38.p9/bwa-meth/GRCh38.p9.fa",
        input_file
    ],
                    stdout=open(input_file + ".BWAMeth.sam", 'w'))
    subprocess.call(["countBWA.py", input_file + ".BWAMeth.sam"],
                    stdout=open(input_file + ".BWAMeth.count", 'w'))
    bispin_args = [
        "BisPin_align.py", "-W", "-n", process_amount, "-i", "1", "-I", "2",
        "/research/jsporter/Data/genome/GRCh38.p9/BisFAST/GRCh38.p9.fa",
        input_file + ".BisPin.sam", input_file
    ]
    subprocess.call(bispin_args, stdout=open(input_file + ".BisPin.out", 'w'))
    if accuracy:
        subprocess.call([
            "calculateSimulationAccuracy.py", "-d", input_file + ".BisPin.sam",
            str(total_reads)
        ],
                        stdout=open(input_file + ".BisPin.acc", 'w'))
        subprocess.call([
            "calculateSimulationAccuracy.py", "-d",
            input_file.replace(".fastq", "") + "_bismark_bt2.sam",
            str(total_reads)
        ],
                        stdout=open(input_file + ".bismark.acc", 'w'))
        subprocess.call([
            "calculateSimulationAccuracy.py", "-d",
            input_file + ".BWAMeth.sam",
            str(total_reads)
        ],
                        stdout=open(input_file + ".BWAMeth.acc", 'w'))
        subprocess.call([
            "calculateSimulationAccuracy.py", "-d", input_file + ".walt.sam",
            str(total_reads)
        ],
                        stdout=open(input_file + ".walt.acc", 'w'))
    for i in range(0, 11):
        for j in range(0, 11):
            if i + j > 10:
                break
            beta = i / 10.0
            gamma = j / 10.0
            alpha = (10 - i - j) / 10.0
            m = 25
            t = 0
            #print alpha, beta, gamma
            output_file = "%s.s-%s.r-%s.m-%s.fastq" % (
                output_file_prefix, str(beta), str(gamma), str(m))
            InfoTrim.trim_reads(input_file,
                                output_file,
                                alpha,
                                beta,
                                gamma,
                                m,
                                t,
                                process_amount=int(process_amount))
            subprocess.call([
                "/home/jsporter/bismark_v0.16.3/bismark", "-p", process_amount,
                "--sam", "/research/jsporter/Data/genome/GRCh38.p9/bismark/",
                output_file
            ])
            subprocess.call([
                "/home/jsporter/bwa-meth/bwameth.py", "--reference",
                "/research/jsporter/Data/genome/GRCh38.p9/bwa-meth/GRCh38.p9.fa",
                output_file
            ],
                            stdout=open(output_file + ".BWAMeth.sam", 'w'))
            subprocess.call(["countBWA.py", output_file + ".BWAMeth.sam"],
                            stdout=open(output_file + ".BWAMeth.count", 'w'))
            subprocess.call([
                "/home/jsporter/walt-1.0/bin/walt", "-t", process_amount, "-i",
                "/research/jsporter/Data/genome/GRCh38.p9/walt/GRCh38.p9.multiLine.fa.index.walt.dbindex",
                "-o", output_file + ".walt.sam", "-r", output_file
            ])
            bispin_args = [
                "BisPin_align.py", "-W", "-n", process_amount, "-i", "1", "-I",
                "2",
                "/research/jsporter/Data/genome/GRCh38.p9/BisFAST/GRCh38.p9.fa",
                output_file + ".BisPin.sam", output_file
            ]
            subprocess.call(bispin_args,
                            stdout=open(output_file + ".BisPin.out", 'w'))
            if accuracy:
                subprocess.call([
                    "calculateSimulationAccuracy.py", "-d",
                    output_file + ".BisPin.sam",
                    str(total_reads)
                ],
                                stdout=open(output_file + ".BisPin.acc", 'w'))
                subprocess.call([
                    "calculateSimulationAccuracy.py", "-d",
                    output_file.replace(".fastq", "") + "_bismark_bt2.sam",
                    str(total_reads)
                ],
                                stdout=open(output_file + ".bismark.acc", 'w'))
                subprocess.call([
                    "calculateSimulationAccuracy.py", "-d",
                    output_file + ".BWAMeth.sam",
                    str(total_reads)
                ],
                                stdout=open(output_file + ".BWAMeth.acc", 'w'))
                subprocess.call([
                    "calculateSimulationAccuracy.py", "-d",
                    output_file + ".walt.sam",
                    str(total_reads)
                ],
                                stdout=open(output_file + ".walt.acc", 'w'))