Beispiel #1
0
 def test_extractedSeq(self):
     sam_iterator = SeqIterator.SeqIterator(self.sam_location, file_type=Constants.SAM)
     for record in sam_iterator:
         key = record[Constants.SAM_KEY_RNAME]
         position = int(record[Constants.SAM_KEY_POS]) 
         seq = record[Constants.SAM_KEY_SEQ]
         seq_len = len(seq)
         token_cigar = BisPin_util.tokenizeCigar(record[Constants.SAM_KEY_CIGAR])
         reference = BisPin_postprocess.getReferenceSequence(self.ref_dictionary, key, position, seq_len, token_cigar)
         self.assertEqual(seq_len, len(reference))
         self.assertEqual(seq, reference)
Beispiel #2
0
 def test_getReadStrFastaStr(self):
     (r_str,
      f_str) = BisPin_util.getReadStrFastaStr(Constants.CONV_CT_GA_CT)
     self.assertEqual(Constants.CONV_CT_GA, r_str)
     self.assertEqual(Constants.CONV_CT, f_str)
 def test_tokenizeCigar01(self):
     cigar_list = [(2, 'M'), (1, 'I'), (27, 'M'), (1, 'D'), (11, 'M')]
     self.assertEqual(BisPin_util.tokenizeCigar('2M1I27M1D11M'), cigar_list)
 def test_tokenizeMDtag04(self):
     mdtag_list = [5, 'T', 1, 'T', 0, 'T', 21, 'G', 1, '^ACT', 2]
     value = BisPin_util.tokenizeMDtag('5T1T0T21G1^ACT2')
     self.assertEqual(value, mdtag_list)
 def test_tokenizeMDtag03(self):
     mdtag_list = [5, 'T', 100, 'T', 0, 'T']
     self.assertEqual(BisPin_util.tokenizeMDtag('5T100T0T'), mdtag_list)
 def test_tokenizeMDtag02(self):
     mdtag_list = [51]
     self.assertEqual(BisPin_util.tokenizeMDtag('51'), mdtag_list)
 def test_tokenizeMDtag01(self):
     mdtag_list = [5, 'T', 1, 'T', 0, 'T', 21, 'G', 1, '^A', 2]
     self.assertEqual(BisPin_util.tokenizeMDtag('5T1T0T21G1^A2'),
                      mdtag_list)
 def test_tokenizeCigar02(self):
     cigar_list = [(99, 'M')]
     self.assertEqual(BisPin_util.tokenizeCigar('99M'), cigar_list)
Beispiel #9
0
def main():
    """
    The entry point into the program.
    """
    #Create the command line interface.
    usage = "usage: %prog [options] <reference_genome_file> "
    version = "%prog " + Constants.version
    description = "Create indices for the reference genome.  The file naming convention for the converted genome is expected to be in BisPin_covert format."
    epilog = Constants.creation_string
    p = optparse.OptionParser(
        usage=usage,
        version=version,
        description=description,
        epilog=epilog,
        formatter=IndentedHelpFormatterWithNL.IndentedHelpFormatterWithNL())
    p.add_option(
        '--path',
        '-P',
        help=
        'The path to the BFAST executable file.  If this option is not used, then the PATH variable will be searched for the executable.',
        default=None)
    p.add_option(
        '--tmpDir',
        '-T',
        help=
        'Specifies the directory to store temporary files.  [default: the directory where the outputfile is located.]',
        default=None)
    p.add_option(
        '--regular',
        '-r',
        help=
        'Create an index for the regular unconverted reference genome. This is for processing hairpin data for the recovery step.  [default: %default]',
        action='store_true',
        default=False)
    #p.add_option('--aligner', '-a', help='The code string for the aligner to use.  [default: %default]\nOptions:\n' + Constants.BFAST +'\n' + Constants.BWA, default=Constants.BFAST)
    p.add_option(
        '--sequential',
        '-s',
        help=
        'Create the indices for the converted reference genome in sequence instead of in parallel. [default: %default]',
        action='store_true',
        default=False)
    bfast_group = optparse.OptionGroup(p, "BFAST index options.")
    bfast_group.add_option(
        '--mask',
        '-m',
        help=
        'The mask or spaced seed to use. The mask is a set of zero and ones (must start and end with a one). [default: %default]',
        default='11111111111111111111')
    bfast_group.add_option(
        '--hashWidth',
        '-w',
        help=
        'The length of the hashed string (key size for the hash index). This must be less than or equal to the number of ones in the mask. [default: %default]',
        default=14)
    bfast_group.add_option(
        '--indexNumber',
        '-i',
        help=
        'Specifies this is the ith index you are creating. This is useful when multiple indexes from the same reference are to be created (in the same space). [default: %default]',
        default='1')
    #p.add_option('--num_threads', '-n', help='The number of threads for each BFAST process to use. [default: %default]', default='1')
    #p.add_option('--depth' , '-d', help= 'The depth of splitting (d).  The index will be split into 4^d parts. [default: %default]', default='0')
    p.add_option_group(bfast_group)
    #     bwa_group = optparse.OptionGroup(p, "BWA index options.")
    #     p.add_option('--block_size', '-b', help='block size for the bwtsw algorithm [default: %default]', default='10000000')
    #     p.add_option_group(bwa_group)
    #Extract arguments
    options, args = p.parse_args()
    now = datetime.datetime.now()
    if len(args) == 0:
        p.print_help()
    if len(args) != 1:
        p.error("There must be one argument.")
    fastafile = args[0]
    #path_to_aligner = args[0]
    aligner_type = Constants.BFAST  #options.aligner.upper()
    if aligner_type != Constants.BFAST and aligner_type != Constants.BWA:
        p.error(
            "The aligner is not recognized.  Please choose the aligner from the given options."
        )
    if options.path == None:
        path_to_aligner = BisPin_util.which("bfast")
    else:
        path_to_aligner = BisPin_util.which(options.path)
    if path_to_aligner == None:
        p.error(
            "The BFAST executable could not be found.  Please check the path.")
    path_to_aligner = str(path_to_aligner)
    try:
        hashWidth = int(options.hashWidth)
    except ValueError:
        p.error("The hash width argument must be a positive integer.")
    regular = options.regular
    if not os.path.exists(fastafile):
        p.error("The file at %s could not be located." % fastafile)
    if not os.path.exists(path_to_aligner):
        p.error("The aligner could not be found at %s" % path_to_aligner)
    mask = options.mask
    hashWidth = str(hashWidth)
    indexNumber = str(options.indexNumber)
    sequential = options.sequential
    tmpDir = options.tmpDir
    num_threads = '1'  #options.num_threads  # Multithreading for constructing indices does not seem to do anyting.
    depth = '0'  #options.depth #Unsupported right now.
    filename = os.path.basename(fastafile)
    directory = os.path.dirname(fastafile)
    if tmpDir == None:
        tmpDir = directory
    tmpDir = str(tmpDir)
    if not tmpDir.endswith("/"):
        tmpDir += "/"
    #Create indices
    sys.stderr.write("%sCreating indices for %s\n" % (logstr, fastafile))
    sys.stderr.flush()
    if aligner_type == Constants.BFAST:  #Create BFAST indices
        sys.stderr.write(
            logstr +
            "Converting the FASTA format into the BFAST binary version (BRG)...\n"
        )
        sys.stderr.flush()
        finish_string = "%sFinished creating %s index with mask %s and hash width %s for %s with\n" + aligner_type
        if not regular:
            p_CT_bin = multiprocessing.Process(target=fasta2brg,
                                               args=(path_to_aligner,
                                                     directory, filename,
                                                     True))
            p_CT_bin.start()
            p_GA_bin = multiprocessing.Process(target=fasta2brg,
                                               args=(path_to_aligner,
                                                     directory, filename,
                                                     False))
            p_GA_bin.start()
            p_CT_bin.join()
            p_GA_bin.join()
            sys.stderr.write(logstr +
                             "Finished creating the BFAST BRG files!\n")
            p_CT = multiprocessing.Process(
                target=create_index,
                args=(fastafile, path_to_aligner, mask, hashWidth, indexNumber,
                      depth, num_threads, tmpDir, True))
            p_CT.start()
            if sequential:
                p_CT.join()
                sys.stderr.write(
                    finish_string %
                    (logstr, "C to T", mask, hashWidth, fastafile))
            p_GA = multiprocessing.Process(
                target=create_index,
                args=(fastafile, path_to_aligner, mask, hashWidth, indexNumber,
                      depth, num_threads, tmpDir, False))
            p_GA.start()
            if not sequential:
                p_CT.join()
                sys.stderr.write(
                    finish_string %
                    (logstr, "C to T", mask, hashWidth, fastafile))
            p_GA.join()
            sys.stderr.write(finish_string %
                             (logstr, "G to A", mask, hashWidth, fastafile))
        else:
            p_orig_bin = multiprocessing.Process(target=fasta2brg,
                                                 args=(path_to_aligner,
                                                       directory, filename,
                                                       True, True))
            p_orig_bin.start()
            p_orig_bin.join()
            sys.stderr.write(logstr +
                             "Finished creating the BFAST BRG files!\n")
            p_orig = multiprocessing.Process(
                target=create_regular_index,
                args=(fastafile, path_to_aligner, mask, hashWidth, indexNumber,
                      depth, num_threads, tmpDir))
            p_orig.start()
            p_orig.join()
            sys.stderr.write(finish_string %
                             (logstr, "original", mask, hashWidth, fastafile))
    elif aligner_type == Constants.BWA:  #Create BWA indices
        finish_string = "%sFinished creating the %s index with " + aligner_type
        if not regular:
            p_CT = multiprocessing.Process()
            p_CT.start()
            if sequential:
                p_CT.join()
                sys.stderr.write(finish_string % (logstr, "C to T"))
            p_GA = multiprocessing.Process()
            p_GA.start()
            if not sequential:
                p_CT.join()
                sys.stderr.write(finish_string % (logstr, "C to T"))
            p_GA.join()
            sys.stderr.write(finish_string % (logstr, "G to A"))
        else:
            p_orig = multiprocessing.Process()
            p_orig.start()
            p_orig.join()
            sys.stderr.write(finish_string % (logstr, "original"))
    sys.stderr.flush()
    later = datetime.datetime.now()
    sys.stderr.write("%sElapsed time -- %s\n" % (logstr, str(later - now)))