Example #1
0
    def testDiagnoseMulti(self):
        multi_dir = data_source_path('testmulti/caenophidia')
        fp = os.path.join(multi_dir, 'caenophidia_mos.fasta')
        fp2 = os.path.join(multi_dir, 'caenophidia_mos2.fasta')
        s = summary_stats_from_parse([fp, fp2], ["DNA", "RNA", "PROTEIN"],
                                     careful_parse=False)
        self.assertEqual(s[0], "PROTEIN")
        self.assertEqual(s[1], [(114, 189), (109, 202)])
        self.assertEqual(
            s[2], 116
        )  # two taxa names were changed and 5 were deleted, so the union is 116
        self.assertEqual(s[3], False)

        fp3 = data_source_path('smallrna.fasta')
        s = summary_stats_from_parse([fp3, fp3], ["DNA", "RNA", "PROTEIN"],
                                     careful_parse=False)
        self.assertEqual(s[0], "RNA")
        self.assertEqual(s[1], [(32, 1650), (32, 1650)])
        self.assertEqual(s[2], 32)
        self.assertEqual(s[3], True)
        self.assertRaises(Exception,
                          summary_stats_from_parse, [fp, fp3],
                          ["DNA", "RNA", "PROTEIN"],
                          careful_parse=False)
        _LOG.warn(
            "WARNING: summary_stats_from_parse will read multi with dna and protein as entirely protein. MIXED data type support is needed!"
        )

        fp4 = data_source_path('small.fasta')
        fp5 = data_source_path('smallunaligned.fasta')
        s = summary_stats_from_parse([fp4, fp4], ["DNA", "RNA", "PROTEIN"],
                                     careful_parse=False)
        self.assertEqual(s[0], "DNA")
        self.assertEqual(s[1], [(32, 1650), (32, 1650)])
        self.assertEqual(s[2], 32)
        self.assertEqual(s[3], True)
        self.assertRaises(Exception,
                          summary_stats_from_parse, [fp, fp3],
                          ["DNA", "RNA", "PROTEIN"],
                          careful_parse=False)
        _LOG.warn(
            "WARNING: summary_stats_from_parse will read multi with dna and protein as entirely protein. MIXED data type support is needed!"
        )

        fp4 = data_source_path('small.fasta')
        fp5 = data_source_path('smallunaligned.fasta')
        s = summary_stats_from_parse([fp4, fp5], ["DNA", "RNA", "PROTEIN"],
                                     careful_parse=False)
        self.assertEqual(s[0], "DNA")
        self.assertEqual(s[1], [(32, 1650), (32, 1650)])
        self.assertEqual(s[2], 32)
        self.assertEqual(s[3], False)
        self.assertRaises(Exception,
                          summary_stats_from_parse, [fp, fp3],
                          ["DNA", "RNA", "PROTEIN"],
                          careful_parse=False)
        _LOG.warn(
            "WARNING: summary_stats_from_parse will read multi with dna and protein as entirely protein. MIXED data type support is needed!"
        )
Example #2
0
 def testDiagnoseProt(self):
     fp = data_source_path('caenophidia_mos.fasta')
     print fp
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
     self.assertEqual(s[0], "PROTEIN")
     self.assertEqual(s[1], [(114, 189)])
     self.assertEqual(s[2], 114)
     self.assertEqual(s[3], False)
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"], careful_parse=True)
     self.assertEqual(s[0], "PROTEIN")
     self.assertEqual(s[1], [(114, 189)])
     self.assertEqual(s[2], 114)
     self.assertEqual(s[3], False)
     self.assertRaises(Exception, summary_stats_from_parse, [fp], ["DNA", "RNA"], careful_parse=False)
     self.assertRaises(Exception, summary_stats_from_parse, [fp], ["DNA", "RNA"], careful_parse=True)
Example #3
0
 def testDiagnoseRNA(self):
     fp = data_source_path('smallrna.fasta')
     print fp
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
     self.assertEqual(s[0], "RNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"], careful_parse=True)
     self.assertEqual(s[0], "RNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     self.assertRaises(Exception, summary_stats_from_parse, [fp], ["DNA", "PROTEIN"], careful_parse=False)
     _LOG.warn("WARNING: summary_stats_from_parse does not distinguish between RNA and DNA in 'careful' mode") 
Example #4
0
 def testDiagnoseDNA(self):
     fp = data_source_path('small.fasta')
     print fp
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
     self.assertEqual(s[0], "DNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"], careful_parse=True)
     self.assertEqual(s[0], "DNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     self.assertRaises(Exception, summary_stats_from_parse, [fp], ["RNA"], careful_parse=False)
     self.assertRaises(Exception, summary_stats_from_parse, [fp], ["RNA"], careful_parse=True)
Example #5
0
 def testDiagnoseProt(self):
     fp = data_source_path('caenophidia_mos.fasta')
     print fp
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"],
                                  careful_parse=False)
     self.assertEqual(s[0], "PROTEIN")
     self.assertEqual(s[1], [(114, 189)])
     self.assertEqual(s[2], 114)
     self.assertEqual(s[3], False)
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"],
                                  careful_parse=True)
     self.assertEqual(s[0], "PROTEIN")
     self.assertEqual(s[1], [(114, 189)])
     self.assertEqual(s[2], 114)
     self.assertEqual(s[3], False)
     self.assertRaises(Exception,
                       summary_stats_from_parse, [fp], ["DNA", "RNA"],
                       careful_parse=False)
     self.assertRaises(Exception,
                       summary_stats_from_parse, [fp], ["DNA", "RNA"],
                       careful_parse=True)
Example #6
0
 def testDiagnoseRNA(self):
     fp = data_source_path('smallrna.fasta')
     print fp
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"],
                                  careful_parse=False)
     self.assertEqual(s[0], "RNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"],
                                  careful_parse=True)
     self.assertEqual(s[0], "RNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     self.assertRaises(Exception,
                       summary_stats_from_parse, [fp], ["DNA", "PROTEIN"],
                       careful_parse=False)
     _LOG.warn(
         "WARNING: summary_stats_from_parse does not distinguish between RNA and DNA in 'careful' mode"
     )
Example #7
0
 def testDiagnoseDNA(self):
     fp = data_source_path('small.fasta')
     print fp
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"],
                                  careful_parse=False)
     self.assertEqual(s[0], "DNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     s = summary_stats_from_parse([fp], ["DNA", "RNA", "PROTEIN"],
                                  careful_parse=True)
     self.assertEqual(s[0], "DNA")
     self.assertEqual(s[1], [(32, 1650)])
     self.assertEqual(s[2], 32)
     self.assertEqual(s[3], True)
     self.assertRaises(Exception,
                       summary_stats_from_parse, [fp], ["RNA"],
                       careful_parse=False)
     self.assertRaises(Exception,
                       summary_stats_from_parse, [fp], ["RNA"],
                       careful_parse=True)
Example #8
0
    def testDiagnoseMulti(self):
        multi_dir = data_source_path('testmulti/caenophidia')
        fp = os.path.join(multi_dir,'caenophidia_mos.fasta')
        fp2 = os.path.join(multi_dir,'caenophidia_mos2.fasta')
        s = summary_stats_from_parse([fp, fp2], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        self.assertEqual(s[0], "PROTEIN")
        self.assertEqual(s[1], [(114, 189), (109, 202)])
        self.assertEqual(s[2], 116) # two taxa names were changed and 5 were deleted, so the union is 116
        self.assertEqual(s[3], False)

        fp3 = data_source_path('smallrna.fasta')
        s = summary_stats_from_parse([fp3, fp3], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        self.assertEqual(s[0], "RNA")
        self.assertEqual(s[1], [(32, 1650),(32, 1650)])
        self.assertEqual(s[2], 32)
        self.assertEqual(s[3], True)
        self.assertRaises(Exception, summary_stats_from_parse, [fp, fp3], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        _LOG.warn("WARNING: summary_stats_from_parse will read multi with dna and protein as entirely protein. MIXED data type support is needed!") 


        fp4 = data_source_path('small.fasta')
        fp5 = data_source_path('smallunaligned.fasta')
        s = summary_stats_from_parse([fp4, fp4], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        self.assertEqual(s[0], "DNA")
        self.assertEqual(s[1], [(32, 1650),(32, 1650)])
        self.assertEqual(s[2], 32) 
        self.assertEqual(s[3], True)
        self.assertRaises(Exception, summary_stats_from_parse, [fp, fp3], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        _LOG.warn("WARNING: summary_stats_from_parse will read multi with dna and protein as entirely protein. MIXED data type support is needed!") 

        fp4 = data_source_path('small.fasta')
        fp5 = data_source_path('smallunaligned.fasta')
        s = summary_stats_from_parse([fp4, fp5], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        self.assertEqual(s[0], "DNA")
        self.assertEqual(s[1], [(32, 1650),(32, 1650)])
        self.assertEqual(s[2], 32) 
        self.assertEqual(s[3], False)
        self.assertRaises(Exception, summary_stats_from_parse, [fp, fp3], ["DNA", "RNA", "PROTEIN"], careful_parse=False)
        _LOG.warn("WARNING: summary_stats_from_parse will read multi with dna and protein as entirely protein. MIXED data type support is needed!") 
Example #9
0
def sate_main(argv=sys.argv):
    '''Returns (True, dir, temp_fs) on successful execution or raises an exception.

    Where `dir` is either None or the undeleted directory of temporary files.
    and `temp_fs` is is the TempFS object used to create `dir` (if `dir` is
    not None)

    Note that if `argv` is sys.argv then the first element will be skipped, but
        if it is not the sys.argv list then the first element will be interpretted
        as an argument (and will NOT be skipped).
    '''

    _START_TIME = time.time()
    usage = """usage: %prog [options] <settings_file1> <settings_file2> ..."""
    parser = optparse.OptionParser(usage=usage,
                                   description=PROGRAM_LONG_DESCRIPTION,
                                   formatter=IndentedHelpFormatterWithNL(),
                                   version="%s v%s" %
                                   (PROGRAM_NAME, PROGRAM_VERSION))

    user_config = get_configuration()
    command_line_group = user_config.get('commandline')
    command_line_group.add_to_optparser(parser)
    sate_group = user_config.get('sate')
    sate_group.add_to_optparser(parser)

    group = optparse.OptionGroup(parser, "SATe tools extra options")
    group.add_option('--tree-estimator-model',
                     type='string',
                     dest='tree_estimator_model',
                     help='Do not use this option.')
    parser.add_option_group(group)

    if argv == sys.argv:
        (options, args) = parser.parse_args(argv[1:])
    else:
        (options, args) = parser.parse_args(argv)
    #if options.multilocus:
    #    sys.exit("SATe: Multilocus mode is disabled in this release.")
    if options.tree_estimator_model and options.tree_estimator and len(
            args) == 0:
        if options.tree_estimator.lower() == 'raxml':
            user_config.raxml.model = options.tree_estimator_model
        elif options.tree_estimator.lower() == 'fasttree':
            user_config.fasttree.model = options.tree_estimator_model

    config_filenames = list(args)
    for fn in config_filenames:
        if fn[0] == '"' and fn[-1] == '"':
            fn = fn[1:-1]
        if not os.path.exists(fn):
            raise Exception(
                'The configuration (settings) file "%s" does not exist' % fn)
        try:
            user_config.read_config_filepath(fn)
        except:
            raise Exception(
                'The file "%s" does not appear to be a valid configuration file format. It lacks section headers.'
                % fn)
    user_config.set_values_from_dict(options.__dict__)
    command_line_group.job = coerce_string_to_nice_outfilename(
        command_line_group.job, 'Job', 'satejob')

    if user_config.commandline.auto or (user_config.commandline.untrusted):
        if user_config.commandline.input is None:
            sys.exit("ERROR: Input file(s) not specified.")
        from sate.usersettingclasses import get_list_of_seq_filepaths_from_dir
        from sate.alignment import summary_stats_from_parse
        try:
            if user_config.commandline.multilocus:
                fn_list = get_list_of_seq_filepaths_from_dir(
                    user_config.commandline.input)
            else:
                fn_list = [user_config.commandline.input]
            datatype_list = [user_config.commandline.datatype.upper()]
            careful_parse = user_config.commandline.untrusted
            summary_stats = summary_stats_from_parse(
                fn_list, datatype_list, careful_parse=careful_parse)
        except:
            if user_config.commandline.auto:
                MESSENGER.send_error(
                    "Error reading input while setting options for the --auto mode\n"
                )
            else:
                MESSENGER.send_error("Error reading input\n")
            raise
        if user_config.commandline.auto:
            user_config.commandline.auto = False
            auto_opts = get_auto_defaults_from_summary_stats(
                summary_stats[0], summary_stats[1], summary_stats[2])
            user_config.get('sate').set_values_from_dict(auto_opts['sate'])
            user_config.get('commandline').set_values_from_dict(
                auto_opts['commandline'])
            user_config.get('fasttree').set_values_from_dict(
                auto_opts['fasttree'])

    if user_config.commandline.raxml_search_after:
        if user_config.sate.tree_estimator.upper() != 'FASTTREE':
            sys.exit(
                "ERROR: the 'raxml_search_after' option is only supported when the tree_estimator is FastTree"
            )

    exportconfig = command_line_group.exportconfig
    if exportconfig:
        command_line_group.exportconfig = None
        user_config.save_to_filepath(exportconfig)

        ### TODO: wrap up in messaging system
        sys.stdout.write(
            'Configuration written to "%s". Exiting successfully.\n' %
            exportconfig)

        return True, None, None

    if user_config.commandline.input is None:
        sys.exit("ERROR: Input file(s) not specified.")

    # note: need to read sequence files first to allow SateProducts to
    # correctly self-configure
    user_config.read_seq_filepaths(
        src=user_config.commandline.input,
        multilocus=user_config.commandline.multilocus)
    sate_products = filemgr.SateProducts(user_config)

    export_config_as_temp = True
    if export_config_as_temp:
        name_cfg = sate_products.get_abs_path_for_tag('sate_config.txt')
        command_line_group.exportconfig = None
        user_config.save_to_filepath(name_cfg)
        MESSENGER.send_info('Configuration written to "%s".\n' % name_cfg)

    MESSENGER.run_log_streams.append(sate_products.run_log_stream)
    MESSENGER.err_log_streams.append(sate_products.err_log_stream)
    temp_dir, temp_fs = run_sate_from_config(user_config, sate_products)
    _TIME_SPENT = time.time() - _START_TIME
    MESSENGER.send_info("Total time spent: %ss" % _TIME_SPENT)
    return True, temp_dir, temp_fs
Example #10
0
def sate_main(argv=sys.argv):
    '''Returns (True, dir, temp_fs) on successful execution or raises an exception.

    Where `dir` is either None or the undeleted directory of temporary files.
    and `temp_fs` is is the TempFS object used to create `dir` (if `dir` is
    not None)

    Note that if `argv` is sys.argv then the first element will be skipped, but
        if it is not the sys.argv list then the first element will be interpretted
        as an argument (and will NOT be skipped).
    '''

    _START_TIME = time.time()
    usage = """usage: %prog [options] <settings_file1> <settings_file2> ..."""
    parser = optparse.OptionParser(usage=usage,
                                    description=PROGRAM_LONG_DESCRIPTION,
                                    formatter=IndentedHelpFormatterWithNL(),
                                    version="%s v%s" % (PROGRAM_NAME, PROGRAM_VERSION))

    user_config = get_configuration()
    command_line_group = user_config.get('commandline')
    command_line_group.add_to_optparser(parser)
    sate_group = user_config.get('sate')
    sate_group.add_to_optparser(parser)
    
    group = optparse.OptionGroup(parser, "SATe tools extra options")
    group.add_option('--tree-estimator-model', type='string',
            dest='tree_estimator_model',
            help='Do not use this option.')
    parser.add_option_group(group)
    
    if argv == sys.argv:
        (options, args) = parser.parse_args(argv[1:])
    else:
        (options, args) = parser.parse_args(argv)
    #if options.multilocus:
    #    sys.exit("SATe: Multilocus mode is disabled in this release.")
    if options.tree_estimator_model and options.tree_estimator and len(args) == 0:
        if options.tree_estimator.lower() == 'raxml':
            user_config.raxml.model = options.tree_estimator_model
        elif options.tree_estimator.lower() == 'fasttree':
            user_config.fasttree.model = options.tree_estimator_model

    config_filenames = list(args)
    for fn in config_filenames:
        if fn[0] == '"' and fn[-1] == '"':
            fn = fn[1:-1]
        if not os.path.exists(fn):
            raise Exception('The configuration (settings) file "%s" does not exist' % fn)
        try:
            user_config.read_config_filepath(fn)
        except:
            raise Exception('The file "%s" does not appear to be a valid configuration file format. It lacks section headers.' % fn)
    user_config.set_values_from_dict(options.__dict__)
    command_line_group.job = coerce_string_to_nice_outfilename(command_line_group.job, 'Job', 'satejob')


    if user_config.commandline.auto or (user_config.commandline.untrusted):
        if user_config.commandline.input is None:
            sys.exit("ERROR: Input file(s) not specified.")
        from sate.usersettingclasses import get_list_of_seq_filepaths_from_dir
        from sate.alignment import summary_stats_from_parse
        try:
            if user_config.commandline.multilocus:
                fn_list = get_list_of_seq_filepaths_from_dir(user_config.commandline.input)
            else:
                fn_list = [user_config.commandline.input]
            datatype_list = [user_config.commandline.datatype.upper()]
            careful_parse = user_config.commandline.untrusted
            summary_stats = summary_stats_from_parse(fn_list, datatype_list, careful_parse=careful_parse)
        except:
            if user_config.commandline.auto:
                MESSENGER.send_error("Error reading input while setting options for the --auto mode\n")
            else:
                MESSENGER.send_error("Error reading input\n")
            raise
        if user_config.commandline.auto:
            user_config.commandline.auto = False
            auto_opts = get_auto_defaults_from_summary_stats(summary_stats[0], summary_stats[1], summary_stats[2])
            user_config.get('sate').set_values_from_dict(auto_opts['sate'])
            user_config.get('commandline').set_values_from_dict(auto_opts['commandline'])
            user_config.get('fasttree').set_values_from_dict(auto_opts['fasttree'])
            
    
    if user_config.commandline.raxml_search_after:
        if user_config.sate.tree_estimator.upper() != 'FASTTREE':
            sys.exit("ERROR: the 'raxml_search_after' option is only supported when the tree_estimator is FastTree")

    exportconfig = command_line_group.exportconfig
    if exportconfig:
        command_line_group.exportconfig = None
        user_config.save_to_filepath(exportconfig)

        ### TODO: wrap up in messaging system
        sys.stdout.write('Configuration written to "%s". Exiting successfully.\n' % exportconfig )

        return True, None, None

    if user_config.commandline.input is None:
        sys.exit("ERROR: Input file(s) not specified.")

    # note: need to read sequence files first to allow SateProducts to
    # correctly self-configure
    user_config.read_seq_filepaths(src=user_config.commandline.input,
            multilocus=user_config.commandline.multilocus)
    sate_products = filemgr.SateProducts(user_config)
    
    export_config_as_temp = True
    if export_config_as_temp:
        name_cfg = sate_products.get_abs_path_for_tag('sate_config.txt')
        command_line_group.exportconfig = None
        user_config.save_to_filepath(name_cfg)
        MESSENGER.send_info('Configuration written to "%s".\n' % name_cfg )
         

    MESSENGER.run_log_streams.append(sate_products.run_log_stream)
    MESSENGER.err_log_streams.append(sate_products.err_log_stream)
    temp_dir, temp_fs = run_sate_from_config(user_config, sate_products)
    _TIME_SPENT = time.time() - _START_TIME
    MESSENGER.send_info("Total time spent: %ss" % _TIME_SPENT)
    return True, temp_dir, temp_fs