コード例 #1
0
ファイル: splitter.py プロジェクト: gelomerase/anvio
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!")

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
コード例 #2
0
    def get_summary_object_for_profile_db(self,
                                          profile_db_path,
                                          init_gene_coverages=True):
        collection_name = self.descriptions.genomes[
            self.unique_profile_db_path_to_internal_genome_name[
                profile_db_path][0]]['collection_id']
        profile_db_path = self.descriptions.genomes[
            self.unique_profile_db_path_to_internal_genome_name[
                profile_db_path][0]]['profile_db_path']
        contigs_db_path = self.descriptions.genomes[
            self.unique_profile_db_path_to_internal_genome_name[
                profile_db_path][0]]['contigs_db_path']

        # poor-man's whatever
        bin_names_list = [
            self.descriptions.genomes[g]['bin_id'] for g in self.
            unique_profile_db_path_to_internal_genome_name[profile_db_path]
        ]

        ARGS = summarizer.ArgsTemplateForSummarizerClass()
        ARGS.profile_db = profile_db_path
        ARGS.contigs_db = contigs_db_path
        ARGS.skip_init_functions = True
        ARGS.init_gene_coverages = init_gene_coverages
        ARGS.collection_name = collection_name
        ARGS.bin_names_list = bin_names_list
        ARGS.output_dir = None

        summary = summarizer.ProfileSummarizer(ARGS)
        summary.init()
        summary.init_collection_profile(collection_name)

        return summary
コード例 #3
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['db_type'] != 'profile' or profile_db.meta[
                'blank'] or not profile_db.meta['merged']:
            raise ConfigError(
                "You an only split merged profiles :/ We hope this is not a moment of a terrible disappointment.\
                               If it is, you should consider writing to us.")

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
コード例 #4
0
def gen_summary(args, d, request, response, collection_name):
    set_default_headers(response)

    if args.read_only:
        return json.dumps({'error': "Sorry! This is a read-only instance."})

    if d.manual_mode:
        return json.dumps({
            'error':
            "Creating summaries is only possible with proper anvi'o runs at the moment :/"
        })

    run.info_single('A summary of collection "%s" has been requested.' %
                    collection_name)

    # get a dummy args instance, and fill it down below
    summarizer_args = summarizer.ArgsTemplateForSummarizerClass()

    # common params. we will set pan/profile specific params a bit later:
    summarizer_args.collection_name = collection_name
    summarizer_args.taxonomic_level = d.taxonomic_level

    if d.mode == 'pan':
        summarizer_args.pan_db = d.pan_db_path
        summarizer_args.genomes_storage = d.genomes_storage_path
        summarizer_args.output_dir = os.path.join(
            os.path.dirname(summarizer_args.pan_db),
            'SUMMARY_%s' % collection_name)
    elif d.mode == 'full':
        summarizer_args.profile_db = d.profile_db_path
        summarizer_args.contigs_db = d.contigs_db_path
        summarizer_args.output_dir = os.path.join(
            os.path.dirname(summarizer_args.profile_db),
            'SUMMARY_%s' % collection_name)
    else:
        return json.dumps({
            'error':
            'We do not know anything about this mode: "%s"' % d.mode
        })

    # call the summary:
    try:
        summary = summarizer.PanSummarizer(
            summarizer_args, r=run,
            p=progress) if d.mode == 'pan' else summarizer.ProfileSummarizer(
                summarizer_args, r=run, p=progress)
        summary.process()
    except Exception as e:
        return json.dumps({
            'error':
            'Something failed in the "%s" summary mode. This is what we know: %s'
            % (d.mode, e)
        })

    run.info_single('HTML output for summary is ready: %s' %
                    summary.index_html)

    path = "summary/%s/index.html" % (collection_name)
    return json.dumps({'path': path})
コード例 #5
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detection_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.number_of_positive_samples = None
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.gene_coverages = {}
        self.gene_detection = {}
        self.samples = {}
        self.positive_samples = {}
        self.negative_samples = {}
        self.gene_class_information = {}
        self.samples_information = {}
        self.profile_db = {}

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = self.profile_db.gene_coverages_dict
                self.gene_detection = self.profile_db.gene_detection_dict
                self.samples = set(
                    next(iter(self.gene_coverages.values())).keys())
コード例 #6
0
ファイル: splitter.py プロジェクト: YuJimlong/anvio
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=True)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError(
                "The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!"
            )

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError(
                "Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        # if this is not set false, the summarizer class attemts to remove the main output directory
        # upon initialization. not doing that is useful in this context since this allows multiple
        # anvi-split runs to work on bins in the same collection in parallel:
        self.args.delete_output_directory_if_exists = False

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
コード例 #7
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.exclude_samples = A('exclude_samples')
        self.include_samples = A('include_samples')
        self.profile_db = {}
        self.coverage_values_per_nt = {}
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.sample_detection_information_was_initiated = False
        self.positive_samples = []
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_detection_information = pd.DataFrame.empty
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty
        self.additional_description = ''
        self.total_length = None
        self.samples_coverage_stats_dicts_was_initiated = False
        self.samples_coverage_stats_dicts = pd.DataFrame.empty
        self.non_outlier_indices = {}

        if self.exclude_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(self.exclude_samples, 'rU').readlines()
            ])

            if not self.samples_to_exclude:
                raise ConfigError(
                    "You asked to exclude samples, but provided an empty list."
                )

            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        if self.include_samples:
            # check that there is a file like this
            filesnpaths.is_file_exists(self.include_samples)
            self.samples_to_include = set([
                l.split('\t')[0].strip()
                for l in open(self.include_samples, 'rU').readlines()
            ])

            if not self.samples_to_include:
                raise ConfigError(
                    "You provided an empty list of samples to include.")

            run.info(
                'Including Samples',
                'The following samples will be included: %s' %
                self.samples_to_include,
            )
        else:
            self.samples_to_include = set([])

        # run sanity check on all input arguments
        self.sanity_check()

        if self.profile_db_path is None:
            # TODO: this will probably be removed because we don't save the coverage information in nucleotide level.
            pass
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
                self.init_samples(self.summary.p_meta['samples'])
            else:
                self.profile_db = ProfileSuperclass(args)
                self.init_samples(self.profile_db.p_meta['samples'])
                self.profile_db.init_split_coverage_values_per_nt_dict()
                self.profile_db.init_gene_level_coverage_stats_dicts()
                self.coverage_values_per_nt = get_coverage_values_per_nucleotide(
                    self.profile_db.split_coverage_values_per_nt_dict,
                    self.samples)

                # comply with the new design and get gene_coverages and gene_detection dicsts from
                # gene_level_coverage_stats_dict.
                gene_coverages, gene_detection = self.get_gene_coverages_and_gene_detection_dicts(
                )

                self.init_coverage_and_detection_dataframes(
                    gene_coverages, gene_detection)

                # getting the total length of all contigs
                self.total_length = self.profile_db.p_meta['total_length']
コード例 #8
0
    def __init__(self, args, run=run, progress=progress):
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.gene_coverages_data_file_path = A('data_file')
        self.gene_detections_data_file_path = A('gene_detection_data_file')
        self.profile_db_path = A('profile_db')
        self.output_file_prefix = A('output_file_prefix')
        self.alpha = A('alpha')
        self.beta = A('beta')
        self.gamma = A('gamma')
        self.eta = A('eta')
        self.zeta = A('zeta')
        self.additional_layers_to_append = A('additional_layers_to_append')
        self.samples_information_to_append = A('samples_information_to_append')
        self.collection_name = A('collection_name')
        self.bin_id = A('bin_id')
        self.bin_ids_file_path = A('bin_ids_file')
        self.store_gene_detections_and_gene_coverages_tables = A(
            'store_gene_detections_and_gene_coverages_tables')
        self.exclude_samples = A('exclude_samples')
        self.gene_coverages = pd.DataFrame.empty
        self.gene_detections = pd.DataFrame.empty
        self.samples = {}
        self.positive_samples = pd.DataFrame.empty
        self.number_of_positive_samples = None
        self.negative_samples = pd.DataFrame.empty
        self.number_of_negative_samples = None
        self.gene_class_information = pd.DataFrame.empty
        self.samples_information = pd.DataFrame.empty
        self.profile_db = {}
        self.gene_presence_absence_in_samples = pd.DataFrame.empty
        self.gene_coverages_filtered = pd.DataFrame.empty

        # check that there is a file like this
        if self.exclude_samples:
            filesnpaths.is_file_exists(self.exclude_samples)
            self.samples_to_exclude = set([
                l.split('\t')[0].strip()
                for l in open(args.exclude_samples, 'rU').readlines()
            ])
            run.info(
                'Excluding Samples',
                'The following samples will be excluded: %s' %
                self.samples_to_exclude,
            )
        else:
            self.samples_to_exclude = set([])

        self.sanity_check()
        if self.profile_db_path is None:
            self.get_data_from_txt_file()
        else:
            # load sample list and gene_coverage_dict from the merged profile db
            args.init_gene_coverages = True
            if self.collection_name:
                self.summary = summarizer.ProfileSummarizer(args)
                self.summary.init()
            else:
                self.profile_db = ProfileSuperclass(args)
                self.profile_db.init_gene_coverages_and_detection_dicts()
                self.gene_coverages = pd.DataFrame.from_dict(
                    self.profile_db.gene_coverages_dict,
                    orient='index',
                    dtype=float)
                self.gene_coverages.drop(self.samples_to_exclude,
                                         axis=1,
                                         inplace=True)
                self.Ng = len(self.gene_coverages.index)
                self.gene_detections = pd.DataFrame.from_dict(
                    self.profile_db.gene_detection_dict,
                    orient='index',
                    dtype=float)
                self.gene_detections.drop(self.samples_to_exclude,
                                          axis=1,
                                          inplace=True)
                self.samples = set(self.gene_coverages.columns)