コード例 #1
0
ファイル: filesnpaths.py プロジェクト: rajaldebnath/anvio
def is_proper_samples_information_file(file_path):
    is_file_tab_delimited(file_path)

    f = open(file_path, 'rU')

    # quick checks for the header
    columns = f.readline().strip('\n').split('\t')

    if columns[0] != 'samples':
        raise SamplesError(
            "The first column of the first row of an anvi'o samples information file\
                              must say 'samples'.")

    if len(columns[1:]) != len(set(columns[1:])):
        raise SamplesError(
            "Every column name in the anvi'o samples information file must be unique (obviously)."
        )

    # quick checks for the samples described
    sample_names = [l.strip('\n').split('\t')[0] for l in f.readlines()]

    if len(sample_names) != len(set(sample_names)):
        raise SamplesError(
            "Every sample name in the anvi'o samples information file must be unique :/"
        )

    f.close()

    return sample_names
コード例 #2
0
    def sanity_check(self):
        if self.sample_names_in_samples_information_file and self.sample_names_in_samples_order_file:
            if sorted(self.sample_names_in_samples_information_file) != sorted(
                    self.sample_names_in_samples_order_file):
                raise SamplesError('OK. Samples described in the information file and order file are not identical :/ '
                                    'Here are the %d sample names in the information file: "%s", versus the %d sample '
                                    'names in the orders file: "%s". And here is the difference: "%s".'\
                                                            % (len(self.sample_names_in_samples_information_file),
                                                               self.sample_names_in_samples_information_file,
                                                               len(self.sample_names_in_samples_order_file),
                                                               self.sample_names_in_samples_order_file,
                                                               list(set(self.sample_names_in_samples_information_file) - set(self.sample_names_in_samples_order_file))))

        if not self.samples_information_default_layer_order:
            # we still don't have a default order. we will try to recover from that here
            # by looking into what we have in the samples order informaiton
            if not len(self.samples_order_dict):
                raise SamplesError(
                    "Something is missing. Anvi'o is having hard time coming up with a default samples "
                    "order for the samples database.")

            a_basic_order = [
                o['basic'].split(',') if o['basic'] else None
                for o in list(self.samples_order_dict.values())
            ][0]
            a_tree_order = utils.get_names_order_from_newick_tree([
                o['newick'] if o['newick'] else None
                for o in list(self.samples_order_dict.values())
            ][0])

            self.samples_information_default_layer_order = a_basic_order or a_tree_order
コード例 #3
0
ファイル: samplesops.py プロジェクト: ppflrs/anvio
    def process_single_order_data(self, single_order_path, single_order_name):
        """Just inject a single order into the `self.samples_order_dict`"""

        if not single_order_path:
            return

        if not single_order_name:
            raise SamplesError("You provided a file for a single order, but not a name for it. This is a no no :/")

        filesnpaths.is_file_plain_text(single_order_path)

        single_order_file_content = [l.strip('\n') for l in open(single_order_path, 'rU').readlines()]

        if len(single_order_file_content) != 1:
            raise SamplesError("The single order file should contain a single line of information. It can't have nothing,\
                                it can't have too much. Just a single newick tree, or a comma-separated list of sample\
                                names.")

        _order = single_order_file_content.pop()

        # if you are reading this line, please brace yourself to possibly one of the silliest
        # bunch of lines in the anvi'o codebase. the reason we are doing this this way is quite
        # a long story, and deserves a FIXME, but in order to utilize the excellent function
        # in the filesnpaths module to check the contents of the samples order dict rigirously,
        # we need to have this information in a file. a better way could have been implementing
        # a filesnpaths.is_proper_samples_order_content function next to the currently available
        # filesnpaths.is_proper_samples_order_file (the latter would call the former with a dict
        # and it would be much more flexible), but we can't import utils form within filesnpaths.
        # without utils we don't have a get_TAB_delimited_file_as_dictionary function, and we are
        # definitely not going to implement it in two places :( recovering from a poor design by
        # doing something even poorer? couldn't have we fixed this once and for all instead of
        # writing this paragraph? well. just remember that you are thinking about a rethorical
        # question in a comment section. so sometimes we do things that are not quite productive.
        temp_samples_order_file_path = filesnpaths.get_temp_file_path()
        temp_samples_order_file = open(temp_samples_order_file_path, 'w')
        temp_samples_order_file.write('\t'.join(['attributes', 'basic', 'newick']) + '\n')

        if filesnpaths.is_proper_newick(_order, dont_raise=True):
            temp_samples_order_file.write('\t'.join([single_order_name, '', _order]) + '\n')
            self.samples_order_dict[single_order_name] = {'newick': _order, 'basic': None}
        else:
            temp_samples_order_file.write('\t'.join([single_order_name, _order, '']) + '\n')
            self.samples_order_dict[single_order_name] = {'basic': _order, 'newick': None}

        temp_samples_order_file.close()

        sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(temp_samples_order_file_path)
        os.remove(temp_samples_order_file_path)

        if not self.sample_names_in_samples_information_file:
            self.sample_names_in_samples_order_file = sample_names_in_samples_order_file

        self.available_orders.add(single_order_name)

        self.run.info('Samples order', "A single order for '%s' is also loaded" % single_order_name, quiet=self.quiet)
コード例 #4
0
ファイル: samplesops.py プロジェクト: ppflrs/anvio
    def update_samples_order_dict(self):
        """Some attributes in the samples information dict may also be used as orders"""

        def F(v):
            if isinstance(v, type(None)):
                return ''

            if not v:
                return 0.0

            try:
                return float(v)
            except:
                return v

        for sample_attribute_tuples in [[(F(self.samples_information_dict[sample][attribute]), sample, attribute) \
                                            for sample in self.samples_information_dict] \
                                            for attribute in self.aliases_to_attributes_dict]:
            # skip bar charts:
            if ';' in str(sample_attribute_tuples[0][0]):
                continue

            attribute = self.aliases_to_attributes_dict[sample_attribute_tuples[0][2]]
            if attribute not in self.samples_order_dict:
                try:
                    self.samples_order_dict['>> ' + attribute] = {'newick': '', 'basic': ','.join([t[1] for t in sorted(sample_attribute_tuples)])}
                    self.samples_order_dict['>> ' + attribute + ' (reverse)'] = {'newick': '', 'basic': ','.join([t[1] for t in sorted(sample_attribute_tuples, reverse=True)])}
                except TypeError:
                    raise SamplesError("OK. Anvi'o has good and bad news. The bad news is that your samples information\
                                        is kaput, because one of the columns in it has mixed data types (not everything has the\
                                        same type). The good news is that we know what column is that: it is the column '%s'.\
                                        Please take a look." % attribute)
コード例 #5
0
ファイル: filesnpaths.py プロジェクト: ppflrs/anvio
def is_proper_samples_information_file(file_path):
    is_file_tab_delimited(file_path)

    f = open(file_path, 'rU')

    # quick checks for the header
    columns = f.readline().strip('\n').split('\t')

    bad_column_names = [
        col for col in columns
        if is_bad_column_name(''.join(col.split('!')[0]))
    ]
    if bad_column_names:
        raise SamplesError(
            "Well, anvi'o does not like some of the column names in your samples information file. The\
                            best practice is to limit the characters that make up the column name to ASCII letters,\
                            digits, and the underscore character ('_'). No spaces, or funky characters unless they are\
                            necessary for various data types. Here are the perpetrators: '%s'."
            % ', '.join(bad_column_names))

    if len(columns[1:]) != len(set(columns[1:])):
        raise SamplesError(
            "Every column name in the anvi'o samples information file must be unique (obviously)."
        )

    # quick checks for the samples described
    sample_names = [l.strip('\n').split('\t')[0] for l in f.readlines()]

    if len(sample_names) != len(set(sample_names)):
        raise SamplesError(
            "Every sample name in the anvi'o samples information file must be unique :/"
        )

    f.close()

    return sample_names
コード例 #6
0
ファイル: samplesops.py プロジェクト: ppflrs/anvio
    def populate_from_input_files(self, samples_information_path=None, samples_order_path=None, single_order_path=None, single_order_name=None):
        if not samples_information_path and not samples_order_path and not single_order_path:
            raise SamplesError("At least one of the input files must be declared to create or to update an\
                                anvi'o samples information database :/ But maybe not. Maybe anvi'o should be\
                                able to create an empty samples information database, too. Do you need this?\
                                Write to us!")

        self.process_samples_information_file(samples_information_path)
        self.process_samples_order_file(samples_order_path)
        self.process_single_order_data(single_order_path, single_order_name)
        self.update_samples_order_dict()

        self.sanity_check()

        self.sample_names = self.sample_names_in_samples_information_file or self.sample_names_in_samples_order_file
コード例 #7
0
ファイル: filesnpaths.py プロジェクト: ppflrs/anvio
def is_proper_samples_order_file(file_path):
    is_file_tab_delimited(file_path)

    f = open(file_path, 'rU')

    columns = f.readline().strip().split('\t')

    if len(columns) != 3:
        raise SamplesError(
            "The number of columns in an anvi'o samples order file must be three.\
                             Yours has %d. Please see the documentation if you are lost."
            % len(columns))

    if columns[0] != 'attributes':
        raise SamplesError(
            "The first column of the first row of an anvi'o samples order file \
                              must say 'attributes'. All these rules... Anvi'o promises that they \
                              are for your own good.")

    if columns[1] != 'basic':
        raise SamplesError(
            "The second column of the first row of an anvi'o samples order file \
                              must read 'basic'.")

    if columns[2] != 'newick':
        raise SamplesError(
            "The third column of the first row of an anvi'o samples order file \
                              must read 'basic'.")

    num_samples_described_in_basic_organizations = []
    num_samples_described_in_newick_organizations = []
    sample_names_described_by_each_organization = []

    for columns in [l.strip('\n').split('\t') for l in f.readlines()]:
        if len(columns) != 3:
            raise SamplesError(
                "Each line in the samples order file must contain three columns separated\
                                 from each other by TAB characters. You have at least one with %d columns\
                                 :/" % len(columns))

        attribute, basic, newick = columns

        if basic and newick:
            raise SamplesError(
                'For the attribute %s, there is both basic and newick form of organization\
                                in the samples order file. For a given attribute, you can define only one\
                                of them, and the other must be blank.' %
                attribute)
        if not basic and not newick:
            raise SamplesError(
                'For the attribute %s, there is no organization defined (neither newick, nor\
                                 basic). Is this a test or something? :/' %
                attribute)

        if newick:
            try:
                tree = is_proper_newick(newick)
            except:
                raise SamplesError(
                    'The newick entry for the attribute %s deos not seem to be a properly\
                                     formatted newick :/' % attribute)
            samples = [n.name for n in tree.get_leaves()]
            num_samples_described_in_newick_organizations.append(len(samples))
            sample_names_described_by_each_organization.append(samples)

        if basic:
            if not basic.count(','):
                raise SamplesError(
                    'The basic samples organization for attribute %s does not seem to be a\
                                     comma-separated list.')
            samples = [s.strip() for s in basic.split(',')]
            num_samples_described_in_basic_organizations.append(len(samples))
            sample_names_described_by_each_organization.append(samples)

    if num_samples_described_in_basic_organizations and len(
            set(num_samples_described_in_basic_organizations)) != 1:
        raise SamplesError(
            'The number of samples described by each comma-separated basic organization line\
                             must be equal. But that does not seem to be the case with your input :/'
        )

    if num_samples_described_in_newick_organizations and len(
            set(num_samples_described_in_newick_organizations)) != 1:
        raise SamplesError(
            'The number of samples described by each newick-formatted organization \
                             must be equal. But that does not seem to be the case with your input :/'
        )

    unique_list_of_samples = [
        list(x) for x in set(
            tuple(sorted(x))
            for x in sample_names_described_by_each_organization)
    ]
    if len(unique_list_of_samples) != 1:
        raise SamplesError(
            "At least one organization in the samples order file differs from the others. Each\
                             order should contain the same sample names. Sorry about the cryptic error message,\
                             but your file is not properly formatted :/")

    return unique_list_of_samples[0]