コード例 #1
0
ファイル: select.py プロジェクト: CroP-BioDiv/zcitools
def select_data(step_data,
                result,
                steps,
                select,
                where_part,
                group_by_part,
                having_part,
                order_by_part,
                output_filename=None):
    # Fetch data
    with StepDatabase(steps) as db:
        column_data_types, rows = db.select_all_tables(select, where_part,
                                                       group_by_part,
                                                       having_part,
                                                       order_by_part)

    result = result.lower()
    if result[0] == 'e':  # Excel
        table_data_2_excel(output_filename or 'out.xslx', column_data_types,
                           rows)

    elif result[0] == 's':  # Step
        step = TableStep(steps[0].project, step_data, remove_data=True)
        step.set_table_data(rows, column_data_types)
        step.save()
        return step

    else:  # Print
        print('Columns:')
        print_table(None, column_data_types)

        print('\nData:')
        print_table([c for c, _ in column_data_types], rows, show_limit=7)
        print(f'\nNumber of rows: {len(rows)}')
コード例 #2
0
    def show_data(self, params=None):
        # If listed, filter only these sequences
        filter_seqs = None
        if params:
            filter_seqs = self._sequences & set(params)
            params = [p for p in params if p not in filter_seqs]  # Remove processed params

        cmd = params[0].lower() if params else 'by_type'  # Default print
        if params:
            params = params[1:]

        if cmd == 'by_type':
            all_types = set()
            data = dict()  # seq_ident -> dict(length=int, features=int, <type>=str)
            for seq_ident, seq_record in self._iterate_records(filter_seqs=filter_seqs):
                type_2_list = defaultdict(list)
                for f in seq_record.features:
                    if f.type != 'source':
                        type_2_list[f.type].append(feature_qualifiers_to_desc(f))
                all_types.update(type_2_list.keys())
                data[seq_ident] = dict(length=len(seq_record.seq), features=len(seq_record.features))
                data[seq_ident].update((t, f"{len(fs)}/{len(set(fs))}" if t != 'repeat_region' else str(len(fs)))
                                       for t, fs in type_2_list.items())

            all_types = sorted(all_types)
            print_table(['seq_ident', 'Length', 'Features'] + all_types,
                        sorted([seq_ident, d['length'], d['features']] + [d.get(t, '') for t in all_types]
                               for seq_ident, d in sorted(data.items())))

        # Genes
        elif cmd == 'genes':
            self._all_features(self._get_feature_desc('gene', filter_seqs=filter_seqs))
        elif cmd == 'repeated_genes':
            self._repeated_features(self._get_feature_desc('gene', filter_seqs=filter_seqs))
        elif cmd == 'shared_genes':
            self._shared_features(self._get_feature_desc('gene', filter_seqs=filter_seqs))

        # CDSs
        elif cmd == 'cds':
            self._all_features(self._get_feature_desc('CDS', filter_seqs=filter_seqs))
        elif cmd == 'repeated_cds':
            self._repeated_features(self._get_feature_desc('CDS', filter_seqs=filter_seqs))
        elif cmd == 'shared_cds':
            self._shared_features(self._get_feature_desc('CDS', filter_seqs=filter_seqs))

        # IR
        elif cmd == 'ir':
            # seq_ident -> list of feature locations
            data = dict((
                seq_ident,
                [feature_location_desc(f.location)
                 for f in seq_record.features if f.type == 'repeat_region'])
                for seq_ident, seq_record in self._iterate_records(filter_seqs=filter_seqs))

            for seq_ident, locations in sorted(data.items()):
                print(f"{seq_ident} ({len(locations)}): {', '.join(map(str, sorted(locations)))}")
        else:
            print(f'Wrong show command ({cmd})!')
コード例 #3
0
def check_annotations(files_or_dirs, filter_type, output_filename):
    from .helpers import read_sequence, feature_qualifiers_to_desc
    from common_utils.file_utils import files_from_args
    from common_utils.show import print_table
    from common_utils.value_data_types import rows_2_excel

    checks = []
    for f_name in files_from_args(files_or_dirs, '.gb'):
        print(f'Processing file: {f_name}')
        seq_rec = read_sequence(f_name)

        # Extract features
        if filter_type:
            filter_type = [t.lower() for t in filter_type]
            features = [f for f in seq_rec.features if f.type.lower() in filter_type]
        else:
            features = seq_rec.features

        #
        without_location = [f for f in features if not f.location]
        if without_location:  # Fix if needed
            features = [f for f in features if f.location]
        without_name = [f for f in features if not feature_qualifiers_to_desc(f, do_assert=True)]
        uniq, duplicated = split_features_in_uniq_dupl(len(seq_rec), features)

        checks.append(dict(
            filename=f_name,
            num_without_location=len(without_location),
            without_location=', '.join(sorted(feature_qualifiers_to_desc(f) for f in without_location)),
            num_without_name=len(without_name),
            num_unique_names=len(uniq),
            num_unique_features=sum(len(_l) for _l in uniq.values()),
            num_duplicated_names=len(duplicated),
            num_duplicated_features=sum(len(_l) for _l in duplicated.values()),
        ))

    # Print and save
    print_table(['File', 'No loc', 'No name', 'Uniq names', 'Uniq fs', 'Dup. names', 'Dup. fs'],
                [[c['filename'], c['num_without_location'], c['num_without_name'],
                 c['num_unique_names'], c['num_unique_features'],
                 c['num_duplicated_names'], c['num_duplicated_features']] for c in checks])

    print(f"""
Statistics, num seqeunces with features that:
 - do not have location  : {sum(1 for c in checks if c['num_without_location'])}
 - do not have name      : {sum(1 for c in checks if c['num_without_name'])}
 - have duplicated names : {sum(1 for c in checks if c['num_duplicated_names'])}
""")
    if output_filename:
        # ToDo: in more formats?
        rows_2_excel(output_filename,
                     ['Filename', 'Num without location', 'Without location', 'Num without name',
                      'Num uniq names', 'Num uniq features',
                      'Num duplicated names', 'Num duplicated features'],
                     [[c['filename'], c['num_without_location'], c['without_location'], c['num_without_name'],
                      c['num_unique_names'], c['num_unique_features'],
                      c['num_duplicated_names'], c['num_duplicated_features']] for c in checks])
コード例 #4
0
ファイル: steps.py プロジェクト: CroP-BioDiv/zcitools
    def show_data(self, params=None):
        if not self.is_completed():
            print('Table step is not completed!')
            return

        print('Columns:')
        print_table(None, self._columns)

        if 'columns' not in params:
            print('\nData:')
            print_table([c for c, _ in self._columns],
                        self.get_rows(),
                        show_limit=7)
コード例 #5
0
 def show_data(self, params=None):
     print_table(['image_ident', 'Files'],
                 [[ident, ', '.join(sorted(fs))]
                  for ident, fs in sorted(self._images.items())])