def select_data(step_data, result, steps, select, where_part, group_by_part, having_part, order_by_part, output_filename=None): # Fetch data with StepDatabase(steps) as db: column_data_types, rows = db.select_all_tables(select, where_part, group_by_part, having_part, order_by_part) result = result.lower() if result[0] == 'e': # Excel table_data_2_excel(output_filename or 'out.xslx', column_data_types, rows) elif result[0] == 's': # Step step = TableStep(steps[0].project, step_data, remove_data=True) step.set_table_data(rows, column_data_types) step.save() return step else: # Print print('Columns:') print_table(None, column_data_types) print('\nData:') print_table([c for c, _ in column_data_types], rows, show_limit=7) print(f'\nNumber of rows: {len(rows)}')
def show_data(self, params=None): # If listed, filter only these sequences filter_seqs = None if params: filter_seqs = self._sequences & set(params) params = [p for p in params if p not in filter_seqs] # Remove processed params cmd = params[0].lower() if params else 'by_type' # Default print if params: params = params[1:] if cmd == 'by_type': all_types = set() data = dict() # seq_ident -> dict(length=int, features=int, <type>=str) for seq_ident, seq_record in self._iterate_records(filter_seqs=filter_seqs): type_2_list = defaultdict(list) for f in seq_record.features: if f.type != 'source': type_2_list[f.type].append(feature_qualifiers_to_desc(f)) all_types.update(type_2_list.keys()) data[seq_ident] = dict(length=len(seq_record.seq), features=len(seq_record.features)) data[seq_ident].update((t, f"{len(fs)}/{len(set(fs))}" if t != 'repeat_region' else str(len(fs))) for t, fs in type_2_list.items()) all_types = sorted(all_types) print_table(['seq_ident', 'Length', 'Features'] + all_types, sorted([seq_ident, d['length'], d['features']] + [d.get(t, '') for t in all_types] for seq_ident, d in sorted(data.items()))) # Genes elif cmd == 'genes': self._all_features(self._get_feature_desc('gene', filter_seqs=filter_seqs)) elif cmd == 'repeated_genes': self._repeated_features(self._get_feature_desc('gene', filter_seqs=filter_seqs)) elif cmd == 'shared_genes': self._shared_features(self._get_feature_desc('gene', filter_seqs=filter_seqs)) # CDSs elif cmd == 'cds': self._all_features(self._get_feature_desc('CDS', filter_seqs=filter_seqs)) elif cmd == 'repeated_cds': self._repeated_features(self._get_feature_desc('CDS', filter_seqs=filter_seqs)) elif cmd == 'shared_cds': self._shared_features(self._get_feature_desc('CDS', filter_seqs=filter_seqs)) # IR elif cmd == 'ir': # seq_ident -> list of feature locations data = dict(( seq_ident, [feature_location_desc(f.location) for f in seq_record.features if f.type == 'repeat_region']) for seq_ident, seq_record in self._iterate_records(filter_seqs=filter_seqs)) for seq_ident, locations in sorted(data.items()): print(f"{seq_ident} ({len(locations)}): {', '.join(map(str, sorted(locations)))}") else: print(f'Wrong show command ({cmd})!')
def check_annotations(files_or_dirs, filter_type, output_filename): from .helpers import read_sequence, feature_qualifiers_to_desc from common_utils.file_utils import files_from_args from common_utils.show import print_table from common_utils.value_data_types import rows_2_excel checks = [] for f_name in files_from_args(files_or_dirs, '.gb'): print(f'Processing file: {f_name}') seq_rec = read_sequence(f_name) # Extract features if filter_type: filter_type = [t.lower() for t in filter_type] features = [f for f in seq_rec.features if f.type.lower() in filter_type] else: features = seq_rec.features # without_location = [f for f in features if not f.location] if without_location: # Fix if needed features = [f for f in features if f.location] without_name = [f for f in features if not feature_qualifiers_to_desc(f, do_assert=True)] uniq, duplicated = split_features_in_uniq_dupl(len(seq_rec), features) checks.append(dict( filename=f_name, num_without_location=len(without_location), without_location=', '.join(sorted(feature_qualifiers_to_desc(f) for f in without_location)), num_without_name=len(without_name), num_unique_names=len(uniq), num_unique_features=sum(len(_l) for _l in uniq.values()), num_duplicated_names=len(duplicated), num_duplicated_features=sum(len(_l) for _l in duplicated.values()), )) # Print and save print_table(['File', 'No loc', 'No name', 'Uniq names', 'Uniq fs', 'Dup. names', 'Dup. fs'], [[c['filename'], c['num_without_location'], c['num_without_name'], c['num_unique_names'], c['num_unique_features'], c['num_duplicated_names'], c['num_duplicated_features']] for c in checks]) print(f""" Statistics, num seqeunces with features that: - do not have location : {sum(1 for c in checks if c['num_without_location'])} - do not have name : {sum(1 for c in checks if c['num_without_name'])} - have duplicated names : {sum(1 for c in checks if c['num_duplicated_names'])} """) if output_filename: # ToDo: in more formats? rows_2_excel(output_filename, ['Filename', 'Num without location', 'Without location', 'Num without name', 'Num uniq names', 'Num uniq features', 'Num duplicated names', 'Num duplicated features'], [[c['filename'], c['num_without_location'], c['without_location'], c['num_without_name'], c['num_unique_names'], c['num_unique_features'], c['num_duplicated_names'], c['num_duplicated_features']] for c in checks])
def show_data(self, params=None): if not self.is_completed(): print('Table step is not completed!') return print('Columns:') print_table(None, self._columns) if 'columns' not in params: print('\nData:') print_table([c for c, _ in self._columns], self.get_rows(), show_limit=7)
def show_data(self, params=None): print_table(['image_ident', 'Files'], [[ident, ', '.join(sorted(fs))] for ident, fs in sorted(self._images.items())])