def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] if split_names: split_names = set(split_names) if split_names: c.init_split_sequences() seq = ''.join( [c.split_sequences[split_name] for split_name in split_names]) info_dict['total_length'] = len(seq) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['gene_caller_ids'] = set([ e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names ]) info_dict['num_genes'] = len(info_dict['gene_caller_ids']) info_dict['avg_gene_length'] = numpy.mean([ (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start']) for gene_caller_id in info_dict['gene_caller_ids'] ]) info_dict['num_genes_per_kb'] = info_dict[ 'num_genes'] * 1000.0 / info_dict['total_length'] info_dict['num_splits'] = len(split_names) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['num_genes'] = len(c.genes_in_contigs_dict) info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys()) info_dict['avg_gene_length'] = numpy.mean([ (gene['stop'] - gene['start']) for gene in c.genes_in_contigs_dict.values() if not gene['partial'] ]) info_dict['num_genes_per_kb'] = info_dict[ 'num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits( split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits( set(c.splits_basic_info.keys())) if comp.has_key('Campbell_et_al'): info_dict['percent_complete'] = comp['Campbell_et_al'][ 'percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al'][ 'percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def get_contigs_db_info_dict(contigs_db_path, run = run, progress = progress, include_AA_counts = False, split_names = None): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r = run, p = progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] if split_names: split_names = set(split_names) if split_names: c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) info_dict['total_length'] = len(seq) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['gene_caller_ids'] = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) info_dict['num_genes'] = len(info_dict['gene_caller_ids']) info_dict['avg_gene_length'] = numpy.mean([(c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start']) for gene_caller_id in info_dict['gene_caller_ids']]) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] info_dict['num_splits'] = len(split_names) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['num_genes'] = len(c.genes_in_contigs_dict) info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys()) info_dict['avg_gene_length'] = numpy.mean([(gene['stop'] - gene['start']) for gene in c.genes_in_contigs_dict.values() if not gene['partial']]) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(set(c.splits_basic_info.keys())) if comp.has_key('Campbell_et_al'): info_dict['percent_complete'] = comp['Campbell_et_al']['percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al']['percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names = split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] # Two different strategies here depending on whether we work with a given set if split ids or # everything in the contigs database. if split_names: split_names = set(split_names) c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) candidate_gene_caller_ids = c.genes_in_contigs_dict.keys() gene_caller_ids = set([]) excluded_gene_ids = set([]) for gene_caller_id in candidate_gene_caller_ids: if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls: excluded_gene_ids.add(gene_caller_id) else: gene_caller_ids.add(gene_caller_id) info_dict['gene_caller_ids'] = gene_caller_ids info_dict['excluded_gene_ids'] = excluded_gene_ids info_dict['num_genes'] = len(gene_caller_ids) info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids]) info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values()) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['total_length'] = len(seq) # get completeness / contamination estimates if split_names: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names) else: comp = completeness.Completeness(contigs_db_path).get_info_for_splits(set(c.splits_basic_info.keys())) if 'Campbell_et_al' in comp: info_dict['percent_complete'] = comp['Campbell_et_al']['percent_complete'] info_dict['percent_redundancy'] = comp['Campbell_et_al']['percent_redundancy'] # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True): """Returns an info dict for a given contigs db""" class Args: def __init__(self): self.contigs_db = contigs_db_path args = Args() run = run progress = progress run.verbose = False progress.verbose = False c = ContigsSuperclass(args, r=run, p=progress) info_dict = {'path': contigs_db_path} for key in c.a_meta: info_dict[key] = c.a_meta[key] # Two different strategies here depending on whether we work with a given set if split ids or # everything in the contigs database. if split_names: split_names = set(split_names) c.init_split_sequences() seq = ''.join([c.split_sequences[split_name] for split_name in split_names]) candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names]) else: c.init_contig_sequences() seq = ''.join([e['sequence'] for e in c.contig_sequences.values()]) candidate_gene_caller_ids = c.genes_in_contigs_dict.keys() info_dict['gc_content'] = sequence.Composition(seq).GC_content info_dict['total_length'] = len(seq) gene_caller_ids = set([]) excluded_gene_ids = set([]) for gene_caller_id in candidate_gene_caller_ids: if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls: excluded_gene_ids.add(gene_caller_id) else: gene_caller_ids.add(gene_caller_id) info_dict['gene_caller_ids'] = gene_caller_ids info_dict['excluded_gene_ids'] = excluded_gene_ids info_dict['num_genes'] = len(gene_caller_ids) info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids]) info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values()) info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length'] # get completeness / contamination estimates p_completion, p_redundancy, domain, domain_confidence, results_dict = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names if split_names else set(c.splits_basic_info.keys())) info_dict['percent_complete'] = p_completion info_dict['percent_redundancy'] = p_redundancy info_dict['scg_domain'] = domain info_dict['scg_domain_confidence'] = domain_confidence # lets get all amino acids used in all complete gene calls: if include_AA_counts: if split_names: AA_counts_dict = c.get_AA_counts_dict(split_names=split_names) else: AA_counts_dict = c.get_AA_counts_dict() info_dict['AA_counts'] = AA_counts_dict['AA_counts'] info_dict['total_AAs'] = AA_counts_dict['total_AAs'] return info_dict