Ejemplo n.º 1
0
def get_contigs_db_info_dict(contigs_db_path,
                             run=run,
                             progress=progress,
                             include_AA_counts=False,
                             split_names=None):
    """Returns an info dict for a given contigs db"""
    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    if split_names:
        split_names = set(split_names)

    if split_names:
        c.init_split_sequences()
        seq = ''.join(
            [c.split_sequences[split_name] for split_name in split_names])
        info_dict['total_length'] = len(seq)
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['gene_caller_ids'] = set([
            e['gene_callers_id'] for e in c.genes_in_splits.values()
            if e['split'] in split_names
        ])
        info_dict['num_genes'] = len(info_dict['gene_caller_ids'])
        info_dict['avg_gene_length'] = numpy.mean([
            (c.genes_in_contigs_dict[gene_caller_id]['stop'] -
             c.genes_in_contigs_dict[gene_caller_id]['start'])
            for gene_caller_id in info_dict['gene_caller_ids']
        ])
        info_dict['num_genes_per_kb'] = info_dict[
            'num_genes'] * 1000.0 / info_dict['total_length']
        info_dict['num_splits'] = len(split_names)
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['num_genes'] = len(c.genes_in_contigs_dict)
        info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys())
        info_dict['avg_gene_length'] = numpy.mean([
            (gene['stop'] - gene['start'])
            for gene in c.genes_in_contigs_dict.values() if not gene['partial']
        ])
        info_dict['num_genes_per_kb'] = info_dict[
            'num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    if split_names:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(
            split_names)
    else:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(
            set(c.splits_basic_info.keys()))

    if comp.has_key('Campbell_et_al'):
        info_dict['percent_complete'] = comp['Campbell_et_al'][
            'percent_complete']
        info_dict['percent_redundancy'] = comp['Campbell_et_al'][
            'percent_redundancy']

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict
Ejemplo n.º 2
0
def get_contigs_db_info_dict(contigs_db_path, run = run, progress = progress, include_AA_counts = False, split_names = None):
    """Returns an info dict for a given contigs db"""

    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run 
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r = run, p = progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    if split_names:
        split_names = set(split_names)

    if split_names:
        c.init_split_sequences()
        seq = ''.join([c.split_sequences[split_name] for split_name in split_names])
        info_dict['total_length'] = len(seq)
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['gene_caller_ids'] = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names])
        info_dict['num_genes'] = len(info_dict['gene_caller_ids'])
        info_dict['avg_gene_length'] = numpy.mean([(c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start']) for gene_caller_id in info_dict['gene_caller_ids']])
        info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length']
        info_dict['num_splits'] = len(split_names)
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        info_dict['gc_content'] = sequence.Composition(seq).GC_content
        info_dict['num_genes'] = len(c.genes_in_contigs_dict)
        info_dict['gene_caller_ids'] = set(c.genes_in_contigs_dict.keys())
        info_dict['avg_gene_length'] = numpy.mean([(gene['stop'] - gene['start']) for gene in c.genes_in_contigs_dict.values() if not gene['partial']])
        info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    if split_names:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names)
    else:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(set(c.splits_basic_info.keys()))

    if comp.has_key('Campbell_et_al'):
        info_dict['percent_complete'] = comp['Campbell_et_al']['percent_complete']
        info_dict['percent_redundancy'] = comp['Campbell_et_al']['percent_redundancy']

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names = split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict
Ejemplo n.º 3
0
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True):
    """Returns an info dict for a given contigs db"""

    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    # Two different strategies here depending on whether we work with a given set if split ids or
    # everything in the contigs database.
    if split_names:
        split_names = set(split_names)
        c.init_split_sequences()
        seq = ''.join([c.split_sequences[split_name] for split_name in split_names])
        candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names])
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        candidate_gene_caller_ids = c.genes_in_contigs_dict.keys()

    gene_caller_ids = set([])
    excluded_gene_ids = set([])
    for gene_caller_id in candidate_gene_caller_ids:
        if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls:
            excluded_gene_ids.add(gene_caller_id)
        else:
            gene_caller_ids.add(gene_caller_id)

    info_dict['gene_caller_ids'] = gene_caller_ids
    info_dict['excluded_gene_ids'] = excluded_gene_ids
    info_dict['num_genes'] = len(gene_caller_ids)
    info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids])
    info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values())
    info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length']

    info_dict['gc_content'] = sequence.Composition(seq).GC_content
    info_dict['total_length'] = len(seq)

    # get completeness / contamination estimates
    if split_names:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names)
    else:
        comp = completeness.Completeness(contigs_db_path).get_info_for_splits(set(c.splits_basic_info.keys()))

    if 'Campbell_et_al' in comp:
        info_dict['percent_complete'] = comp['Campbell_et_al']['percent_complete']
        info_dict['percent_redundancy'] = comp['Campbell_et_al']['percent_redundancy']

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict
Ejemplo n.º 4
0
def get_contigs_db_info_dict(contigs_db_path, run=run, progress=progress, include_AA_counts=False, split_names=None, exclude_partial_gene_calls=True):
    """Returns an info dict for a given contigs db"""

    class Args:
        def __init__(self):
            self.contigs_db = contigs_db_path

    args = Args()
    run = run
    progress = progress
    run.verbose = False
    progress.verbose = False
    c = ContigsSuperclass(args, r=run, p=progress)

    info_dict = {'path': contigs_db_path}

    for key in c.a_meta:
        info_dict[key] = c.a_meta[key]

    # Two different strategies here depending on whether we work with a given set if split ids or
    # everything in the contigs database.
    if split_names:
        split_names = set(split_names)
        c.init_split_sequences()
        seq = ''.join([c.split_sequences[split_name] for split_name in split_names])
        candidate_gene_caller_ids = set([e['gene_callers_id'] for e in c.genes_in_splits.values() if e['split'] in split_names])
    else:
        c.init_contig_sequences()
        seq = ''.join([e['sequence'] for e in c.contig_sequences.values()])
        candidate_gene_caller_ids = c.genes_in_contigs_dict.keys()

    info_dict['gc_content'] = sequence.Composition(seq).GC_content
    info_dict['total_length'] = len(seq)

    gene_caller_ids = set([])
    excluded_gene_ids = set([])
    for gene_caller_id in candidate_gene_caller_ids:
        if c.genes_in_contigs_dict[gene_caller_id]['partial'] and exclude_partial_gene_calls:
            excluded_gene_ids.add(gene_caller_id)
        else:
            gene_caller_ids.add(gene_caller_id)

    info_dict['gene_caller_ids'] = gene_caller_ids
    info_dict['excluded_gene_ids'] = excluded_gene_ids
    info_dict['num_genes'] = len(gene_caller_ids)
    info_dict['gene_lengths'] = dict([(gene_caller_id, (c.genes_in_contigs_dict[gene_caller_id]['stop'] - c.genes_in_contigs_dict[gene_caller_id]['start'])) for gene_caller_id in gene_caller_ids])
    info_dict['avg_gene_length'] = numpy.mean(info_dict['gene_lengths'].values())
    info_dict['num_genes_per_kb'] = info_dict['num_genes'] * 1000.0 / info_dict['total_length']

    # get completeness / contamination estimates
    p_completion, p_redundancy, domain, domain_confidence, results_dict = completeness.Completeness(contigs_db_path).get_info_for_splits(split_names if split_names else set(c.splits_basic_info.keys()))

    info_dict['percent_complete'] = p_completion
    info_dict['percent_redundancy'] = p_redundancy
    info_dict['scg_domain'] = domain
    info_dict['scg_domain_confidence'] = domain_confidence

    # lets get all amino acids used in all complete gene calls:
    if include_AA_counts:
        if split_names:
            AA_counts_dict = c.get_AA_counts_dict(split_names=split_names)
        else:
            AA_counts_dict = c.get_AA_counts_dict()

        info_dict['AA_counts'] = AA_counts_dict['AA_counts']
        info_dict['total_AAs'] = AA_counts_dict['total_AAs']

    return info_dict