Example #1
0
def filter_seqs(input_paths=INPUT_PATHS,
                input_format=SEQFILE_FORMAT,
                output_path=OUTPUT_PATH,
                output_format=SEQFILE_FORMAT,
                min_length=MIN_LENGTH,
                max_length=MAX_LENGTH,
                counted_char=COUNTED_CHAR,
                min_count=MIN_COUNT,
                max_count=MAX_COUNT,
                min_prop=MIN_PROP,
                max_prop=MAX_PROP):
    with open_file(output_path, 'w') as output_file:
        for input_path in input_paths:
            with open_file(input_path) as input_file:
                for record in Bio.SeqIO.parse(input_file, input_format):
                    length = len(record)
                    if length < min_length:
                        continue
                    if max_length is not None and length > max_length:
                        continue
                    if counted_char is not None:
                        count = record.seq.count(counted_char)
                        prop = 100.0 * count / length
                        if count < min_count:
                            continue
                        if max_count is not None and count > max_count:
                            continue
                        if prop < min_prop:
                            continue
                        if prop > max_prop:
                            continue
                    write_records(record, output_file, output_format)
Example #2
0
def _parallelize_unindexed_sep_part(target_function, input_paths_part,
                                    output_paths_part, input_format,
                                    output_format, queue, **kwargs):
    for input_path, output_path in zip(input_paths_part, output_paths_part):
        with open_file(input_path) as input_file, \
             open_file(output_path, 'w') as output_file:
            records = Bio.SeqIO.parse(input_file, input_format)
            result = target_function(records, output_file, output_format,
                                     **kwargs)
            queue.put(result)
Example #3
0
def count_seqs(input_paths,
               input_format=SEQFILE_FORMAT,
               count_seqs=COUNT_SEQS,
               count_bases=COUNT_BASES,
               ignore_case=IGNORE_CASE):
    seq_counts, base_counts = {}, {}
    for input_path in input_paths:
        seq_count = 0
        with open_file(input_path) as input_file:
            records = Bio.SeqIO.parse(input_file, input_format)
            for record in records:
                if count_seqs:
                    seq_count += 1
                if count_bases:
                    seq = record.seq
                    if ignore_case:
                        seq = seq.upper()
                    for base in seq:
                        try:
                            base_counts[base][input_path] += 1
                        except KeyError:
                            try:
                                base_counts[base][input_path] = 1
                            except KeyError:
                                base_counts[base] = {input_path: 1}
        seq_counts[input_path] = seq_count
    return seq_counts, base_counts
Example #4
0
def export_tree(tree_path, terms, types, levels, obsolete, alt, main_ids,
                children, parents, ancestors):

    with open_file(tree_path, 'w') as tree_file:
        tree_table = csv.writer(tree_file, dialect='excel-tab')

        header = [
            "#GO_ID", "term", "type", "level", "is_obsolete", "is_alternative",
            "main_id", "children", "parents", "ancestors"
        ]
        tree_table.writerow(header)

        for go_id in sorted(terms):
            try:
                level = levels[go_id]
            except KeyError:
                level = 'NA'
            try:
                main_id = main_ids[go_id]
            except KeyError:
                main_id = ''
            row = [
                go_id, terms[go_id], types[go_id], level,
                str(obsolete[go_id]),
                str(alt[go_id]), main_id
            ]
            for d in [children, parents, ancestors]:
                try:
                    row.append(', '.join(sorted(d[go_id])))
                except KeyError:
                    row.append('')
            tree_table.writerow(row)
Example #5
0
def export_table_with_seqids_expanded(results, seqids, info, info_header,
                                      output_path, export_level_one_seqids):
    with open_file(output_path, 'w') as output_file:
        table = csv.writer(output_file, dialect='excel-tab')
        header = [
            'GO ID', 'Term', 'Level', 'Ref. count', 'Ref. perc.',
            'Sample count', 'Sample perc.', 'FC', 'p-value', 'Reg.',
            'Sequence ID'
        ]
        if info is not None:
            header += info_header
        table.writerow(header)
        for level in sorted(results):
            for key in sorted(results[level]):
                row = results[level][key]
                go_id = row[0]
                if level < 2 and not export_level_one_seqids:
                    table.writerow(row)
                    continue
                lcl_seqids = seqids.get(go_id, None)
                if lcl_seqids is None:
                    table.writerow(row)
                    continue
                lcl_seqids.sort()
                for i in range(len(lcl_seqids)):
                    if not i:
                        final_row = list(row)
                    else:
                        final_row = [''] * len(row)
                    final_row.append(lcl_seqids[i])
                    if info is not None:
                        gene_info = info.get(lcl_seqids[i], None)
                        if gene_info is not None:
                            final_row += gene_info
                    table.writerow(final_row)
Example #6
0
def export_table(output_path, seqids, terms, bp, mf, cc, ec, add_ancestors,
                 bp_anc, mf_anc, cc_anc):
    if seqids is None:
        seqids = sorted(list(set(bp.keys()) | set(mf.keys()) | set(cc.keys())))

    with open_file(output_path, 'w') as output_file:
        output_table = csv.writer(output_file, dialect='excel-tab')
        header = [
            'SeqID', 'Biological process', 'Molecular function',
            'Cellular component'
        ]
        if add_ancestors:
            header += [
                'Biological process (with ancestors)',
                'Molecular function (with ancestors)',
                'Cellular component (with ancestors)'
            ]
        header += ['Enzyme codes']
        output_table.writerow(header)
        for seqid in seqids:
            row = [
                seqid,
                format_terms(seqid, bp, terms),
                format_terms(seqid, mf, terms),
                format_terms(seqid, cc, terms)
            ]
            if add_ancestors:
                row += [
                    format_terms(seqid, bp_anc, terms),
                    format_terms(seqid, mf_anc, terms),
                    format_terms(seqid, cc_anc, terms)
                ]
            row.append(format_ec_codes(seqid, ec))
            output_table.writerow(row)
Example #7
0
def generate_random_sequences_part(
        output_path, output_format, seq_type, min_length, max_length,
        nb_sequences, prefix, start_index, index_width):
    with open_file(output_path, 'w') as output_file:
        for i in range(nb_sequences):
            seqid = make_seqid(prefix, i + start_index, index_width)
            length = define_length(min_length, max_length)
            if seq_type == 'dna':
                seq = generate_random_seq(length, DNA_NTS)
                alphabet = DNA_ALPHA()
            elif seq_type == 'rna':
                seq = generate_random_seq(length, RNA_NTS)
                alphabet = RNA_ALPHA()
            elif seq_type == 'aa':
                seq = generate_random_seq(length, AMINOACIDS)
                alphabet = PROT_ALPHA()
            elif seq_type == 'prot':
                seq = generate_random_seq(length-1, AMINOACIDS, start=START_AA)
                alphabet = PROT_ALPHA()
            elif seq_type == 'prot_stop':
                seq = generate_random_seq(length-1, AMINOACIDS,
                                          start=START_AA, end=STOP_AA)
                alphabet = PROT_ALPHA()
            elif seq_type == 'cds':
                seq = generate_random_seq(length//3-2, CODONS,
                                          start=START_CODONS, end=STOP_CODONS)
                alphabet = DNA_ALPHA()

            record = Bio.SeqRecord.SeqRecord(
                id=seqid,
                seq=Bio.Seq.Seq(seq, alphabet),
                description=DESCRIPTION)
            write_records(record, output_file, output_format)
Example #8
0
def merge_outputs(tmp_output_paths, output_path):
    with open_file(output_path, 'w') as output_file:
        for tmp_output_path in tmp_output_paths:
            with open(tmp_output_path) as tmp_output_file:
                for line in tmp_output_file:
                    output_file.write(line)
    for tmp_output_path in tmp_output_paths:
        os.remove(tmp_output_path)
Example #9
0
def import_seqids(seqids_path):
    if seqids_path is not None:
        seqids = []
        with open_file(seqids_path) as seqids_file:
            for line in seqids_file:
                line = line.strip()
                seqids.append(line)
        return seqids
    return None
Example #10
0
def import_seqids(seqids_path):
    seqids = set()
    with open_file(seqids_path) as seqids_file:
        seqids_table = csv.reader(seqids_file, dialect='excel-tab')
        for row in seqids_table:
            if not row:
                continue
            seqids |= {row[0]}
    return seqids
Example #11
0
def import_annotations(annot_paths,
                       types,
                       levels,
                       obsolete,
                       main_ids,
                       add_ancestors,
                       ancestors,
                       min_level=MIN_LEVEL,
                       max_level=MAX_LEVEL):
    bp, mf, cc, ec = {}, {}, {}, {}
    bp_anc, mf_anc, cc_anc = None, None, None
    if add_ancestors:
        bp_anc, mf_anc, cc_anc = {}, {}, {}

    for annot_path in annot_paths:
        with open_file(annot_path) as annot_file:
            for row in csv.reader(annot_file, dialect='excel-tab'):
                seqid = row[0]
                go_id = row[1]

                if not (go_id.startswith('GO:') or go_id.startswith('EC:')):
                    print_stderr(
                        'ERROR: Unknown annotation type: {}.'.format(go_id))
                    continue

                if go_id.startswith('EC:'):
                    updict_add_to_set(ec, seqid, go_id)
                    continue

                try:
                    go_type = types[go_id]
                except KeyError:
                    print_stderr(
                        'ERROR: Annotation not found: {}.'.format(go_id))
                    continue

                if obsolete[go_id]:
                    continue
                if levels[go_id] < min_level:
                    continue
                if max_level is not None and levels[go_id] > max_level:
                    continue
                try:
                    go_id = main_ids[go_id]
                except KeyError:
                    pass

                if go_type == 'biological_process':
                    add_annotation(bp, seqid, go_id, bp_anc, ancestors)
                elif go_type == 'molecular_function':
                    add_annotation(mf, seqid, go_id, mf_anc, ancestors)
                elif go_type == 'cellular_component':
                    add_annotation(cc, seqid, go_id, cc_anc, ancestors)

    return bp, mf, cc, ec, bp_anc, mf_anc, cc_anc
Example #12
0
def export_table(results, output_path):
    with open_file(output_path, 'w') as output_file:
        table = csv.writer(output_file, dialect='excel-tab')
        header = [
            'GO ID', 'Term', 'Level', 'Ref. count', 'Ref. perc.',
            'Sample count', 'Sample perc.', 'FC', 'p-value', 'Reg.'
        ]
        table.writerow(header)
        for level in sorted(results):
            for key in sorted(results[level]):
                table.writerow(results[level][key])
Example #13
0
def import_values(values_path, integers=False):
    values = []
    with open_file(values_path) as input_file:
        for line in input_file:
            value = line.strip()
            if not value:
                continue
            if integers:
                value = int(value)
            values.append(value)
    return values
Example #14
0
def extract_records_complex(seqids, descriptions, names, mol_types, taxids,
                            comparison_func, inverse=INVERSE,
                            input_paths=INPUT_PATHS,
                            input_format=SEQFILE_FORMAT,
                            output_path=OUTPUT_PATH,
                            output_format=SEQFILE_FORMAT):
    with open_file(output_path, 'w') as output_file:
        output_alphabet = None
        for input_path in input_paths:
            with open_file(input_path) as input_file:
                records = Bio.SeqIO.parse(input_file, input_format)
                for record in records:
                    keep_record = good_record(
                        record, seqids, descriptions, names,
                        mol_types, taxids, comparison_func)
                    if inverse:
                        keep_record = not keep_record
                    if not keep_record:
                        continue
                    write_records(record, output_file, output_format)
Example #15
0
def import_info(info_path):
    if info_path is None:
        return None, None
    info = {}
    with open_file(info_path) as info_file:
        info_table = csv.reader(info_file, dialect='excel-tab')
        header = next(info_table)
        info_header = header[1:]
        for row in info_table:
            seqid, desc = row[0], row[1:]
            info[seqid] = desc
    return info, info_header
Example #16
0
def import_obo(obo_path):
    terms, types, alt, alt_ids, obsolete = {}, {}, {}, {}, {}
    parents, children = {}, {}

    with open_file(obo_path, 'r') as obo_file:

        for line in obo_file:
            line = line.strip()

            if line == '[Typedef]':
                break

            elif line.startswith('id: '):
                go_id = line[4:]
                obsolete[go_id] = False
                alt[go_id] = False
                alt_ids[go_id] = []

            elif line.startswith('name: '):
                terms[go_id] = line[6:]
            elif line.startswith('namespace: '):
                types[go_id] = line[11:]
            elif line == 'is_obsolete: true':
                obsolete[go_id] = True

            elif line.startswith('alt_id: '):
                alt_id = line[8:]
                if '!' in alt_id:
                    alt_id = alt_id[:alt_id.index('!')]
                alt_id = alt_id.strip()
                alt[alt_id] = True
                updict_append_to_list(alt_ids, go_id, alt_id)

            else:
                for prefix in PARENT_PREFIXES:
                    if line.startswith(prefix):
                        parent = line[len(prefix) - 2:]
                        if '!' in parent:
                            parent = parent[:parent.index('!')]
                        parent = parent.strip()
                        updict_append_to_list(parents, go_id, parent)
                        updict_append_to_list(children, parent, go_id)

                for prefix in CHILD_PREFIXES:
                    if line.startswith(prefix):
                        child = line[len(prefix) - 2:]
                        if '!' in child:
                            child = child[:child.index('!')]
                        child = child.strip()
                        updict_append_to_list(parents, child, go_id)
                        updict_append_to_list(children, go_id, child)

    return terms, types, alt, alt_ids, obsolete, parents, children
Example #17
0
def _parallelize_indexed_sep_part(target_function, input_paths_part,
                                  index_paths_part, output_paths_part,
                                  input_format, output_format, queue,
                                  **kwargs):
    paths = zip(input_paths_part, index_paths_part, output_paths_part)
    for input_path, index_path, output_path in paths:
        with open_file(output_path, 'w') as output_file:
            if index_path is None:
                records = Bio.SeqIO.index(input_path, input_format)
            else:
                records = Bio.SeqIO.index_db(index_path, input_path,
                                             input_format)
            results = target_function(records, output_file, output_format,
                                      **kwargs)
            queue.put(results)
Example #18
0
def import_tree(tree_path):
    terms, types, levels = {}, {}, {}
    obsolete, main_ids, ancestors = {}, {}, {}
    with open_file(tree_path) as tree_file:
        for row in csv.reader(tree_file, dialect='excel-tab'):
            go_id = row[0]
            terms[go_id] = row[1]
            types[go_id] = row[2]
            try:
                levels[go_id] = int(row[3])
            except ValueError:
                levels[go_id] = None
            obsolete[go_id] = True if row[4] == 'True' else False
            if row[5] == 'True':
                main_ids[go_id] = row[6]
            ancestors[go_id] = set(row[-1].split(', '))
    return terms, types, levels, obsolete, main_ids, ancestors
Example #19
0
def export_table_with_seqids(results, seqids, output_path,
                             export_level_one_seqids):
    with open_file(output_path, 'w') as output_file:
        table = csv.writer(output_file, dialect='excel-tab')
        header = [
            'GO ID', 'Term', 'Level', 'Ref. count', 'Ref. perc.',
            'Sample count', 'Sample perc.', 'FC', 'p-value', 'Reg.',
            'Sequence IDs'
        ]
        table.writerow(header)
        for level in sorted(results):
            for key in sorted(results[level]):
                row = list(results[level][key])
                go_id = row[0]
                if level > 1 or export_level_one_seqids:
                    row += [', '.join(sorted(seqids[go_id]))]
                table.writerow(row)
Example #20
0
def extract_records_by_seqid(seqids, input_paths=INPUT_PATHS,
                             input_format=SEQFILE_FORMAT,
                             index_path=INDEX_PATH, output_path=OUTPUT_PATH,
                             output_format=SEQFILE_FORMAT):
    records_list = make_indexed_records_list(
        input_paths, index_path, input_format)
    with open_file(output_path, 'w') as output_file:
        not_found = []
        for seqid in seqids:
            try:
                record = get_indexed_record(seqid, records_list)
            except KeyError:
                not_found.append(seqid)
            else:
                write_records(record, output_file, output_format)
    if not_found:
        print_stderr('{:d} identifier(s) not found: {}'.format(
                        len(not_found), ', '.join(not_found)))
Example #21
0
def export_tex(results,
               output_path,
               name,
               go_type_long,
               header=TEX_HEADER,
               footer=TEX_FOOTER):
    name = name.replace('_', '\\_')
    go_type_long = go_type_long.replace('_', '\\_')
    with open_file(output_path, 'w') as output_file:
        output_file.write(''.join([
            header[0], go_type_long, header[1], name, header[2], name,
            header[3]
        ]))
        for level in sorted(results):
            output_file.write('\\midrule\n')
            for key in sorted(results[level]):
                row = format_tex_row(results[level][key])
                output_file.write(row + '\n')
        output_file.write(footer)