def main():
    parser = argparse.ArgumentParser( description='Filters trinity output for longest subcomponents based on naming convention')

    ## output file to be written
    parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created.  Default = STDOUT' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = utils.fasta_dict_from_file(args.input)

    components = dict()

    for seq_id in seqs:
        m = re.search("(comp\d+)_", seq_id)
        if m:
            component_id = m.group(1)

            if component_id not in components or len(seqs[seq_id]['s']) > len(components[component_id]['s']):
                components[component_id] = seqs[seq_id]
                components[component_id]['longest_id'] = seq_id
        else:
            raise Exception("ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}".format(seq_id))

    for c_id in components:
        seq_wrapped = utils.wrapped_fasta(components[c_id]['s'], every=60)
        fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'], components[c_id]['h'], seq_wrapped))
Example #2
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Filters trinity output for longest subcomponents based on naming convention'
    )

    ## output file to be written
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=False,
                        help='Output file to be created.  Default = STDOUT')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output is not None:
        fout = open(args.output, 'wt')

    seqs = utils.fasta_dict_from_file(args.input)

    components = dict()

    for seq_id in seqs:
        m = re.search("(comp\d+)_", seq_id)
        if m:
            component_id = m.group(1)

            if component_id not in components or len(seqs[seq_id]['s']) > len(
                    components[component_id]['s']):
                components[component_id] = seqs[seq_id]
                components[component_id]['longest_id'] = seq_id
        else:
            raise Exception(
                "ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}"
                .format(seq_id))

    for c_id in components:
        seq_wrapped = utils.wrapped_fasta(components[c_id]['s'], every=60)
        fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'],
                                            components[c_id]['h'],
                                            seq_wrapped))
def initialize_polypeptides( log_fh, fasta_file ):
    '''
    Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide
    objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached.
    '''
    seqs = utils.fasta_dict_from_file(fasta_file)

    polypeptides = dict()

    for seq_id in seqs:
        polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s'])
        annotation = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME)
        log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME))
        polypeptide.annotation = annotation
        
        polypeptides[seq_id] = polypeptide
    
    return polypeptides
def initialize_polypeptides(log_fh, fasta_file):
    '''
    Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide
    objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached.
    '''
    seqs = utils.fasta_dict_from_file(fasta_file)

    polypeptides = dict()

    for seq_id in seqs:
        polypeptide = things.Polypeptide(id=seq_id,
                                         length=len(seqs[seq_id]['s']),
                                         residues=seqs[seq_id]['s'])
        annotation = annotation.FunctionalAnnotation(
            product_name=DEFAULT_PRODUCT_NAME)
        log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(
            seq_id, DEFAULT_PRODUCT_NAME))
        polypeptide.annotation = annotation

        polypeptides[seq_id] = polypeptide

    return polypeptides