def main(): parser = argparse.ArgumentParser( description='Filters trinity output for longest subcomponents based on naming convention') ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file' ) parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT' ) args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = utils.fasta_dict_from_file(args.input) components = dict() for seq_id in seqs: m = re.search("(comp\d+)_", seq_id) if m: component_id = m.group(1) if component_id not in components or len(seqs[seq_id]['s']) > len(components[component_id]['s']): components[component_id] = seqs[seq_id] components[component_id]['longest_id'] = seq_id else: raise Exception("ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}".format(seq_id)) for c_id in components: seq_wrapped = utils.wrapped_fasta(components[c_id]['s'], every=60) fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'], components[c_id]['h'], seq_wrapped))
def main(): parser = argparse.ArgumentParser( description= 'Filters trinity output for longest subcomponents based on naming convention' ) ## output file to be written parser.add_argument('-i', '--input', type=str, required=True, help='Path to an input FASTA file') parser.add_argument('-o', '--output', type=str, required=False, help='Output file to be created. Default = STDOUT') args = parser.parse_args() ## output will either be a file or STDOUT fout = sys.stdout if args.output is not None: fout = open(args.output, 'wt') seqs = utils.fasta_dict_from_file(args.input) components = dict() for seq_id in seqs: m = re.search("(comp\d+)_", seq_id) if m: component_id = m.group(1) if component_id not in components or len(seqs[seq_id]['s']) > len( components[component_id]['s']): components[component_id] = seqs[seq_id] components[component_id]['longest_id'] = seq_id else: raise Exception( "ERROR: This ID wasn't in the expected format of compN_cN_seqN: {0}" .format(seq_id)) for c_id in components: seq_wrapped = utils.wrapped_fasta(components[c_id]['s'], every=60) fout.write(">{0} {1}\n{2}\n".format(components[c_id]['longest_id'], components[c_id]['h'], seq_wrapped))
def initialize_polypeptides( log_fh, fasta_file ): ''' Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached. ''' seqs = utils.fasta_dict_from_file(fasta_file) polypeptides = dict() for seq_id in seqs: polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s']) annotation = annotation.FunctionalAnnotation(product_name=DEFAULT_PRODUCT_NAME) log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format(seq_id, DEFAULT_PRODUCT_NAME)) polypeptide.annotation = annotation polypeptides[seq_id] = polypeptide return polypeptides
def initialize_polypeptides(log_fh, fasta_file): ''' Reads a FASTA file of (presumably) polypeptide sequences and creates a dict of Polypeptide objects, keyed by ID, with bioannotation.FunctionalAnnotation objects attached. ''' seqs = utils.fasta_dict_from_file(fasta_file) polypeptides = dict() for seq_id in seqs: polypeptide = things.Polypeptide(id=seq_id, length=len(seqs[seq_id]['s']), residues=seqs[seq_id]['s']) annotation = annotation.FunctionalAnnotation( product_name=DEFAULT_PRODUCT_NAME) log_fh.write("INFO: {0}: Set initial product name to '{1}'\n".format( seq_id, DEFAULT_PRODUCT_NAME)) polypeptide.annotation = annotation polypeptides[seq_id] = polypeptide return polypeptides