description = """ This will search NCBI for domains in a protein fasta file. """ import sys import time import requests from pythomics.templates import CustomParser parser = CustomParser(description=description) parser.add_fasta() parser.add_out() parser.add_argument( "--db", help="The database to search", default="cdd", choices=["cdd", "pfam", "smart", "tigrfam", "cog", "kog"] ) def main(): args = parser.parse_args() files = {"queries": args.fasta} nbci_url = "http://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi?" response = requests.post( "{nbci}tdata=hits&dmode=std&db={database}&compbasedadj=0&filter=true&evalue=0.0001&cddefl=true".format( nbci=nbci_url, database=args.db ), files=files, ) if response.status_code != 200: sys.stderr.write("Error interfacing with NCBI: {}".format(response.text))
description = """ This script will accept a given nucleotide fasta file and output found ORFs. ORFs are annotated by which stop codon they are a part of. As in, ORF 3 is annotated as the 3rd sequence if the translated sequence is divided by stop codons. This is prevent ambiguity with differing minimum lengths of ORFs. """ from pythomics.templates import CustomParser import sys, argparse import pythomics.parsers.fasta as fasta parser = CustomParser(description = description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_fasta() parser.add_out() parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50) parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true') parser.add_argument('--no-met-start', help="Output ORFs starting with amino acids other than MET", action='store_true') parser.add_argument('--from-met', help="Truncate leading amino acids up to MET", action='store_true') parser.add_argument('--from-met-keep', help="Truncate leading amino acids up to MET, but keep the untruncated version as well.", action='store_true') def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands no_met = args.no_met_start from_met = args.from_met from_met_keep = args.from_met_keep
default=['Heavy/Light']) mods = parser.add_argument_group('Modification File') mods.add_argument( '--mods', help= "The modifications file (the file with sites, peptides). For multiple files, separate by spaces (must be in same order as inference).", nargs='+', type=argparse.FileType('r'), required=True) mods.add_argument('--site-protein', help="The mod file protein column name", type=str, default='Protein') parser.add_argument('--no-log2', help='Do not log2 normalize quantification values.', action='store_true') parser.add_argument( '--no-median', help= 'Do not normalize quantification values by the median of the experiment.', action='store_true') parser.add_argument( '--wp', help= "The whole proteome inference file, if it exists. For multiple replicates, separate by spaces.", nargs='+', type=argparse.FileType('r')) parser.add_argument('--non-mod-norm', help='Normalize the data by the non-modified peptides.', action='store_true')
import sys, copy, re import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta import pythomics.proteomics.config as config from pythomics.templates import CustomParser parser = CustomParser(description=description) parser.add_fasta() parser.add_out() parser.add_enzyme( help= "Enzyme to use. Pass a list like \"trypsin lysc\" to use multiple enzymes. " "The order of enzymes will be the order of digestion if digesting in series." ) parser.add_argument( '--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true') def main(): args = parser.parse_args() digest_min = args.min digest_max = args.max enzymes = args.enzyme peptides_found = {} retained = {} total = 0 proteinMap = {} coverageMap = {} aas = config.RESIDUE_MASSES.keys() aas.sort()
description = """ This will search NCBI for domains in a protein fasta file. """ import sys import time import requests from pythomics.templates import CustomParser parser = CustomParser(description=description) parser.add_fasta() parser.add_out() parser.add_argument('--db', help='The database to search', default='cdd', choices=['cdd', 'pfam', 'smart', 'tigrfam', 'cog', 'kog']) def main(): args = parser.parse_args() files = {'queries': args.fasta} nbci_url = 'http://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi?' response = requests.post( '{nbci}tdata=hits&dmode=std&db={database}&compbasedadj=0&filter=true&evalue=0.0001&cddefl=true' .format(nbci=nbci_url, database=args.db), files=files) if response.status_code != 200: sys.stderr.write('Error interfacing with NCBI: {}'.format( response.text)) return 1
description = """ This script will digest a given fasta file with the specified enzymes. Both protein and nucleotide fasta files are valid inputs, and when digesting fasta files, it is possible to create 6 frame as well as 3 frame translations. """ import argparse, sys, itertools from pythomics.templates import CustomParser import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta parser = CustomParser(description = description) parser.add_fasta() parser.add_argument('-t', '--type', help="The type of fasta file (default protein).", choices=['prot','nt'], type=str, default='prot') parser.add_argument('--frame', help="If using a nucleotide file, translate in how many frames?", choices=[1,3,6], type=int) parser.add_argument('--genome', help="Are we translating a genome? This will keep chromosome positions in the header.", action='store_true', default=False) parser.add_out() parser.add_enzyme() parser.add_argument('--unique', help="Only return unique peptides per cleavage", action='store_true', default=False) def main(): args = parser.parse_args() file_name = args.fasta enzyme_choice = args.enzyme digest_type = args.type digest_frame = args.frame digest_negative = False if digest_frame == 6: digest_negative = True
try: import re2 as re except ImportError: import re from multiprocessing import Pool, Value from collections import Counter from pythomics.templates import CustomParser import pythomics.proteomics.config as config import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta from pythomics.utils import ColumnFunctions parser = CustomParser(description = description) parser.add_fasta(help="The fasta file to match peptides against.") parser.add_out(help="The name of the file you wish to create with results appended.") parser.add_argument('--peptide-out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=os.devnull) parser.add_argument('--protein-out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=os.devnull) parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true') parser.add_delimited_file(cols=['--peptide-col'], col_default='Peptide') parser.add_argument('-r', '--regex', help="A perl regular expression determining which parts of the header to capture.", type=str) parser.add_argument('--inferred-name', help="The name you want to assign for protein inference (in case you are regexing for gene names or something).", type=str, default='Proteins') parser.add_argument('--no-inference', help="Do not append proteins inferred from sequences.", action='store_true') parser.add_argument('--no-equality', help="Do not consider Leucine and Isoleucine equal for peptide mapping.", action='store_true') ibaq_group = parser.add_argument_group('iBAQ related options') ibaq_group.add_argument('--ibaq', help="Provide to append iBAQ values as well (requires protein inference).", action='store_true') ibaq_group.add_argument('--precursors', help="The column with precursor area (defaults to header lines containing 'Precursor').", type=str) parser.add_column_function('', col_argument='--ibaq-function', group=ibaq_group, col_help="The function to apply to groups of iBAQ values (for multiple peptide matches).", parent=False) ibaq_group.add_argument('--non-redundant', help="Use only non-redundant theoretical tryptic peptides for the iBAQ denominator.", action='store_true') parser.add_enzyme(help="The enzyme used to digest the sample.") ibaq_group.add_argument('--normalize', help="Normalize iBAQ to total intensity of column (useful for comparing multiple samples).", action='store_true') protein_group = parser.add_argument_group('Protein Grouping Options')
and summarize how much of the proteome is covered, what residues are missed, and what isoforms can be uniquely identified. """ import sys, copy, re import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta import pythomics.proteomics.config as config from pythomics.templates import CustomParser parser = CustomParser(description = description) parser.add_fasta() parser.add_out() parser.add_enzyme(help="Enzyme to use. Pass a command separated list (no spaces); " "the order of enzymes will be the order of digestion if digesting in series.") parser.add_argument('--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true', default=False) parser.add_argument('--series', help="Should cleavages be done in series? (default)", action='store_true', default=True) def main(): args = parser.parse_args() digest_min = args.min digest_max = args.max enzymes = args.enzyme.split(',') peptides_found = {} retained = {} total = 0 proteinMap = {} coverageMap = {} aas = config.RESIDUE_MASSES.keys() aas.sort()
and summarize how much of the proteome is covered, what residues are missed, and what isoforms can be uniquely identified. """ import sys, copy, re import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta import pythomics.proteomics.config as config from pythomics.templates import CustomParser parser = CustomParser(description = description) parser.add_fasta() parser.add_out() parser.add_enzyme(help="Enzyme to use. Pass a list like \"trypsin lysc\" to use multiple enzymes. " "The order of enzymes will be the order of digestion if digesting in series.") parser.add_argument('--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true') def main(): args = parser.parse_args() digest_min = args.min digest_max = args.max enzymes = args.enzyme peptides_found = {} retained = {} total = 0 proteinMap = {} coverageMap = {} aas = config.RESIDUE_MASSES.keys() aas.sort() tlen = 0
description = """ This script will annotate a tab delimited text file with peptides with corresponding proteins present in an annotation file, and can also use this annotation to include iBAQ measures. """ import argparse, sys, re, csv, copy, decimal from pythomics.templates import CustomParser import pythomics.proteomics.config as config import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta parser = CustomParser(description = description) parser.add_fasta(help="The fasta file to match peptides against.") parser.add_argument('--peptide_out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=sys.stdout) parser.add_argument('--protein_out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=sys.stdout) parser.add_delimited_file() parser.add_argument('-r', '--regex', help="A perl regular expression determining which parts of the header to capture.", type=str) parser.add_argument('--no-inference', help="Do not append proteins inferred from sequences.", action='store_false', default=False) group = parser.add_argument_group('iBAQ related options') group.add_argument('--ibaq', help="Provide to append iBAQ values as well (requires protein inference).", action='store_true', default=False) group.add_argument('--precursors', help="The column with precursor area (defaults to header lines containing 'Precursor').", type=int, default=None) parser.add_enzyme() group.add_argument('--no-normalize', help="Don't normalize iBAQ to total intensity", action='store_false', default=True) group.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true', default=False) protein_group = parser.add_argument_group('Protein Grouping Options') protein_group.add_argument('--unique-only', help="Only group proteins with unique peptides", action='store_true', default=False) protein_group.add_argument('--position', help="Write the position of the peptide matches.", action='store_true', default=False) def main():
description = """ This script will accept a given nucleotide fasta file and output found ORFs. ORFs are annotated by which stop codon they are a part of. As in, ORF 3 is annotated as the 3rd sequence if the translated sequence is divided by stop codons. This is prevent ambiguity with differing minimum lengths of ORFs. """ from pythomics.templates import CustomParser import sys, argparse import pythomics.parsers.fasta as fasta parser = CustomParser(description = description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_fasta() parser.add_out() parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50) parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true', default=False) def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand='+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation):
__author__ = 'chris' description = """ This script will lookup features from one delimited file in another delimited file, and perform various operations on the found entries in the alternative file """ import sys, csv from pythomics.templates import CustomParser from pythomics.utils import ColumnFunctions parser = CustomParser(description = description) parser.add_delimited_file(files=['-a'], delimiter=['--adelim'], cols=['--acol'], header=['--aheader'], help="This is the file to lookup values from.") parser.add_delimited_file(files=['-b'], delimiter=['--bdelim'], cols=['--bcol'], header=['--bheader'], help="This is the file to lookup values in.") parser.add_argument('--blookup', help='The column to take entries from in file b.', type=str, default=1) parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true') parser.add_out() parser.add_argument('--function', help='The function to apply to found entries.', choices=['concat', 'mean', 'sum', 'median', 'var', 'std', 'count'], type=str, default='concat') parser.add_argument('--colname', help='The column name to give the new appended value. Defaults to function chosen', type=str, default='') parser.add_argument('--aregex', help='An optional regex pattern for matching columns in file a.', type=str, default='') parser.add_argument('--bregex', help='An optional regex pattern for matching columns in file b.', type=str, default='') def main(): args = parser.parse_args() a_colname, b_colname, bl_colname = False, False, False try: a_column = int(args.acol) a_column = a_column-1 if a_column > 0 else a_column except ValueError: a_colname = True
found ORFs. ORFs are annotated by which stop codon they are a part of. As in, ORF 3 is annotated as the 3rd sequence if the translated sequence is divided by stop codons. This is prevent ambiguity with differing minimum lengths of ORFs. """ from pythomics.templates import CustomParser import sys, argparse import pythomics.parsers.fasta as fasta parser = CustomParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_fasta() parser.add_out() parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50) parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true') parser.add_argument( '--no-met-start', help="Output ORFs starting with amino acids other than MET", action='store_true') parser.add_argument('--from-met', help="Truncate leading amino acids up to MET", action='store_true') parser.add_argument( '--from-met-keep', help= "Truncate leading amino acids up to MET, but keep the untruncated version as well.",
This script will digest a given fasta file with the specified enzymes. Both protein and nucleotide fasta files are valid inputs, and when digesting fasta files, it is possible to create 6 frame as well as 3 frame translations. """ import argparse, sys, itertools from pythomics.templates import CustomParser import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta parser = CustomParser(description=description) parser.add_fasta() parser.add_argument('-t', '--type', help="The type of fasta file (default protein).", choices=['prot', 'nt'], type=str, default='prot') parser.add_argument( '--frame', help="If using a nucleotide file, translate in how many frames?", choices=[1, 3, 6], type=int) parser.add_argument( '--genome', help= "Are we translating a genome? This will keep chromosome positions in the header.", action='store_true') parser.add_out() parser.add_enzyme() parser.add_argument('--unique',
Also, features can be can be grouped into longer sequences with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged into LNGERPEPTIDE). """ import argparse, sys, re, csv, copy, decimal from pythomics.templates import CustomParser import pythomics.proteomics.config as config import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta from pythomics.utils import ColumnFunctions parser = CustomParser(description = description) parser.add_delimited_file(cols=['--group-on']) parser.add_out() parser.add_argument('--substring', help='If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true') parser.add_column_function('--summary-col', col_help="The function to apply to grouped entries in modification columns.") parser.add_argument('--summary-col-delimiter', help="If the summary column has a delimiter, such as a ; for multiple proteins.") parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true') parser.add_argument('--merge', help='Merge together identical entries.', action='store_true') # parser.add_argument('--merge-columns', help="If set, columns of merged peptides will be combined.", action='store_true') # parser.add_argument('--merge-delimiter', help='The delimiter for column merges.', type=str, default=';') parser.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true') def main(): args = parser.parse_args() peptide_colname = False try: peptide_column = int(args.group_on) peptide_column = peptide_column-1 if peptide_column > 0 else peptide_column except ValueError:
This script will take a delimited file and collapse features together, such as scan numbers. It can also be used to group peptides into longer sequences with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged into LNGERPEPTIDE). """ import argparse, sys, re, csv, copy, decimal from pythomics.templates import CustomParser import pythomics.proteomics.config as config import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta parser = CustomParser(description = description) parser.add_delimited_file() parser.add_out() parser.add_argument('--substring', help='If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true', default=False) parser.add_argument('--merge-columns', help="If set, columns of merged peptides will be combined.", action='store_true', default=False) parser.add_argument('--merge-delimiter', help='The delimiter for column merges.', type=str, default=';') parser.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true', default=False) def main(): args = parser.parse_args() peptide_column = args.col-1 tsv_file = args.tsv header_lines = args.header delimiter = args.delimiter peptide_join = args.substring col_delimiter = args.merge_delimiter merge_columns = args.merge_columns case_sens = args.case_sensitive peptide_history = {}
from pythomics.templates import CustomParser from pythomics.utils import ColumnFunctions parser = CustomParser(description=description) parser.add_delimited_file(files=['-a'], delimiter=['--adelim'], cols=['--acol'], header=['--aheader'], help="This is the file to lookup values from.") parser.add_delimited_file(files=['-b'], delimiter=['--bdelim'], cols=['--bcol'], header=['--bheader'], help="This is the file to lookup values in.") parser.add_argument('--blookup', help='The column to take entries from in file b.', type=str, default=1) parser.add_argument( '--strict', help= 'For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true') parser.add_out() parser.add_argument( '--function', help='The function to apply to found entries.', choices=['concat', 'mean', 'sum', 'median', 'var', 'std', 'count'], type=str, default='concat') parser.add_argument( '--colname',
import re from multiprocessing import Pool, Value from collections import Counter from pythomics.templates import CustomParser import pythomics.proteomics.config as config import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta from pythomics.utils import ColumnFunctions parser = CustomParser(description=description) parser.add_fasta(help="The fasta file to match peptides against.") parser.add_out( help="The name of the file you wish to create with results appended.") parser.add_argument('--peptide-out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=os.devnull) parser.add_argument('--protein-out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=os.devnull) parser.add_argument( '--strict', help= 'For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true') parser.add_delimited_file(cols=['--peptide-col'], col_default='Peptide') parser.add_argument( '-r',
into LNGERPEPTIDE). """ import argparse, sys, re, csv, copy, decimal from pythomics.templates import CustomParser import pythomics.proteomics.config as config import pythomics.proteomics.digest as digest import pythomics.parsers.fasta as fasta from pythomics.utils import ColumnFunctions parser = CustomParser(description=description) parser.add_delimited_file(cols=['--group-on']) parser.add_out() parser.add_argument( '--substring', help= 'If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true') parser.add_column_function( '--summary-col', col_help="The function to apply to grouped entries in modification columns." ) parser.add_argument( '--summary-col-delimiter', help= "If the summary column has a delimiter, such as a ; for multiple proteins." ) parser.add_argument( '--strict', help= 'For numeric operations, fail if types are incorrect (converting NA to a float for instance).',
import argparse from pythomics.templates import CustomParser parser = CustomParser(description=description) group = parser.add_argument_group('Protein Inference File') group.add_argument('--inference', help="The protein inference file (your peptide file with gene/protein annotations). For multiple files, separate by spaces (must be in same order as mods).", nargs='+', type=argparse.FileType('r'), required=True) group.add_argument('--gene', help="The Gene column name", type=str, default='Gene') group.add_argument('--protein', help="The Protein column name", type=str, default='Protein') group.add_argument('--peptide', help="The Peptide column name", type=str, default='Peptide') group.add_argument('--quant', help="The name of quantification columns (such as Heavy/Light). Separate multiple columns by spaces", nargs='+', default=['Heavy/Light']) mods = parser.add_argument_group('Modification File') mods.add_argument('--mods', help="The modifications file (the file with sites, peptides). For multiple files, separate by spaces (must be in same order as inference).", nargs='+', type=argparse.FileType('r'), required=True) mods.add_argument('--site-protein', help="The mod file protein column name", type=str, default='Protein') parser.add_argument('--no-log2', help='Do not log2 normalize quantification values.', action='store_true') parser.add_argument('--no-median', help='Do not normalize quantification values by the median of the experiment.', action='store_true') parser.add_argument('--wp', help="The whole proteome inference file, if it exists. For multiple replicates, separate by spaces.", nargs='+', type=argparse.FileType('r')) parser.add_argument('--non-mod-norm', help='Normalize the data by the non-modified peptides.', action='store_true') parser.add_argument('--site-file', help='The output path for the file with sumamries at the site level.', default=sys.stdout, type=argparse.FileType('wb')) parser.add_argument('--peptide-file', help='The output path for the file with sumamries at the site and peptide level.', default=sys.stdout, type=argparse.FileType('wb')) def main(): args = parser.parse_args() inference_files = args.inference mod_files = args.mods wp_files = args.wp if args.wp else [] quant_cols = args.quant gene_col = args.gene
This script will trim N's from the ends of a fasta/fastq file so it can be aligned by tophat (which pukes if there are >5 N's. We remove them from the read ends only) """ import sys, re, os, gzip from itertools import izip from multiprocessing import Pool from pythomics.templates import CustomParser parser = CustomParser(description=description) parser.add_fasta() parser.add_read_pair() parser.add_out() parser.add_argument( '--min-len', help="The minimum read length reads must be after trimming.", type=int, default=25) parser.add_argument('--prefix', help="If using paired reads, this is the filename prefix.", type=str) parser.add_argument('--quality', help='If provided, remove qualities below a given score.', type=int, default=0) parser.add_argument('--chunk', help='How many reads to submit to each core.', type=int, default=1000) parser.add_argument('--no-gzip', help='To disable compression with gzip.',
description = """ This script will trim N's from the ends of a fasta/fastq file so it can be aligned by tophat (which pukes if there are >5 N's. We remove them from the read ends only) """ import sys, re, os, gzip from itertools import izip from multiprocessing import Pool from pythomics.templates import CustomParser parser = CustomParser(description = description) parser.add_fasta() parser.add_read_pair() parser.add_out() parser.add_argument('--min-len', help="The minimum read length reads must be after trimming.", type=int, default=25) parser.add_argument('--prefix', help="If using paired reads, this is the filename prefix.", type=str) parser.add_argument('--quality', help='If provided, remove qualities below a given score.', type=int, default=0) parser.add_argument('--chunk', help='How many reads to submit to each core.', type=int, default=1000) parser.add_argument('--no-gzip', help='To disable compression with gzip.', action='store_false') # parser.add_argument('--5partial-match', help='This will trim partial matches at the 3\' end of the sequence if there is a match of at least x nucleotides.', type=int, default=0) # parser.add_argument('--seed-length', help='The seed length for a match.', type=int, default=0) # parser.add_argument('--mismatches', help='The number of possible mismatches in a sequence.', type=int, default=3) start_trim = re.compile(r'^N+') end_trim = re.compile(r'N+$') global quality_min global quality_offset global paired global read_min quality_min = 0