Ejemplo n.º 1
0
#!/usr/bin/env python

description = """
This script will digest a given protein fasta file with the specified enzymes
and summarize how much of the proteome is covered, what residues are missed,
and what isoforms can be uniquely identified.
"""

import sys, copy, re
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
import pythomics.proteomics.config as config
from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_out()
parser.add_enzyme(
    help=
    "Enzyme to use. Pass a list like \"trypsin lysc\" to use multiple enzymes.  "
    "The order of enzymes will be the order of digestion if digesting in series."
)
parser.add_argument(
    '--parallel',
    help="Should cleavages be done in parallel (default is serial digestion)?",
    action='store_true')


def main():
    args = parser.parse_args()
    digest_min = args.min
Ejemplo n.º 2
0
#!/usr/bin/env python
from __future__ import division, absolute_import

__author__ = 'chris'

description = """
This will search NCBI for domains in a protein fasta file.
"""

import sys
import time
import requests

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_out()
parser.add_argument('--db',
                    help='The database to search',
                    default='cdd',
                    choices=['cdd', 'pfam', 'smart', 'tigrfam', 'cog', 'kog'])


def main():
    args = parser.parse_args()
    files = {'queries': args.fasta}
    nbci_url = 'http://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi?'
    response = requests.post(
        '{nbci}tdata=hits&dmode=std&db={database}&compbasedadj=0&filter=true&evalue=0.0001&cddefl=true'
        .format(nbci=nbci_url, database=args.db),
Ejemplo n.º 3
0
"""

import argparse, sys, csv, copy, decimal, itertools, os, operator
try:
    import re2 as re
except ImportError:
    import re
from multiprocessing import Pool, Value
from collections import Counter
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to match peptides against.")
parser.add_out(help="The name of the file you wish to create with results appended.")
parser.add_argument('--peptide-out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=os.devnull)
parser.add_argument('--protein-out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=os.devnull)
parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true')
parser.add_delimited_file(cols=['--peptide-col'], col_default='Peptide')
parser.add_argument('-r', '--regex', help="A perl regular expression determining which parts of the header to capture.", type=str)
parser.add_argument('--inferred-name', help="The name you want to assign for protein inference (in case you are regexing for gene names or something).", type=str, default='Proteins')
parser.add_argument('--no-inference', help="Do not append proteins inferred from sequences.", action='store_true')
parser.add_argument('--no-equality', help="Do not consider Leucine and Isoleucine equal for peptide mapping.", action='store_true')
ibaq_group = parser.add_argument_group('iBAQ related options')
ibaq_group.add_argument('--ibaq', help="Provide to append iBAQ values as well (requires protein inference).", action='store_true')
ibaq_group.add_argument('--precursors', help="The column with precursor area (defaults to header lines containing 'Precursor').", type=str)
parser.add_column_function('', col_argument='--ibaq-function', group=ibaq_group, col_help="The function to apply to groups of iBAQ values (for multiple peptide matches).", parent=False)
ibaq_group.add_argument('--non-redundant', help="Use only non-redundant theoretical tryptic peptides for the iBAQ denominator.", action='store_true')
Ejemplo n.º 4
0
#!/usr/bin/env python

__author__ = 'Chris Mitchell'

import sys
from pythomics.templates import CustomParser
import pythomics.parsers.fasta as fasta
import pythomics.genomics.parsers as gp

description = """
This script will incorporate the variants in a given VCF file into a specified
fasta file.
"""

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to incorporate changes into.")
parser.add_out(help="The file to write resulting fasta file to.")
parser.add_vcf()

def main():
    args = parser.parse_args()
    file_name = args.file
    vcf = args.vcf
    snps = not args.no_snps
    dels = args.dels
    ins = args.ins
    homs = not args.no_homozygous
    hets = args.heterozygous
    individual = args.individual-1
    fasta_file = fasta.FastaIterator(file_name)
    vcf_file = gp.VCFIterator( vcf )
Ejemplo n.º 5
0
creating summary statistics for them.

For instance, gene ids can be selected and their FPKM/iBAQ values combined.
Also, features can be can be grouped into longer sequences
with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged
into LNGERPEPTIDE).
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description = description)
parser.add_delimited_file(cols=['--group-on'])
parser.add_out()
parser.add_argument('--substring', help='If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true')
parser.add_column_function('--summary-col', col_help="The function to apply to grouped entries in modification columns.")
parser.add_argument('--summary-col-delimiter', help="If the summary column has a delimiter, such as a ; for multiple proteins.")
parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true')
parser.add_argument('--merge', help='Merge together identical entries.', action='store_true')
# parser.add_argument('--merge-columns', help="If set, columns of merged peptides will be combined.", action='store_true')
# parser.add_argument('--merge-delimiter', help='The delimiter for column merges.', type=str, default=';')
parser.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true')

def main():
    args = parser.parse_args()
    peptide_colname = False
    try:
Ejemplo n.º 6
0
#!/usr/bin/env python

description = """
This script will digest a given fasta file with the specified enzymes. 
Both protein and nucleotide fasta files are valid inputs, and when
digesting fasta files, it is possible to create 6 frame as well as 
3 frame translations.
"""

import argparse, sys, itertools
from pythomics.templates import CustomParser
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description)
parser.add_fasta()
parser.add_argument('-t', '--type', help="The type of fasta file (default protein).", choices=['prot','nt'], type=str, default='prot')
parser.add_argument('--frame', help="If using a nucleotide file, translate in how many frames?", choices=[1,3,6], type=int)
parser.add_argument('--genome', help="Are we translating a genome? This will keep chromosome positions in the header.", action='store_true', default=False)
parser.add_out()
parser.add_enzyme()
parser.add_argument('--unique', help="Only return unique peptides per cleavage", action='store_true', default=False)

def main():
    args = parser.parse_args()
    file_name = args.fasta
    enzyme_choice = args.enzyme
    digest_type = args.type
    digest_frame = args.frame
    digest_negative = False
    if digest_frame == 6:
Ejemplo n.º 7
0
#!/usr/bin/env python

__author__ = 'Chris Mitchell'

import sys
from pythomics.templates import CustomParser
import pythomics.parsers.fasta as fasta
import pythomics.genomics.parsers as gp

description = """
This script will incorporate the variants in a given VCF file into a specified
fasta file.
"""

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to incorporate changes into.")
parser.add_out(help="The file to write resulting fasta file to.")
parser.add_vcf()

def main():
    args = parser.parse_args()
    file_name = args.fasta
    vcf = args.vcf
    snps = args.no_snps
    dels = args.dels
    ins = args.ins
    homs = args.no_homozygous
    hets = args.heterozygous
    individual = args.individual-1
    fasta_file = fasta.FastaIterator(file_name)
    vcf_file = gp.VCFIterator( vcf )
Ejemplo n.º 8
0
#!/usr/bin/env python
from __future__ import division, absolute_import

__author__ = "chris"

description = """
This will search NCBI for domains in a protein fasta file.
"""

import sys
import time
import requests

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_out()
parser.add_argument(
    "--db", help="The database to search", default="cdd", choices=["cdd", "pfam", "smart", "tigrfam", "cog", "kog"]
)


def main():
    args = parser.parse_args()
    files = {"queries": args.fasta}
    nbci_url = "http://www.ncbi.nlm.nih.gov/Structure/bwrpsb/bwrpsb.cgi?"
    response = requests.post(
        "{nbci}tdata=hits&dmode=std&db={database}&compbasedadj=0&filter=true&evalue=0.0001&cddefl=true".format(
            nbci=nbci_url, database=args.db
        ),
Ejemplo n.º 9
0
__author__ = 'chris'

description = """
This script will annotate a tab delimited text file with peptides with
corresponding proteins present in an annotation file, and can also
use this annotation to include iBAQ measures.
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to match peptides against.")
parser.add_argument('--peptide_out', nargs='?', help="The file to write digested products to.", type=argparse.FileType('w'), default=sys.stdout)
parser.add_argument('--protein_out', nargs='?', help="The file to write grouped products to.", type=argparse.FileType('w'), default=sys.stdout)
parser.add_delimited_file()
parser.add_argument('-r', '--regex', help="A perl regular expression determining which parts of the header to capture.", type=str)
parser.add_argument('--no-inference', help="Do not append proteins inferred from sequences.", action='store_false', default=False)
group = parser.add_argument_group('iBAQ related options')
group.add_argument('--ibaq', help="Provide to append iBAQ values as well (requires protein inference).", action='store_true', default=False)
group.add_argument('--precursors', help="The column with precursor area (defaults to header lines containing 'Precursor').", type=int, default=None)
parser.add_enzyme()
group.add_argument('--no-normalize', help="Don't normalize iBAQ to total intensity", action='store_false', default=True)
group.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true', default=False)
protein_group = parser.add_argument_group('Protein Grouping Options')
protein_group.add_argument('--unique-only', help="Only group proteins with unique peptides", action='store_true', default=False)
protein_group.add_argument('--position', help="Write the position of the peptide matches.", action='store_true', default=False)
Ejemplo n.º 10
0
#!/usr/bin/env python

description = """
This script will accept a given nucleotide fasta file and output
found ORFs. ORFs are annotated by which stop codon they are a part
of. As in, ORF 3 is annotated as the 3rd sequence if the translated
sequence is divided by stop codons. This is prevent ambiguity with
differing minimum lengths of ORFs.
"""

from pythomics.templates import CustomParser
import sys, argparse
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_fasta()
parser.add_out()
parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50)
parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true', default=False)

def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    with args.out as o:
        for header, sequence in fasta_file:
            for i in xrange(3):
                strand='+'
                translation = fasta._translate(sequence[i:])
Ejemplo n.º 11
0
#!/usr/bin/env python

__author__ = 'chris'

description = """
This script will lookup features from one delimited file in another delimited file, and
perform various operations on the found entries in the alternative file
"""

import sys, csv
from pythomics.templates import CustomParser
from pythomics.utils import ColumnFunctions

parser = CustomParser(description = description)
parser.add_delimited_file(files=['-a'], delimiter=['--adelim'], cols=['--acol'], header=['--aheader'], help="This is the file to lookup values from.")
parser.add_delimited_file(files=['-b'], delimiter=['--bdelim'], cols=['--bcol'], header=['--bheader'], help="This is the file to lookup values in.")
parser.add_argument('--blookup', help='The column to take entries from in file b.', type=str, default=1)
parser.add_argument('--strict', help='For numeric operations, fail if types are incorrect (converting NA to a float for instance).', action='store_true')
parser.add_out()
parser.add_argument('--function', help='The function to apply to found entries.', choices=['concat', 'mean', 'sum', 'median', 'var', 'std', 'count'], type=str, default='concat')
parser.add_argument('--colname', help='The column name to give the new appended value. Defaults to function chosen', type=str, default='')
parser.add_argument('--aregex', help='An optional regex pattern for matching columns in file a.', type=str, default='')
parser.add_argument('--bregex', help='An optional regex pattern for matching columns in file b.', type=str, default='')

def main():
    args = parser.parse_args()
    a_colname, b_colname, bl_colname = False, False, False
    try:
        a_column = int(args.acol)
        a_column = a_column-1 if a_column > 0 else a_column
    except ValueError:
Ejemplo n.º 12
0
#!/usr/bin/env python

description = """
This script will accept a given nucleotide fasta file and output
found ORFs. ORFs are annotated by which stop codon they are a part
of. As in, ORF 3 is annotated as the 3rd sequence if the translated
sequence is divided by stop codons. This is prevent ambiguity with
differing minimum lengths of ORFs.
"""

from pythomics.templates import CustomParser
import sys, argparse
import pythomics.parsers.fasta as fasta

parser = CustomParser(description=description,
                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_fasta()
parser.add_out()
parser.add_argument('--min',
                    help="Minimum ORF length in amino acids.",
                    type=int,
                    default=50)
parser.add_argument('--both-strands',
                    help="Search both strands for ORFs.",
                    action='store_true')
parser.add_argument(
    '--no-met-start',
    help="Output ORFs starting with amino acids other than MET",
    action='store_true')
parser.add_argument('--from-met',
                    help="Truncate leading amino acids up to MET",
Ejemplo n.º 13
0
#!/usr/bin/env python

description = """
This script will digest a given fasta file with the specified enzymes. 
Both protein and nucleotide fasta files are valid inputs, and when
digesting fasta files, it is possible to create 6 frame as well as 
3 frame translations.
"""

import argparse, sys, itertools
from pythomics.templates import CustomParser
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description=description)
parser.add_fasta()
parser.add_argument('-t',
                    '--type',
                    help="The type of fasta file (default protein).",
                    choices=['prot', 'nt'],
                    type=str,
                    default='prot')
parser.add_argument(
    '--frame',
    help="If using a nucleotide file, translate in how many frames?",
    choices=[1, 3, 6],
    type=int)
parser.add_argument(
    '--genome',
    help=
    "Are we translating a genome? This will keep chromosome positions in the header.",
Ejemplo n.º 14
0
#!/usr/bin/env python

description = """
This script will accept a given nucleotide fasta file and output
found ORFs. ORFs are annotated by which stop codon they are a part
of. As in, ORF 3 is annotated as the 3rd sequence if the translated
sequence is divided by stop codons. This is prevent ambiguity with
differing minimum lengths of ORFs.
"""

from pythomics.templates import CustomParser
import sys, argparse
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_fasta()
parser.add_out()
parser.add_argument('--min', help="Minimum ORF length in amino acids.", type=int, default=50)
parser.add_argument('--both-strands', help="Search both strands for ORFs.", action='store_true')
parser.add_argument('--no-met-start', help="Output ORFs starting with amino acids other than MET", action='store_true')
parser.add_argument('--from-met', help="Truncate leading amino acids up to MET", action='store_true')
parser.add_argument('--from-met-keep', help="Truncate leading amino acids up to MET, but keep the untruncated version as well.", action='store_true')


def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    no_met = args.no_met_start
Ejemplo n.º 15
0
#!/usr/bin/env python

description = """
This script will digest a given protein fasta file with the specified enzymes
and summarize how much of the proteome is covered, what residues are missed,
and what isoforms can be uniquely identified.
"""

import sys, copy, re
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
import pythomics.proteomics.config as config
from pythomics.templates import CustomParser

parser = CustomParser(description = description)
parser.add_fasta()
parser.add_out()
parser.add_enzyme(help="Enzyme to use. Pass a command separated list (no spaces); "
                    "the order of enzymes will be the order of digestion if digesting in series.")
parser.add_argument('--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true', default=False)
parser.add_argument('--series', help="Should cleavages be done in series? (default)", action='store_true', default=True)


def main():
    args = parser.parse_args()
    digest_min = args.min
    digest_max = args.max
    enzymes = args.enzyme.split(',')
    peptides_found = {}
    retained = {}
    total = 0
Ejemplo n.º 16
0
from __future__ import division, absolute_import

__author__ = 'chris'

description = """

"""

import sys
import os
import operator
import argparse

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
group = parser.add_argument_group('Protein Inference File')
group.add_argument(
    '--inference',
    help=
    "The protein inference file (your peptide file with gene/protein annotations). For multiple files, separate by spaces (must be in same order as mods).",
    nargs='+',
    type=argparse.FileType('r'),
    required=True)
group.add_argument('--gene',
                   help="The Gene column name",
                   type=str,
                   default='Gene')
group.add_argument('--protein',
                   help="The Protein column name",
                   type=str,
Ejemplo n.º 17
0
#!/usr/bin/env python

description = """
This produces a bam file corresponding to junctional regions in a given gtf file
"""

import sys
import pysam
from pythomics.genomics.parsers import GFFReader
from pythomics.templates import CustomParser

parser = CustomParser(description=description)
parser.add_bam()
parser.add_bam_out()
parser.add_gff()


def main():
    args = parser.parse_args()
    samfile = pysam.Samfile(args.bam, 'rb')
    junctionreads = pysam.Samfile(args.out_bam, 'wb', template=samfile)
    id_tag = args.group_on
    chosen_feature = args.feature
    if args.cufflinks:
        gff = GFFReader(args.gff, preset='cufflinks')
    else:
        gff = GFFReader(args.gff, tag_map={'ID': id_tag, 'Parent': 'Parent'})
    written = set([])
    for feature_name, feature in gff.get_features():
        try:
            children = feature.children
Ejemplo n.º 18
0
#!/usr/bin/env python

description = """
This produces a bam file corresponding to junctional regions in a given gtf file
"""

import sys
import pysam
from pythomics.genomics.parsers import GFFReader
from pythomics.templates import CustomParser

parser = CustomParser(description = description)
parser.add_bam()
parser.add_bam_out()
parser.add_gff()

def main():
    args = parser.parse_args()
    samfile = pysam.Samfile(args.bam, 'rb')
    junctionreads = pysam.Samfile(args.out_bam, 'wb', template=samfile)
    id_tag = args.group_on
    chosen_feature = args.feature
    if args.cufflinks:
        gff = GFFReader(args.gff, preset='cufflinks')
    else:
        gff = GFFReader(args.gff, tag_map={'ID': id_tag, 'Parent': 'Parent'})
    written = set([])
    for feature_name, feature in gff.get_features():
        try:
            children = feature.children
        except AttributeError:
Ejemplo n.º 19
0
#!/usr/bin/env python

description = """
This script will digest a given protein fasta file with the specified enzymes
and summarize how much of the proteome is covered, what residues are missed,
and what isoforms can be uniquely identified.
"""

import sys, copy, re
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
import pythomics.proteomics.config as config
from pythomics.templates import CustomParser

parser = CustomParser(description = description)
parser.add_fasta()
parser.add_out()
parser.add_enzyme(help="Enzyme to use. Pass a list like \"trypsin lysc\" to use multiple enzymes.  "
                    "The order of enzymes will be the order of digestion if digesting in series.")
parser.add_argument('--parallel', help="Should cleavages be done in parallel (default is serial digestion)?", action='store_true')


def main():
    args = parser.parse_args()
    digest_min = args.min
    digest_max = args.max
    enzymes = args.enzyme
    peptides_found = {}
    retained = {}
    total = 0
    proteinMap = {}
Ejemplo n.º 20
0
#!/usr/bin/env python

__author__ = 'chris'

description = """
This script will lookup features from one delimited file in another delimited file, and
perform various operations on the found entries in the alternative file
"""

import sys, csv
from pythomics.templates import CustomParser
from pythomics.utils import ColumnFunctions

parser = CustomParser(description=description)
parser.add_delimited_file(files=['-a'],
                          delimiter=['--adelim'],
                          cols=['--acol'],
                          header=['--aheader'],
                          help="This is the file to lookup values from.")
parser.add_delimited_file(files=['-b'],
                          delimiter=['--bdelim'],
                          cols=['--bcol'],
                          header=['--bheader'],
                          help="This is the file to lookup values in.")
parser.add_argument('--blookup',
                    help='The column to take entries from in file b.',
                    type=str,
                    default=1)
parser.add_argument(
    '--strict',
    help=
Ejemplo n.º 21
0
__author__ = 'chris'

description = """
This script will take a delimited file and collapse features together, such
as scan numbers. It can also be used to group peptides into longer sequences
with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged
into LNGERPEPTIDE).
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta

parser = CustomParser(description = description)
parser.add_delimited_file()
parser.add_out()
parser.add_argument('--substring', help='If set, merge features by partial matches (such as collapsing peptides into larger peptides)', action='store_true', default=False)
parser.add_argument('--merge-columns', help="If set, columns of merged peptides will be combined.", action='store_true', default=False)
parser.add_argument('--merge-delimiter', help='The delimiter for column merges.', type=str, default=';')
parser.add_argument('--case-sensitive', help="Treat peptides as case-sensitive (ie separate modified peptides)", action='store_true', default=False)

def main():
    args = parser.parse_args()
    peptide_column = args.col-1
    tsv_file = args.tsv
    header_lines = args.header
    delimiter = args.delimiter
    peptide_join = args.substring
    col_delimiter = args.merge_delimiter
Ejemplo n.º 22
0
creating summary statistics for them.

For instance, gene ids can be selected and their FPKM/iBAQ values combined.
Also, features can be can be grouped into longer sequences
with the --substring flag (ex: peptides LNGERPEPTIDE and ERPEPT will be merged
into LNGERPEPTIDE).
"""

import argparse, sys, re, csv, copy, decimal
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description=description)
parser.add_delimited_file(cols=['--group-on'])
parser.add_out()
parser.add_argument(
    '--substring',
    help=
    'If set, merge features by partial matches (such as collapsing peptides into larger peptides)',
    action='store_true')
parser.add_column_function(
    '--summary-col',
    col_help="The function to apply to grouped entries in modification columns."
)
parser.add_argument(
    '--summary-col-delimiter',
    help=
    "If the summary column has a delimiter, such as a ; for multiple proteins."
Ejemplo n.º 23
0
"""

import argparse, sys, csv, copy, decimal, itertools, os, operator
try:
    import re2 as re
except ImportError:
    import re
from multiprocessing import Pool, Value
from collections import Counter
from pythomics.templates import CustomParser
import pythomics.proteomics.config as config
import pythomics.proteomics.digest as digest
import pythomics.parsers.fasta as fasta
from pythomics.utils import ColumnFunctions

parser = CustomParser(description=description)
parser.add_fasta(help="The fasta file to match peptides against.")
parser.add_out(
    help="The name of the file you wish to create with results appended.")
parser.add_argument('--peptide-out',
                    nargs='?',
                    help="The file to write digested products to.",
                    type=argparse.FileType('w'),
                    default=os.devnull)
parser.add_argument('--protein-out',
                    nargs='?',
                    help="The file to write grouped products to.",
                    type=argparse.FileType('w'),
                    default=os.devnull)
parser.add_argument(
    '--strict',
Ejemplo n.º 24
0
#!/usr/bin/env python

description = """
This script will trim N's from the ends of a fasta/fastq file so it can be
aligned by tophat (which pukes if there are >5 N's. We remove them from the read
ends only)
"""

import sys, re, os, gzip
from itertools import izip
from multiprocessing import Pool
from pythomics.templates import CustomParser
parser = CustomParser(description=description)
parser.add_fasta()
parser.add_read_pair()
parser.add_out()
parser.add_argument(
    '--min-len',
    help="The minimum read length reads must be after trimming.",
    type=int,
    default=25)
parser.add_argument('--prefix',
                    help="If using paired reads, this is the filename prefix.",
                    type=str)
parser.add_argument('--quality',
                    help='If provided, remove qualities below a given score.',
                    type=int,
                    default=0)
parser.add_argument('--chunk',
                    help='How many reads to submit to each core.',
                    type=int,
Ejemplo n.º 25
0
from __future__ import division, absolute_import

__author__ = 'chris'

description = """

"""

import sys
import os
import operator
import argparse

from pythomics.templates import CustomParser

parser = CustomParser(description=description)
group = parser.add_argument_group('Protein Inference File')
group.add_argument('--inference', help="The protein inference file (your peptide file with gene/protein annotations). For multiple files, separate by spaces (must be in same order as mods).", nargs='+', type=argparse.FileType('r'), required=True)
group.add_argument('--gene', help="The Gene column name", type=str, default='Gene')
group.add_argument('--protein', help="The Protein column name", type=str, default='Protein')
group.add_argument('--peptide', help="The Peptide column name", type=str, default='Peptide')
group.add_argument('--quant', help="The name of quantification columns (such as Heavy/Light). Separate multiple columns by spaces", nargs='+', default=['Heavy/Light'])
mods = parser.add_argument_group('Modification File')
mods.add_argument('--mods', help="The modifications file (the file with sites, peptides). For multiple files, separate by spaces (must be in same order as inference).", nargs='+', type=argparse.FileType('r'), required=True)
mods.add_argument('--site-protein', help="The mod file protein column name", type=str, default='Protein')

parser.add_argument('--no-log2', help='Do not log2 normalize quantification values.', action='store_true')
parser.add_argument('--no-median', help='Do not normalize quantification values by the median of the experiment.', action='store_true')
parser.add_argument('--wp', help="The whole proteome inference file, if it exists. For multiple replicates, separate by spaces.", nargs='+', type=argparse.FileType('r'))
parser.add_argument('--non-mod-norm', help='Normalize the data by the non-modified peptides.', action='store_true')
Ejemplo n.º 26
0
__author__ = 'Chris Mitchell'

from pythomics.templates import CustomParser
import sys
import argparse
import operator
import pythomics.parsers.fasta as fasta
import pythomics.genomics.parsers as gp

description = """
This script will incorporate the a given GFF file into a specified
fasta file. It can also incorporate variants given in a VCF file
while generating this fasta file.
"""

parser = CustomParser(description = description)
parser.add_fasta(help="The fasta file to reference.")
parser.add_out(help="The file to write resulting fasta file to.")
gff_group = parser.add_argument_group('GFF file related options')
gff_group.add_argument('--gff', help="The GFF file to use.", type=argparse.FileType('r'), required=True)
gff_group.add_argument('--group-on', help="The key to group entries together by (such as transcript_id)", type=str, default='ID')
gff_group.add_argument('--feature', help="The feature to use for fetching coordinates (such as CDS, does not apply with cufflinks flag).", type=str, default='')
gff_group.add_argument('--cufflinks', help="If the gff file is in the standard cufflinks output", action='store_true', default=False)
vcf_group = parser.add_argument_group('VCF file related options')
vcf_group.add_vcf()
vcf_group.add_argument('--variants-only', help="Only output transcripts with variants.", action='store_true', default=False)
splice_group = parser.add_argument_group('Splice Junction Options (if a variant falls over a exon-exon junction. Default is to ignore.)')
splice_group.add_argument('--splice-partial', help="Partially splice variants (only include exonic portions of variant)", action='store_true', default=False)


def main():
Ejemplo n.º 27
0
#!/usr/bin/env python

description = """
This script will trim N's from the ends of a fasta/fastq file so it can be
aligned by tophat (which pukes if there are >5 N's. We remove them from the read
ends only)
"""

import sys, re, os, gzip
from itertools import izip
from multiprocessing import Pool
from pythomics.templates import CustomParser
parser = CustomParser(description = description)
parser.add_fasta()
parser.add_read_pair()
parser.add_out()
parser.add_argument('--min-len', help="The minimum read length reads must be after trimming.", type=int, default=25)
parser.add_argument('--prefix', help="If using paired reads, this is the filename prefix.", type=str)
parser.add_argument('--quality', help='If provided, remove qualities below a given score.', type=int, default=0)
parser.add_argument('--chunk', help='How many reads to submit to each core.', type=int, default=1000)
parser.add_argument('--no-gzip', help='To disable compression with gzip.', action='store_false')
# parser.add_argument('--5partial-match', help='This will trim partial matches at the 3\' end of the sequence if there is a match of at least x nucleotides.', type=int, default=0)
# parser.add_argument('--seed-length', help='The seed length for a match.', type=int, default=0)
# parser.add_argument('--mismatches', help='The number of possible mismatches in a sequence.', type=int, default=3)

start_trim = re.compile(r'^N+')
end_trim = re.compile(r'N+$')
global quality_min
global quality_offset
global paired
global read_min