Esempio n. 1
0
 def test_parser_ncbi(self):
     header = '>NP_001351877.1 acylglycerol kinase, mitochondrial isoform 2 [H**o sapiens]'
     parsed = {
         'description': 'acylglycerol kinase, mitochondrial isoform 2',
         'id': 'NP_001351877.1',
         'taxon': 'H**o sapiens'
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 2
0
 def test_parser_spd_mult_ids(self):
     header = ('>P02763 Q8TC16|A1AG1_HUMAN| Alpha-1-acid glycoprotein 1 '
               'precursor (AGP 1) (Orosomucoid-1) (OMD 1)')
     parsed = {
         'description': 'Alpha-1-acid glycoprotein 1 precursor (AGP 1)'
         ' (Orosomucoid-1) (OMD 1)',
         'gene': 'A1AG1_HUMAN',
         'gene_id': 'A1AG1',
         'id': 'P02763 Q8TC16',
         'taxon': 'HUMAN'
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 3
0
 def test_parser_unimes(self):
     header = (
         'MES00000000005 Putative uncharacterized protein GOS_3018412 '
         '(Fragment) OS=marine metagenome Pep=JCVI_PEP_1096688850003 SV=1')
     parsed = {
         'OS': 'marine metagenome',
         'Pep': 'JCVI_PEP_1096688850003',
         'SV': 1,
         'id': 'MES00000000005',
         'name': 'Putative uncharacterized protein GOS_3018412 (Fragment)'
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 4
0
 def test_parser_spd(self):
     header = ('>P31947|1433S_HUMAN| 14-3-3 protein sigma (Stratifin) '
               '(Epithelial cell marker protein 1).')
     parsed = {
         'description': '14-3-3 protein sigma (Stratifin) '
         '(Epithelial cell marker protein 1).',
         'gene': '1433S_HUMAN',
         'gene_id': '1433S',
         'id': 'P31947',
         'taxon': 'HUMAN'
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 5
0
def describe(args):
    """Read database and produce a summary"""
    logger.debug('describe called with: %s', args)
    try:
        dlist = [d for d, seq in fasta.read(args.file)]
    except Exception as e:
        logger.info('Not a valid FASTA file.')
        logger.debug('Exception: %s', e)
    else:
        logger.info('Found %s FASTA entries.', len(dlist))
        n = len(dlist)
        if n:
            logger.debug('First entry: %s', dlist[0])
            if n > 2:
                dlist.sort()
                prefix_1 = os.path.commonprefix(dlist[:n // 2])
                prefix_2 = os.path.commonprefix(dlist[n // 2 + 1:])
                if prefix_1 != prefix_2:
                    logger.info('Common prefixes: %s, %s', prefix_1, prefix_2)
                else:
                    logger.info('Common prefix: %s', prefix_1)
            formats = []
            for flavor in fasta.std_parsers:
                try:
                    fasta.parse(dlist[0], flavor=flavor)
                except Exception as e:
                    logger.debug('Header: %s; parsing exception: %s', dlist[0],
                                 e)
                else:
                    formats.append(flavor)
            k = len(formats)
            if not k:
                logger.info('Unknown header format.')
            elif k == 1:
                logger.info('Suggested header format: %s', formats[0])
            else:
                logger.info('Possible header formats: %s', ', '.join(formats))
Esempio n. 6
0
 def test_parser_uniref(self):
     header = ('>UniRef100_A5DI11 Elongation factor 2 n=1 '
               'Tax=Pichia guilliermondii RepID=EF2_PICGU')
     parsed = {
         'RepID': 'EF2_PICGU',
         'taxon': 'PICGU',
         'gene_id': 'EF2',
         'Tax': 'Pichia guilliermondii',
         'cluster': 'Elongation factor 2',
         'id': 'UniRef100_A5DI11',
         'type': 'UniRef100',
         'accession': 'A5DI11',
         'n': 1
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 7
0
 def test_parser_uniptokb_isoform(self):
     header = (
         'sp|Q4R572-2|1433B_MACFA Isoform Short of 14-3-3 protein beta'
         '/alpha OS=Macaca fascicularis GN=YWHAB')
     parsed = {
         'GN': 'YWHAB',
         'OS': 'Macaca fascicularis',
         'db': 'sp',
         'entry': '1433B_MACFA',
         'gene_id': '1433B',
         'id': 'Q4R572-2',
         'name': 'Isoform Short of 14-3-3 protein beta/alpha',
         'taxon': 'MACFA'
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 8
0
def digest(sequences,
           name_col,
           min_length=6,
           missed_cleavage=2,
           protease="trypsin"):
    """
    
    Digests a give FASTA sequence collection into peptides. The protease, 
    and minimum length can be configured via the options.
    
    Parameters:
    --------------------------
    sequences: pyteomics FileReader,
              Filereader object from an FASTA file.
              
    name_col: str,
                identifier column that is added to the final dataframe.
    min_length: int,
                minimal peptide length for sequences to be included in the 
                final dataframe.
    protease: str,
                Text identifier for a the desired protease
                
    Returns:
    ---------------
    df: df,
         A dataframe with the columns, sequence, proteine and DB
    """
    #store peptide (sequence) and proteins (by an integer id)
    peptides = []
    proteins = []

    for description, sequence in sequences:
        #get desc and cleavage products
        desc = fasta.parse(description)
        new_peptides = parser.cleave(sequence,
                                     parser.expasy_rules[protease],
                                     missed_cleavages=missed_cleavage,
                                     min_length=min_length)

        #store new data
        peptides.extend(new_peptides)
        proteins.extend([desc["id"]] * len(new_peptides))

    peptide_df = pd.DataFrame([peptides, proteins]).transpose()
    peptide_df.columns = ["Peptidesequence", "Protein"]
    peptide_df["DB"] = name_col
    return (peptide_df)
Esempio n. 9
0
 def test_parser_uniprotkb(self):
     header = (
         'sp|P27748|ACOX_RALEH Acetoin catabolism protein X OS=Ralstonia'
         ' eutropha (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)'
         ' GN=acoX PE=4 SV=2')
     parsed = {
         'GN': 'acoX',
         'OS': 'Ralstonia eutropha '
         '(strain ATCC 17699 / H16 / DSM 428 / Stanier 337)',
         'PE': 4,
         'SV': 2,
         'db': 'sp',
         'entry': 'ACOX_RALEH',
         'id': 'P27748',
         'gene_id': 'ACOX',
         'name': 'Acetoin catabolism protein X',
         'taxon': 'RALEH'
     }
     self.assertEqual(fasta.parse(header), parsed)
Esempio n. 10
0
 def test_parser_uniparc(self):
     header = '>UPI0000000005 status=active'
     parsed = {'id': 'UPI0000000005', 'status': 'active'}
     self.assertEqual(fasta.parse(header), parsed)