def test_parser_ncbi(self): header = '>NP_001351877.1 acylglycerol kinase, mitochondrial isoform 2 [H**o sapiens]' parsed = { 'description': 'acylglycerol kinase, mitochondrial isoform 2', 'id': 'NP_001351877.1', 'taxon': 'H**o sapiens' } self.assertEqual(fasta.parse(header), parsed)
def test_parser_spd_mult_ids(self): header = ('>P02763 Q8TC16|A1AG1_HUMAN| Alpha-1-acid glycoprotein 1 ' 'precursor (AGP 1) (Orosomucoid-1) (OMD 1)') parsed = { 'description': 'Alpha-1-acid glycoprotein 1 precursor (AGP 1)' ' (Orosomucoid-1) (OMD 1)', 'gene': 'A1AG1_HUMAN', 'gene_id': 'A1AG1', 'id': 'P02763 Q8TC16', 'taxon': 'HUMAN' } self.assertEqual(fasta.parse(header), parsed)
def test_parser_unimes(self): header = ( 'MES00000000005 Putative uncharacterized protein GOS_3018412 ' '(Fragment) OS=marine metagenome Pep=JCVI_PEP_1096688850003 SV=1') parsed = { 'OS': 'marine metagenome', 'Pep': 'JCVI_PEP_1096688850003', 'SV': 1, 'id': 'MES00000000005', 'name': 'Putative uncharacterized protein GOS_3018412 (Fragment)' } self.assertEqual(fasta.parse(header), parsed)
def test_parser_spd(self): header = ('>P31947|1433S_HUMAN| 14-3-3 protein sigma (Stratifin) ' '(Epithelial cell marker protein 1).') parsed = { 'description': '14-3-3 protein sigma (Stratifin) ' '(Epithelial cell marker protein 1).', 'gene': '1433S_HUMAN', 'gene_id': '1433S', 'id': 'P31947', 'taxon': 'HUMAN' } self.assertEqual(fasta.parse(header), parsed)
def describe(args): """Read database and produce a summary""" logger.debug('describe called with: %s', args) try: dlist = [d for d, seq in fasta.read(args.file)] except Exception as e: logger.info('Not a valid FASTA file.') logger.debug('Exception: %s', e) else: logger.info('Found %s FASTA entries.', len(dlist)) n = len(dlist) if n: logger.debug('First entry: %s', dlist[0]) if n > 2: dlist.sort() prefix_1 = os.path.commonprefix(dlist[:n // 2]) prefix_2 = os.path.commonprefix(dlist[n // 2 + 1:]) if prefix_1 != prefix_2: logger.info('Common prefixes: %s, %s', prefix_1, prefix_2) else: logger.info('Common prefix: %s', prefix_1) formats = [] for flavor in fasta.std_parsers: try: fasta.parse(dlist[0], flavor=flavor) except Exception as e: logger.debug('Header: %s; parsing exception: %s', dlist[0], e) else: formats.append(flavor) k = len(formats) if not k: logger.info('Unknown header format.') elif k == 1: logger.info('Suggested header format: %s', formats[0]) else: logger.info('Possible header formats: %s', ', '.join(formats))
def test_parser_uniref(self): header = ('>UniRef100_A5DI11 Elongation factor 2 n=1 ' 'Tax=Pichia guilliermondii RepID=EF2_PICGU') parsed = { 'RepID': 'EF2_PICGU', 'taxon': 'PICGU', 'gene_id': 'EF2', 'Tax': 'Pichia guilliermondii', 'cluster': 'Elongation factor 2', 'id': 'UniRef100_A5DI11', 'type': 'UniRef100', 'accession': 'A5DI11', 'n': 1 } self.assertEqual(fasta.parse(header), parsed)
def test_parser_uniptokb_isoform(self): header = ( 'sp|Q4R572-2|1433B_MACFA Isoform Short of 14-3-3 protein beta' '/alpha OS=Macaca fascicularis GN=YWHAB') parsed = { 'GN': 'YWHAB', 'OS': 'Macaca fascicularis', 'db': 'sp', 'entry': '1433B_MACFA', 'gene_id': '1433B', 'id': 'Q4R572-2', 'name': 'Isoform Short of 14-3-3 protein beta/alpha', 'taxon': 'MACFA' } self.assertEqual(fasta.parse(header), parsed)
def digest(sequences, name_col, min_length=6, missed_cleavage=2, protease="trypsin"): """ Digests a give FASTA sequence collection into peptides. The protease, and minimum length can be configured via the options. Parameters: -------------------------- sequences: pyteomics FileReader, Filereader object from an FASTA file. name_col: str, identifier column that is added to the final dataframe. min_length: int, minimal peptide length for sequences to be included in the final dataframe. protease: str, Text identifier for a the desired protease Returns: --------------- df: df, A dataframe with the columns, sequence, proteine and DB """ #store peptide (sequence) and proteins (by an integer id) peptides = [] proteins = [] for description, sequence in sequences: #get desc and cleavage products desc = fasta.parse(description) new_peptides = parser.cleave(sequence, parser.expasy_rules[protease], missed_cleavages=missed_cleavage, min_length=min_length) #store new data peptides.extend(new_peptides) proteins.extend([desc["id"]] * len(new_peptides)) peptide_df = pd.DataFrame([peptides, proteins]).transpose() peptide_df.columns = ["Peptidesequence", "Protein"] peptide_df["DB"] = name_col return (peptide_df)
def test_parser_uniprotkb(self): header = ( 'sp|P27748|ACOX_RALEH Acetoin catabolism protein X OS=Ralstonia' ' eutropha (strain ATCC 17699 / H16 / DSM 428 / Stanier 337)' ' GN=acoX PE=4 SV=2') parsed = { 'GN': 'acoX', 'OS': 'Ralstonia eutropha ' '(strain ATCC 17699 / H16 / DSM 428 / Stanier 337)', 'PE': 4, 'SV': 2, 'db': 'sp', 'entry': 'ACOX_RALEH', 'id': 'P27748', 'gene_id': 'ACOX', 'name': 'Acetoin catabolism protein X', 'taxon': 'RALEH' } self.assertEqual(fasta.parse(header), parsed)
def test_parser_uniparc(self): header = '>UPI0000000005 status=active' parsed = {'id': 'UPI0000000005', 'status': 'active'} self.assertEqual(fasta.parse(header), parsed)