def read_embl(path_to_embls: list, num_of_entries: int, exclude_csv: str, queue): """ Reads entries from a list of existing embl files """ if exclude_csv is None: # If no exclude csv is provided, we execute the reading without an if checking! (performance) for input_f in path_to_embls: # For each entry: try to read it and # add it to the queue try: entries = SwissProt.parse(input_f) for entry in entries: queue.put(entry) except Exception as e: print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e)) else: # If a exclude csv is provided, then a simple if check is added (reduced performance) with open(exclude_csv) as in_f: # Read the contents of the csv csv_reader = csv.reader(in_f) exclude_set = set(x[0] for x in list(csv_reader)) for input_f in path_to_embls: # For each entry: try to read it and # add it to the queue try: entries = SwissProt.parse(input_f) for entry in entries: if entry.accessions[0] in exclude_set: # This effectively skips an entry at the cost to check whether to skip in EACH entry! continue queue.put(entry) except Exception as e: print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e))
def sync_query_list_with_response(response_fn, query_list): db_data = {} with open(response_fn, 'r') as fh: for record in SwissProt.parse(fh): acc = record.accessions[0] # Select only EMBL and RefSeq crossrefs refseq_refs, embl_refs = [], [] for db_ref in record.cross_references: if db_ref[0] == 'RefSeq': refseq_refs.append(db_ref[1:]) elif db_ref[0] == 'EMBL': embl_refs.append(db_ref[1:]) db_data[acc] = {'RefSeq': refseq_refs, 'EMBL': embl_refs} # This is to handle isoforms # E.g. P03692 and P03692-1 can both be included in the query list # P03705-2 can be in the query list but not P03705 for prot in query_list: # For each of the original queries if (prot not in db_data) and ('-' in prot): # If the query is not returned base_name = prot.split('-')[0] # Search for the fist part if base_name in db_data: # If it is present in the db_data if base_name not in query_list: # If it's not in the original query list db_data[prot] = db_data[base_name] # Fill in the information for the corresponding protein db_data.pop(base_name) # AND remove the original part elif base_name in query_list: # If the first part is in the original query db_data[prot] = db_data[base_name] # Fill in the information WITHOUT removing the original part elif prot not in db_data: print("I don't know what to do with this id: {}".format(prot)) pass return db_data
def write_swissprot_annotations(outf, indentation_level, uniprot, uniprot_f): uniprot_dat_indices = UniProtDatIndex.objects.filter(uniprot=uniprot) for uniprot_dat_index in uniprot_dat_indices: if uniprot_dat_index.uniprot_accession == uniprot.accession: break uniprot_f.seek(uniprot_dat_index.file_char) record = SwissProt.parse(uniprot_f).next() indented_write(outf, indentation_level + 1, "Length: %d\n" % record.sequence_length) if len(record.gene_name) > 0: for name_spec in record.gene_name.replace('\n', ' ').split('; '): name_type, names = name_spec.split('=') indented_write(outf, indentation_level + 1, "%s: %s\n" % (name_type, names)) for keyword in record.keywords: indented_write(outf, indentation_level + 1, 'Keyword: %s\n' % keyword) for comment in record.comments: if comment[0:5] == '-----': continue components = comment.replace(':\n', ': ').split(': ') comment_type = components[0] comment_lines = ': '.join(components[1:]).split('\n') indented_write(outf, indentation_level + 1, "%s:\n" % comment_type) for line in comment_lines: indented_write(outf, indentation_level + 2, "%s\n" % line) for cross_reference in record.cross_references: indented_write( outf, indentation_level + 1, "%s: %s\n" % (cross_reference[0], '; '.join(cross_reference[1:])))
def go_in_papers(sp_path): # Returns: papers: key: pubmed_id; value: list of go_rec records # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id); # 'go_id': (GO ID); 'go_ec': (GO Evidence Code). # To be used with SP data, not GOA papers = {} go_ids = {} sp_recs = {} papers_prots = {} sph = open(sp_path) for sp_rec in SP.parse(sph): cur_go_recs = get_go_evidence_codes(sp_rec) # print cur_go_recs if not cur_go_recs: continue cur_papers = get_papers(sp_rec) for paper in cur_papers: if paper not in papers_prots: papers_prots[paper] = {sp_rec.entry_name: 1} else: papers_prots[paper][sp_rec.entry_name] = \ papers_prots[paper].get(sp_rec.entry_name,0)+1 for cur_go_rec in cur_go_recs: d1 = dict(sp_id=sp_rec.entry_name, go_id=cur_go_rec[0], go_ec=cur_go_rec[1]) papers.setdefault(paper,[]).append(d1) return papers, papers_prots
def pull_uniprot(repull=False): xmlname = os.path.join(os.path.dirname(__file__), 'uniprot_sprot_human.dat') if repull: xmldata = pull_and_decompress( 'ftp.uniprot.org', '/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/', 'uniprot_sprot_human.dat.gz') with open(xmlname, 'w') as xmlfile: xmlfile.write(xmldata) seq_to_idlist = defaultdict(set) #I only want the PRO sequences. One day, I could get the -1 -2 sequences as well if # there were a reason. with open(xmlname, 'r') as unif: for record in SwissProt.parse(unif): uniprotid = f'UniProtKB:{record.accessions[0]}' #xrefs = [ f"{x[0]}:{x[1]}" for x in record.cross_references if x[0].lower() in ['mint','string','nextprot']] #xrefs.append( f'PR:{record.accessions[0]}' ) #xrefs.append( uniprotid ) feats = [ f for f in record.features if f[4].startswith('PRO_') and isinstance(f[1], int) and isinstance(f[2], int) ] fseq = [(record.sequence[f[1] - 1:f[2]], f[4]) for f in feats] #seq_to_idlist[record.sequence].update(xrefs) for fs, fn in fseq: seq_to_idlist[fs].add(f'{uniprotid}#{fn}') return seq_to_idlist
def file_parse(): file = gzip.open("uniprot.gz") #Declaration of arrays which check for repitions non_rep_id = [] non_rep_org = [] non_rep_tax = [] swiss_records = SwissProt.parse(file) for swiss_record in swiss_records: #NCBI ID id = swiss_record.taxonomy_id if id not in non_rep_id: non_rep_id.append(id) #ORGANISM NAME organism = (swiss_record.organism.strip('.')) if organism not in non_rep_org: non_rep_org.append(organism) #TAXONOMY taxonomy= (swiss_record.organism_classification) if taxonomy not in non_rep_tax: non_rep_tax.append(taxonomy) #ZIP arrays to column/tab seperated output for i in zip(non_rep_id, non_rep_org, non_rep_tax): print ("".join(map((str), list(format(i)))))
def parse_uniprot(input_file): dic_pfam = {} dic_dom = {} dic_king = {} # probably faster/easier to use the XML parser directly #print (input_file) handle = open(input_file) for record in SwissProt.parse(handle): #print (record) #print (record.entry_name) #print (record.cross_references) entry = record.entry_name id = entry dic_pfam[id] = '' dic_dom[id] = 0 dic_king[id] = 'Unique' for db in record.cross_references: if (db[0] == "Pfam"): dic_pfam[id] = dic_pfam[id] + db[1] + ";" dic_dom[id] += 1 if (db[1] in shared_domains.keys()): dic_king[id] = "Shared" if (dic_dom[id] == 0): dic_king[id] = 'None' return dic_pfam, dic_dom, dic_king
def test_compute_features_return_empty_list_if_features_are_empty(self): for record in SwissProt.parse(self.records): protein = parse_record_into_protein(record) break protein.go_mf = None protein.go_bp = None protein.go_cc = None protein.interpro = None protein.pfam = None protein.keywords = None protein.save(self.session, commit=True) protein = Protein.query.get(protein.id) # Refresh features = compute_interaction_features(protein, protein) expected = dict(go_mf=[], go_bp=[], go_cc=[], ulca_go_mf=[], ulca_go_bp=[], ulca_go_cc=[], interpro=[], pfam=[], keywords=[]) self.assertEqual(expected, features)
def test_can_parse_record_into_protein_objects(self): for record in SwissProt.parse(self.records): obj = parse_record_into_protein(record) break self.assertEqual(obj.uniprot_id, "P31946") self.assertEqual(obj.gene_id, "YWHAB") self.assertEqual(obj.reviewed, True)
def features(files): ft=['ZN_FING', 'REGION','METAL','SITE','SIGNAL','REPEAT', 'NP_REGION', 'BINDING','MOTIF','MOD_RES', 'LIPID','DOMAIN','DNA_BIND','DISULFID','CROSSLNK', 'CARBOHYD','CA_BIND', 'ACT_SITE'] for record in SwissProt.parse(open(files)): for l in record.features: if l[0] in ft: print l[0]+','+str(l[1])+'-'+str(l[2])+','+l[3]
def _parse_features( self ): print( 'uniprot flat files, to get features...' ) with open( path + files[16], 'wt' ) as outf: for j in [11,12,13,14]: print( files[j] + '...' ) with open(path + files[j], 'rt') as handle: for record in SwissProt.parse(handle): if record.taxonomy_id[0] in ['9606', '10090', '10116']: accs = record.accessions acc = accs.pop(0) feats = record.features for f in feats: f = list(f) f.insert(3, '') if re.search(r'^[^\.]+\.\s*$', f[4]): m = re.match(r'^(.+)\.\s*$', f[4]) if m: f[3] = m.group(1) f[4] = '' elif re.search(r'.+\.\s+\{', f[4]): m = re.match(r'^(.+)\.\s*\{(.+)\}\.$', f[4]) if m: f[3] = m.group(1) f[4] = m.group(2) elif re.search(r'.+\.\s+\/', f[4]): m = re.match(r'^(.+)\.\s*\/(.+)\.$', f[4]) if m: f[3] = m.group(1) f[4] = m.group(2) else : f[4] = re.sub(r'[\{\}\.\/]', '', f[4]) #print(f) outf.write( acc + "\t" + '\t'.join(map(str, f)) + '\n')
def test_compute_features_returns_None_if_target_is_None(self): for record in SwissProt.parse(self.records): protein = parse_record_into_protein(record) break protein.save(self.session, commit=True) protein = Protein.query.get(protein.id) # Refresh self.assertIsNone(compute_interaction_features(protein, None))
def get_genes (self,gene_name=""): if gene_name != "": print "Finding \"{}\" gene in Uniprot database...".format(gene_name) upper_name = gene_name.upper() # Rho --> RHO output_handle = open(self.fasta_file, "w") for record in SwissProt.parse (self.fd): match = record.gene_name[5:5+len(upper_name)+1].upper() # Name=Rhodop; --> RHOD (Length of the queried name (rho)+1) # For matching the two possibilities # 1) Name=Rho; # 2) Name=rho {ECO.....} # So, it fill compare the queried gene name and match one e.g. # in 1st case "RHO " == "RHO;" or "RHO;" == "RHO;" # in 2nd case "RHO " == "RHO " or "RHO;" == "RHO " # We do not consider gene names differ to "Name=...;" in swisprot file if (upper_name+" ") == match or (upper_name+";") == match: print "Add protein to fasta file: " + record.entry_name + ", ...." + record.gene_name output = ">"+record.entry_name+"\n"+record.sequence.format("fasta")+"\n" #print output output_handle.write(output) output_handle.write("") output_handle.close()
def go_in_papers(sp_path): # Returns: papers: key: pubmed_id; value: list of go_rec records # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id); # 'go_id': (GO ID); 'go_ec': (GO Evidence Code). # To be used with SP data, not GOA papers = {} go_ids = {} sp_recs = {} papers_prots = {} sph = open(sp_path) for sp_rec in SP.parse(sph): cur_go_recs = get_go_evidence_codes(sp_rec) # print cur_go_recs if not cur_go_recs: continue cur_papers = get_papers(sp_rec) for paper in cur_papers: if paper not in papers_prots: papers_prots[paper] = {sp_rec.entry_name: 1} else: papers_prots[paper][sp_rec.entry_name] = \ papers_prots[paper].get(sp_rec.entry_name,0)+1 for cur_go_rec in cur_go_recs: d1 = dict(sp_id=sp_rec.entry_name, go_id=cur_go_rec[0], go_ec=cur_go_rec[1]) papers.setdefault(paper, []).append(d1) return papers, papers_prots
def load_uniprot(self): self.uniprot = None if not self.exists('uniprot.txt'): return with self.open('uniprot.txt') as fp: self.uniprot = [] for record in SwissProt.parse(fp): self.uniprot.append(record)
def test_parses_function_as_None_for_entry_with_no_comment(self): for record in SwissProt.parse(self.records): r = record break r.comments = [x for x in r.comments if "FUNCTION: " not in x] result = function(r) expected = None self.assertEqual(result, expected)
def generate_uniprot_record(self): for file_handle, file_number in self._uniprot_file_handle(): data_source = self._file_number_to_source(file_number) for record in SwissProt.parse(file_handle): if self._check_id_to_use(record.accessions[0]): current_record_dict = self._parse_record( record, data_source) yield current_record_dict
def get_ancestors_list(): i = 0 handle = open("uniprot_sprot.dat") for record in SwissProt.parse(handle): descriptions.append(record.sequence) print(descriptions) i += 1 if i == 1: break
def _parse_flat_files( self ): print( 'uniprot flat files...' ) with open( path + files[15], 'wt' ) as outf: for j in [11,12,13,14]: print( files[j] + '...' ) with open(path + files[j], 'rt') as handle: for record in SwissProt.parse(handle): if record.taxonomy_id[0] in ['9606', '10090', '10116']: accs = record.accessions acc = accs.pop(0) rev = record.data_class gname = re.sub(r'.*Name=([^;{]+)[{;].*', r'\1', record.gene_name).strip() uid = record.entry_name taxid = record.taxonomy_id[0] seq = record.sequence sinfo = str(record.seqinfo[0]) srcdb = 'sp' if re.search(r'trembl', files[j]): srcdb = 'tr' rname = '' fname = '' sname = '' flags = '' if 'RecName' in record.description: rname = re.sub(r'.*RecName: *Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip() elif 'SubName' in record.description: rname = re.sub(r'.*SubName: *Full=([^;{]+) *[;{].*', r'\1', record.description, re.IGNORECASE).strip() if 'AltName' in record.description: if re.search(r'AltName:[^:]*Full=', record.description, re.IGNORECASE): fname = re.sub(r'.*AltName:[^:]*Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip() if re.search(r'AltName:[^:]*Short=', record.description, re.IGNORECASE): sname = re.sub(r'.*AltName:[^:]*Short=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip() if 'Flags:' in record.description: flags = re.sub(r'.*Flags: *([^;]+);.*', r'\1', record.description, re.IGNORECASE).strip() refs = list() eids = list() mgis = list() hgnc = list() dids = list() dnms = list() ddbs = list() for i in range(0, len(record.cross_references)): if record.cross_references[i][0] == 'GeneID': eids.append(record.cross_references[i][1]) if record.cross_references[i][0] == 'RefSeq': refs.append(re.sub(r'\.\d+$', r'', record.cross_references[i][1])) if record.cross_references[i][0] == 'MGI': mgis.append(record.cross_references[i][1]) if record.cross_references[i][0] == 'HGNC': hgnc.append(record.cross_references[i][1]) if record.cross_references[i][0] in xdoms: dids.append(record.cross_references[i][1]) ddbs.append(record.cross_references[i][0]) dnms.append(record.cross_references[i][2]) outf.write( '\t'.join([ acc, uid, srcdb, taxid, rev, gname, rname, fname, sname, flags, '|'.join(accs), '|'.join(eids), '|'.join(refs), '|'.join(hgnc), '|'.join(mgis), '|'.join(ddbs), '|'.join(dids), '|'.join(dnms), sinfo, seq ]) + '\n' )
def test_parses_interpro_correctly(self): for record in SwissProt.parse(self.records): result = interpro_terms(record) break expected = [ 'IPR000308', 'IPR023409', 'IPR036815', 'IPR023410', ] self.assertEqual(result, expected)
def give_me_the_record(primary_id, swissprot_file): """ Return a single record given with the primary id :param primary_id: A primary id :param swissprot_file: A swissprot file :return: A record with accession == primary id """ with open(swissprot_file, 'r') as fh: for record in SwissProt.parse(fh): if primary_id in record.accessions: return record
def obtain_taxons(self, protein_dict, fh_sprot): found = False for rec in sp.parse(fh_sprot): for ac in range(len(rec.accessions)): if rec.accessions[ac] in protein_dict.keys(): # assign rec.taxonomy_id list to the protein protein_dict[rec.accessions[ac]] = rec.taxonomy_id found = True break #if found: # break return protein_dict
def __init__(self, sprot_cache='', trembl_cache='', organism='h**o sapien'): self.records = {} self.organism = organism.strip().lower() if sprot_cache: # Load the swissprot records if file can be found try: with open(sprot_cache) as fp: for record in SwissProt.parse(fp): for accession in record.accessions: self.records[accession] = record except IOError, e: print(e); print("SwissProt cache not loaded")
def test_parses_keywords_correctly(self): for record in SwissProt.parse(self.records): result = keywords(record) break expected = [ '3D-structure', 'Acetylation', 'Alternative initiation', 'Complete proteome', 'Cytoplasm', 'Direct protein sequencing', 'Host-virus interaction', 'Isopeptide bond', 'Nitration', 'Phosphoprotein', 'Polymorphism', 'Reference proteome', 'Ubl conjugation' ] self.assertEqual(result, expected)
def species_filter(sp_handle, taxon_id): target_id = int(taxon_id+"0000001") outhandle = open("sp_species.%s.tfa" % taxon_id,"w") for inrec in sp.parse(sp_handle): if taxon_id in inrec.taxonomy_id: outseq = SeqRecord(Seq(inrec.sequence), id="T"+str(target_id), description = "%s" % (inrec.entry_name)) outseq_list = [outseq] SeqIO.write(outseq_list,outhandle,"fasta") target_id += 1 outhandle.close()
def make_sp_sql_table(sp_handle, out_handle, taxa_ids=[]): """ Make a table from swissprot that includes the following fields: SP_ID, GO, Ontology, evidence code """ for sp_rec in sp.parse(sp_handle): go_list = parse_go(sp_rec) if not go_list: continue if taxa_ids and (not (sp_rec.taxonomy_id[0] in taxa_ids)): continue for go_entry in go_list: out_handle.write("%s\t%s\t%s\t%s\n" % (sp_rec.entry_name, go_entry['go_id'], go_entry['ontology'], go_entry['evidence']))
def n_go_ec(sp_handle, taxon_id, allowed_ec=exp_ec): # For each GO namespace, number of proteins annotated with allowed # terms. Uses experimental evidence code, by default. # ncount = {} ncount['BPO'] = 0 ncount['MFO'] = 0 ncount['CCO'] = 0 for inrec in sp.parse(sp_handle): if taxon_id in inrec.taxonomy_id: in_allowed = go_ec_filter(inrec, allowed_ec=exp_ec) for onto in in_allowed: if in_allowed[onto]: ncount[onto] += 1 return ncount
def test_compute_features_maps_to_alts_to_stable_ontology_terms(self): for record in SwissProt.parse(self.records): protein = parse_record_into_protein(record) break protein.go_mf = 'GO:0000975' protein.go_bp = None protein.go_cc = None protein.interpro = None protein.pfam = None protein.keywords = None features = compute_interaction_features(protein, protein) self.assertEqual(features['go_mf'], ['GO:0044212', 'GO:0044212']) self.assertTrue('GO:0000975' not in features['ulca_go_mf'])
def __build_NEXP_accession_singleSpecies(fh_sprot, taxon_id, ontType, EXP_default=set([])): ''' This method builds a list of accessions of the proteins whose annotations have non-EXP evidence but no EXP evidence codes in a specific UniProtKB/SwissProt file (file pointer fh_sprot) for some ontology type (ontType). The method returns the list. ''' # nexp_accessions: Initialize a list to store the accessions of the # proteins that meet the criteria: (1) the protein whose annotation # is supported some Non-EXP evidence code in the specific ontology # ontType, but (2) the annotation is NOT supported by any EXP # evidence code. nexp_accessions = [] print(' Building the accession list with the proteins ' + \ 'that have only non-EXP evidence codes at time t1 ...') for rec in sp.parse(fh_sprot): # Selects records that are related to a specific # taxonomy id taxon_id: if taxon_id in rec.taxonomy_id: # ont_specific_code_exist: this varilable is initialized to False # at the beginning of each iteration. If an evidence code (either # EXP or Non-EXP) for the current record is found, this varilable # will be set to True ont_specific_code_exist = False # exp_code: this variable is initialized to False at the beginning # of each iteration. If an EXP evidence for the current record is # found, this variable will be set to True. exp_code = False # Going over the list of DB reference entries: for crossRef in rec.cross_references: # Consider the cross_reference entries # that relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if not ont_specific_code_exist and goList[2] == ontType: ont_specific_code_exist = True if goList[2] == ontType and \ (crossRef[3].split(':'))[0] in EXP_default: exp_code = True break # If the protein's annotation is supported by some Non-EXP evidence # code but is not supported by any EXP evidence code, append the # protein's accessions list to the nexp_accessions list: if ont_specific_code_exist and not exp_code: nexp_accessions.append(rec.accessions) return nexp_accessions
def obtain_goterms(self, goterm_dict, fh_sprot): found = False for rec in sp.parse(fh_sprot): for ac in range(len(rec.accessions)): goList = [] if rec.accessions[ac] in goterm_dict.keys(): for crossRef in rec.cross_references: if crossRef[0] == 'GO': goDef = (crossRef[1], (crossRef[3].split(':'))[0], \ crossRef[2][0]) goterm_dict[rec.accessions[ac]].add(goDef) found = True break #if found: #break return goterm_dict
def UNIPROT_GENE_PLUS(UNIPROT): #LIST-The difference between this and UNIPROT_GENE is that UNIPROT_GENE_PLUS returns synonim genes as well if #any and the gene name in the first entry import urllib, urllib2 from Bio import SwissProt url=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT) GENES=[] for record in SwissProt.parse(url): if len(record.gene_name.split(";"))>2: GENES.append(record.gene_name.split(";")[0].split("=")[1]) SYN=record.gene_name.split(";")[1].split("=")[1].split(",") for syno in SYN: GENES.append("".join(syno.split())) else: GENES.append(record.gene_name.split(";")[0].split("=")[1]) return GENES
def count_GOterms_with_EXP(fh_sprot, taxon_id, EXP_default=set([])): ''' This method extract the distinct GO terms for each gene that have validation with any of the experimental evidence codes. A set is created for these GO terms for each gene and then are placed in a dictionary of each ontological categories. At the end, these THREE dictionaries are returned. ''' mfo_terms = OrderedDict() bpo_terms = OrderedDict() cco_terms = OrderedDict() count = 0 for rec in sp.parse(fh_sprot): # SELECT records that are related to a specific # taxon_id such as 559292 for yeast: if taxon_id in rec.taxonomy_id: protName = rec.accessions[0] # Initialize lists for adding GO terms: terms_mfo = set() terms_bpo = set() terms_cco = set() # Go over the list of DB cross references: for crossRef in rec.cross_references: # Consider the cross_reference entries that # relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if (crossRef[3].split(':'))[0] in EXP_default: # print goList if goList[-1].upper() == 'F': terms_mfo.add(goList[0]) elif goList[-1].upper() == 'P': terms_bpo.add(goList[0]) elif goList[-1].upper() == 'C': terms_cco.add(goList[0]) # Increase gene counts in BPO, CCO, and MFO categories # depending on the corresponding flag values: mfo_terms[protName] = terms_mfo bpo_terms[protName] = terms_bpo cco_terms[protName] = terms_cco count += 1 if count > 20: break #break return (mfo_terms, bpo_terms, cco_terms)
def count_genes_with_EXP_old(fh_sprot, taxon_id, EXP_default=set([])): # The exp_bpo_ct variable counts total number of genes in # the sprot file related to the taxonomy id taxon_id whose # annotations have EXP evidence and in BPO ontological category: exp_bpo_ct = 0 # The exp_cco_ct variable counts total number of genes in # the sprot file related to the taxonomy id taxon_id whose # annotations have EXP evidence and in CCO ontological category: exp_cco_ct = 0 # The exp_mfo_ct variable counts total number of genes in # the sprot file related to the taxonomy id taxon_id whose # annotations have EXP evidence and in MFO ontological category: exp_mfo_ct = 0 for rec in sp.parse(fh_sprot): # SELECT records that are related to a specific # taxon_id such as 559292 for yeast: if taxon_id in rec.taxonomy_id: bpo_exp_flag = cco_exp_flag = mfo_exp_flag = False # Go over the list of GO information: for crossRef in rec.cross_references: # Consider the cross_reference entries that # relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if (crossRef[3].split(':'))[0] in EXP_default: if goList[-1].upper() == 'P': bpo_exp_flag = True elif goList[-1].upper() == 'C': cco_exp_flag = True elif goList[-1].upper() == 'F': mfo_exp_flag = True if (bpo_exp_flag and cco_exp_flag and mfo_exp_flag): break # Increase gene counts in BPO, CCO, and MFO categories # depending on the corresponding flag values: if bpo_exp_flag: exp_bpo_ct += 1 if cco_exp_flag: exp_cco_ct += 1 if mfo_exp_flag: exp_mfo_ct += 1 return (exp_bpo_ct, exp_cco_ct, exp_mfo_ct)
def main(): """Make a jazz noise here""" args = get_args() in_fh = args.FILE keywords = args.keyword skips = args.skip out_fh = args.output if not os.path.isfile(in_fh): die('"{}" is not a file'.format(in_fh)) #print(keywords) keylist = [keywords] #print(keylist) outfile = open(out_fh, "w") fandle = open(in_fh) print('Processing "{}"'.format(in_fh)) records = SwissProt.parse(fandle) records = SeqIO.parse(fandle, 'swiss') print(records) #print(records) i = 0 j = 0 for record in records: print(record) docset = record.keywords docsetlower = [i.lower() for i in docset] docsetlower = set(docsetlower) userset = set(keylist) i += 1 organismset = record.organism_classification organismsetlower = [i.lower() for i in organismset] organismsetlower = set(organismsetlower) skipsetlower = [i.lower() for i in skips] skipsetlower = set(skipsetlower) if docsetlower.intersection(userset) and len( organismsetlower.intersection(skipsetlower)) == 0: j += 1 SeqIO.write(record, outfile, 'fasta') print('Done, skipped {} and took {}. See output in "{}".'.format( (i - j), j, out_fh))
def check_sprot_format(fh_sprot): """ This method checks whether the format of the file (with file handle fh_sprot) is in UniProtKB/Swissprot format. If the file is in UniProtKB/Swissprot format format, it returns True Otherwise, it returns False. """ iter_handle = sp.parse(fh_sprot) # sp.parse method returns a generator try: for rec in iter_handle: break except: return False else: return True
def test_parses_gomf_correctly(self): for record in SwissProt.parse(self.records): result = go_terms(record, ont="mf") break expected = [ 'GO:0045296', 'GO:0019899', 'GO:0042826', 'GO:0042802', 'GO:0051219', 'GO:0050815', 'GO:0008022', 'GO:0032403', 'GO:0019904', 'GO:0003714', ] self.assertEqual(result, expected)
def count_genes_with_EXP(fh_sprot, taxon_id, EXP_default=set([])): gene_count = {} gene_count['MFO'] = 0 gene_count['BPO'] = 0 gene_count['CCO'] = 0 for rec in sp.parse(fh_sprot): # SELECT records that are related to a specific # taxon_id such as 559292 for yeast: if taxon_id in rec.taxonomy_id: # Three flags to check whether an Exp evidence is found # in any of BPO, CCO, and MFO ontological categories: exp_flag = {} exp_flag['MFO'] = False exp_flag['BPO'] = False exp_flag['CCO'] = False # Go over the list of DB cross references: for crossRef in rec.cross_references: # Consider the cross_reference entries that # relate to GO DB: if crossRef[0] == 'GO': goList = [crossRef[1], (crossRef[3].split(':'))[0], crossRef[2][0]] if (crossRef[3].split(':'))[0] in EXP_default: if goList[-1].upper() == 'F': exp_flag['MFO'] = True elif goList[-1].upper() == 'P': exp_flag['BPO'] = True elif goList[-1].upper() == 'C': exp_flag['CCO'] = True # Whenever an exp evidence for all three ontological # categories are found, break out the loop: if (exp_flag['MFO'] and exp_flag['BPO'] and exp_flag['CCO']): break # Increase gene counts in BPO, CCO, and MFO categories # depending on the corresponding flag values: if exp_flag['MFO']: gene_count['MFO'] += 1 if exp_flag['BPO']: gene_count['BPO'] += 1 if exp_flag['CCO']: gene_count['CCO'] += 1 return gene_count
def read_sprot_dat(sprot_dat_file, seq_dict): num_record = 0 for record in SwissProt.parse(open(sprot_dat_file)): # Use Bio.SwissProt to parse the uniprot_sprot.dat file for seqID in record.accessions: if seqID in seq_dict: num_record += 1 if num_record % 10000 == 0: sys.stderr.write("{} records read so far\n".format(num_record)) go_terms = [i[1][3:] for i in record.cross_references if i[0] == 'GO'] # GO terms ['GO:0031012', 'GO:0005576', 'GO:0004222', 'GO:0008270'] organism = record.organism # organism name lineage = record.organism_classification # taxonomic classification ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Chloriridovirus'] tax_id = record.taxonomy_id[0] # taxonomy id '345201' gene_name, OLN, ORF = parse_GN(record.gene_name) # GN line,include gene names, ordered locus names, and ORF names full_name, EC = parse_DE(record.description) # DE line with descriptive information. RecName, AltName (Full=, short=, EC=, ...) seq_dict[seqID] = {'organism' : organism, 'EC' : EC, 'gene_name' : gene_name,'OLN' : OLN, 'ORF' : ORF, 'GO': go_terms, 'KW': record.keywords, 'full_name': full_name, 'tax_id': tax_id, 'lineage': lineage} # map primary ID to annotation dictionary else: continue sys.stderr.write("\nnumber of sequences is {}\n".format(len(seq_dict))) return seq_dict
def process(files): #the file is a text file of a swissprot protein parsed = sp.parse(files) record = next(parsed) sequence = record.sequence length = len(sequence) a = record.features finished = False output = [] for item in a: if finished: break else: if ( not (item[0] == "CHAIN") and not (item[0] == "DOMAIN") ): #this requires more refinement to allow for a range of domains output.append([item[0], item[1], item[2]]) if item[2] == length: finished = True return output
def test_parses_gocc_correctly(self): for record in SwissProt.parse(self.records): result = go_terms(record, ont="cc") break expected = [ 'GO:0005737', 'GO:0030659', 'GO:0005829', 'GO:0070062', 'GO:0005925', 'GO:0042470', 'GO:0016020', 'GO:0005739', 'GO:0005634', 'GO:0048471', 'GO:0043234', 'GO:0017053', ] self.assertEqual(result, expected)
def setUp(self): self.records = open( os.path.normpath( "{}/test_data/test_sprot_records.dat".format(base_path)), 'rt') self.session, self.engine = create_session(db_path) delete_database(self.session) self.proteins = [] for record in SwissProt.parse(self.records): protein = parse_record_into_protein(record) protein.save(self.session, commit=True) self.proteins.append(protein) self.labels = ['Activation', 'Inhibition', 'Acetylation'] self.interactions = [] for protein_a, protein_b in product(self.proteins, self.proteins): class_kwargs = compute_interaction_features(protein_a, protein_b) label = '{},{}'.format(self.labels[protein_a.id - 1], self.labels[protein_b.id - 1]) try: interaction = create_interaction(protein_a, protein_b, labels=label, session=self.session, verbose=False, save=True, commit=True, **class_kwargs) self.interactions.append(interaction) except ObjectAlreadyExists: continue self.X, self.y, _ = load_dataset(self.interactions, self.labels, selection=DEFAULT_SELECTION) base = Pipeline( steps=[('vectorizer', CountVectorizer(lowercase=False, stop_words=[':', 'GO']) ), ('estimator', LogisticRegression(random_state=0))]) self.clf = MixedBinaryRelevanceClassifier( [clone(base) for _ in range(len(self.labels))]) self.clf.fit(self.X, self.y)
def from_file(cls, path): db_records = [] with open(path) as f: db = SwissProt.parse(f) for record in db: db_records.append( cls( primary_accession=record.accessions[0], primary_tax_id=int(record.taxonomy_id[0]), gene_name=record.gene_name, organism=record.organism, description=record.description, sequence=record.sequence, comments=record.comments, cross_references=record.cross_references, accessions=record.accessions, annotation_update=record.annotation_update, created=record.created, data_class=record.data_class, entry_name=record.entry_name, features=record.features, host_organism=record.host_organism, host_taxonomy_id=record.host_taxonomy_id, keywords=record.keywords, molecule_type=record.molecule_type, organelle=record.organelle, organism_classification=record.organism_classification, protein_existence=record.protein_existence, references=[x.__dict__ for x in record.references], seqinfo=record.seqinfo, sequence_length=record.sequence_length, sequence_update=record.sequence_update, taxonomy_id=record.taxonomy_id, )) with database: database.create_tables([cls]) with database.atomic(): cls.bulk_create(db_records, batch_size=250)
def UNIPROT_CHAIN_LIMITS(UNIPROT_ID): #Given a uniprot, it returns the limits of the mature protein numbering import urllib2 from Bio import SwissProt PAGE=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT_ID) PARSED_PAGE=SwissProt.parse(PAGE) for record in PARSED_PAGE: CHAIN_VALUES=[] for feature in record.features: if feature[0]=="CHAIN": CHAIN_VALUES=CHAIN_VALUES+[str(feature[1]), str(feature[2])] if any(X.isdigit()==False for X in CHAIN_VALUES) or not CHAIN_VALUES: CHAIN_START=1 CHAIN_END=record.sequence_length else: CHAIN_START=min(int(X) for X in CHAIN_VALUES) CHAIN_END=max(int(X) for X in CHAIN_VALUES) return[CHAIN_START, CHAIN_END]
def SwissIterator(handle): """Break up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: - Swiss-Prot aka SwissProt - TrEMBL - UniProtKB aka UniProt Knowledgebase For consistency with BioPerl and EMBOSS we call this the "swiss" format. See also the SeqIO support for "uniprot-xml" format. Rather than calling it directly, you are expected to use this parser via Bio.SeqIO.parse(..., format="swiss") instead. """ swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord(seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, features=[_make_seqfeature(*f) for f in swiss_record.features], ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if dbxref not in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations['accessions'] = swiss_record.accessions if swiss_record.protein_existence: annotations['protein_existence'] = swiss_record.protein_existence if swiss_record.created: annotations['date'] = swiss_record.created[0] annotations['sequence_version'] = swiss_record.created[1] if swiss_record.sequence_update: annotations[ 'date_last_sequence_update'] = swiss_record.sequence_update[0] annotations['sequence_version'] = swiss_record.sequence_update[1] if swiss_record.annotation_update: annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] annotations['entry_version'] = swiss_record.annotation_update[1] if swiss_record.gene_name: annotations['gene_name'] = swiss_record.gene_name annotations['organism'] = swiss_record.organism.rstrip(".") annotations['taxonomy'] = swiss_record.organism_classification annotations['ncbi_taxid'] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations['organism_host'] = swiss_record.host_organism if swiss_record.host_taxonomy_id: annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id if swiss_record.comments: annotations['comment'] = "\n".join(swiss_record.comments) if swiss_record.references: annotations['references'] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments) for key, value in reference.references: if key == 'PubMed': feature.pubmed_id = value elif key == 'MEDLINE': feature.medline_id = value elif key == 'DOI': pass elif key == 'AGRICOLA': pass else: raise ValueError( "Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations['references'].append(feature) if swiss_record.keywords: record.annotations['keywords'] = swiss_record.keywords yield record
from Bio import SwissProt with open('../../samples/spfile.txt') as fh: records = SwissProt.parse(fh) for record in records: print('Entry name: %s' % record.entry_name) print('Accession(s): %s' % ','.join(record.accessions)) print('Keywords: %s' % ','.join(record.keywords)) print('Sequence: %s' % record.sequence)
#Input="HumanReview.txt" # Globle File Input #Testing #Input = "GLP.txt" """ AATarget1 = "TargetSequencePosition1.txt" AATarget2 = "TargetSequencePosition2.txt" """ # Zero Variables TargetSeq = [] GlobalStat = 0 from Bio import SwissProt handle = open (Input,"r") records = list(SwissProt.parse(handle)) print "Done Big List Parsing" """ HumanOut="HumanAccessions.txt" # Human Protein Accession File # (This need to be fasta format for later Analysis) HumanMAPInput=HumanOut HumanMAPOut="HumanMAPList.txt" AlignInput="HumanMAPList.txt" # This is a File Name (used in SeqIO.parse) MAPUpdateOutPut="HumanMAPFiltered.txt" CytoMitoInput=MAPUpdateOutPut CytoOutput="CytoList.txt"
# SwissProt / Uniprot flat file parsing from Bio import SwissProt for record in SwissProt.parse(open('/path/to/your/uniprot_sprot.dat')): pid=record.accessions[0] seq=record.sequence for feature in record.features: print feature
from Bio import SwissProt import pickle HANDLE=open("DATABASES/uniprot_sprot.dat") DICT={} for record in SwissProt.parse(HANDLE): CHAIN_VALUES=[] AC=[] for ac in record.accessions: AC.append(ac) for feature in record.features: if feature[0]=="CHAIN": CHAIN_VALUES=CHAIN_VALUES+[str(feature[1]), str(feature[2])] print CHAIN_VALUES if any(X.isdigit()==False for X in CHAIN_VALUES) or not CHAIN_VALUES: CHAIN_START=1 CHAIN_END=record.sequence_length else: CHAIN_START=min(int(X) for X in CHAIN_VALUES) CHAIN_END=max(int(X) for X in CHAIN_VALUES) print AC[0] print CHAIN_START, CHAIN_END DICT[AC[0]]=[int(CHAIN_START), int(CHAIN_END)] PICKLE_OUT=open("DATABASES/OBJECTS/DICT_UNIPROT_CHAIN_LIMITS.pi", "w") pickle.dump(DICT,PICKLE_OUT)
def uniprot2metaphors(outdir, hash2protid, accessions, verbose=True): """Parser for dat formatted dump of uniprot database. It's quick, much faster than xml parsing! But from time to time may encounter wrongly formatted DAT file. """ # create working dir if not os.path.isdir(outdir): os.makedirs(outdir) #parse dump in dat format files = {} # will store opened files taxid2stats = {} # { taxid: [matches,total] } for r in SwissProt.parse(sys.stdin): #skip entry if taxid not of interest taxid = int(r.taxonomy_id[0]) if taxid not in hash2protid: continue #update stats if not taxid in taxid2stats: taxid2stats[taxid] = [0, 0] taxid2stats[taxid][1] += 1 #check if md5 match md5 = hashlib.md5(r.sequence).hexdigest() if md5 not in hash2protid[taxid]: #skip if no match continue #save uniprot accession for protid in hash2protid[taxid][md5]: files = save_entry(outdir, files, protid, "accessions", r.accessions[0]) #provide only accessions if requested so if accessions: continue #add xreference information for each external db for ex_db_data in r.cross_references: extdb, extid = ex_db_data[:2] files = save_entry(outdir, files, protid, extdb, extid) #save gene name if r.gene_name.startswith('Name='): extid = r.gene_name[5:].split(';')[0] files = save_entry(outdir, files, protid, "GeneName", extid) #save description if r.description: files = save_entry(outdir, files, protid, "Description", r.description) #update stats taxid2stats[taxid][0] += 1 #close opened files for f in files: files[f].close() #print stats print "#taxid\tmapped\ttotal\t%" for taxid in sorted(taxid2stats.keys()): mapped,total = taxid2stats[taxid] print "%s\t%s\t%s\t%.2f" % (taxid, mapped, total, mapped*100.0/total) print "%s\tDone." % datetime.now()
from Bio import SwissProt with open('../../samples/spfile.txt') as fh: record = next(SwissProt.parse(fh)) for att in dir(record): if not att.startswith('__'): print(att, getattr(record, att))
def appendSprot2goa(fh_sprot, goa_file_name, taxon_id, fh_merged_go): """ This method reads each reacord from the UniProtKB/SwissProt file and checks wither it's for taxon_id. If it is, this method then checks whether the GO term exists in the UniProt-GOA file passed as the file name goa_file_name. If it is a new GO term, this method invokes swissProt2GOA method for each such GO term to construct a UniProt-GOA record which it appends at the end of the merged UniProt-GOA file passed as file handle fh_merged_go. """ # Creates an iterator object for t1 file: iter_handle, GAFFIELDS = create_iterator(goa_file_name) # Construct a dictionary goa_dict with the proteins and # the corresponding GO terms in t1 file: goa_dict = {} for ingen in iter_handle: if ingen['DB_Object_ID'] in goa_dict.keys(): goa_dict[ingen['DB_Object_ID']].append([ingen['GO_ID'], \ ingen['Evidence'], ingen['Aspect']]) else: goa_dict[ingen['DB_Object_ID']] = [[ingen['GO_ID'], \ ingen['Evidence'], ingen['Aspect']]] # EXTRACTS the NEW GO terms in t2 file that are NOT found in t1 file: goCount = 0 for rec in sp.parse(fh_sprot): # SELECTS records that are related to a specific taxon_id # such as 559292 for yeast: if taxon_id in rec.taxonomy_id: # Going over each of the entries of the accessions list: for ac in range(len(rec.accessions)): # knownProt is an indicator to detect whether the # current sprot protein is already in GOA file: knownProt = "" if rec.accessions[ac] in goa_dict.keys(): # If the current sprot protein is already in the GOA # file, the sprot protein is assigned to knownProt: knownProt = rec.accessions[ac] break # Going over the list of GO information: for crossRef in rec.cross_references: # Consider the cross_reference entries that relate to GO DB: if crossRef[0] == 'GO': # goList is a list of GO ID, Aspect, and Evidence: goList = [crossRef[1], (crossRef[3].split(':'))[0], \ crossRef[2][0]] # Checking whether a new GO annotaion found: if (not knownProt) or (knownProt and \ goList not in goa_dict[knownProt]): # A new GO annotation is found in two situations: # 1. if knownProt is empty (not knownProt) or # 2. if knownProt is not empty but the GO annotation # is not found in the GOA file # Convert the sprot record to a GOA record: goaRec = swissProt2GOA(rec, crossRef, GAFFIELDS) # Write the converted GOA record to the output file: GOAParser.writerec(goaRec, fh_merged_go, GAFFIELDS) # if goCount in range(1, 20) or goCount in range(6400, 6420): # print ('goCount: ' + str(goCount) + '\n') # goaRec = swissProt2GOA(rec, crossRef, GAFFIELDS) goCount += 1 return goCount
def parse_uniprot_data(data): return dict((s.entry_name, s) for s in SwissProt.parse(data))
def SwissIterator(handle): """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects. Every section from the ID line to the terminating // becomes a single SeqRecord with associated annotation and features. This parser is for the flat file "swiss" format as used by: * Swiss-Prot aka SwissProt * TrEMBL * UniProtKB aka UniProt Knowledgebase It does NOT read their new XML file format. http://www.expasy.org/sprot/ For consistency with BioPerl and EMBOSS we call this the "swiss" format. """ swiss_records = SwissProt.parse(handle) for swiss_record in swiss_records: # Convert the SwissProt record to a SeqRecord seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein) record = SeqRecord.SeqRecord(seq, id=swiss_record.accessions[0], name=swiss_record.entry_name, description=swiss_record.description, ) record.description = swiss_record.description for cross_reference in swiss_record.cross_references: if len(cross_reference) < 2: continue database, accession = cross_reference[:2] dbxref = "%s:%s" % (database, accession) if not dbxref in record.dbxrefs: record.dbxrefs.append(dbxref) annotations = record.annotations annotations['accessions'] = swiss_record.accessions annotations['date'] = swiss_record.created[0] annotations['date_last_sequence_update'] = swiss_record.sequence_update[0] if swiss_record.annotation_update: annotations['date_last_annotation_update'] = swiss_record.annotation_update[0] if swiss_record.gene_name: annotations['gene_name'] = swiss_record.gene_name annotations['organism'] = swiss_record.organism.rstrip(".") annotations['taxonomy'] = swiss_record.organism_classification annotations['ncbi_taxid'] = swiss_record.taxonomy_id if swiss_record.host_organism: annotations['organism_host'] = [word.rstrip(".") \ for word \ in swiss_record.host_organism] if swiss_record.comments: annotations['comment'] = "\n".join(swiss_record.comments) if swiss_record.references: annotations['references'] = [] for reference in swiss_record.references: feature = SeqFeature.Reference() feature.comment = " ".join(["%s=%s;" % (key, value) \ for key, value \ in reference.comments]) for key, value in reference.references: if key == 'PubMed': feature.pubmed_id = value elif key == 'MEDLINE': feature.medline_id = value elif key == 'DOI': pass elif key == 'AGRICOLA': pass else: raise ValueError(\ "Unknown key %s found in references" % key) feature.authors = reference.authors feature.title = reference.title feature.journal = reference.location annotations['references'].append(feature) if swiss_record.keywords: record.annotations['keywords'] = swiss_record.keywords yield record
table_def_2 = ', '.join(table_def_items) # definicion de la tabla cur.execute("CREATE TABLE IF NOT EXISTS " + tabla_sequence + " (" + table_def_2 + ") ENGINE=InnoDB;") con.commit() # Variables del loop i = 0 j = 0 ptm = '' out = [] listap = [] listaq = [] listar = [] olista = [] interes = [] with open(uniprot_file) as uniprot: # esto me abre y cierra el archivo al final for record in SwissProt.parse(uniprot): # parseando los records de uniprot i += 1 if i % 1000 == 0: print(i) con.commit() data = empty_data.copy() # en vez de vaciar el diccionario, le asigno el dafault sin enlazarlo al vacío # Acá cargo los datos generales para las PTMs de una proteína/entrada de uniprot (instancias de entradas) # tienen que cargarse en el orden de las columnas en la ptmdb y el del insert # print(record.accessions[0]) data['AC'] = record.accessions[0] # solo el principal, el resto nose. data['SQ'] = record.sequence data['LENGTH'] = record.sequence_length # todo acá hay un problema? no entran las de mas de 999 residuos data['ORG'] = record.organism # el bicho data['OC'] = record.organism_classification[0] # el dominio del bicho data['OX'] = record.taxonomy_id[0] # Id taxonomica del bicho
# guardar el CREATE en output table_def_items = [] # lista para concatenaciones de key y valor for cat, value in categories.items(): # concatenaciones key y valor table_def_items.append(cat + ' ' + value) # guardadaes en la lista table_def_2 = ', '.join(table_def_items) # definicion de la tabla #cur.execute("CREATE TABLE IF NOT EXISTS sprot2 (" + table_def_2 + ") ENGINE=InnoDB") #con.commit() ptm = '' out = [] listap = [] listaq = [] listar = [] with open(sprot_file) as sprot: # esto me abre y cierra el archivo al final for record in SwissProt.parse(sprot): i += 1 data = empty_data contenido_aa = count_amino_acids_ext(record.sequence) listaq = [] for q in contenido_aa.itervalues(): listaq.append(str(q)) # y los pongo en una lista sql_insert_values_q = ', '.join(listaq) cur.execute("INSERT INTO count_aa VALUES ('" + record.accessions[0] + "', '" + record.organism_classification[0] + "', " + str(record.sequence_length) + ", " + sql_insert_values_q + ")") con.commit()
#test cases have only one record. # With the SequenceParser test_handle = open(datafile) records = list(SeqIO.parse(test_handle, "swiss")) test_handle.close() assert len(records) == 1 assert isinstance(records[0], SeqRecord) #Check matches what we got earlier without the iterator: assert records[0].seq.tostring() == seq_record.seq.tostring() assert records[0].description == seq_record.description assert records[0].name == seq_record.name assert records[0].id == seq_record.id # With the RecordParser test_handle = open(datafile) records = list(SwissProt.parse(test_handle)) test_handle.close() assert len(records) == 1 assert isinstance(records[0], SProt.Record) #Check matches what we got earlier without the iterator: assert records[0].sequence == record.sequence assert records[0].description == record.description assert records[0].entry_name == record.entry_name assert records[0].accessions == record.accessions