Example #1
0
def read_embl(path_to_embls: list, num_of_entries: int, exclude_csv: str, queue):
    """ Reads entries from a list of existing embl files """
    if exclude_csv is None:
        # If no exclude csv is provided, we execute the reading without an if checking! (performance)
        for input_f in path_to_embls:
            # For each entry: try to read it and
            # add it to the queue
            try:
                entries = SwissProt.parse(input_f)
                for entry in entries:
                    queue.put(entry)
            except Exception as e:
                print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e))

    else:
        # If a exclude csv is provided, then a simple if check is added (reduced performance)
        with open(exclude_csv) as in_f:
            # Read the contents of the csv
            csv_reader = csv.reader(in_f)
            exclude_set = set(x[0] for x in list(csv_reader))

            for input_f in path_to_embls:
                # For each entry: try to read it and
                # add it to the queue
                try:
                    entries = SwissProt.parse(input_f)
                    for entry in entries:
                        if entry.accessions[0] in exclude_set:
                            # This effectively skips an entry at the cost to check whether to skip in EACH entry!
                            continue
                        queue.put(entry)
                except Exception as e:
                    print("File '{}' could not be parsed and was excluded. Reason: {}".format(input_f, e))
Example #2
0
def sync_query_list_with_response(response_fn, query_list):
    db_data = {}
    with open(response_fn, 'r') as fh:
        for record in SwissProt.parse(fh):
            acc = record.accessions[0]
            # Select only EMBL and RefSeq crossrefs
            refseq_refs, embl_refs = [], []
            for db_ref in record.cross_references:
                if db_ref[0] == 'RefSeq':
                    refseq_refs.append(db_ref[1:])
                elif db_ref[0] == 'EMBL':
                    embl_refs.append(db_ref[1:])
            db_data[acc] = {'RefSeq': refseq_refs,
                            'EMBL': embl_refs}

    # This is to handle isoforms
    # E.g. P03692 and P03692-1 can both be included in the query list
    # P03705-2 can be in the query list but not P03705
    for prot in query_list:  # For each of the original queries
        if (prot not in db_data) and ('-' in prot):  # If the query is not returned
            base_name = prot.split('-')[0]  # Search for the fist part
            if base_name in db_data:  # If it is present in the db_data
                if base_name not in query_list:  # If it's not in the original query list
                    db_data[prot] = db_data[base_name]  # Fill in the information for the corresponding protein
                    db_data.pop(base_name)  # AND remove the original part
                elif base_name in query_list:  # If the first part is in the original query
                    db_data[prot] = db_data[base_name]  # Fill in the information WITHOUT removing the original part
        elif prot not in db_data:
            print("I don't know what to do with this id: {}".format(prot))
            pass

    return db_data
def write_swissprot_annotations(outf, indentation_level, uniprot, uniprot_f):
    uniprot_dat_indices = UniProtDatIndex.objects.filter(uniprot=uniprot)
    for uniprot_dat_index in uniprot_dat_indices:
        if uniprot_dat_index.uniprot_accession == uniprot.accession:
            break
    uniprot_f.seek(uniprot_dat_index.file_char)
    record = SwissProt.parse(uniprot_f).next()
    indented_write(outf, indentation_level + 1,
                   "Length: %d\n" % record.sequence_length)
    if len(record.gene_name) > 0:
        for name_spec in record.gene_name.replace('\n', ' ').split('; '):
            name_type, names = name_spec.split('=')
            indented_write(outf, indentation_level + 1,
                           "%s: %s\n" % (name_type, names))
    for keyword in record.keywords:
        indented_write(outf, indentation_level + 1, 'Keyword: %s\n' % keyword)
    for comment in record.comments:
        if comment[0:5] == '-----':
            continue
        components = comment.replace(':\n', ': ').split(': ')
        comment_type = components[0]
        comment_lines = ': '.join(components[1:]).split('\n')
        indented_write(outf, indentation_level + 1, "%s:\n" % comment_type)
        for line in comment_lines:
            indented_write(outf, indentation_level + 2, "%s\n" % line)
    for cross_reference in record.cross_references:
        indented_write(
            outf, indentation_level + 1,
            "%s: %s\n" % (cross_reference[0], '; '.join(cross_reference[1:])))
Example #4
0
def go_in_papers(sp_path):
    # Returns: papers: key: pubmed_id; value: list of go_rec records
    # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id); 
    # 'go_id': (GO ID); 'go_ec': (GO Evidence Code).
    
    # To be used with SP data, not GOA
    
    papers = {}
    go_ids = {}
    sp_recs = {}
    papers_prots = {}
    sph = open(sp_path)
    for sp_rec in SP.parse(sph):
        cur_go_recs = get_go_evidence_codes(sp_rec)
#        print cur_go_recs
        if not cur_go_recs: 
            continue
        cur_papers = get_papers(sp_rec)
        for paper in cur_papers:
            if paper not in papers_prots:
                papers_prots[paper] = {sp_rec.entry_name: 1}
            else:
                papers_prots[paper][sp_rec.entry_name] = \
                    papers_prots[paper].get(sp_rec.entry_name,0)+1
            for cur_go_rec in cur_go_recs:
                d1 = dict(sp_id=sp_rec.entry_name,
                          go_id=cur_go_rec[0],
                          go_ec=cur_go_rec[1])
                papers.setdefault(paper,[]).append(d1)
    return papers, papers_prots        
Example #5
0
def pull_uniprot(repull=False):
    xmlname = os.path.join(os.path.dirname(__file__),
                           'uniprot_sprot_human.dat')
    if repull:
        xmldata = pull_and_decompress(
            'ftp.uniprot.org',
            '/pub/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/',
            'uniprot_sprot_human.dat.gz')
        with open(xmlname, 'w') as xmlfile:
            xmlfile.write(xmldata)
    seq_to_idlist = defaultdict(set)
    #I only want the PRO sequences.  One day, I could get the -1 -2 sequences as well if
    # there were a reason.
    with open(xmlname, 'r') as unif:
        for record in SwissProt.parse(unif):
            uniprotid = f'UniProtKB:{record.accessions[0]}'
            #xrefs = [ f"{x[0]}:{x[1]}" for x in record.cross_references if x[0].lower() in ['mint','string','nextprot']]
            #xrefs.append( f'PR:{record.accessions[0]}' )
            #xrefs.append( uniprotid )
            feats = [
                f for f in record.features if f[4].startswith('PRO_')
                and isinstance(f[1], int) and isinstance(f[2], int)
            ]
            fseq = [(record.sequence[f[1] - 1:f[2]], f[4]) for f in feats]
            #seq_to_idlist[record.sequence].update(xrefs)
            for fs, fn in fseq:
                seq_to_idlist[fs].add(f'{uniprotid}#{fn}')
    return seq_to_idlist
Example #6
0
def file_parse():
    file = gzip.open("uniprot.gz")

    #Declaration of arrays which check for repitions
    non_rep_id = []
    non_rep_org = []
    non_rep_tax = []

    swiss_records = SwissProt.parse(file)

    for swiss_record in swiss_records:

        #NCBI ID
        id = swiss_record.taxonomy_id
        if id not in non_rep_id:
            non_rep_id.append(id)

        #ORGANISM NAME
        organism = (swiss_record.organism.strip('.'))
        if organism not in non_rep_org:
            non_rep_org.append(organism)

        #TAXONOMY
        taxonomy= (swiss_record.organism_classification)
        if taxonomy not in non_rep_tax:
            non_rep_tax.append(taxonomy)

    #ZIP arrays to column/tab seperated output
    for i in zip(non_rep_id, non_rep_org, non_rep_tax):
        print ("".join(map((str), list(format(i)))))
def parse_uniprot(input_file):
    dic_pfam = {}
    dic_dom = {}
    dic_king = {}
    # probably faster/easier to use the XML parser directly
    #print (input_file)
    handle = open(input_file)
    for record in SwissProt.parse(handle):
        #print (record)
        #print (record.entry_name)
        #print (record.cross_references)
        entry = record.entry_name
        id = entry
        dic_pfam[id] = ''
        dic_dom[id] = 0
        dic_king[id] = 'Unique'
        for db in record.cross_references:
            if (db[0] == "Pfam"):
                dic_pfam[id] = dic_pfam[id] + db[1] + ";"
                dic_dom[id] += 1
                if (db[1] in shared_domains.keys()):
                    dic_king[id] = "Shared"
        if (dic_dom[id] == 0):
            dic_king[id] = 'None'
    return dic_pfam, dic_dom, dic_king
Example #8
0
    def test_compute_features_return_empty_list_if_features_are_empty(self):
        for record in SwissProt.parse(self.records):
            protein = parse_record_into_protein(record)
            break

        protein.go_mf = None
        protein.go_bp = None
        protein.go_cc = None
        protein.interpro = None
        protein.pfam = None
        protein.keywords = None

        protein.save(self.session, commit=True)
        protein = Protein.query.get(protein.id)  # Refresh

        features = compute_interaction_features(protein, protein)
        expected = dict(go_mf=[],
                        go_bp=[],
                        go_cc=[],
                        ulca_go_mf=[],
                        ulca_go_bp=[],
                        ulca_go_cc=[],
                        interpro=[],
                        pfam=[],
                        keywords=[])
        self.assertEqual(expected, features)
Example #9
0
 def test_can_parse_record_into_protein_objects(self):
     for record in SwissProt.parse(self.records):
         obj = parse_record_into_protein(record)
         break
     self.assertEqual(obj.uniprot_id, "P31946")
     self.assertEqual(obj.gene_id, "YWHAB")
     self.assertEqual(obj.reviewed, True)
Example #10
0
def features(files):
	ft=['ZN_FING', 'REGION','METAL','SITE','SIGNAL','REPEAT', 'NP_REGION', 'BINDING','MOTIF','MOD_RES', 'LIPID','DOMAIN','DNA_BIND','DISULFID','CROSSLNK', 'CARBOHYD','CA_BIND', 'ACT_SITE']
	for record in SwissProt.parse(open(files)):
		for l in record.features:
			
			if l[0] in ft:
				print l[0]+','+str(l[1])+'-'+str(l[2])+','+l[3]
Example #11
0
    def _parse_features( self ):
    
        print( 'uniprot flat files, to get features...' )
        with open( path + files[16], 'wt' ) as outf:

            for j in [11,12,13,14]:
                print( files[j] + '...' )
                with open(path + files[j], 'rt') as handle:
                    for record in SwissProt.parse(handle):
                        if record.taxonomy_id[0] in ['9606', '10090', '10116']:
                            accs  = record.accessions
                            acc   = accs.pop(0)
                            feats = record.features
                            for f in feats:
                                f = list(f)
                                f.insert(3, '')
                                if re.search(r'^[^\.]+\.\s*$', f[4]):
                                    m = re.match(r'^(.+)\.\s*$', f[4])
                                    if m:
                                        f[3] = m.group(1)
                                        f[4] = ''
                                elif re.search(r'.+\.\s+\{', f[4]):
                                    m = re.match(r'^(.+)\.\s*\{(.+)\}\.$', f[4])
                                    if m:
                                        f[3] = m.group(1)
                                        f[4] = m.group(2)
                                elif re.search(r'.+\.\s+\/', f[4]):
                                    m = re.match(r'^(.+)\.\s*\/(.+)\.$', f[4])
                                    if m:
                                        f[3] = m.group(1)
                                        f[4] = m.group(2)                                
                                else :
                                    f[4] = re.sub(r'[\{\}\.\/]', '', f[4]) 
                                #print(f)
                                outf.write( acc + "\t" + '\t'.join(map(str, f)) + '\n')
Example #12
0
 def test_compute_features_returns_None_if_target_is_None(self):
     for record in SwissProt.parse(self.records):
         protein = parse_record_into_protein(record)
         break
     protein.save(self.session, commit=True)
     protein = Protein.query.get(protein.id)  # Refresh
     self.assertIsNone(compute_interaction_features(protein, None))
Example #13
0
    def get_genes (self,gene_name=""):
        if gene_name != "":
            print "Finding \"{}\" gene in Uniprot database...".format(gene_name)
            upper_name = gene_name.upper() # Rho --> RHO

            output_handle = open(self.fasta_file, "w")

            for record in SwissProt.parse (self.fd):

                match = record.gene_name[5:5+len(upper_name)+1].upper()
                # Name=Rhodop; --> RHOD (Length of the queried name (rho)+1)
                # For matching the two possibilities
                # 1) Name=Rho;
                # 2) Name=rho {ECO.....}
                # So, it fill compare the queried gene name and match one e.g.
                # in 1st case "RHO " == "RHO;" or "RHO;" == "RHO;"
                # in 2nd case "RHO " == "RHO " or "RHO;" == "RHO "
                # We do not consider gene names differ to "Name=...;" in swisprot file



                if (upper_name+" ") == match or (upper_name+";") == match:
                    print "Add protein to fasta file: " + record.entry_name + ", ...." + record.gene_name
                    output = ">"+record.entry_name+"\n"+record.sequence.format("fasta")+"\n"
                    #print output
                    output_handle.write(output)
            output_handle.write("")
            output_handle.close()
Example #14
0
def go_in_papers(sp_path):
    # Returns: papers: key: pubmed_id; value: list of go_rec records
    # go_rec record is a dictionary. Keys (values): 'sp_id' (swissprot id);
    # 'go_id': (GO ID); 'go_ec': (GO Evidence Code).

    # To be used with SP data, not GOA

    papers = {}
    go_ids = {}
    sp_recs = {}
    papers_prots = {}
    sph = open(sp_path)
    for sp_rec in SP.parse(sph):
        cur_go_recs = get_go_evidence_codes(sp_rec)
        #        print cur_go_recs
        if not cur_go_recs:
            continue
        cur_papers = get_papers(sp_rec)
        for paper in cur_papers:
            if paper not in papers_prots:
                papers_prots[paper] = {sp_rec.entry_name: 1}
            else:
                papers_prots[paper][sp_rec.entry_name] = \
                    papers_prots[paper].get(sp_rec.entry_name,0)+1
            for cur_go_rec in cur_go_recs:
                d1 = dict(sp_id=sp_rec.entry_name,
                          go_id=cur_go_rec[0],
                          go_ec=cur_go_rec[1])
                papers.setdefault(paper, []).append(d1)
    return papers, papers_prots
Example #15
0
 def load_uniprot(self):
     self.uniprot = None
     if not self.exists('uniprot.txt'):
         return
     with self.open('uniprot.txt') as fp:
         self.uniprot = []
         for record in SwissProt.parse(fp):
             self.uniprot.append(record)
Example #16
0
 def test_parses_function_as_None_for_entry_with_no_comment(self):
     for record in SwissProt.parse(self.records):
         r = record
         break
     r.comments = [x for x in r.comments if "FUNCTION: " not in x]
     result = function(r)
     expected = None
     self.assertEqual(result, expected)
Example #17
0
 def generate_uniprot_record(self):
     for file_handle, file_number in self._uniprot_file_handle():
         data_source = self._file_number_to_source(file_number)
         for record in SwissProt.parse(file_handle):
             if self._check_id_to_use(record.accessions[0]):
                 current_record_dict = self._parse_record(
                     record, data_source)
                 yield current_record_dict
def get_ancestors_list():
    i = 0
    handle = open("uniprot_sprot.dat")
    for record in SwissProt.parse(handle):
        descriptions.append(record.sequence)
          print(descriptions)
        i += 1
        if i == 1:
            break
Example #19
0
    def _parse_flat_files( self ):
    
        print( 'uniprot flat files...' )
        with open( path + files[15], 'wt' ) as outf:

            for j in [11,12,13,14]:
                print( files[j] + '...' )
                with open(path + files[j], 'rt') as handle:
                    for record in SwissProt.parse(handle):
                        if record.taxonomy_id[0] in ['9606', '10090', '10116']:
                            accs  = record.accessions
                            acc   = accs.pop(0)
                            rev   = record.data_class
                            gname = re.sub(r'.*Name=([^;{]+)[{;].*', r'\1', record.gene_name).strip()
                            uid   = record.entry_name
                            taxid = record.taxonomy_id[0]
                            seq   = record.sequence
                            sinfo = str(record.seqinfo[0])
                            srcdb = 'sp'
                            if re.search(r'trembl', files[j]):
                                srcdb = 'tr'
                            rname = ''
                            fname = ''
                            sname = ''
                            flags = ''
                            if 'RecName' in record.description:
                                rname = re.sub(r'.*RecName: *Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip()
                            elif 'SubName' in record.description:
                                rname = re.sub(r'.*SubName: *Full=([^;{]+) *[;{].*', r'\1', record.description, re.IGNORECASE).strip()
                            if 'AltName' in record.description:
                                if re.search(r'AltName:[^:]*Full=', record.description, re.IGNORECASE): 
                                    fname = re.sub(r'.*AltName:[^:]*Full=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip()
                                if re.search(r'AltName:[^:]*Short=', record.description, re.IGNORECASE): 
                                    sname = re.sub(r'.*AltName:[^:]*Short=([^;{]+)[{;].*', r'\1', record.description, re.IGNORECASE).strip()
                            if 'Flags:' in record.description:
                                flags = re.sub(r'.*Flags: *([^;]+);.*', r'\1', record.description, re.IGNORECASE).strip()
                            refs  = list()
                            eids  = list()
                            mgis  = list()
                            hgnc  = list()
                            dids  = list()
                            dnms  = list()
                            ddbs  = list()
                            for i in range(0, len(record.cross_references)):
                                if record.cross_references[i][0] == 'GeneID':
                                    eids.append(record.cross_references[i][1])
                                if record.cross_references[i][0] == 'RefSeq':
                                    refs.append(re.sub(r'\.\d+$', r'', record.cross_references[i][1]))
                                if record.cross_references[i][0] == 'MGI':
                                    mgis.append(record.cross_references[i][1])
                                if record.cross_references[i][0] == 'HGNC':
                                    hgnc.append(record.cross_references[i][1])
                                if record.cross_references[i][0] in xdoms:
                                    dids.append(record.cross_references[i][1])
                                    ddbs.append(record.cross_references[i][0])
                                    dnms.append(record.cross_references[i][2])
                            outf.write( '\t'.join([ acc, uid, srcdb, taxid, rev, gname, rname, fname, sname, flags, '|'.join(accs), '|'.join(eids), '|'.join(refs), '|'.join(hgnc), '|'.join(mgis), '|'.join(ddbs), '|'.join(dids), '|'.join(dnms), sinfo, seq ]) + '\n' )          
Example #20
0
 def test_parses_interpro_correctly(self):
     for record in SwissProt.parse(self.records):
         result = interpro_terms(record)
         break
     expected = [
         'IPR000308',
         'IPR023409',
         'IPR036815',
         'IPR023410',
     ]
     self.assertEqual(result, expected)
Example #21
0
def give_me_the_record(primary_id, swissprot_file):
    """
    Return a single record given with the primary id
    :param primary_id: A primary id
    :param swissprot_file: A swissprot file
    :return: A record with accession == primary id
    """
    with open(swissprot_file, 'r') as fh:
        for record in SwissProt.parse(fh):
            if primary_id in record.accessions:
                return record
Example #22
0
 def obtain_taxons(self, protein_dict, fh_sprot): 
     found = False
     for rec in sp.parse(fh_sprot):
         for ac in range(len(rec.accessions)): 
             if rec.accessions[ac] in protein_dict.keys(): 
                 # assign rec.taxonomy_id list to the protein 
                 protein_dict[rec.accessions[ac]] = rec.taxonomy_id 
                 found = True
                 break
         #if found: 
         #    break 
     return protein_dict
Example #23
0
 def __init__(self, sprot_cache='', trembl_cache='', organism='h**o sapien'):
     self.records = {}
     self.organism = organism.strip().lower()
     if sprot_cache:
         # Load the swissprot records if file can be found
         try:
             with open(sprot_cache) as fp:
                 for record in SwissProt.parse(fp):
                     for accession in record.accessions:
                         self.records[accession] = record
         except IOError, e:
             print(e); print("SwissProt cache not loaded")
Example #24
0
 def test_parses_keywords_correctly(self):
     for record in SwissProt.parse(self.records):
         result = keywords(record)
         break
     expected = [
         '3D-structure', 'Acetylation', 'Alternative initiation',
         'Complete proteome', 'Cytoplasm', 'Direct protein sequencing',
         'Host-virus interaction', 'Isopeptide bond', 'Nitration',
         'Phosphoprotein', 'Polymorphism', 'Reference proteome',
         'Ubl conjugation'
     ]
     self.assertEqual(result, expected)
Example #25
0
 def obtain_taxons(self, protein_dict, fh_sprot):
     found = False
     for rec in sp.parse(fh_sprot):
         for ac in range(len(rec.accessions)):
             if rec.accessions[ac] in protein_dict.keys():
                 # assign rec.taxonomy_id list to the protein
                 protein_dict[rec.accessions[ac]] = rec.taxonomy_id
                 found = True
                 break
         #if found:
         #    break
     return protein_dict
def species_filter(sp_handle, taxon_id):
    target_id = int(taxon_id+"0000001")
    outhandle = open("sp_species.%s.tfa" % taxon_id,"w")
    for inrec in sp.parse(sp_handle):
        if taxon_id in inrec.taxonomy_id:
            outseq = SeqRecord(Seq(inrec.sequence),
                   id="T"+str(target_id),
                   description = "%s" %
                   (inrec.entry_name))
            outseq_list = [outseq]
            SeqIO.write(outseq_list,outhandle,"fasta")
            target_id += 1
    outhandle.close()
Example #27
0
def make_sp_sql_table(sp_handle, out_handle, taxa_ids=[]):
    """
    Make a table from swissprot that includes the following fields:
    SP_ID, GO, Ontology, evidence code
    """
    for sp_rec in sp.parse(sp_handle):
        go_list = parse_go(sp_rec)
        if not go_list:
            continue
        if taxa_ids and (not (sp_rec.taxonomy_id[0] in taxa_ids)):
            continue
        for go_entry in go_list:
            out_handle.write("%s\t%s\t%s\t%s\n" %
                             (sp_rec.entry_name, go_entry['go_id'],
                              go_entry['ontology'], go_entry['evidence']))
def n_go_ec(sp_handle, taxon_id, allowed_ec=exp_ec):
    # For each GO namespace, number of proteins annotated with allowed
    # terms. Uses experimental evidence code, by default.
    #
    ncount = {}
    ncount['BPO'] = 0
    ncount['MFO'] = 0
    ncount['CCO'] = 0
    for inrec in sp.parse(sp_handle):
        if taxon_id in inrec.taxonomy_id:
            in_allowed = go_ec_filter(inrec, allowed_ec=exp_ec)
            for onto in in_allowed:
                if in_allowed[onto]:
                    ncount[onto] += 1
    return ncount
Example #29
0
    def test_compute_features_maps_to_alts_to_stable_ontology_terms(self):
        for record in SwissProt.parse(self.records):
            protein = parse_record_into_protein(record)
            break

        protein.go_mf = 'GO:0000975'
        protein.go_bp = None
        protein.go_cc = None
        protein.interpro = None
        protein.pfam = None
        protein.keywords = None

        features = compute_interaction_features(protein, protein)
        self.assertEqual(features['go_mf'], ['GO:0044212', 'GO:0044212'])
        self.assertTrue('GO:0000975' not in features['ulca_go_mf'])
def __build_NEXP_accession_singleSpecies(fh_sprot, taxon_id, ontType, EXP_default=set([])):
    '''
    This method builds a list of accessions of the proteins whose annotations 
    have non-EXP evidence but no EXP evidence codes in a specific 
    UniProtKB/SwissProt file (file pointer fh_sprot) for some ontology 
    type (ontType). The method returns the list. 
    '''
    # nexp_accessions: Initialize a list to store the accessions of the 
    # proteins that meet the criteria: (1) the protein whose annotation 
    # is supported some Non-EXP evidence code in the specific ontology 
    # ontType, but (2) the annotation is NOT supported by any EXP 
    # evidence code.
    nexp_accessions = []
    print('      Building the accession list with the proteins ' + \
          'that have only non-EXP evidence codes at time t1 ...')
    for rec in sp.parse(fh_sprot):
        # Selects records that are related to a specific
        # taxonomy id taxon_id:
        if taxon_id in rec.taxonomy_id:
            # ont_specific_code_exist: this varilable is initialized to False
            # at the beginning of each iteration. If an evidence code (either 
            # EXP or Non-EXP) for the current record is found, this varilable 
            # will be set to True
            ont_specific_code_exist = False
            # exp_code: this variable is initialized to False at the beginning 
            # of each iteration. If an EXP evidence for the current record is 
            # found, this variable will be set to True.
            exp_code = False
            # Going over the list of DB reference entries:
            for crossRef in rec.cross_references:
            # Consider the cross_reference entries
            # that relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if not ont_specific_code_exist and goList[2] == ontType:
                        ont_specific_code_exist = True
                    if goList[2] == ontType and \
                        (crossRef[3].split(':'))[0] in EXP_default:
                        exp_code = True
                        break
            # If the protein's annotation is supported by some Non-EXP evidence
            # code but is not supported by any EXP evidence code, append the 
            # protein's accessions list to the nexp_accessions list:
            if ont_specific_code_exist and not exp_code:
                nexp_accessions.append(rec.accessions)
    return nexp_accessions
Example #31
0
 def obtain_goterms(self, goterm_dict, fh_sprot):
     found = False
     for rec in sp.parse(fh_sprot):
         for ac in range(len(rec.accessions)):
             goList = []
             if rec.accessions[ac] in goterm_dict.keys():
                 for crossRef in rec.cross_references:
                     if crossRef[0] == 'GO':
                        goDef = (crossRef[1], (crossRef[3].split(':'))[0], \
                                  crossRef[2][0])
                        goterm_dict[rec.accessions[ac]].add(goDef)
                 found = True
                 break
         #if found: 
             #break 
     return goterm_dict
def UNIPROT_GENE_PLUS(UNIPROT): #LIST-The difference between this and UNIPROT_GENE is that UNIPROT_GENE_PLUS returns synonim genes as well if    
                                #any and the gene name in the first entry
    import urllib, urllib2
    from Bio import SwissProt
    
    url=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT)
    GENES=[]
    for record in SwissProt.parse(url):
        if len(record.gene_name.split(";"))>2:
            GENES.append(record.gene_name.split(";")[0].split("=")[1])
            SYN=record.gene_name.split(";")[1].split("=")[1].split(",")
            for syno in SYN:
                GENES.append("".join(syno.split()))
        else:
            GENES.append(record.gene_name.split(";")[0].split("=")[1])
    return GENES
Example #33
0
 def obtain_goterms(self, goterm_dict, fh_sprot):
     found = False
     for rec in sp.parse(fh_sprot):
         for ac in range(len(rec.accessions)):
             goList = []
             if rec.accessions[ac] in goterm_dict.keys():
                 for crossRef in rec.cross_references:
                     if crossRef[0] == 'GO':
                         goDef = (crossRef[1], (crossRef[3].split(':'))[0], \
                                   crossRef[2][0])
                         goterm_dict[rec.accessions[ac]].add(goDef)
                 found = True
                 break
         #if found:
         #break
     return goterm_dict
def count_GOterms_with_EXP(fh_sprot, taxon_id, EXP_default=set([])):
    '''
    This method extract the distinct GO terms for each gene that 
    have validation with any of the experimental evidence codes.
    A set is created for these GO terms for each gene and then 
    are placed in a dictionary of each ontological categories. 
    At the end, these THREE dictionaries are returned.
    '''
    mfo_terms = OrderedDict()
    bpo_terms = OrderedDict()
    cco_terms = OrderedDict()
    count = 0
    for rec in sp.parse(fh_sprot):
        # SELECT records that are related to a specific
        # taxon_id such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            protName = rec.accessions[0]
            # Initialize lists for adding GO terms:
            terms_mfo = set()
            terms_bpo = set()
            terms_cco = set()
            # Go over the list of DB cross references:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that
                # relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if (crossRef[3].split(':'))[0] in EXP_default:
#                        print goList
                        if goList[-1].upper() == 'F':
                            terms_mfo.add(goList[0])
                        elif goList[-1].upper() == 'P':
                            terms_bpo.add(goList[0])
                        elif goList[-1].upper() == 'C':
                            terms_cco.add(goList[0])
            # Increase gene counts in BPO, CCO, and MFO categories
            # depending on the corresponding flag values:
            mfo_terms[protName] = terms_mfo
            bpo_terms[protName] = terms_bpo
            cco_terms[protName] = terms_cco
            count += 1
            if count > 20: 
                break
            #break
    return (mfo_terms, bpo_terms, cco_terms) 
def count_genes_with_EXP_old(fh_sprot, taxon_id, EXP_default=set([])):
    # The exp_bpo_ct variable counts total number of genes in
    # the sprot file related to the taxonomy id taxon_id whose
    # annotations have EXP evidence and in BPO ontological category:
    exp_bpo_ct = 0

    # The exp_cco_ct variable counts total number of genes in
    # the sprot file related to the taxonomy id taxon_id whose
    # annotations have EXP evidence and in CCO ontological category:
    exp_cco_ct = 0

    # The exp_mfo_ct variable counts total number of genes in
    # the sprot file related to the taxonomy id taxon_id whose
    # annotations have EXP evidence and in MFO ontological category:
    exp_mfo_ct = 0

    for rec in sp.parse(fh_sprot):
        # SELECT records that are related to a specific
        # taxon_id such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            bpo_exp_flag = cco_exp_flag = mfo_exp_flag = False
            # Go over the list of GO information:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that
                # relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if (crossRef[3].split(':'))[0] in EXP_default:
                        if goList[-1].upper() == 'P':
                            bpo_exp_flag = True
                        elif goList[-1].upper() == 'C':
                            cco_exp_flag = True
                        elif goList[-1].upper() == 'F':
                            mfo_exp_flag = True
                if (bpo_exp_flag and cco_exp_flag and mfo_exp_flag):
                    break
            # Increase gene counts in BPO, CCO, and MFO categories
            # depending on the corresponding flag values:
            if bpo_exp_flag:
                exp_bpo_ct += 1
            if cco_exp_flag:  
                exp_cco_ct += 1
            if mfo_exp_flag:  
                exp_mfo_ct += 1
    return (exp_bpo_ct, exp_cco_ct, exp_mfo_ct)
Example #36
0
def main():
    """Make a jazz noise here"""
    args = get_args()
    in_fh = args.FILE
    keywords = args.keyword
    skips = args.skip
    out_fh = args.output

    if not os.path.isfile(in_fh):
        die('"{}" is not a file'.format(in_fh))

    #print(keywords)
    keylist = [keywords]
    #print(keylist)

    outfile = open(out_fh, "w")

    fandle = open(in_fh)
    print('Processing "{}"'.format(in_fh))
    records = SwissProt.parse(fandle)
    records = SeqIO.parse(fandle, 'swiss')
    print(records)
    #print(records)
    i = 0
    j = 0
    for record in records:
        print(record)
        docset = record.keywords
        docsetlower = [i.lower() for i in docset]
        docsetlower = set(docsetlower)
        userset = set(keylist)
        i += 1

        organismset = record.organism_classification
        organismsetlower = [i.lower() for i in organismset]
        organismsetlower = set(organismsetlower)
        skipsetlower = [i.lower() for i in skips]
        skipsetlower = set(skipsetlower)

        if docsetlower.intersection(userset) and len(
                organismsetlower.intersection(skipsetlower)) == 0:
            j += 1
            SeqIO.write(record, outfile, 'fasta')

    print('Done, skipped {} and took {}. See output in "{}".'.format(
        (i - j), j, out_fh))
Example #37
0
def check_sprot_format(fh_sprot):
    """
    This method checks whether the format of the file
    (with file handle fh_sprot) is in UniProtKB/Swissprot format.
    If the file is in UniProtKB/Swissprot format format,
        it returns True
    Otherwise,
       it returns False.
    """
    iter_handle = sp.parse(fh_sprot) # sp.parse method returns a generator
    try:
        for rec in iter_handle:
            break
    except:
        return False
    else:
        return True
Example #38
0
def check_sprot_format(fh_sprot):
    """
    This method checks whether the format of the file
    (with file handle fh_sprot) is in UniProtKB/Swissprot format.
    If the file is in UniProtKB/Swissprot format format,
        it returns True
    Otherwise,
       it returns False.
    """
    iter_handle = sp.parse(fh_sprot)  # sp.parse method returns a generator
    try:
        for rec in iter_handle:
            break
    except:
        return False
    else:
        return True
Example #39
0
 def test_parses_gomf_correctly(self):
     for record in SwissProt.parse(self.records):
         result = go_terms(record, ont="mf")
         break
     expected = [
         'GO:0045296',
         'GO:0019899',
         'GO:0042826',
         'GO:0042802',
         'GO:0051219',
         'GO:0050815',
         'GO:0008022',
         'GO:0032403',
         'GO:0019904',
         'GO:0003714',
     ]
     self.assertEqual(result, expected)
def count_genes_with_EXP(fh_sprot, taxon_id, EXP_default=set([])):
    gene_count = {} 
    gene_count['MFO'] = 0
    gene_count['BPO'] = 0
    gene_count['CCO'] = 0

    for rec in sp.parse(fh_sprot):
        # SELECT records that are related to a specific
        # taxon_id such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            # Three flags to check whether an Exp evidence is found
            # in any of BPO, CCO, and MFO ontological categories:
            exp_flag = {}
            exp_flag['MFO'] = False
            exp_flag['BPO'] = False
            exp_flag['CCO'] = False

            # Go over the list of DB cross references:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that
                # relate to GO DB:
                if crossRef[0] == 'GO':
                    goList = [crossRef[1],
                              (crossRef[3].split(':'))[0],
                              crossRef[2][0]]
                    if (crossRef[3].split(':'))[0] in EXP_default:
                        if goList[-1].upper() == 'F':
                            exp_flag['MFO'] = True
                        elif goList[-1].upper() == 'P':
                            exp_flag['BPO'] = True
                        elif goList[-1].upper() == 'C':
                            exp_flag['CCO'] = True
                # Whenever an exp evidence for all three ontological 
                # categories are found, break out the loop:
                if (exp_flag['MFO'] and exp_flag['BPO'] and exp_flag['CCO']):
                    break
            # Increase gene counts in BPO, CCO, and MFO categories
            # depending on the corresponding flag values:
            if exp_flag['MFO']:
                gene_count['MFO'] += 1
            if exp_flag['BPO']:
                gene_count['BPO'] += 1
            if exp_flag['CCO']:
                gene_count['CCO'] += 1
    return gene_count
Example #41
0
def read_sprot_dat(sprot_dat_file, seq_dict):
    num_record = 0
    for record in SwissProt.parse(open(sprot_dat_file)): # Use Bio.SwissProt to parse the uniprot_sprot.dat file
        for seqID in record.accessions:
            if seqID in seq_dict:
                num_record += 1
                if num_record % 10000 == 0:
                    sys.stderr.write("{} records read so far\n".format(num_record))
                go_terms = [i[1][3:] for i in record.cross_references if i[0] == 'GO'] # GO terms ['GO:0031012', 'GO:0005576', 'GO:0004222', 'GO:0008270']
                organism = record.organism # organism name
                lineage = record.organism_classification # taxonomic classification ['Viruses', 'dsDNA viruses, no RNA stage', 'Iridoviridae', 'Chloriridovirus']
                tax_id = record.taxonomy_id[0] # taxonomy id '345201'
                gene_name, OLN, ORF = parse_GN(record.gene_name) # GN line,include gene names, ordered locus names, and ORF names 
                full_name, EC = parse_DE(record.description) # DE line with descriptive information. RecName, AltName (Full=, short=, EC=, ...)
                seq_dict[seqID] = {'organism' : organism, 'EC' : EC, 'gene_name' : gene_name,'OLN' : OLN, 'ORF' : ORF, 'GO': go_terms, 'KW': record.keywords, 'full_name': full_name, 'tax_id': tax_id, 'lineage': lineage} # map primary ID to annotation dictionary
            else:
                continue
    sys.stderr.write("\nnumber of sequences is {}\n".format(len(seq_dict)))
    return seq_dict
Example #42
0
File: app.py Project: Sharabesh/MDA
def process(files):  #the file is a text file of a swissprot protein
    parsed = sp.parse(files)
    record = next(parsed)
    sequence = record.sequence
    length = len(sequence)
    a = record.features
    finished = False
    output = []
    for item in a:
        if finished:
            break
        else:
            if (
                    not (item[0] == "CHAIN") and not (item[0] == "DOMAIN")
            ):  #this requires more refinement to allow for a range of domains
                output.append([item[0], item[1], item[2]])
                if item[2] == length:
                    finished = True
    return output
Example #43
0
 def test_parses_gocc_correctly(self):
     for record in SwissProt.parse(self.records):
         result = go_terms(record, ont="cc")
         break
     expected = [
         'GO:0005737',
         'GO:0030659',
         'GO:0005829',
         'GO:0070062',
         'GO:0005925',
         'GO:0042470',
         'GO:0016020',
         'GO:0005739',
         'GO:0005634',
         'GO:0048471',
         'GO:0043234',
         'GO:0017053',
     ]
     self.assertEqual(result, expected)
Example #44
0
    def setUp(self):
        self.records = open(
            os.path.normpath(
                "{}/test_data/test_sprot_records.dat".format(base_path)), 'rt')
        self.session, self.engine = create_session(db_path)
        delete_database(self.session)

        self.proteins = []
        for record in SwissProt.parse(self.records):
            protein = parse_record_into_protein(record)
            protein.save(self.session, commit=True)
            self.proteins.append(protein)

        self.labels = ['Activation', 'Inhibition', 'Acetylation']
        self.interactions = []
        for protein_a, protein_b in product(self.proteins, self.proteins):
            class_kwargs = compute_interaction_features(protein_a, protein_b)
            label = '{},{}'.format(self.labels[protein_a.id - 1],
                                   self.labels[protein_b.id - 1])
            try:
                interaction = create_interaction(protein_a,
                                                 protein_b,
                                                 labels=label,
                                                 session=self.session,
                                                 verbose=False,
                                                 save=True,
                                                 commit=True,
                                                 **class_kwargs)
                self.interactions.append(interaction)
            except ObjectAlreadyExists:
                continue

        self.X, self.y, _ = load_dataset(self.interactions,
                                         self.labels,
                                         selection=DEFAULT_SELECTION)
        base = Pipeline(
            steps=[('vectorizer',
                    CountVectorizer(lowercase=False, stop_words=[':', 'GO'])
                    ), ('estimator', LogisticRegression(random_state=0))])
        self.clf = MixedBinaryRelevanceClassifier(
            [clone(base) for _ in range(len(self.labels))])
        self.clf.fit(self.X, self.y)
Example #45
0
    def from_file(cls, path):
        db_records = []

        with open(path) as f:
            db = SwissProt.parse(f)

            for record in db:
                db_records.append(
                    cls(
                        primary_accession=record.accessions[0],
                        primary_tax_id=int(record.taxonomy_id[0]),
                        gene_name=record.gene_name,
                        organism=record.organism,
                        description=record.description,
                        sequence=record.sequence,
                        comments=record.comments,
                        cross_references=record.cross_references,
                        accessions=record.accessions,
                        annotation_update=record.annotation_update,
                        created=record.created,
                        data_class=record.data_class,
                        entry_name=record.entry_name,
                        features=record.features,
                        host_organism=record.host_organism,
                        host_taxonomy_id=record.host_taxonomy_id,
                        keywords=record.keywords,
                        molecule_type=record.molecule_type,
                        organelle=record.organelle,
                        organism_classification=record.organism_classification,
                        protein_existence=record.protein_existence,
                        references=[x.__dict__ for x in record.references],
                        seqinfo=record.seqinfo,
                        sequence_length=record.sequence_length,
                        sequence_update=record.sequence_update,
                        taxonomy_id=record.taxonomy_id,
                    ))

        with database:
            database.create_tables([cls])

        with database.atomic():
            cls.bulk_create(db_records, batch_size=250)
def UNIPROT_CHAIN_LIMITS(UNIPROT_ID): #Given a uniprot, it returns the limits of the mature protein numbering
    import urllib2
    from Bio import SwissProt
    
    PAGE=urllib2.urlopen("http://www.uniprot.org/uniprot/%s.txt"%UNIPROT_ID)

    PARSED_PAGE=SwissProt.parse(PAGE)
    for record in PARSED_PAGE:
        CHAIN_VALUES=[]
        for feature in record.features:
            if feature[0]=="CHAIN":
                CHAIN_VALUES=CHAIN_VALUES+[str(feature[1]), str(feature[2])]

        if any(X.isdigit()==False for X in CHAIN_VALUES) or not CHAIN_VALUES:
            CHAIN_START=1
            CHAIN_END=record.sequence_length
        else:
            CHAIN_START=min(int(X) for X in CHAIN_VALUES)
            CHAIN_END=max(int(X) for X in CHAIN_VALUES)                
    
    return[CHAIN_START, CHAIN_END]  
Example #47
0
def SwissIterator(handle):
    """Break up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     - Swiss-Prot aka SwissProt
     - TrEMBL
     - UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.

    Rather than calling it directly, you are expected to use this
    parser via Bio.SeqIO.parse(..., format="swiss") instead.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(seq,
                                     id=swiss_record.accessions[0],
                                     name=swiss_record.entry_name,
                                     description=swiss_record.description,
                                     features=[_make_seqfeature(*f) for f
                                               in swiss_record.features],
                                     )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if dbxref not in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        if swiss_record.protein_existence:
            annotations['protein_existence'] = swiss_record.protein_existence
        if swiss_record.created:
            annotations['date'] = swiss_record.created[0]
            annotations['sequence_version'] = swiss_record.created[1]
        if swiss_record.sequence_update:
            annotations[
                'date_last_sequence_update'] = swiss_record.sequence_update[0]
            annotations['sequence_version'] = swiss_record.sequence_update[1]
        if swiss_record.annotation_update:
            annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
            annotations['entry_version'] = swiss_record.annotation_update[1]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = swiss_record.host_organism
        if swiss_record.host_taxonomy_id:
            annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments)
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError(
                            "Unknown key %s found in references" % key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record
Example #48
0
from Bio import SwissProt
with open('../../samples/spfile.txt') as fh:
    records = SwissProt.parse(fh)
    for record in records:
        print('Entry name: %s' % record.entry_name)
        print('Accession(s): %s' % ','.join(record.accessions))
        print('Keywords: %s' % ','.join(record.keywords))
        print('Sequence: %s' % record.sequence)
Example #49
0
#Input="HumanReview.txt"					# Globle File Input
#Testing 
#Input = "GLP.txt"

"""
AATarget1 = "TargetSequencePosition1.txt"
AATarget2 = "TargetSequencePosition2.txt"
"""

# Zero Variables
TargetSeq = []
GlobalStat = 0

from Bio import SwissProt
handle = open (Input,"r")
records = list(SwissProt.parse(handle))
print "Done Big List Parsing"



"""
HumanOut="HumanAccessions.txt"				# Human Protein Accession File 
							# (This need to be fasta format for later Analysis) 
HumanMAPInput=HumanOut
HumanMAPOut="HumanMAPList.txt"
	
AlignInput="HumanMAPList.txt"				# This is a File Name (used in SeqIO.parse)
MAPUpdateOutPut="HumanMAPFiltered.txt"

CytoMitoInput=MAPUpdateOutPut
CytoOutput="CytoList.txt"
# SwissProt / Uniprot flat file parsing 
from Bio import SwissProt
for record in SwissProt.parse(open('/path/to/your/uniprot_sprot.dat')):
  pid=record.accessions[0]
  seq=record.sequence
  for feature in record.features:
    print feature
from Bio import SwissProt
import pickle

HANDLE=open("DATABASES/uniprot_sprot.dat")

DICT={}
for record in SwissProt.parse(HANDLE):
    CHAIN_VALUES=[]
    AC=[]
    for ac in record.accessions:
        AC.append(ac) 
    for feature in record.features:
        if feature[0]=="CHAIN":
            CHAIN_VALUES=CHAIN_VALUES+[str(feature[1]), str(feature[2])]
    print CHAIN_VALUES
    
    if any(X.isdigit()==False for X in CHAIN_VALUES) or not CHAIN_VALUES:
        CHAIN_START=1
        CHAIN_END=record.sequence_length
    else:
        CHAIN_START=min(int(X) for X in CHAIN_VALUES)
        CHAIN_END=max(int(X) for X in CHAIN_VALUES)
    print AC[0]
    print CHAIN_START, CHAIN_END
    DICT[AC[0]]=[int(CHAIN_START), int(CHAIN_END)]

PICKLE_OUT=open("DATABASES/OBJECTS/DICT_UNIPROT_CHAIN_LIMITS.pi", "w")
pickle.dump(DICT,PICKLE_OUT)
Example #52
0
def uniprot2metaphors(outdir, hash2protid, accessions, verbose=True):
    """Parser for dat formatted dump of uniprot database. 
    It's quick, much faster than xml parsing!
    But from time to time may encounter wrongly formatted DAT file.
    """
    # create working dir
    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    #parse dump in dat format
    files = {} # will store opened files
    taxid2stats = {} # { taxid: [matches,total] }
    for r in SwissProt.parse(sys.stdin):
        #skip entry if taxid not of interest
        taxid = int(r.taxonomy_id[0])
        if taxid not in hash2protid: 
            continue
        
        #update stats
        if not taxid in taxid2stats:
            taxid2stats[taxid] = [0, 0]
        taxid2stats[taxid][1] += 1
    
        #check if md5 match
        md5 = hashlib.md5(r.sequence).hexdigest()
        if md5 not in hash2protid[taxid]:
            #skip if no match
            continue 
            
        #save uniprot accession
        for protid in hash2protid[taxid][md5]:
            files  = save_entry(outdir, files, protid, "accessions", r.accessions[0])

            #provide only accessions if requested so 
            if accessions:
                continue

            #add xreference information for each external db
            for ex_db_data in r.cross_references:
                extdb, extid = ex_db_data[:2]
                files = save_entry(outdir, files, protid, extdb, extid)

            #save gene name
            if r.gene_name.startswith('Name='):
                extid = r.gene_name[5:].split(';')[0]
                files = save_entry(outdir, files, protid, "GeneName", extid)

            #save description
            if r.description:
                files = save_entry(outdir, files, protid, "Description", r.description)

            #update stats    
            taxid2stats[taxid][0] += 1

    #close opened files
    for f in files: 
        files[f].close()

    #print stats
    print "#taxid\tmapped\ttotal\t%"
    for taxid in sorted(taxid2stats.keys()):
        mapped,total = taxid2stats[taxid]
        print "%s\t%s\t%s\t%.2f" % (taxid, mapped, total, mapped*100.0/total)
    print "%s\tDone." % datetime.now()
Example #53
0
from Bio import SwissProt
with open('../../samples/spfile.txt') as fh:
    record = next(SwissProt.parse(fh))
    for att in dir(record):
        if not att.startswith('__'):
            print(att, getattr(record, att))
Example #54
0
def appendSprot2goa(fh_sprot, goa_file_name, taxon_id, fh_merged_go):
    """
     This method reads each reacord from the UniProtKB/SwissProt file
     and checks wither it's for taxon_id. If it is, this method
     then checks whether the GO term exists in the UniProt-GOA file
     passed as the file name goa_file_name. If it is a new GO term, 
     this method invokes swissProt2GOA method for each such GO term 
     to construct a UniProt-GOA record which it appends at the end of 
     the merged UniProt-GOA file passed as file handle fh_merged_go. 
    """
    # Creates an iterator object for t1 file:
    iter_handle, GAFFIELDS = create_iterator(goa_file_name) 

    # Construct a dictionary goa_dict with the proteins and 
    # the corresponding GO terms in t1 file:
    goa_dict = {}
    for ingen in iter_handle:
        if ingen['DB_Object_ID'] in goa_dict.keys():
            goa_dict[ingen['DB_Object_ID']].append([ingen['GO_ID'], \
                            ingen['Evidence'], ingen['Aspect']])
        else:
            goa_dict[ingen['DB_Object_ID']] = [[ingen['GO_ID'], \
                           ingen['Evidence'], ingen['Aspect']]]

    # EXTRACTS the NEW GO terms in t2 file that are NOT found in t1 file:
    goCount = 0
    for rec in sp.parse(fh_sprot):
        # SELECTS records that are related to a specific taxon_id
        # such as 559292 for yeast:
        if taxon_id in rec.taxonomy_id:
            # Going over each of the entries of the accessions list:
            for ac in range(len(rec.accessions)):
                # knownProt is an indicator to detect whether the
                # current sprot protein is already in GOA file:
                knownProt = ""
                if rec.accessions[ac] in goa_dict.keys():
                    # If the current sprot protein is already in the GOA
                    # file, the sprot protein is assigned to knownProt:
                    knownProt = rec.accessions[ac]
                    break
            # Going over the list of GO information:
            for crossRef in rec.cross_references:
                # Consider the cross_reference entries that relate to GO DB:
                if crossRef[0] == 'GO':
                    # goList is a list of GO ID, Aspect, and Evidence:
                    goList = [crossRef[1], (crossRef[3].split(':'))[0], \
                              crossRef[2][0]]
                    # Checking whether a new GO annotaion found:
                    if (not knownProt) or (knownProt and \
                        goList not in goa_dict[knownProt]):
                        # A new GO annotation is found in two situations:
                        # 1. if knownProt is empty  (not knownProt) or
                        # 2. if knownProt is not empty but the GO annotation
                        #    is not found in the GOA file

                        # Convert the sprot record to a GOA record:
                        goaRec = swissProt2GOA(rec, crossRef, GAFFIELDS)
                        # Write the converted GOA record to the output file:
                        GOAParser.writerec(goaRec, fh_merged_go, GAFFIELDS)
#                        if goCount in range(1, 20) or goCount in range(6400, 6420):
#                            print ('goCount: ' + str(goCount) + '\n')
#                            goaRec = swissProt2GOA(rec, crossRef, GAFFIELDS)
                        goCount += 1
    return goCount
Example #55
0
def parse_uniprot_data(data):
    return dict((s.entry_name, s) for s in SwissProt.parse(data))
Example #56
0
def SwissIterator(handle):
    """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     * Swiss-Prot aka SwissProt
     * TrEMBL
     * UniProtKB aka UniProt Knowledgebase

    It does NOT read their new XML file format.
    http://www.expasy.org/sprot/

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(seq,
                                     id=swiss_record.accessions[0],
                                     name=swiss_record.entry_name,
                                     description=swiss_record.description,
                                    )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if not dbxref in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        annotations['date'] = swiss_record.created[0]
        annotations['date_last_sequence_update'] = swiss_record.sequence_update[0]
        if swiss_record.annotation_update:
            annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = [word.rstrip(".") \
                                            for word \
                                            in swiss_record.host_organism]
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join(["%s=%s;" % (key, value) \
                                            for key, value \
                                            in reference.comments])
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError(\
                            "Unknown key %s found in references" % key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record
table_def_2 = ', '.join(table_def_items)  # definicion de la tabla
cur.execute("CREATE TABLE IF NOT EXISTS " + tabla_sequence + " (" + table_def_2 + ") ENGINE=InnoDB;")
con.commit()

# Variables del loop
i = 0
j = 0
ptm = ''
out = []
listap = []
listaq = []
listar = []
olista = []
interes = []
with open(uniprot_file) as uniprot:  # esto me abre y cierra el archivo al final
    for record in SwissProt.parse(uniprot):  # parseando los records de uniprot
        i += 1
        if i % 1000 == 0:
            print(i)
            con.commit()
        data = empty_data.copy()  # en vez de vaciar el diccionario, le asigno el dafault sin enlazarlo al vacío
        # Acá cargo los datos generales para las PTMs de una proteína/entrada de uniprot (instancias de entradas)
        # tienen que cargarse en el orden de las columnas en la ptmdb y el del insert
        # print(record.accessions[0])
        data['AC'] = record.accessions[0]  # solo el principal, el resto nose.
        data['SQ'] = record.sequence
        data['LENGTH'] = record.sequence_length  # todo acá hay un problema? no entran las de mas de 999 residuos
        data['ORG'] = record.organism  # el bicho
        data['OC'] = record.organism_classification[0]  # el dominio del bicho
        data['OX'] = record.taxonomy_id[0]  # Id taxonomica del bicho
Example #58
0
#  guardar el CREATE en output

table_def_items = []  # lista para concatenaciones de key y valor
for cat, value in categories.items():  # concatenaciones key y valor
    table_def_items.append(cat + ' ' + value)  # guardadaes en la lista
table_def_2 = ', '.join(table_def_items)  # definicion de la tabla
#cur.execute("CREATE TABLE IF NOT EXISTS sprot2 (" + table_def_2 + ") ENGINE=InnoDB")
#con.commit()

ptm = ''
out = []
listap = []
listaq = []
listar = []
with open(sprot_file) as sprot:  # esto me abre y cierra el archivo al final
    for record in SwissProt.parse(sprot):
        i += 1
        data = empty_data
        contenido_aa = count_amino_acids_ext(record.sequence)
        listaq = []
        for q in contenido_aa.itervalues():
            listaq.append(str(q))  # y los pongo en una lista
        sql_insert_values_q = ', '.join(listaq)
        cur.execute("INSERT INTO count_aa VALUES ('"
              + record.accessions[0] + "', '"
              + record.organism_classification[0] + "', "
              + str(record.sequence_length)
              + ", " + sql_insert_values_q + ")")
        con.commit()

Example #59
0
    #test cases have only one record.

    # With the SequenceParser
    test_handle = open(datafile)
    records = list(SeqIO.parse(test_handle, "swiss"))
    test_handle.close()

    assert len(records) == 1
    assert isinstance(records[0], SeqRecord)

    #Check matches what we got earlier without the iterator:
    assert records[0].seq.tostring() == seq_record.seq.tostring()
    assert records[0].description == seq_record.description
    assert records[0].name == seq_record.name
    assert records[0].id == seq_record.id
    
    # With the RecordParser
    test_handle = open(datafile)
    records = list(SwissProt.parse(test_handle))
    test_handle.close()

    assert len(records) == 1
    assert isinstance(records[0], SProt.Record)
    
    #Check matches what we got earlier without the iterator:
    assert records[0].sequence == record.sequence
    assert records[0].description == record.description
    assert records[0].entry_name == record.entry_name
    assert records[0].accessions == record.accessions