Ejemplo n.º 1
0
def download_aa_dist_per_gene(UPID_list_fname, cutoff):
    UPID_list = []
    for row in open(UPID_list_fname, 'r'):
        if row:
            UPID_list.append(row[48:54])
    
    if cutoff > 0:
        UPID_list = UPID_list[0:min(cutoff, len(UPID_list))]    
    
    # a dictionary containing the aa_dist for each uniprot ID
    UPID_to_aa_dist = {}
    
    for i, UPID in enumerate(UPID_list):  
        print i, "\t", UPID
    
        # initialize a dictionary for amino acids frequency in each protein
        aa_dist = dict([(aa, 0) for aa in AA_LETTERS])
        
        # call for aa sequence for each uniprot from swiss prot - biopython tool
        handle = ExPASy.get_sprot_raw(UPID)
        seq_record = SeqIO.read(handle, "swiss")
        
        # count frequency for each aa in each UPID
        # update aa_frequency in aa_dict - to avoid bugs where for example an aa seq from
        # swiss prot may contain weired letters such as 'X'
        for aa in list(seq_record):
            if aa in AA_LETTERS:
                aa_dist[aa] += 1
        
        UPID_to_aa_dist[UPID] = np.array([aa_dist[aa] for aa in AA_LETTERS])
    return UPID_to_aa_dist
Ejemplo n.º 2
0
def get_SwissProt(dict,accession):
    try:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        dict[accession] = record
    except urllib2.HTTPError, error:
        print accession + ": protein not found on UniProt . "
Ejemplo n.º 3
0
def main():
    # Read the UniProt ID for a txt file.
    with open('problem_datasets/rosalind_dbpr.txt', 'r') as infile:
        uni_id = infile.read().strip()

    # Retrieve the data from UniProt (separated IDs by commas).
    raw_data = ExPASy.get_sprot_raw(uni_id)
    record = SwissProt.read(
        raw_data)  # use SwissProt.parse for multiple proteins

    # Collect the relevant information.
    go = []
    for i in record.cross_references:
        if i[2].startswith('P:'):
            go.append(i[2][2:])

    # Output answer.
    with open('output/rosalind_dbpr_out.txt', 'w') as outfile:
        outfile.write('\n'.join(go))

    # Optional: Print answer and gene ID/name
    name = record.gene_name.split(' ')[0][5:]
    print('Gene:\n',
          name,
          ' (UniProt ID = ',
          uni_id,
          ')\n\nBiological Processes:\n',
          '\n'.join(go),
          sep='')
Ejemplo n.º 4
0
def download_sequences(accessions):
    records = {}
    for accession in accessions:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        records[accession] = record.sequence
    return records
def download_from_swissprot(id_file,
                            output_file,
                            rettype="swiss",
                            save_format="swiss"):
    """
	:type id_file: basestring
	:type output_file: basestring
	"""

    cnt = 1
    output_file_handle = open(output_file, 'w')
    with open(id_file, 'r') as f:
        for line in f:
            query_id = line.strip()

            try:
                handle = ExPASy.get_sprot_raw(query_id)
            except urllib2.HTTPError as e:
                log.warning('{0} query failed'.format(query_id))

            seq_record = SeqIO.read(handle, rettype)
            SeqIO.write(seq_record, output_file_handle, save_format)
            log.info('#{1} Processed {0}'.format(seq_record.id, cnt))
            cnt += 1
            handle.close()

    log.info('Total {0} queries done!'.format(cnt - 1))
def download_aa_dist_per_gene(UPID_list_fname, cutoff):
    UPID_list = []
    for row in open(UPID_list_fname, 'r'):
        if row:
            UPID_list.append(row[48:54])
    
    if cutoff > 0:
        UPID_list = UPID_list[0:min(cutoff, len(UPID_list))]    
    
    # a dictionary containing the aa_dist for each uniprot ID
    UPID_to_aa_dist = {}
    
    for i, UPID in enumerate(UPID_list):  
        print i, "\t", UPID
    
        # initialize a dictionary for amino acids frequency in each protein
        aa_dist = dict([(aa, 0) for aa in AA_LETTERS])
        
        # call for aa sequence for each uniprot from swiss prot - biopython tool
        handle = ExPASy.get_sprot_raw(UPID)
        seq_record = SeqIO.read(handle, "swiss")
        
        # count frequency for each aa in each UPID
        # update aa_frequency in aa_dict - to avoid bugs where for example an aa seq from
        # swiss prot may contain weired letters such as 'X'
        for aa in list(seq_record):
            if aa in AA_LETTERS:
                aa_dist[aa] += 1
        
        UPID_to_aa_dist[UPID] = np.array([aa_dist[aa] for aa in AA_LETTERS])
    return UPID_to_aa_dist
Ejemplo n.º 7
0
def access_sequence(accession):
    handle = ExPASy.get_sprot_raw(accession)
    try:
        record = SwissProt.read(handle)
    except ValueException:
        print("WARNING: Accession %s not found" % accession)
    return record.sequence
Ejemplo n.º 8
0
def main(input_string):
    record = SwissProt.read(ExPASy.get_sprot_raw(input_string))
    for ref in record.cross_references:
        if ref[0] == 'GO' and ref[2].startswith('P:'):
            # if reference is a Gene Ontology reference and refers to a
            # biological process
            print(ref[2][2:])
Ejemplo n.º 9
0
	def find_COG2(self):
		"""Find records from uniprotIDs without use of keggIDs."""
		handle = ExPASy.get_sprot_raw(self.uprotID)
		record = SwissProt.read(handle)
		query = record.gene_name.strip("Name""="";")
		url_open = urllib.urlopen("http://rest.genome.jp/oc/?"+query)
		return url_open.read()
Ejemplo n.º 10
0
def main(filename):
    with open(filename) as fin:
        my_seq = fin.read().strip()
    handle = ExPASy.get_sprot_raw(my_seq) 
    record = SwissProt.read(handle)
    for s in [f[2].split(':')[1] for f in record.cross_references if f[0]=='GO' and f[2][0]=='P']:
        print s
Ejemplo n.º 11
0
def parseBlast():
    result_handle = open("./output/blastOut.xml")
    blast_records = NCBIXML.parse(result_handle)
    E_VALUE_THRESH = 1
    blastHits = {}
    accessions = {}
    #Loop through each protein query results
    for blast_record in blast_records:
        keyword_list = []  #stores running keyword list
        queryID = blast_record.query.split()[0].split(':')[
            1]  #parse for the query protein ID
        #Loop through the hits associated with particular sequence
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                #Hit must have e-value < threshold to be considered
                if hsp.expect < E_VALUE_THRESH:
                    title = alignment.title  #title of hit
                    splittitle = title.split()
                    raw_protein_title = title.split('OS')[
                        0]  #specific keywords in title
                    protein_title = " ".join(raw_protein_title.split()[2:])
                    keyword_list.append(protein_title)
                    accession = splittitle[1].split('|')[
                        1]  #parse for the accession number
                    accessions.setdefault(queryID, []).append(accession)
                    handle = ExPASy.get_sprot_raw(accession)
                    record = SwissProt.read(handle)
                    keyword_list += record.keywords
                    keyword_string = '; '.join(keyword_list)
                    blastHits[queryID] = keyword_string
            break  #only take top hit for now
    return (blastHits, accessions)
Ejemplo n.º 12
0
def sequence_file(*args):
    '''The function sequence_file save the sequence of the protein in fasta
    format, to do so the sequence is retrieved and the other necessary
    information to make the fasta header.
    We included a try/except chunck to display an Error if the code is invalid'''

    a = code.get()
    try:
        from Bio import ExPASy
        from Bio import SwissProt
        with ExPASy.get_sprot_raw(a) as handle:
            record = SwissProt.read(handle)
    except:
        if a == "":
            open_window("No Code", "Please Insert an Uniprot Code", "#FFC3C3",
                        '200x30')
        else:
            open_window("No Valid Code", "Please Insert a valid Uniprot Code",
                        "#FFC3C3", '200x30')

    descrip = record.description.split(";")[0]
    num = descrip.find("Full=") + 5
    descrip = descrip[num:]
    fasta_header = ">sp|" + code.get(
    ) + "|" + record.entry_name + " " + descrip + " OS=" + record.organism

    filename = filedialog.asksaveasfilename(defaultextension='.fasta',
                                            filetypes=[("fasta", "*.fasta")])
    TextFile = open(filename, "w")
    TextFile.write(fasta_header + '\n')
    TextFile.write(record.sequence)
    TextFile.close()
Ejemplo n.º 13
0
def gen_uniprot_features_for_pdb(infile):
  for line in open(infile,'r'):
    (pdb_dom, count, uniprot_ids) = line.replace('\n','').split('\t')
    uniprot_ids = uniprot_ids.split('|')
    for uniprot_id in uniprot_ids:
      data = SwissProt.read(ExPASy.get_sprot_raw(uniprot_id)).__dict__  
      keep = False
      go = []; interpro = ''; evo_trace = ''
      for xref in data['cross_references']:
        if xref[0] == 'GO':
          go.append(xref[1])
        if xref[0] == 'InterPro':
          interpro = xref[1]
        if xref[0] == 'EvolutionaryTrace':
          evo_trace = xref[1]
        if xref[0] == 'PDB' and xref[1].lower() == pdb_dom.lower():
          keep = True
      if keep == False:
        continue
      organism = data['organism']
      loc = ''
      for comment in data['comments']:
        if comment.startswith('SUBCELLULAR LOCATION'):
          loc = comment
      print '%s\t%s\t%s\t%s\t%s\t%s\t%s' %(pdb_dom,uniprot_id,'|'.join(go),interpro,evo_trace,organism,loc)
Ejemplo n.º 14
0
def get_protein_EC(gene, retry=0):
    """ Queries Uniprot for a gene entry and extracts the EC, if any.
        If the gene is successfully queried, but no EC is present, returns None.
        It's possible that, due to connection problems, a gene that is in
        Uniprot is not found, so it will try again after a cooldown period.

        > Input
        gene : str => the gene code to be queried
        retry : int => number of tries. Max 10.

        > Output
        - EC for GENE, if GENE has one annotated in Uniprot.
        - None, if GENE doesn't have an EC
        - Exception, if any exception occurred.
          Most common exceptions are HTTPError or ValueError.
    """
    rgx = re.compile(r"EC=\d+\.\d+\.\d+\.\d+")
    try:
        with ExPASy.get_sprot_raw(gene) as handle:
            seq_record = SeqIO.read(handle, "swiss")
            match = rgx.search(seq_record.description)
            if match is not None:
                return match.group(0)
    except Exception as e:
        if retry < 10:
            time.sleep(5)  # cool down time 5s
            print("\nGENE NOT FOUND. RETRYING (%d)" % retry)
            return get_protein_EC(gene, retry + 1)
        return e
    except KeyboardInterrupt as k:
        print("\nKeyBoard Interrupt Signal received. Aborting")
        return k
    return None
Ejemplo n.º 15
0
def get_records(ids):
    records = []
    for id in ids:
        handle = ExPASy.get_sprot_raw(id)
        record = SwissProt.read(handle)
        records.append(record.sequence)
    return records
Ejemplo n.º 16
0
def fetch_swp_expasy(uniprot_acc):
    """
    Fetch information on SwissProt accession (manually reviewed UniProt entry).
    
    http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html

    Parameters
    ----------
    arg1 : str
        SwissProt accession or identifier.

    Returns
    -------
    list
        list of length 2 with the name of the attributes found and their values.
    """

    #generates record object with information regarding SwissProt identifier
    handle = ExPASy.get_sprot_raw(uniprot_acc)
    record = SwissProt.read(handle)

    #checks all the attributes possibles for the record object generated and their type
    #attributes are of type: str, tuple, or list
    #attribute list found here: http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html
    attrib_names = [
        'accessions', 'data created', 'date created (ISO)', 'organism',
        'gene names', 'description', 'comments', 'keywords'
    ]
    swp_info_list = [
        record.accessions, record.created[0],
        dating(record.created[0]), record.organism, record.gene_name,
        record.description, record.comments, record.keywords
    ]
    return (attrib_names, swp_info_list)
Ejemplo n.º 17
0
 def Uniprot_records(self):
     handle = ExPASy.get_sprot_raw(self.__uniprot_id)#ID do NCBI, para tirar ficheiro xml da Uniprot
     url = handle.url # 
     url = url.replace('txt','xml') #
     response = requests.get(url) #
     with open('Uniprot' + self.__uniprot_id + '.xml','wb') as file: #b para escrever em modo binário
         file.write(response.content)
Ejemplo n.º 18
0
def get_SwissProt(dict, accession):
    try:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        dict[accession] = record
    except urllib2.HTTPError, error:
        print accession + ": protein not found on UniProt . "
Ejemplo n.º 19
0
def fetch_genbank(sid):
    try:
        handle = ExPASy.get_sprot_raw(sid)
        seq = SeqIO.read(handle, 'swiss')
        SeqIO.write(seq, sid + '.genbank', 'genbank')
        print(sid, 'sequence length', len(seq))
    except Exception:
        print(sid, 'not found')
Ejemplo n.º 20
0
def get_pro_from_SwissProt(id):
    '''return protein sequence of id from swiss protein database
    module used: Bio.ExPASy, SeqIO.read'''
    handle = ExPASy.get_sprot_raw(id)
    if handle:
        pro_record = SeqIO.read(handle, 'swiss')
        return str(pro_record.seq)
    return None
Ejemplo n.º 21
0
def main(argv):
    # input() reads stdin
    handle = ExPASy.get_sprot_raw(input().strip()) #you can give several IDs separated by commas
    record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins
    
    # there ought to be a better way to pull GO information from the record! maybe there is...
    for p in filter(lambda x:x[0]=='GO' and x[2].startswith('P:'),record.cross_references):
        print(p[2][2:])
Ejemplo n.º 22
0
def getgo(id):
    handle = ExPASy.get_sprot_raw(id)
    record = SwissProt.read(handle)
    go = [
        r[2].split(":")[1] for r in record.cross_references
        if r[0] == "GO" and r[2].startswith("P")
    ]
    print("\n".join(go))
Ejemplo n.º 23
0
def dbpr(UniProt_ID):
    handle = ExPASy.get_sprot_raw(UniProt_ID)
    record = handle.read()
    bp_patten = r"P:.*; IEA:"
    bp_res = re.findall(bp_patten, str(record))
    bp = [r.replace("P:", "").replace("; IEA:", "") for r in bp_res]
    handle.close()
    return bp
Ejemplo n.º 24
0
def swissprot_search():

    f = open('output/seq_accession.txt')
    db = f.readline()
    for accession in f:
        handle = ExPASy.get_sprot_raw(accession)
        record = SwissProt.read(handle)
        print(record)
Ejemplo n.º 25
0
def main(id):
    handle = ExPASy.get_sprot_raw(id)
    record = SwissProt.read(handle)
    for cr in record.cross_references:
        if cr[0] == "GO":
            bits = cr[2].split(":")
            if bits[0] == "P":
                print bits[1]
Ejemplo n.º 26
0
 def get(self,id):
     """Open and Read a Swiss-Prot file locally from remote source (ExPASy database)
         Swiss-Prot file over the internet from the ExPASy database.
         Input must be a accession number stored on the swissprot site.
     """
     handle = ExPASy.get_sprot_raw(id)
     record = SwissProt.read(handle)
     return record
Ejemplo n.º 27
0
def write_to_file(identifier):
    handle = ExPASy.get_sprot_raw(identifier)
    record = SeqIO.read(handle, 'swiss')

    with open('%s.txt' % identifier, 'w') as file:
        SeqIO.write(record, file, 'fasta')

    handle.close()
Ejemplo n.º 28
0
    def MouseHomolog(self, dfs):

        print('\nFinding mouse homologs')
        ind = 0
        new_dfs = []

        for acc in self.accs:

            try:
                handle = ExPASy.get_sprot_raw(acc)
                record = SwissProt.read(handle)
                name = record.entry_name
            except:
                print('\nNo entry for', acc, ',continuing')
                ind += 1
                continue

            try:
                mname = name.split('_')[0] + '_MOUSE'
                mhandle = ExPASy.get_sprot_raw(mname)
                mrecord = SwissProt.read(mhandle)
                mseq = mrecord.sequence
                print(f'\nFound mouse homolog for {name}: {mname}')
            except:
                print(f'\nNo mouse gene entry for {acc}-{name}, continuing')
                ind += 1
                continue

            df = dfs[ind]
            mcol = []

            for row in range(len(df)):
                pepseq = df.Sequence[df.index[row]]
                print(pepseq)
                if str(pepseq) in mseq:
                    mcol.append('True')
                else:
                    mcol.append('False')

            df['Mouse'] = mcol
            new_dfs.append(df)
            ind += 1

        df_final = pd.concat(new_dfs, sort=True)
        df_final.to_excel(self.out_folder + '/' + 'MouseHomologPeptides.xlsx',
                          index=True)
Ejemplo n.º 29
0
def main():
    with open("dbpr") as f:
        handle = ExPASy.get_sprot_raw(f.readline().strip())
        record = SwissProt.read(handle)
        record = [x[2] for x in record.cross_references if x[0] == 'GO']
        record = [x[2:] for x in record if x[0] == 'P']
        sys.stdout = open("dbpr.out","w")
        print "\n".join(record)
Ejemplo n.º 30
0
def protfunction(query_proteins):
    """Shows the proteins function given their names or ids
    str -> list"""
    function_list = []
    for prot in query_proteins:
        with ExPASy.get_sprot_raw(prot) as handle:
            record = SwissProt.read(handle)
            function_list.append((prot, record.comments[0][10:]))
    return function_list
Ejemplo n.º 31
0
def main(argv):
    line = files.read_line(argv[0])
    handle = ExPASy.get_sprot_raw(line)
    record = SwissProt.read(handle)

    go = filter(lambda x: x[0] == 'GO' and 'P:' in x[2],
                record.cross_references)

    print '\n'.join(g[2].split(':')[1] for g in go)
Ejemplo n.º 32
0
def print_bio_process(file):
    Uniprot_id = file.read().rstrip()
    handle = ExPASy.get_sprot_raw(Uniprot_id)
    rec = SwissProt.read(handle)
    bio_process = [
        i[2][2:] for i in rec.cross_references
        if i[0] == 'GO' and i[2].startswith('P')
    ]
    print('\n'.join(bio_process))
Ejemplo n.º 33
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     handle = ExPASy.get_sprot_raw(identifier)
     record = SeqIO.read(handle, "swiss")
     handle.close()
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
Ejemplo n.º 34
0
def dbpr():
    uniprot_id = open("rosalind_dbpr.txt").read().strip()
    handle = ExPASy.get_sprot_raw(uniprot_id)
    record = SwissProt.read(handle)

    # return the list of biological functions
    for ref in record.cross_references:
        if ref[0] == 'GO' and ref[2].startswith('P:'):
            print ref[2][2:]
Ejemplo n.º 35
0
	def __init__(self, seq_id=None, seq_type=None):
		"sets variables for instance"
		if seq_type is 'uniprot':
			handle = ExPASy.get_sprot_raw(seq_id)
			self.seq_record = SeqIO.read(handle, "swiss")
		elif seq_type is 'genbank':
			handle = Entrez.efetch(db='protein', rettype='genbank', id=seq_id)
			self.seq_record = SeqIO.read(handle, "genbank")
		handle.close()
Ejemplo n.º 36
0
def DBPR(id):
    handle = ExPASy.get_sprot_raw(id)  # several IDs can be separated by commas
    record = SwissProt.read(
        handle)  # use SwissProt.parse for multiple proteins
    GO = []
    for item in record.cross_references:
        if item[0] == 'GO':
            if item[2].split(':')[0] == 'P':
                GO.append(item[2].split(':')[1])
    return GO
Ejemplo n.º 37
0
def find_function(prot):
    handle = ExPASy.get_sprot_raw(prot) # Can give several IDs separated by commas
    record = SwissProt.read(handle) # Use SwissProt.parse for multiple proteins  

    functions = []
    for ref in record.cross_references:
        if ref[0] == 'GO' and ref[2][0] == 'P':
            print(ref)
            functions.append(ref[2][2:])
    return functions
Ejemplo n.º 38
0
 def acession(self):
     self.rec=[]
     for ide in self.ids:
         if ide!='ND':
             results=ExPASy.get_sprot_raw(ide)
             rec=SwissProt.read(results)
             self.rec.append(rec)
         else:
             self.rec.append('ND')
     return self.rec
Ejemplo n.º 39
0
def BiologicalProcesses(UniProtID):
    Handle = ExPASy.get_sprot_raw(UniProtID)
    Record = SwissProt.read(Handle)

    Processes = []
    for i in Record.cross_references:
        if "GO" in i:
            for j in i:
                if re.match("P:.*", j):
                    Processes.append(j[j.rfind(':')+1:])
    return "\n".join(Processes)
Ejemplo n.º 40
0
def get_keywords(lookup):
    try:
        handle = ExPASy.get_sprot_raw(lookup)
    except:
        print("Error in ExPASy")
        sys.exit(1)
    try:
        record = SwissProt.read(handle)
    except ValueError, error:
        print(error)
        sys.exit(1)
Ejemplo n.º 41
0
def get_prot(id):
    
    with ExPASy.get_sprot_raw(id) as handle:
        seq_record = SeqIO.read(handle, 'swiss')
    tam= len(seq_record.seq)
    seq= seq_record.seq
    tax= seq_record.annotations["taxonomy"]
    org= seq_record.annotations["organism"]
    #host= seq_record.annotations["organism_host"]
    y = ('ID:' + id + '|' + 'SEQUENCE:' + seq + '|' + 'SEQUENCE LENGTH:' + str(tam) + 'bp' + '|' + 'TAXONOMY:' + str(tax) + '|' + 'ORGANISM:' + org )
    return y
Ejemplo n.º 42
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     # This is to catch an error page from our proxy:
     handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
     if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
         raise IOError
     record = SeqIO.read(handle, "swiss")
     handle.close()
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
Ejemplo n.º 43
0
def main(protein_id):
    handle = ExPASy.get_sprot_raw(protein_id) #you can give several IDs separated by commas
    record = SwissProt.read(handle) # use SwissProt.parse for multiple proteins

    answer = ""
    for r in record.cross_references:
        print r
        if r[0] == "GO":
            if r[2].split(":")[0] == 'P':
                answer += r[2].split(":")[1] + "\n"

    return answer.strip()
Ejemplo n.º 44
0
def get_seq(source, fmt):
    handle = None
    if fmt == 'fasta':
        handle = open(source)
    elif fmt == 'genbank':
        hanlde = open(sourc)
    elif fmt == 'swiss':
        handle = ExPASy.get_sprot_raw(source)
    else:
        raise TypeError('Need to choose correct file format')

    record_iterator = SeqIO.parse(handle, fmt)
    #handle.close()
    return record_iterator
Ejemplo n.º 45
0
    def download_entry(self, accession):
        try:
            handle = ExPASy.get_sprot_raw(accession)
            record = SwissProt.read(handle)
        except:
            raise KeyError('{}'.format(accession))

        record_org = record.organism.strip().lower()
        if self.organism not in record_org:
            print('{} ortholog of {} not found.'.format(self.organism, accession))
            raise KeyError('{} ortholog of {} not found.'.format(self.organism, accession))
        else:
            self.records[accession] = record
            return record
Ejemplo n.º 46
0
def main():
    #Grab our input id value
    uniprot_id = get_uniprot_id_from_file(arguments['<input>'])
    #Get a handle on the data for the uniprot id
    handle = ExPASy.get_sprot_raw(uniprot_id)
    #Parse our data
    record = SwissProt.read(handle)
    handle.close()
    #Process out the stuff of interest, GO values in this case
    go_refs = [ref[1:] for ref in record.cross_references if ref[0] == 'GO']
    for go_entry in go_refs:
        pre, val = go_entry[1].split(':')
        if pre == 'P':
            print(val)
Ejemplo n.º 47
0
def main(fichier):
	"""
		navigate into protein database
	"""
	f = open(fichier,'r')
	fline = f.readline().strip()
	from Bio import ExPASy
	from Bio import SwissProt
	handle = ExPASy.get_sprot_raw(fline)
	record = SwissProt.read(handle)
	go = []
	for i in record.cross_references:
		if i[0] == 'GO' and i[2][0]=='P':
		        go.append(i[2].lstrip('P:'))
	print '\n'.join(go)
Ejemplo n.º 48
0
 def test_get_sprot_raw(self):
     """Bio.ExPASy.get_sprot_raw("O23729")"""
     identifier = "O23729"
     try:
         #This is to catch an error page from our proxy:
         handle = UndoHandle(ExPASy.get_sprot_raw(identifier))
         if _as_string(handle.peekline()).startswith("<!DOCTYPE HTML"):
             raise IOError
         record = SeqIO.read(handle, "swiss")
         handle.close()
     except IOError:
         raise MissingExternalDependencyError(
               "internet (or maybe just ExPASy) not available")
     self.assertEqual(record.id, identifier)
     self.assertEqual(len(record), 394)
     self.assertEqual(seguid(record.seq), "5Y08l+HJRDIlhLKzFEfkcKd1dkM")
Ejemplo n.º 49
0
    def __getitem__(self, id):
        """__getitem__(self, id) -> object

        Return a SwissProt entry.  id is either the id or accession
        for the entry.  Raises a KeyError if there's an error.
        
        """
        from Bio import ExPASy
        # First, check to see if enough time has passed since my
        # last query.
        self.limiter.wait()

        try:
            handle = ExPASy.get_sprot_raw(id)
        except IOError:
            raise KeyError(id)
        
        if self.parser is not None:
            return self.parser.parse(handle)
        return handle.read()
Ejemplo n.º 50
0
def main():
    # Read the UniProt ID for a txt file.
    with open('problem_datasets/rosalind_dbpr.txt', 'r') as infile:
        uni_id = infile.read().strip()

    # Retrieve the data from UniProt (separated IDs by commas).
    raw_data = ExPASy.get_sprot_raw(uni_id)
    record = SwissProt.read(raw_data) # use SwissProt.parse for multiple proteins

    # Collect the relevant information.
    go = []
    for i in record.cross_references:
        if i[2].startswith('P:'):
            go.append(i[2][2:])

    # Output answer.
    with open('output/rosalind_dbpr_out.txt', 'w') as outfile:
        outfile.write('\n'.join(go))

    # Optional: Print answer and gene ID/name
    name = record.gene_name.split(' ')[0][5:]
    print('Gene:\n', name, ' (UniProt ID = ', uni_id,
          ')\n\nBiological Processes:\n', '\n'.join(go), sep='')  
Ejemplo n.º 51
0
def snp_uniprot(uniprotname, selection='(all)', label=1, name='', quiet=0):
    '''
DESCRIPTION

    Selects all UniProt annotated nsSNPs (natural variants) in given
    structure. Does a sequence alignment of UniProt sequence and PDB
    sequence.

USAGE

    snp_uniprot uniprotname [, selection [, label [, name [, quiet]]]]

ARGUMENTS

    uniprotname = string: UniProt reference (like HBB_HUMAN or P68871)

    selection = string: atom selection

    label = 0 or 1: Label CA atoms of nsSNPs with mutation {default: 1}

    name = string: name of new selection {default: nsSNPs}

EXAMPLE

    fetch 3HBT
    snp_uniprot ACTG_HUMAN, chain A

SEE ALSO

    snp_ncbi
    '''
    from Bio import ExPASy
    from Bio import SwissProt
    handle = ExPASy.get_sprot_raw(uniprotname)
    record = SwissProt.read(handle)
    snp_common(record, selection, label, name, quiet)
def download_from_swissprot(id_file, output_file, rettype="swiss", save_format="swiss"):
	"""
	:type id_file: basestring
	:type output_file: basestring
	"""

	cnt = 1
	output_file_handle = open(output_file, 'w')
	with open(id_file, 'r') as f:
		for line in f:
			query_id = line.strip()

			try:
				handle = ExPASy.get_sprot_raw(query_id)
			except urllib2.HTTPError as e:
				log.warning('{0} query failed'.format(query_id))

			seq_record = SeqIO.read(handle, rettype)
			SeqIO.write(seq_record, output_file_handle, save_format)
			log.info('#{1} Processed {0}'.format(seq_record.id, cnt))
			cnt += 1
			handle.close()

	log.info('Total {0} queries done!'.format(cnt - 1))
Ejemplo n.º 53
0
#!/usr/bin/python


from Bio import ExPASy

ids = ['O23729', 'O23730', 'O23731']

all_results = ''
for id in ids:
    results = ExPASy.get_sprot_raw(id)
    all_results = all_results + results.read()

Ejemplo n.º 54
0
from Bio import ExPASy
from Bio import SwissProt

id = "Q5SLP9" 
handle = ExPASy.get_sprot_raw(id)
record = SwissProt.read(handle)

for x in record.cross_references:
    if x[2][0:2] == 'P:':
        print x[2][2:]
Ejemplo n.º 55
0
def protein_record(protein):
    """Return the SwissProt record of a protein with id protein."""
    handle = ExPASy.get_sprot_raw(protein)  # you can give several IDs separated by commas
    return SwissProt.read(handle)  # use SwissProt.parse for multiple proteins
Ejemplo n.º 56
0
'''
Created on Mar 5, 2013

@author: Mike
'''

from Bio import ExPASy
from Bio import SwissProt

if __name__ == '__main__':
    protein = 'Q9JT70'
    handle = ExPASy.get_sprot_raw(protein)
    record = SwissProt.read(handle)
    
    refs = [r for r in record.cross_references if "GO" in r]
    refs = [r[2] for r in refs if "P:" in r[2]]
    
    for r in refs:
        print r[2:]
from Bio import ExPASy, SwissProt
from Bio.SwissProt import KeyWList

#test ID: Q5SLP9
#get GO-Biological process info
#DNA recombination
#DNA repair
#DNA replication

handle = ExPASy.get_sprot_raw('Q9HAV7') #you can give several IDs separated by commas
record = SwissProt.read(handle)

for i in record.cross_references:
    if i[0] == 'GO' and i[2].startswith('P'):
        print i[2].replace('P:','')
    else:
        continue
Ejemplo n.º 58
0
def checksum_summary(record) :
    if len(record.seq) < 25 :
        short = record.seq.tostring()
    else :
        short = record.seq.tostring()[:19] \
              + "..." + record.seq.tostring()[-3:]
    return "%s [%s] len %i" \
           % (short, seguid(record.seq), len(record.seq))

#####################################################################

print "Checking Bio.ExPASy.get_sprot_raw()"
id_list = ["O23729"]
for identifier in id_list :
    print "- Fetching %s" % identifier
    handle = ExPASy.get_sprot_raw(identifier)
    records = list(SeqIO.parse(handle, "swiss"))
    assert len(records)==1
    record = records[0]
    print "  Got " + checksum_summary(record)
    assert record.id == identifier
del id_list, handle, identifier, records, record

#####################################################################

print "Checking Bio.Entrez.efetch()"
for database, format, entry in [("genome","fasta","X52960"),
                                ("genome","gb","X52960"),
                                ("nucleotide", "fasta", "6273291"),
                                ("nucleotide", "gb", "6273291"),
                                ("protein", "fasta", "16130152"),
Ejemplo n.º 59
0
def getgo(id):
	handle = ExPASy.get_sprot_raw(id)
	record = SwissProt.read(handle)
	go = [r[2].split(":")[1] for r in record.cross_references if r[0] == "GO" and r[2].startswith('P')]
	print "\n".join(go)
Ejemplo n.º 60
0
#!/usr/bin/python

from Bio import ExPASy
from Bio.WWW import *                                                     
from Bio.SwissProt import SProt

expasy = ExPASy.get_sprot_raw('CERU_HUMAN')
sp = SProt.Iterator(expasy, SProt.RecordParser())
record = sp.next()
print record.keywords