def doc_parser(accession): """ Returns information on domain with input accession number """ with open("prosite_files/prosite.doc") as handle: records = Prodoc.parse(handle) for record in records: if accession == record.accession: return record.text
def DocParser(accession_list): """Get record.txt info for domains detected Args: list of accesions of the domains found Returns: record.txt of all patterns """ handle = open("prosite.txt") records = Prodoc.parse(handle) record_text_list = [] try: # Loop to go through prosite.doc entries. for record in records: if record.accession in accession_list: # If an entry is in the list of the already found domains record_text_list.append(record.text) # Save it except: print(False) return record_text_list
def doc_parser(prosite_doc, key, ResultDict, Results_Dir): Ext_Info = open(Results_Dir + 'Extended_Domain_Info_%s.txt' % (key), 'w') Ext_Info.write('\n' + (' Extended information about the domains in %s ' % (key)).center(78, '#') + '\n\n') for Prosite_Id in ResultDict[key]: handle = open(prosite_doc) records = Prodoc.parse(handle) for record in records: try: if len(record.prosite_refs) > 0: for domain in range(len(record.prosite_refs)): if Prosite_Id == record.prosite_refs[domain][0]: Ext_Info.write(Prosite_Id.center(80)) Ext_Info.write('\n\n') Ext_Info.write(record.text) except: pass return
def domaininfo(keydomains): """keydomains: a list of the PROSITE domain matches. This function takes the matches domains founded and extend the information about them.""" handle = open("../prosite.dat", "r") recordsdat = Prosite.parse(handle) for record in recordsdat: for i in range(0, len(keydomains)): for j in range(0, len(keydomains[i])): if record.pattern == keydomains[i][j][-1]: keydomains[i][j].append(record.accession) keydomains[i][j].append(record.name) keydomains[i][j].append(record.description) handle = open("../prosite.doc") recordsdoc = Prodoc.parse(handle) for info in recordsdoc: if str(keydomains[i][j][5]) in str(info.prosite_refs): keydomains[i][j].append(info.text) return keydomains
def findDomains(multifasta, output = '') : """ Multifasta: archivo con todas las proteinas en las que se van a buscar dominios Output: nombre de la query """ file = open(multifasta, 'r') if not os.path.exists('results/prosite') : os.mkdir('results/prosite') output_file = str('results/prosite/dominios_' + output + '.txt') # un archivo para cada query result = open(output_file, 'w') accession_bruto = [] # todos los numeros de acceison de dominios encontrados en el multifasta accession = [] # lo mismo pero eliminando repeticiones for line in file : if line.startswith('>') : result.write('*************************************************************************************************************'+'\n') result.write(line.replace('>', '') + '\n') # titulo: nombre de la proteina else : handle = open('prosite.dat', 'r') records = Prosite.parse(handle) for record in records : patron = repl(record.pattern) # traduccion patron if len(patron) !=0 and re.search(patron, line) : # si existe el patron y se encuentra result.write('Patron: ' + record.pattern + '\nName: ' + record.name + '\nAccession: ' + record.accession + '\nDescription: ' + record.description + '\n\n') accession_bruto.append(record.accession) # guardamos info y el numero de accesion (necesario para buscar en prodoc) for a in accession_bruto : # para eliminar los repetidos if a not in accession : accession.append(a) result.write('\n\n\n\nINFORMACION DE LOS DOMINIOS\n\n') handle = open('prosite.doc', 'r') records = Prodoc.parse(handle) for record in records : if len(record.prosite_refs) != 0 and record.prosite_refs[0][0] in accession : # el numero de accesion de prosite esta en accesion result.write(record.text + '\n\nAccession prodoc: ' + record.accession + '\nAccession prosite: ' + record.prosite_refs[0][0] + '\n') result.write('**************************************************************************************\n\n\n')
def test_prodoc_raw(self): handle = ExPASy.get_prosite_raw("PDOC00001") record = Prodoc.read(handle) handle.close() self.assertEqual(record.accession, "PDOC00001")
def parse_doc(prositedoc, id): with open(prositedoc, 'r', encoding='cp1252') as prositeschachi: for record in Prodoc.parse(prositeschachi): if record.accession == id: return record.text
def test_prodoc_raw(self): handle = ExPASy.get_prosite_raw('PDOC00001') record = Prodoc.read(handle) handle.close() self.assertEqual(record.accession, 'PDOC00001')
def test_prodoc_raw(self): with ExPASy.get_prosite_raw("PDOC00001") as handle: record = Prodoc.read(handle) self.assertEqual(record.accession, "PDOC00001")
# script para parsear la base de datos prosite presentes en los archivos # prosite.doc y prosite.dat utilizando el modulo Biopython from Bio.ExPASy import Prosite, Prodoc # con este script podeis parsear el archivo .dat # handle = open("prosite.dat","r") # records = Prosite.parse(handle) # for record in records: # print("name:"+record.name) # print("accession:"+record.accession) # print("description:"+record.description) # print("pattern:"+record.pattern) # con este script podemos parsear el archivo .doc handle = open("prosite.doc") records = Prodoc.parse(handle) for record in records: print(record.accession) #print(record.prosite_refs) print(record.text) print(count) #print(record.references)
from Bio import ExPASy handle = ExPASy.get_prosite_raw("PS00001") text = handle.read() print(text) # 가장 원시적인 긁는 방법 (Expasy만 있으면 됨) """from Bio import Prosite handle = ExPASy.get_prosite_raw("PS51036") record = Prosite.read(handle) print(record)""" # ImportError: cannot import name 'Prosite' from 'Bio' (/home/koreanraichu/PycharmProjects/pythonProject/venv/lib/python3.8/site-packages/Bio/__init__.py)가 나를 반기는디? from Bio.ExPASy import Prodoc handle = ExPASy.get_prosite_raw("PDOC00001") record = Prodoc.read(handle) print(record) handle = ExPASy.get_prosite_entry("PS51036") html = handle.read() with open("myprositerecord.html", "w") as out_handle: out_handle.write(html) # HTML format으로 다운로드 받을 수 있다. handle = ExPASy.get_prodoc_entry("PDOC51036") html = handle.read() with open("myprositerecord2.html", "w") as out_handle: out_handle.write(html) # 얘는 prodoc 다운로드 하는 코드
def prodoc(lista_hits,prosite_doc): """" DEFINICIÓN= Funcion que imprime por pantalla información extra contenida en prosite.doc del dominio proteico que le solicites ARGUMENTOS: - lista_hits = lista con los dominios proteicos obtenidos tras hacer un parseado en prosite.dat (lista obtenida tras haber empleado el modulo "prosite.py") - prosite_doc= archivo prosite.txt RESULTADO: - Esta funcion no genera ningun archivo ni devuelve nada, la info que proporciona la imprime por pantalla *prosite.txt y prosite.dat constituyen la base de datos de PROSITE* """ print() print() while True: print(lista_hits) dominio=input("De la tabla anterior imprimida por pantalla, escriba el accession del dominio que quiera consultar: ") if dominio in lista_hits: # try: #APERTURA DEL ARCHIVO .doc doc = open(prosite_doc,"r") #PARSEADO DEL DOC, BUSCANDO EL DOMINIO DEL HIT SELECCIONADO records = Prodoc.parse(doc) for record in records: accession = record.accession # conversion del accession del .doc al accession del .dat if accession == dominio: print("-----------------------------------------------------------------------------") print() print("""" ########################################################### #### Información extra de la base de datos prosite.doc #### ########################################################### """) print() print print("DOMINIO: "+record.accession) print() print(record.text) print("################################################################") #CIERRE DEL ARCHIVO .doc doc.close() # Menu para decidir si consultar mas accession o no mas_doc=input("¿ Quiere consultar informacion sobre algún dominio más ? (Si/No): ") if mas_doc in ["NO","No","no","n","N"]: print(""" ---------------------------------------------------------------------------------------------------- ############################################################## #### Muchas gracias por haber confiado en nuestro software. ## ############################################################## """) break else: print() print("--------------------------------------------------------------------------------------") print("Respuesta no válida, por favor, introduzca algun accession de los mostrados en la tabla anterior, es decir, alguno de estos:") print(lista_hits) print("--------------------------------------------------------------------------------------")
def patfinder(fasta, output): """fasta: name of the FASTA file with the sequences to search domains on (str). output: name of the file where the results will be stored (str). This function searchs for domains of the prosite database on the given sequences, and shows some information about them. """ for x in range(0, 4): print(">> Starting Prosite pattern search " + "." * x, end="\r") sleep(0.2) print() # Count number of proteins to analyze. nseqs = 0 with open(fasta, "r") as file: for line in file: if line[0] == ">": nseqs += 1 print("\n> %d proteins to analyze." % nseqs) print( "\nThe results will be shown sequence by sequence. Each query sequence (name ended by _QUERYSEQ) will be followed by its blastp hits." ) print( "On the results folder, you'll find also a txt file with all the results, including also the exact sequence of the domains on the proteins and their position." ) print( "You'll be given also the option to see further information of the found domains of your choice." ) input("\nPRESS ENTER TO CONTINUE\n") initial = [".", "-", "<", ">", "x", "X", "{", "}", "(", ")"] final = ["", "", "^", "$", ".", ".", "[^", "]", "{", "}"] out_file = open(output, "w") j = 0 with open(fasta, "r") as seqs_handle: # Parse prosite.dat for seq_record in SeqIO.parse(seqs_handle, "fasta"): call('clear') j += 1 seq, seq_id = seq_record.seq, seq_record.id print("\n---------------------------------------") print(">> (%d of %d) Prosite domains on sequence %s" % (j, nseqs, seq_id)) print("-----------------------------------------") out_file.write("\n\n>> Prosite domains on sequence %s" % seq_id) with open("prosite.dat", "r") as handle: pat_records = Prosite.parse(handle) total = 0 results = [] for record in pat_records: # Some patterns are empty. If not, convert them to regular expresions. if record.pattern != "": pattern = record.pattern for i in range(0, len(initial)): pattern = pattern.replace(initial[i], final[i]) # Search domains. matches = re.finditer(pattern, str(seq)) hit = False domains, pos = [], [] for m in matches: domains.append(m.group()) pos.append(m.start()) hit = True # Show found domains. if hit == True: total += 1 print("\n> Found %d hits for domain %s." % (len(domains), record.name)) out_file.write( "\n> Found %d hits for domain %s:\n" % (len(domains), record.name)) out_file.write("Pos\tHit sequence\n") out_file.write("---\t------------\n") for i in range(0, len(domains)): out_file.write("%s\t%s\n" % (pos[i], domains[i])) results.append(record.accession) print("Domain accesion id: %s" % record.accession) print("Description: %s" % record.description) print("Pattern: %s" % record.pattern) out_file.write("Domain accesion id: %s\n" % record.accession) out_file.write("Description: %s\n" % record.description) out_file.write("Pattern: %s\n" % record.pattern) if total == 0: print("No domains found for this protein.") out_file.write("No domains found for this protein.") print("\n----------------------------") input("PRESS ENTER TO CONTINUE\n") else: print("\nTotal: %d different domains found.\n" % total) out_file.write( "\n\nTotal: %d different domains found.\n\n" % total) print( "\n---------------------------------------------------------" ) print( "If you want further information of these domains, press Y." ) print("Press ENTER or any other key to continue.") selection = input("> ") if selection.upper() == "Y": # Parse prosite.doc for further information. with open("prosite.doc", "r") as doc_handle: doc_records = Prodoc.parse(doc_handle) for doc_record in doc_records: for x in results: if x in str(doc_record.prosite_refs): print("> %s domain." % x) print(doc_record.text) print("\n----------------------------") input("PRESS ENTER TO CONTINUE\n") out_file.close()
def escribe_archivo(multifasta, lista, dicc, nombres, accesion, descripcion): """Funcion que almacenara en "Archivo_prosite" la informacion de cada una de las secuencias que se han filtrado en blast multifasta_filtrado=archivo obtenido en la primera funcion de este modulo lista=lista con los patrones dicc=diccionario creado patron:linea nombres= lista nombres de prosite accesion=lista con los accession de prosite descripcion=lista con las descripciones """ lee = open(multifasta, "r") #diccionario texto del archivo .doc: referencias del archivo .doc referencias = dict() handle = open("prosite.doc", encoding="utf8", errors="ignore") records = Prodoc.parse(handle) for record in records: valor = record.prosite_refs texto = record.text referencias[texto] = valor with open("Archivo_prosite", mode="w+") as escribir: #Archivo que se creara for x, linea in enumerate(lee): for i in dicc.keys(): if x == i: escribir.write(linea) for valor in dicc.values( ): #recorre el diccionario por los valores for posicion in valor: #saca el numero for patron in range(len( lista)): #recorre la lista de los patrones if patron == posicion: if lista[patron] != "": #debido a que hay varios patrones vacios, cuyo resultado es "" #asi solamente buscara patrones que tengan contenido escribir.write("El nombre es: " + nombres[patron] + "\n") escribir.write("El accession es: " + accesion[patron] + "\n") escribir.write("El patron es: " + lista[patron] + "\n") escribir.write("La descripción es: " + descripcion[patron] + "\n") for v in referencias.items(): for i in v[1]: for x in i: if x == accesion[patron]: escribir.write( "Información adicional: \n" + v[0]) #La siguiente linea nos permitira realizar la separacion de cada uno de los resultados escribir.write( "---------------------------------------------------------------------------------------------------------------------------------\n" ) else: break lee.close() escribir.close() return