def dat_parser(sequence, fields=["name", "accession", "description", "pattern"]): """ Finds domain hits from prosite.dat in input sequence """ hits = [] pattern_replacements = { '-': '', '{': '[^', # {X} = [^X] '}': ']', '(': '{', # (from, to) = {from, to} ')': '}', 'X': '.', # x, X = any (.) 'x': '.', '<': '^', # < = N-terminal '>': '$' # > = C-terminal } with open("prosite_files/prosite.dat", "r") as handle: records = Prosite.parse(handle) for record in records: pattern = record.pattern.strip('.') # Transform ProSite patterns # to regular expressions readable by re module for pat, repl in pattern_replacements.items(): pattern = pattern.replace(pat, repl) if pattern != "" and re.search(pattern, sequence): hits.append( [record.name, record.pdoc, record.description, pattern]) return hits
def parseo_dat(input_file): """ Función para pasear un archivo .dat con la informacion de todos los dominios de la base de datos de prosite, además de su nombre, aceso y descripcion, se guardar en una archivo en la carpta de "DATA/" """ if os.path.isfile(input_file) == True: pass else: print('ERROR:No existe el archivo indicado') sys.exit() path="DATA/data_base_prosite/" if os.path.isdir(path) == True: pass else: os.mkdir(path) handle = open(input_file,"r") out=path + "/db" output=open(out, "w") records = Prosite.parse(handle) for record in records: output.write(record.name + "\t") output.write(record.accession + "\t") output.write(record.description + "\t") output.write(record.pattern + "\n") handle.close() output.close() return out
def prositeToJSON(prositeDb, fp=sys.stdout): """ This is a parser for the prosite database, to turn the relevant entries of the prosite database into JSON. Currently it only makes two entries, the accession and the pattern. The pattern is converted into a regex. The prosite database is available at: ftp://ftp.expasy.org/databases/prosite/prosite.dat An explanation about the fields and structure of the database is available at: http://prosite.expasy.org/prosuser.html @param prositeDb: The C{str} filename of the prosite database. @param fp: A file pointer. @raises AssertionError: if any accession string in the database does not start with "PS" or if the database contains a duplicate accession string. """ seen = set() for record in Prosite.parse(open(prositeDb)): accession = record.accession assert accession not in seen assert accession.startswith('PS') seen.add(accession) pattern = patternToRegex(record.pattern[:-1]) if pattern: print(dumps( { 'accession': accession[2:], 'pattern': pattern, }, separators=(',', ':')), file=fp)
def create_dic_dominios(): """ Crea un diccionario con los patrones de la base de datos de prosite transformados para que pueda utilizarlos el módulo re y el accession correspondiente a cada patrón """ dic_dominios = {} with open("prosite.dat", "r") as handle: records = Prosite.parse(handle) for record in records: accession = record.accession pattern = record.pattern if pattern == "": continue else: pattern = pattern.replace('{', '[^') pattern = pattern.replace('}', ']') pattern = pattern.replace('(', '{') pattern = pattern.replace(')', '}') pattern = pattern.replace('-', '') pattern = pattern.replace('x', '.') pattern = pattern.replace('>', '$') pattern = pattern.replace('<', '^') dic_dominios[pattern] = accession return dic_dominios
def prositeGetPloop(): handle = open("prosite.dat") handle = open("prosite.dat") records = Prosite.parse(handle) for record in records: if (record.accession == "PS00017"): print(record.pattern)
def c_patterns(): """ Create a dictionary with names and pattern of the each record. And convert prosite patterns into REGEX pattern """ handle = open("prosite.dat", "r") records = Prosite.parse(handle) pattern_dict = dict() for record in records: if record.pattern != "": pattern_dict[record.accession] = [ record.pattern, record.name, record.description ] handle.close() #change prosite patters to REGEX patterns for key in pattern_dict: p = pattern_dict[key][0] p = p.replace("-", "") p = p.replace("x", ".") p = p.replace("(", "{") p = p.replace(")", "}") pattern_dict[key][0] = p return pattern_dict
def search_pattern(dic, input, output): """ Busca cada patrón que encuentra en el diccionario en las secuencias proteicas que encuentra en el fasta que hace de input. Devuelve un fichero tipo txt con información sobre los dominios encontrados en cada una de las proteínas. """ with open(input, "r") as input_handle: with open(output, "w") as output_handle: for record in SeqIO.parse(input_handle, "fasta"): prot_id = record.id prot = str(record.seq) print("\n>" + prot_id + "\n", file=output_handle) for pattern in dic: accession = dic[pattern] if re.search(pattern, prot): with open("prosite.dat", "r") as handle: records = Prosite.parse(handle) for record in records: if (record.accession == accession): name = record.name description = record.description pattern_2 = record.pattern print("\tDominio: " + name + " | " + accession, file=output_handle) print("\tDescripción: " + description, file=output_handle) print("\tPatrón: " + pattern_2, file=output_handle) match = re.finditer(pattern, prot) for m in match: start = m.start() end = m.end() base = m.group() print("\t- " + base + " - Posición: " + str(start) + " - " + str(end), file=output_handle) print("", file=output_handle) return output
def RecupererPattern(ident): # la récupération des séquences à partir de la Base des données SwissProt handle = ExPASy.get_prosite_raw(ident) record = Prosite.read(handle) if record.type == 'PATTERN': return record.pattern elif record.type == 'MATRIX': print("L'identifiant que vous avez entré correspont à un profile !")
def RecupererPattern(ident): handle = ExPASy.get_prosite_raw(ident) record = Prosite.read(handle) if record.type == 'PATTERN': print("La forme régulière est: ", record.pattern) print("La traduction en Python est: ", traduireSequence(record.pattern)) elif record.type == 'MATRIX': print("L'identifiant que vous avez entré correspont à un profile !")
def dat_parser(prosite_dat): print('\n' + ('Parsing prosite file...').center(80)) handle = open(prosite_dat, "r") records = Prosite.parse(handle) PatternDict = {} for record in records: patron = pattern_translation(record.pattern) PatternDict[patron] = record.accession return PatternDict
def donneesProt(): handle = open("prosite.dat") records = Prosite.parse(handle) save_file = open("prosite_entries.dat", "w") #Sauvegarde for record in records: save_file.write(record.accession + ',') save_file.write(record.name + ',') save_file.write(record.pattern + ',') save_file.write(record.pdoc + '\r\n') save_file.close() records.close()
def prosite_to_pandas(input,temp_out): # ACC[accession] TYPE[type] NAME[name] DESCRIPTION[description] print('ACC\tTYPE\tNAME\tDESCRIPTION',file=open(temp_out,"a")) with open(input) as handle: prosite_db=Prosite.parse(handle) for prosite_record in prosite_db: ACC = prosite_record.accession TYPE = prosite_record.type NAME = prosite_record.name DESCRIPTION = prosite_record.description print(ACC+'\t'+TYPE+'\t'+NAME+'\t'+DESCRIPTION,file=open(temp_out,"a")) pandatab = pd.read_csv(temp_out, sep='\t') return pandatab
def finder(folder_name): """ Funcion para compara los dominios de la base de datos con las distintas proteínas. Se itera para cada alineamiento independientemente """ #Cabecera archivos resultados for file in os.listdir("./Results/" + folder_name + "/Domains/tmp"): with open("./Results/" + folder_name + "/Domains/" + file.replace("_tmp", "") + "_domains", 'a') as f: f.write("Domain Name\tDomain Accession\tDomain Description\t" "Domain Pattern\t Protein ID\tPosition\n") with open("./Results/" + folder_name + "/Domains/tmp/" + file, 'r') as input_handle: #Iteracion en cada proteina for protein in SeqIO.parse(input_handle, "fasta"): seq_re = str(protein.seq) handle = open("./Data/Domain_DB/prosite.dat", "r") domains = Prosite.parse(handle) #Iteracion sobre cada dominio for domain in domains: pattern_pro = domain.pattern pattern_re = (pattern_pro.replace(".","").replace("x",".") .replace("{","[^").replace("}","]").replace("(","{") .replace(")","}").replace("<","^").replace(">","$") .replace("-","")) #Busqueda patron en secuencia if pattern_re != "" and re.search(pattern_re, seq_re): with open("./Results/" + folder_name + "/Domains/" + file.replace("_tmp", "") + "_domains", 'a') as f: f.write("\n" + domain.name + "\t" + domain.accession + "\t" + domain.description + "\t" + domain.pattern + "\t" + protein.id + "\t") for m in re.finditer(pattern_re, seq_re): f.write(str(m.span())) os.remove("./Results/" + folder_name + "/Domains/tmp/" + file) input_handle.close() f.close() os.rmdir("./Results/" + folder_name + "/Domains/tmp") return
def get_documentation(accession): """ get_documentation function gets as arg : accesion (signature_ac example "PS00001" ) returns html file containing full documentaion """ handle_1 = ExPASy.get_prosite_raw(accession) records = Prosite.read(handle_1) handle_2 = ExPASy.get_prodoc_entry(records.pdoc) #record = Prodoc.read(handle_2)" html = handle_2.read() with open("my_prodoc_record.html", "w") as out_handle: out_handle.write(html)
def parse_dat(prositedat, output_file): try: with open(output_file, 'w') as dominios: dominios.write('name\taccession\tdescription\tpattern\n') handle = open(prositedat, 'r') records = Prosite.parse(handle) for record in records: dominios.write(record.name + '\t' + record.accession + '\t' + record.description + '\t' + record.pattern + '\n') handle.close() dominios.close() return (dominios) except: print('No se ha podido leer el archivo: ' + prositedat + '. Abortando módulo...')
def Prosite_Domain(self): from Bio import ExPASy from Bio.ExPASy import Prosite, ScanProsite try: handle = ScanProsite.scan(seq=self.__seq_input) result = ScanProsite.read(handle) if len(result) != 0: for res in range(len(result)): prosite_acession = result[res]['signature_ac'] r = ExPASy.get_prosite_raw(prosite_acession) html = Prosite.read(r) r.close() print('Foi encontrado um dominio %s.' % (html.name)) else: print('Não foram encontradas correspondências.') except: print('A sequência fornecida não é uma sequência proteica.')
def Prosite_parser(): #function to parse protein domains usind ProSite database prosite_handle = open("Prosite_DB/prosite.dat", "r") records = Prosite.parse(prosite_handle) os.makedirs("Results/Protein_Domains", exist_ok=True) file_list = os.listdir("Results/Blast_Hits/") substitutions = { "-": "", "{": "[^", "}": "]", "(": "{", ")": "}", "X": ".", "x": ".", "<": "^", ">": "$" } for file in file_list: file_handle = open('Results/Blast_Hits/' + file, 'r') cds_seqs = file_handle.read().splitlines() for i in range(0, len(cds_seqs), 2): print("Parsing domains of " + cds_seqs[i][1:]) result_name = "Results/Protein_Domains/" + cds_seqs[i][ 1:] + "_domains" protein = open(result_name, "w") sequence = cds_seqs[i + 1] for record in records: pattern = record.pattern[:-1] for key in substitutions.keys(): pattern = pattern.replace(key, substitutions[key]) if re.search(pattern, sequence): protein.write( "Name: %s\nAccesion: %s\nDescription: %s\nPattern: %s\n\n" % (record.name, record.accession, record.description, record.pattern)) else: pass print("Domains parsed\n") protein.close() prosite_handle.close() return
def domainsearch(filename): """filename: fasta file containing the protein sequences. This function extracts the domain patterns from PROSITE and use them to search for this patterns in the protein sequences of a fasta file. Returns a list with all the matches.""" with open(filename, "r") as handle: fasta = list(SeqIO.parse(handle, "fasta")) repatterns = [] propatterns = [] initialre = [".", "x", "X", "-", "{", "}", "<", ">", "(", ")"] finalre = ["", ".", ".", "", "[^", "]", "^", "$", "{", "}"] handle = open("../prosite.dat", "r") records = Prosite.parse(handle) for record in records: if record.pattern: pattern = record.pattern for i in range(0, len(initialre)): pattern = pattern.replace(initialre[i], finalre[i]) repatterns.append(pattern) propatterns.append(record.pattern) else: continue results = [] for j in range(0, len(fasta)): protresults = [] for k in range(0, len(repatterns)): matches = re.finditer(r"" + repatterns[k], str(fasta[j].seq)) for m in matches: pattresults = [] pattresults.append(fasta[j].id) pattresults.append(m.group()) pattresults.append(m.start()) pattresults.append(m.end()) pattresults.append(propatterns[k]) if pattresults: protresults.append(pattresults) results.append(protresults) return results
def output_results(prosite_dat, ResultDict, Results_Dir): output = open(Results_Dir + 'prosite_result.txt', "w") HitIds = ResultDict.keys() for protein in HitIds: output.write('Protein ' + protein + ' has the following domains:\n\n') for dominio in ResultDict[protein]: handle = open(prosite_dat, "r") records = Prosite.parse(handle) for record in records: if record.accession == dominio: output.write("\tDomain name: " + record.name + '\n') output.write("\tDomain accession: " + record.accession + '\n') output.write("\tDomain description: " + record.description + '\n') output.write("\tPattern found: " + record.pattern + '\n\n') return
def Parsear_prosite(folder_result): """Function to parse the data base selecting the elements necessary for the create file""" with open("prosite.dat", "r") as data_base,\ open("prosite_parser.tsv", "w") as result_file: result_file.write("Name" + "\t" + "Accesion" + "\t" + "Desctiption" + "\t" + "Pattern" + "\n") records = Prosite.parse(data_base) for record in records: name = record.name accesion = record.accession description = record.description patt = record.pattern result = str(name + "\t" + accesion + "\t" + description + "\t" + patt + "\n") result_file.write(result) records.close() result_file.close()
def Parsing_prosite(handle="prosite.dat"): with open("prosite.dat", "r") as handle, \ open("prosite_parsed.tsv", "w") as parsed_out: parsed_out.write("NAME\tACCESSION\tDESCRIPTION\tPATTERN\n") records = Prosite.parse(handle) #Finding each element for record in records: parsed_out.write("%s\t%s\t%s\t%s\n" % ( record.name, record.accession, record.description, record.pattern) ) handle.close() records.close() parsed_out.close()
def parsear(): """ Parsear a file.dat with Prosite. Extraction of the name, accession, pattern and description. """ inpfile = 'prosite.dat' path1 = "results/prosite" try: os.stat(path1) except: os.mkdir(path1) out = open(path1 + "database", "w") handle = open(inpfile, "r") records = Prosite.parse(handle) for record in records: out.write(record.name + "\t") out.write(record.accession + "\t") out.write(record.description + "\t") out.write(record.pattern + "\n") handle.close() out.close()
def encuentra(filtrado): """Funcion para extraer las listas con las diferentes caracteristicas que queremos obtener para la formacion del archivo prosite, parseando tanto .doc como .dat para obtenerla """ #lista con los patrones a buscar en el archivo "filtro.fasta" patrones = [] #lista de los nombres de los dominios en prosite nombres_prosite = [] #lista de la descripcion descripcion = [] #lista de los accession accesion = [] handle = open("prosite.dat", "r") records = Prosite.parse(handle) for record in records: patrones.append(record.pattern) nombres_prosite.append(record.name) accesion.append(record.accession) descripcion.append(record.description) handle.close() #Modificacion de la lista patrones para buscarlo con re for numero in range(len(patrones)): if numero <= (len(patrones) - 1): valor = patrones[numero][:-1] for letra in valor: if letra == "x" or letra == "X": valor = valor.replace("x", ".").replace("X", ".") if letra == "{" or letra == "}": valor = valor.replace("{", "[^").replace("}", "]") if letra == "(" or letra == ")": valor = valor.replace("(", "{").replace(")", "}") if letra == "-": valor = valor.replace("-", "") patrones[numero] = valor return busca(filtrado, patrones, nombres_prosite, accesion, descripcion)
def domaininfo(keydomains): """keydomains: a list of the PROSITE domain matches. This function takes the matches domains founded and extend the information about them.""" handle = open("../prosite.dat", "r") recordsdat = Prosite.parse(handle) for record in recordsdat: for i in range(0, len(keydomains)): for j in range(0, len(keydomains[i])): if record.pattern == keydomains[i][j][-1]: keydomains[i][j].append(record.accession) keydomains[i][j].append(record.name) keydomains[i][j].append(record.description) handle = open("../prosite.doc") recordsdoc = Prodoc.parse(handle) for info in recordsdoc: if str(keydomains[i][j][5]) in str(info.prosite_refs): keydomains[i][j].append(info.text) return keydomains
def findDomains(multifasta, output = '') : """ Multifasta: archivo con todas las proteinas en las que se van a buscar dominios Output: nombre de la query """ file = open(multifasta, 'r') if not os.path.exists('results/prosite') : os.mkdir('results/prosite') output_file = str('results/prosite/dominios_' + output + '.txt') # un archivo para cada query result = open(output_file, 'w') accession_bruto = [] # todos los numeros de acceison de dominios encontrados en el multifasta accession = [] # lo mismo pero eliminando repeticiones for line in file : if line.startswith('>') : result.write('*************************************************************************************************************'+'\n') result.write(line.replace('>', '') + '\n') # titulo: nombre de la proteina else : handle = open('prosite.dat', 'r') records = Prosite.parse(handle) for record in records : patron = repl(record.pattern) # traduccion patron if len(patron) !=0 and re.search(patron, line) : # si existe el patron y se encuentra result.write('Patron: ' + record.pattern + '\nName: ' + record.name + '\nAccession: ' + record.accession + '\nDescription: ' + record.description + '\n\n') accession_bruto.append(record.accession) # guardamos info y el numero de accesion (necesario para buscar en prodoc) for a in accession_bruto : # para eliminar los repetidos if a not in accession : accession.append(a) result.write('\n\n\n\nINFORMACION DE LOS DOMINIOS\n\n') handle = open('prosite.doc', 'r') records = Prodoc.parse(handle) for record in records : if len(record.prosite_refs) != 0 and record.prosite_refs[0][0] in accession : # el numero de accesion de prosite esta en accesion result.write(record.text + '\n\nAccession prodoc: ' + record.accession + '\nAccession prosite: ' + record.prosite_refs[0][0] + '\n') result.write('**************************************************************************************\n\n\n')
def dataparse(Contador): """ Para cada secuencia de los hits, hace una búsqueda de dominios y escribe en domain el nombre, accesion, description y pattern del dominio reconocido """ print("\nEsta parte puede tardar un tiempo, por favor espere") with open("blast_hits{}.fasta".format(Contador), "r") as input_handle, open("domain_prosite{}".format(Contador), "w") as output_handle: for record in SeqIO.parse(input_handle, "fasta"): sequence = str(record.seq) output_handle.write("\n" + record.name + "\n" + "\n") handle = open("prosite.dat", "r") records = Prosite.parse(handle) for hey in records: ozo = str(hey.pattern) Final = correct_pattern(ozo) if re.search(Final, sequence) and Final != "": output_handle.write("name:" + hey.name + "\n") output_handle.write("accession:" + hey.accession + "\n") output_handle.write("description:" + hey.description + "\n") output_handle.write("pattern:" + Final + "\n" + "\n")
def test_prosite_raw(self): handle = ExPASy.get_prosite_raw("PS00001") record = Prosite.read(handle) handle.close() self.assertEqual(record.accession, "PS00001") self.assertEqual(record.name, "ASN_GLYCOSYLATION")
def test_prosite_raw(self): handle = ExPASy.get_prosite_raw('PS00001') record = Prosite.read(handle) handle.close() self.assertEqual(record.accession, 'PS00001') self.assertEqual(record.name, 'ASN_GLYCOSYLATION')
def test_prosite_raw(self): with ExPASy.get_prosite_raw("PS00001") as handle: record = Prosite.read(handle) self.assertEqual(record.accession, "PS00001") self.assertEqual(record.name, "ASN_GLYCOSYLATION")
def patfinder(fasta, output): """fasta: name of the FASTA file with the sequences to search domains on (str). output: name of the file where the results will be stored (str). This function searchs for domains of the prosite database on the given sequences, and shows some information about them. """ for x in range(0, 4): print(">> Starting Prosite pattern search " + "." * x, end="\r") sleep(0.2) print() # Count number of proteins to analyze. nseqs = 0 with open(fasta, "r") as file: for line in file: if line[0] == ">": nseqs += 1 print("\n> %d proteins to analyze." % nseqs) print( "\nThe results will be shown sequence by sequence. Each query sequence (name ended by _QUERYSEQ) will be followed by its blastp hits." ) print( "On the results folder, you'll find also a txt file with all the results, including also the exact sequence of the domains on the proteins and their position." ) print( "You'll be given also the option to see further information of the found domains of your choice." ) input("\nPRESS ENTER TO CONTINUE\n") initial = [".", "-", "<", ">", "x", "X", "{", "}", "(", ")"] final = ["", "", "^", "$", ".", ".", "[^", "]", "{", "}"] out_file = open(output, "w") j = 0 with open(fasta, "r") as seqs_handle: # Parse prosite.dat for seq_record in SeqIO.parse(seqs_handle, "fasta"): call('clear') j += 1 seq, seq_id = seq_record.seq, seq_record.id print("\n---------------------------------------") print(">> (%d of %d) Prosite domains on sequence %s" % (j, nseqs, seq_id)) print("-----------------------------------------") out_file.write("\n\n>> Prosite domains on sequence %s" % seq_id) with open("prosite.dat", "r") as handle: pat_records = Prosite.parse(handle) total = 0 results = [] for record in pat_records: # Some patterns are empty. If not, convert them to regular expresions. if record.pattern != "": pattern = record.pattern for i in range(0, len(initial)): pattern = pattern.replace(initial[i], final[i]) # Search domains. matches = re.finditer(pattern, str(seq)) hit = False domains, pos = [], [] for m in matches: domains.append(m.group()) pos.append(m.start()) hit = True # Show found domains. if hit == True: total += 1 print("\n> Found %d hits for domain %s." % (len(domains), record.name)) out_file.write( "\n> Found %d hits for domain %s:\n" % (len(domains), record.name)) out_file.write("Pos\tHit sequence\n") out_file.write("---\t------------\n") for i in range(0, len(domains)): out_file.write("%s\t%s\n" % (pos[i], domains[i])) results.append(record.accession) print("Domain accesion id: %s" % record.accession) print("Description: %s" % record.description) print("Pattern: %s" % record.pattern) out_file.write("Domain accesion id: %s\n" % record.accession) out_file.write("Description: %s\n" % record.description) out_file.write("Pattern: %s\n" % record.pattern) if total == 0: print("No domains found for this protein.") out_file.write("No domains found for this protein.") print("\n----------------------------") input("PRESS ENTER TO CONTINUE\n") else: print("\nTotal: %d different domains found.\n" % total) out_file.write( "\n\nTotal: %d different domains found.\n\n" % total) print( "\n---------------------------------------------------------" ) print( "If you want further information of these domains, press Y." ) print("Press ENTER or any other key to continue.") selection = input("> ") if selection.upper() == "Y": # Parse prosite.doc for further information. with open("prosite.doc", "r") as doc_handle: doc_records = Prodoc.parse(doc_handle) for doc_record in doc_records: for x in results: if x in str(doc_record.prosite_refs): print("> %s domain." % x) print(doc_record.text) print("\n----------------------------") input("PRESS ENTER TO CONTINUE\n") out_file.close()
from Bio.ExPASy import Prosite from urllib.request import urlopen handle=open("/home/koreanraichu/prosite.dat") records = Prosite.parse(handle) record = next(records) print(record.accession) # 단식 소환 # for문 마려울 때 시도해보자. for i in range(0,5): record = next(records) print(record.accession) # 어떻게 사람이 for문만 씁니까 while도 적용해봐요 i=0 while i < 5: record = next(records) print(record.accession) i=i+1 #셋이 같이 적용하면 첫번째-2, 3, 4, 5, 6번째-7, 8, 9, 10, 11번째 불러오니까 반드시 각개로 해볼 것. n=0 for record in records: n+=1 print(n) # record 몇개십니까 선생님