def dat_parser(sequence, fields=["name", "accession", "description", "pattern"]): """ Finds domain hits from prosite.dat in input sequence """ hits = [] pattern_replacements = { '-': '', '{': '[^', # {X} = [^X] '}': ']', '(': '{', # (from, to) = {from, to} ')': '}', 'X': '.', # x, X = any (.) 'x': '.', '<': '^', # < = N-terminal '>': '$' # > = C-terminal } with open("prosite_files/prosite.dat", "r") as handle: records = Prosite.parse(handle) for record in records: pattern = record.pattern.strip('.') # Transform ProSite patterns # to regular expressions readable by re module for pat, repl in pattern_replacements.items(): pattern = pattern.replace(pat, repl) if pattern != "" and re.search(pattern, sequence): hits.append( [record.name, record.pdoc, record.description, pattern]) return hits
def c_patterns(): """ Create a dictionary with names and pattern of the each record. And convert prosite patterns into REGEX pattern """ handle = open("prosite.dat", "r") records = Prosite.parse(handle) pattern_dict = dict() for record in records: if record.pattern != "": pattern_dict[record.accession] = [ record.pattern, record.name, record.description ] handle.close() #change prosite patters to REGEX patterns for key in pattern_dict: p = pattern_dict[key][0] p = p.replace("-", "") p = p.replace("x", ".") p = p.replace("(", "{") p = p.replace(")", "}") pattern_dict[key][0] = p return pattern_dict
def prositeToJSON(prositeDb, fp=sys.stdout): """ This is a parser for the prosite database, to turn the relevant entries of the prosite database into JSON. Currently it only makes two entries, the accession and the pattern. The pattern is converted into a regex. The prosite database is available at: ftp://ftp.expasy.org/databases/prosite/prosite.dat An explanation about the fields and structure of the database is available at: http://prosite.expasy.org/prosuser.html @param prositeDb: The C{str} filename of the prosite database. @param fp: A file pointer. @raises AssertionError: if any accession string in the database does not start with "PS" or if the database contains a duplicate accession string. """ seen = set() for record in Prosite.parse(open(prositeDb)): accession = record.accession assert accession not in seen assert accession.startswith('PS') seen.add(accession) pattern = patternToRegex(record.pattern[:-1]) if pattern: print(dumps( { 'accession': accession[2:], 'pattern': pattern, }, separators=(',', ':')), file=fp)
def create_dic_dominios(): """ Crea un diccionario con los patrones de la base de datos de prosite transformados para que pueda utilizarlos el módulo re y el accession correspondiente a cada patrón """ dic_dominios = {} with open("prosite.dat", "r") as handle: records = Prosite.parse(handle) for record in records: accession = record.accession pattern = record.pattern if pattern == "": continue else: pattern = pattern.replace('{', '[^') pattern = pattern.replace('}', ']') pattern = pattern.replace('(', '{') pattern = pattern.replace(')', '}') pattern = pattern.replace('-', '') pattern = pattern.replace('x', '.') pattern = pattern.replace('>', '$') pattern = pattern.replace('<', '^') dic_dominios[pattern] = accession return dic_dominios
def parseo_dat(input_file): """ Función para pasear un archivo .dat con la informacion de todos los dominios de la base de datos de prosite, además de su nombre, aceso y descripcion, se guardar en una archivo en la carpta de "DATA/" """ if os.path.isfile(input_file) == True: pass else: print('ERROR:No existe el archivo indicado') sys.exit() path="DATA/data_base_prosite/" if os.path.isdir(path) == True: pass else: os.mkdir(path) handle = open(input_file,"r") out=path + "/db" output=open(out, "w") records = Prosite.parse(handle) for record in records: output.write(record.name + "\t") output.write(record.accession + "\t") output.write(record.description + "\t") output.write(record.pattern + "\n") handle.close() output.close() return out
def prositeGetPloop(): handle = open("prosite.dat") handle = open("prosite.dat") records = Prosite.parse(handle) for record in records: if (record.accession == "PS00017"): print(record.pattern)
def search_pattern(dic, input, output): """ Busca cada patrón que encuentra en el diccionario en las secuencias proteicas que encuentra en el fasta que hace de input. Devuelve un fichero tipo txt con información sobre los dominios encontrados en cada una de las proteínas. """ with open(input, "r") as input_handle: with open(output, "w") as output_handle: for record in SeqIO.parse(input_handle, "fasta"): prot_id = record.id prot = str(record.seq) print("\n>" + prot_id + "\n", file=output_handle) for pattern in dic: accession = dic[pattern] if re.search(pattern, prot): with open("prosite.dat", "r") as handle: records = Prosite.parse(handle) for record in records: if (record.accession == accession): name = record.name description = record.description pattern_2 = record.pattern print("\tDominio: " + name + " | " + accession, file=output_handle) print("\tDescripción: " + description, file=output_handle) print("\tPatrón: " + pattern_2, file=output_handle) match = re.finditer(pattern, prot) for m in match: start = m.start() end = m.end() base = m.group() print("\t- " + base + " - Posición: " + str(start) + " - " + str(end), file=output_handle) print("", file=output_handle) return output
def dat_parser(prosite_dat): print('\n' + ('Parsing prosite file...').center(80)) handle = open(prosite_dat, "r") records = Prosite.parse(handle) PatternDict = {} for record in records: patron = pattern_translation(record.pattern) PatternDict[patron] = record.accession return PatternDict
def donneesProt(): handle = open("prosite.dat") records = Prosite.parse(handle) save_file = open("prosite_entries.dat", "w") #Sauvegarde for record in records: save_file.write(record.accession + ',') save_file.write(record.name + ',') save_file.write(record.pattern + ',') save_file.write(record.pdoc + '\r\n') save_file.close() records.close()
def prosite_to_pandas(input,temp_out): # ACC[accession] TYPE[type] NAME[name] DESCRIPTION[description] print('ACC\tTYPE\tNAME\tDESCRIPTION',file=open(temp_out,"a")) with open(input) as handle: prosite_db=Prosite.parse(handle) for prosite_record in prosite_db: ACC = prosite_record.accession TYPE = prosite_record.type NAME = prosite_record.name DESCRIPTION = prosite_record.description print(ACC+'\t'+TYPE+'\t'+NAME+'\t'+DESCRIPTION,file=open(temp_out,"a")) pandatab = pd.read_csv(temp_out, sep='\t') return pandatab
def finder(folder_name): """ Funcion para compara los dominios de la base de datos con las distintas proteínas. Se itera para cada alineamiento independientemente """ #Cabecera archivos resultados for file in os.listdir("./Results/" + folder_name + "/Domains/tmp"): with open("./Results/" + folder_name + "/Domains/" + file.replace("_tmp", "") + "_domains", 'a') as f: f.write("Domain Name\tDomain Accession\tDomain Description\t" "Domain Pattern\t Protein ID\tPosition\n") with open("./Results/" + folder_name + "/Domains/tmp/" + file, 'r') as input_handle: #Iteracion en cada proteina for protein in SeqIO.parse(input_handle, "fasta"): seq_re = str(protein.seq) handle = open("./Data/Domain_DB/prosite.dat", "r") domains = Prosite.parse(handle) #Iteracion sobre cada dominio for domain in domains: pattern_pro = domain.pattern pattern_re = (pattern_pro.replace(".","").replace("x",".") .replace("{","[^").replace("}","]").replace("(","{") .replace(")","}").replace("<","^").replace(">","$") .replace("-","")) #Busqueda patron en secuencia if pattern_re != "" and re.search(pattern_re, seq_re): with open("./Results/" + folder_name + "/Domains/" + file.replace("_tmp", "") + "_domains", 'a') as f: f.write("\n" + domain.name + "\t" + domain.accession + "\t" + domain.description + "\t" + domain.pattern + "\t" + protein.id + "\t") for m in re.finditer(pattern_re, seq_re): f.write(str(m.span())) os.remove("./Results/" + folder_name + "/Domains/tmp/" + file) input_handle.close() f.close() os.rmdir("./Results/" + folder_name + "/Domains/tmp") return
def parse_dat(prositedat, output_file): try: with open(output_file, 'w') as dominios: dominios.write('name\taccession\tdescription\tpattern\n') handle = open(prositedat, 'r') records = Prosite.parse(handle) for record in records: dominios.write(record.name + '\t' + record.accession + '\t' + record.description + '\t' + record.pattern + '\n') handle.close() dominios.close() return (dominios) except: print('No se ha podido leer el archivo: ' + prositedat + '. Abortando módulo...')
def Prosite_parser(): #function to parse protein domains usind ProSite database prosite_handle = open("Prosite_DB/prosite.dat", "r") records = Prosite.parse(prosite_handle) os.makedirs("Results/Protein_Domains", exist_ok=True) file_list = os.listdir("Results/Blast_Hits/") substitutions = { "-": "", "{": "[^", "}": "]", "(": "{", ")": "}", "X": ".", "x": ".", "<": "^", ">": "$" } for file in file_list: file_handle = open('Results/Blast_Hits/' + file, 'r') cds_seqs = file_handle.read().splitlines() for i in range(0, len(cds_seqs), 2): print("Parsing domains of " + cds_seqs[i][1:]) result_name = "Results/Protein_Domains/" + cds_seqs[i][ 1:] + "_domains" protein = open(result_name, "w") sequence = cds_seqs[i + 1] for record in records: pattern = record.pattern[:-1] for key in substitutions.keys(): pattern = pattern.replace(key, substitutions[key]) if re.search(pattern, sequence): protein.write( "Name: %s\nAccesion: %s\nDescription: %s\nPattern: %s\n\n" % (record.name, record.accession, record.description, record.pattern)) else: pass print("Domains parsed\n") protein.close() prosite_handle.close() return
def domainsearch(filename): """filename: fasta file containing the protein sequences. This function extracts the domain patterns from PROSITE and use them to search for this patterns in the protein sequences of a fasta file. Returns a list with all the matches.""" with open(filename, "r") as handle: fasta = list(SeqIO.parse(handle, "fasta")) repatterns = [] propatterns = [] initialre = [".", "x", "X", "-", "{", "}", "<", ">", "(", ")"] finalre = ["", ".", ".", "", "[^", "]", "^", "$", "{", "}"] handle = open("../prosite.dat", "r") records = Prosite.parse(handle) for record in records: if record.pattern: pattern = record.pattern for i in range(0, len(initialre)): pattern = pattern.replace(initialre[i], finalre[i]) repatterns.append(pattern) propatterns.append(record.pattern) else: continue results = [] for j in range(0, len(fasta)): protresults = [] for k in range(0, len(repatterns)): matches = re.finditer(r"" + repatterns[k], str(fasta[j].seq)) for m in matches: pattresults = [] pattresults.append(fasta[j].id) pattresults.append(m.group()) pattresults.append(m.start()) pattresults.append(m.end()) pattresults.append(propatterns[k]) if pattresults: protresults.append(pattresults) results.append(protresults) return results
def output_results(prosite_dat, ResultDict, Results_Dir): output = open(Results_Dir + 'prosite_result.txt', "w") HitIds = ResultDict.keys() for protein in HitIds: output.write('Protein ' + protein + ' has the following domains:\n\n') for dominio in ResultDict[protein]: handle = open(prosite_dat, "r") records = Prosite.parse(handle) for record in records: if record.accession == dominio: output.write("\tDomain name: " + record.name + '\n') output.write("\tDomain accession: " + record.accession + '\n') output.write("\tDomain description: " + record.description + '\n') output.write("\tPattern found: " + record.pattern + '\n\n') return
def Parsear_prosite(folder_result): """Function to parse the data base selecting the elements necessary for the create file""" with open("prosite.dat", "r") as data_base,\ open("prosite_parser.tsv", "w") as result_file: result_file.write("Name" + "\t" + "Accesion" + "\t" + "Desctiption" + "\t" + "Pattern" + "\n") records = Prosite.parse(data_base) for record in records: name = record.name accesion = record.accession description = record.description patt = record.pattern result = str(name + "\t" + accesion + "\t" + description + "\t" + patt + "\n") result_file.write(result) records.close() result_file.close()
def Parsing_prosite(handle="prosite.dat"): with open("prosite.dat", "r") as handle, \ open("prosite_parsed.tsv", "w") as parsed_out: parsed_out.write("NAME\tACCESSION\tDESCRIPTION\tPATTERN\n") records = Prosite.parse(handle) #Finding each element for record in records: parsed_out.write("%s\t%s\t%s\t%s\n" % ( record.name, record.accession, record.description, record.pattern) ) handle.close() records.close() parsed_out.close()
def parsear(): """ Parsear a file.dat with Prosite. Extraction of the name, accession, pattern and description. """ inpfile = 'prosite.dat' path1 = "results/prosite" try: os.stat(path1) except: os.mkdir(path1) out = open(path1 + "database", "w") handle = open(inpfile, "r") records = Prosite.parse(handle) for record in records: out.write(record.name + "\t") out.write(record.accession + "\t") out.write(record.description + "\t") out.write(record.pattern + "\n") handle.close() out.close()
def encuentra(filtrado): """Funcion para extraer las listas con las diferentes caracteristicas que queremos obtener para la formacion del archivo prosite, parseando tanto .doc como .dat para obtenerla """ #lista con los patrones a buscar en el archivo "filtro.fasta" patrones = [] #lista de los nombres de los dominios en prosite nombres_prosite = [] #lista de la descripcion descripcion = [] #lista de los accession accesion = [] handle = open("prosite.dat", "r") records = Prosite.parse(handle) for record in records: patrones.append(record.pattern) nombres_prosite.append(record.name) accesion.append(record.accession) descripcion.append(record.description) handle.close() #Modificacion de la lista patrones para buscarlo con re for numero in range(len(patrones)): if numero <= (len(patrones) - 1): valor = patrones[numero][:-1] for letra in valor: if letra == "x" or letra == "X": valor = valor.replace("x", ".").replace("X", ".") if letra == "{" or letra == "}": valor = valor.replace("{", "[^").replace("}", "]") if letra == "(" or letra == ")": valor = valor.replace("(", "{").replace(")", "}") if letra == "-": valor = valor.replace("-", "") patrones[numero] = valor return busca(filtrado, patrones, nombres_prosite, accesion, descripcion)
def domaininfo(keydomains): """keydomains: a list of the PROSITE domain matches. This function takes the matches domains founded and extend the information about them.""" handle = open("../prosite.dat", "r") recordsdat = Prosite.parse(handle) for record in recordsdat: for i in range(0, len(keydomains)): for j in range(0, len(keydomains[i])): if record.pattern == keydomains[i][j][-1]: keydomains[i][j].append(record.accession) keydomains[i][j].append(record.name) keydomains[i][j].append(record.description) handle = open("../prosite.doc") recordsdoc = Prodoc.parse(handle) for info in recordsdoc: if str(keydomains[i][j][5]) in str(info.prosite_refs): keydomains[i][j].append(info.text) return keydomains
def findDomains(multifasta, output = '') : """ Multifasta: archivo con todas las proteinas en las que se van a buscar dominios Output: nombre de la query """ file = open(multifasta, 'r') if not os.path.exists('results/prosite') : os.mkdir('results/prosite') output_file = str('results/prosite/dominios_' + output + '.txt') # un archivo para cada query result = open(output_file, 'w') accession_bruto = [] # todos los numeros de acceison de dominios encontrados en el multifasta accession = [] # lo mismo pero eliminando repeticiones for line in file : if line.startswith('>') : result.write('*************************************************************************************************************'+'\n') result.write(line.replace('>', '') + '\n') # titulo: nombre de la proteina else : handle = open('prosite.dat', 'r') records = Prosite.parse(handle) for record in records : patron = repl(record.pattern) # traduccion patron if len(patron) !=0 and re.search(patron, line) : # si existe el patron y se encuentra result.write('Patron: ' + record.pattern + '\nName: ' + record.name + '\nAccession: ' + record.accession + '\nDescription: ' + record.description + '\n\n') accession_bruto.append(record.accession) # guardamos info y el numero de accesion (necesario para buscar en prodoc) for a in accession_bruto : # para eliminar los repetidos if a not in accession : accession.append(a) result.write('\n\n\n\nINFORMACION DE LOS DOMINIOS\n\n') handle = open('prosite.doc', 'r') records = Prodoc.parse(handle) for record in records : if len(record.prosite_refs) != 0 and record.prosite_refs[0][0] in accession : # el numero de accesion de prosite esta en accesion result.write(record.text + '\n\nAccession prodoc: ' + record.accession + '\nAccession prosite: ' + record.prosite_refs[0][0] + '\n') result.write('**************************************************************************************\n\n\n')
def dataparse(Contador): """ Para cada secuencia de los hits, hace una búsqueda de dominios y escribe en domain el nombre, accesion, description y pattern del dominio reconocido """ print("\nEsta parte puede tardar un tiempo, por favor espere") with open("blast_hits{}.fasta".format(Contador), "r") as input_handle, open("domain_prosite{}".format(Contador), "w") as output_handle: for record in SeqIO.parse(input_handle, "fasta"): sequence = str(record.seq) output_handle.write("\n" + record.name + "\n" + "\n") handle = open("prosite.dat", "r") records = Prosite.parse(handle) for hey in records: ozo = str(hey.pattern) Final = correct_pattern(ozo) if re.search(Final, sequence) and Final != "": output_handle.write("name:" + hey.name + "\n") output_handle.write("accession:" + hey.accession + "\n") output_handle.write("description:" + hey.description + "\n") output_handle.write("pattern:" + Final + "\n" + "\n")
def busqueda_dom(sec_prot, id_prot, prosite_dat, name_query): """ DEFINICION: Funcóon que dada una secuencia proteica, analiza si tiene algun dominio que este registrado en la base de datos de prosite ARGUMENTOS: - sec_prot= string que contiene la secuencia proteica a analizar - id_prot= string con el id de la proteina - prosite_data= archivo prosite.dat - name_query= String que representa el nombre del query que estamos analizando RESULTADO: - Genera un archivo tsv (dominios_proteicos.tsv) con el siguiente formato: PROTEÍNA NOMBRE_DOMINIO ACCESION_DOMINIO PDOC_ACCESSION DESCRIPCION_DOMINIO PATRON_Prosite PATRON_Re **** Los archivos se escriben entre comillas dentro de los parentesis de la funcion **** """ #============================================================================================= #========================== SUBFUNCION DE LA SUBFUNCION AUXILIAR =========================== #============================================================================================= def patron_RE(pPRO): """ DEFINICION: Función que transforma los patrones en formato prosite, a formato del modulo RE de python ARGUMENTOS: - pPRO= string que contiene un patrón regular con formato de prosite RESULTADO: - La funcion devuelve un string con el patron regular en formato "del módulo RE" """ pRE = pPRO PROSITE = ["-", ".", "x", "{", "}", "(", ")", "<", ">", ">]"] RE = ["", "", ".", "[^", "]", "{", "}", "^", "$", "]?$"] for i in range(len(RE)): pRE = pRE.replace(PROSITE[i], RE[i]) return pRE #============================================================================================ #============================================================================================ #============================================================================================ ################### CÓDIGO PRINCIPAL DEL SCRIPT busqueda_dom.py ################# # APERTURA DEL ARCHIVO .dat y CREACIÓN DEL ARCHIVO dominios_proteicos.tsv dat = open(prosite_dat, "r", encoding="utf8") output = open(name_query + "_dominios_proteicos.tsv", "a", encoding="utf8") # impresión por pantalla de la cabecera de la tabla del ouptut print() print( "_\tPROTEÍNA_ID\tNOMBRE_DOMINIO\tACCESSION_DOMINIO\tPDOC_ACCESSION\tDESCRIPCION_DOMINIO\tPATRON_Prosite\tPATRON_Re" ) print() # escribimos la cabecera de la tabla en el archivo output output.write( "PROTEÍNA_ID\tNOMBRE_DOMINIO\tACCESSION_DOMINIO\tPDOC_ACCESSION\tDESCRIPCION_DOMINIO\tPATRON_Prosite\tPATRON_Re\n" ) # Variable de control, empleada para la representacion de la tabla imprimida por pantalla contador_hits = 0 #BUCLE QUE RECORRE EL ARCHIVO .dat en busca de todos los patrones existentes records = Prosite.parse(dat) for dom in records: # si el dominio tiene un patron en prosite.dat if len(dom.pattern) != 0: pRE = patron_RE( dom.pattern ) # conversion del patron en fmt de prosite a fmt de RE if re.search(pRE, sec_prot ): # si el patron esta en la proteina, hacemos: #guardamos las siguientes variables name_dom = dom.name accession_dom = dom.accession accession_pdoc = dom.pdoc descrip_dom = dom.description ProPattern = dom.pattern #sumamos uno al contador contador_hits += 1 #creamos la lista con los posibles valores a consultar en el doc if accession_dom not in lista_hits: lista_hits.append(accession_pdoc) #imprimimos por pantalla la tabla print( str(contador_hits) + "\t" + id_prot + "\t" + name_dom + "\t" + accession_dom + "\t" + accession_pdoc + "\t" + descrip_dom + "\t" + ProPattern + "\t" + pRE) print() output.write(id_prot + "\t" + name_dom + "\t" + accession_dom + "\t" + accession_pdoc + "\t" + descrip_dom + "\t" + ProPattern + "\t" + pRE + "\n") # CIERRE DE LOS ARCHIVOS .dat y output dat.close() output.close()
def domain_parser(): """Search domains of each hit Returns: dictionary with values useful to graphic, len of the hit with the largest sequence, list of detected patterns """ input_handle = "blast_hits.fasta" # File with: (1) the domains of each hit and (2) domains information domains = "domains_hits.txt" output_handle = open(domains, "w") output_handle.write( "#This file contains the domains of each hit.\n#At the bottom, you will find detail information of all the domains detected.\n" ) output_handle.write( "#We strongly recommend to open this file with Visual Studio Code.\n#Because when the names of the domains are too large, in regular editors the table looks awful.\n" ) output_handle.write( "#Here it is only showed how many times a pattern is present.\n#In the figure of the domains you will find the position of each domain.\n\n" ) accession_list = [ ] # List of prosite.doc accessions of the domains that had been found domains_dict = dict() # dictionary that saves matches count = 1 max_seq_len = 0 # Keep larger sequence to plot x-axe # Loop to go through hits for seq_record in SeqIO.parse(input_handle, "fasta"): output_handle.write(str(seq_record.id) + "\n") # print identifier of the hit output_handle.write(str(seq_record.seq) + "\n") # print sequence of the hit if len(seq_record.seq) > max_seq_len: max_seq_len = len(seq_record.seq) # Make a table for each hit with the domains, that contains the following fields: name, accession, description and pattern x = PrettyTable() x.field_names = [ "name", "accession", "description", "pattern", "repetitions" ] # Loop to go through prosite domains handle = open("prosite.dat", "r") records = Prosite.parse(handle) for record in records: # prosite.dat preparation for parsing # {} -> [^] pattern = record.pattern.upper() pattern = pattern.replace("{", "[^") pattern = pattern.replace("}", "]") # - -> "" pattern = pattern.replace("-", "") # . -> "" pattern = pattern.replace(".", "") # X|x -> "[ARNDCQEGHILKMFPSTWYV]" AAS = "[ARNDCQEGHILKMFPSTWYV]" pattern = pattern.replace("x", AAS) pattern = pattern.replace("X", AAS) # () -> {} pattern = pattern.replace("(", "{") pattern = pattern.replace(")", "}") # >] -> ]?$ pattern = pattern.replace(">]", "]?$") # < -> ^ # > -> $ pattern = pattern.replace("<", "^") pattern = pattern.replace(">", "$") if pattern != "": # Look if the hit contains the current patter if re.search(r"" + str(pattern), str(seq_record.seq).upper()): # if found if record.pdoc not in accession_list: # Save pdoc accession in the list of prosite.doc accessions # if it is not already accession_list.append(record.pdoc) matches = re.finditer(r"" + str(pattern), str(seq_record.seq).upper()) reps = 0 for match in matches: # save all matches in a dictionary to plot them later domains_dict[count] = [ seq_record.id, len(seq_record.seq), record.name, match.start(), match.end() ] count = count + 1 reps = reps + 1 x.add_row([ record.name, record.accession, record.description, record.pattern, reps ]) # add found domain to table output_handle.write(str(x) + "\n") # add table of hit to domains_hits.txt # At the end of the tables, print information of all the domains that had been found output_handle.write("\n") record_text_list = DocParser(accession_list) for text in record_text_list: output_handle.write(text) return (domains_dict, max_seq_len, accession_list)
from Bio.ExPASy import Prosite from urllib.request import urlopen handle=open("/home/koreanraichu/prosite.dat") records = Prosite.parse(handle) record = next(records) print(record.accession) # 단식 소환 # for문 마려울 때 시도해보자. for i in range(0,5): record = next(records) print(record.accession) # 어떻게 사람이 for문만 씁니까 while도 적용해봐요 i=0 while i < 5: record = next(records) print(record.accession) i=i+1 #셋이 같이 적용하면 첫번째-2, 3, 4, 5, 6번째-7, 8, 9, 10, 11번째 불러오니까 반드시 각개로 해볼 것. n=0 for record in records: n+=1 print(n) # record 몇개십니까 선생님
def make_domain (result_path): ''' Crea un archivo que contiene los dominios encontrados en el multifasta de proteinas filtadas. Input: - result_path: ruta donde esta el multifasta input y donde se guarda el resultado Output: archivo que contiene los dominios encontrados con un header que precede a cada proteina y los campos: "dominio, accesion, descripcion, patron encontrado" El return se silencia a 'None'. ''' #Para hacer el plot posterior de dominios de proteinas se hacen unos #archivos temporales que contienen el start y el end y la longitud total. #Se almacenan en una carpeta temporal os.mkdir('Temporal') #Se abren el archivo output y el multifasta que se parsea with open(result_path+'/Domains.txt', 'w') as output: with open(result_path+'/MultifaFiltered.fasta','r') as handle: for record in SeqIO.parse(handle, 'fasta'): #Si una proteína no matchea con ningún patrón se va a eliminar #su archivo temporal. Inicialmente se supone que una proteína #no matchea ningun patrón has_matched = False #Se escribe el header del archivo de dominios: output.write('>'+record.id+"\n-------------\n") output.write('Nombre dominio\tAccesión\tDescripción\tSecuencia\n') #Se abre el archivo temporal: with open('Temporal/'+record.id,'w') as temporal_output: #Para cada dominio de prosite se comprueba si existe patrón, #se adapta al formato de re y se intenta matchear #El archivo prosite.dat se tiene que abrir para cada record #ya que al escanear por el, como es un iterador se "consume" #(si se abre al principio funciona para el primer record #pero para los demás es como si estuviera vacío) with open('prosite.dat', 'r') as prosite_file: for domain in Prosite.parse(prosite_file): if domain.pattern: adapted_pattern = adapt_pattern(domain.pattern) match = re.search(adapted_pattern, str(record.seq)) #Si ha matcheado se escriben los datos al #resultado y al output temporal y se cambia #has_matched -> True if match: output.write(domain.name+'\t'+domain.accession \ +'\t'+domain.description+'\t' \ + match.group()+'\n') temporal_output.write(domain.name+'\t' \ +str(match.start())+'\t'+str(match.end()) \ +'\t'+str(len(record.seq))+'\n') has_matched = True #Una vez se termina de escanear todos los dominios se #añade un newline extra al output para separar de la #proxima proteina output.write('\n') #Finalmente una vez se ha acabado con la proteina y se ha #cerrado el archivo. #Si no se ha matcheado se elimina el temporal output creado if not has_matched: os.remove('Temporal/'+record.id) return None
def patfinder(fasta, output): """fasta: name of the FASTA file with the sequences to search domains on (str). output: name of the file where the results will be stored (str). This function searchs for domains of the prosite database on the given sequences, and shows some information about them. """ for x in range(0, 4): print(">> Starting Prosite pattern search " + "." * x, end="\r") sleep(0.2) print() # Count number of proteins to analyze. nseqs = 0 with open(fasta, "r") as file: for line in file: if line[0] == ">": nseqs += 1 print("\n> %d proteins to analyze." % nseqs) print( "\nThe results will be shown sequence by sequence. Each query sequence (name ended by _QUERYSEQ) will be followed by its blastp hits." ) print( "On the results folder, you'll find also a txt file with all the results, including also the exact sequence of the domains on the proteins and their position." ) print( "You'll be given also the option to see further information of the found domains of your choice." ) input("\nPRESS ENTER TO CONTINUE\n") initial = [".", "-", "<", ">", "x", "X", "{", "}", "(", ")"] final = ["", "", "^", "$", ".", ".", "[^", "]", "{", "}"] out_file = open(output, "w") j = 0 with open(fasta, "r") as seqs_handle: # Parse prosite.dat for seq_record in SeqIO.parse(seqs_handle, "fasta"): call('clear') j += 1 seq, seq_id = seq_record.seq, seq_record.id print("\n---------------------------------------") print(">> (%d of %d) Prosite domains on sequence %s" % (j, nseqs, seq_id)) print("-----------------------------------------") out_file.write("\n\n>> Prosite domains on sequence %s" % seq_id) with open("prosite.dat", "r") as handle: pat_records = Prosite.parse(handle) total = 0 results = [] for record in pat_records: # Some patterns are empty. If not, convert them to regular expresions. if record.pattern != "": pattern = record.pattern for i in range(0, len(initial)): pattern = pattern.replace(initial[i], final[i]) # Search domains. matches = re.finditer(pattern, str(seq)) hit = False domains, pos = [], [] for m in matches: domains.append(m.group()) pos.append(m.start()) hit = True # Show found domains. if hit == True: total += 1 print("\n> Found %d hits for domain %s." % (len(domains), record.name)) out_file.write( "\n> Found %d hits for domain %s:\n" % (len(domains), record.name)) out_file.write("Pos\tHit sequence\n") out_file.write("---\t------------\n") for i in range(0, len(domains)): out_file.write("%s\t%s\n" % (pos[i], domains[i])) results.append(record.accession) print("Domain accesion id: %s" % record.accession) print("Description: %s" % record.description) print("Pattern: %s" % record.pattern) out_file.write("Domain accesion id: %s\n" % record.accession) out_file.write("Description: %s\n" % record.description) out_file.write("Pattern: %s\n" % record.pattern) if total == 0: print("No domains found for this protein.") out_file.write("No domains found for this protein.") print("\n----------------------------") input("PRESS ENTER TO CONTINUE\n") else: print("\nTotal: %d different domains found.\n" % total) out_file.write( "\n\nTotal: %d different domains found.\n\n" % total) print( "\n---------------------------------------------------------" ) print( "If you want further information of these domains, press Y." ) print("Press ENTER or any other key to continue.") selection = input("> ") if selection.upper() == "Y": # Parse prosite.doc for further information. with open("prosite.doc", "r") as doc_handle: doc_records = Prodoc.parse(doc_handle) for doc_record in doc_records: for x in results: if x in str(doc_record.prosite_refs): print("> %s domain." % x) print(doc_record.text) print("\n----------------------------") input("PRESS ENTER TO CONTINUE\n") out_file.close()