Esempio n. 1
0
def dat_parser(sequence,
               fields=["name", "accession", "description", "pattern"]):
    """ Finds domain hits from prosite.dat in input sequence """
    hits = []
    pattern_replacements = {
        '-': '',
        '{': '[^',  # {X} = [^X]
        '}': ']',
        '(': '{',  # (from, to) = {from, to}
        ')': '}',
        'X': '.',  # x, X = any (.)
        'x': '.',
        '<': '^',  # < = N-terminal
        '>': '$'  # > = C-terminal
    }
    with open("prosite_files/prosite.dat", "r") as handle:
        records = Prosite.parse(handle)
        for record in records:
            pattern = record.pattern.strip('.')
            # Transform ProSite patterns
            # to regular expressions readable by re module
            for pat, repl in pattern_replacements.items():
                pattern = pattern.replace(pat, repl)
            if pattern != "" and re.search(pattern, sequence):
                hits.append(
                    [record.name, record.pdoc, record.description, pattern])
    return hits
Esempio n. 2
0
def c_patterns():
    """
	Create a dictionary with names and pattern of the each record.
	And convert prosite patterns into REGEX pattern
	"""
    handle = open("prosite.dat", "r")
    records = Prosite.parse(handle)
    pattern_dict = dict()
    for record in records:
        if record.pattern != "":
            pattern_dict[record.accession] = [
                record.pattern, record.name, record.description
            ]
    handle.close()

    #change prosite patters to REGEX patterns
    for key in pattern_dict:
        p = pattern_dict[key][0]
        p = p.replace("-", "")
        p = p.replace("x", ".")
        p = p.replace("(", "{")
        p = p.replace(")", "}")
        pattern_dict[key][0] = p

    return pattern_dict
Esempio n. 3
0
def prositeToJSON(prositeDb, fp=sys.stdout):
    """
    This is a parser for the prosite database, to turn the relevant entries of
    the prosite database into JSON. Currently it only makes two entries,
    the accession and the pattern. The pattern is converted into a regex.

    The prosite database is available at:
    ftp://ftp.expasy.org/databases/prosite/prosite.dat
    An explanation about the fields and structure of the database is available
    at: http://prosite.expasy.org/prosuser.html

    @param prositeDb: The C{str} filename of the prosite database.
    @param fp: A file pointer.
    @raises AssertionError: if any accession string in the database does not
        start with "PS" or if the database contains a duplicate accession
        string.
    """
    seen = set()
    for record in Prosite.parse(open(prositeDb)):
        accession = record.accession
        assert accession not in seen
        assert accession.startswith('PS')
        seen.add(accession)
        pattern = patternToRegex(record.pattern[:-1])
        if pattern:
            print(dumps(
                {
                    'accession': accession[2:],
                    'pattern': pattern,
                }, separators=(',', ':')), file=fp)
Esempio n. 4
0
def create_dic_dominios():
    """
        Crea un diccionario con los patrones de la base de datos de
        prosite transformados para que pueda utilizarlos el módulo re
        y el accession correspondiente a cada patrón

    """

    dic_dominios = {}

    with open("prosite.dat", "r") as handle:

        records = Prosite.parse(handle)

        for record in records:

            accession = record.accession
            pattern = record.pattern

            if pattern == "":
                continue
            else:
                pattern = pattern.replace('{', '[^')
                pattern = pattern.replace('}', ']')
                pattern = pattern.replace('(', '{')
                pattern = pattern.replace(')', '}')
                pattern = pattern.replace('-', '')
                pattern = pattern.replace('x', '.')
                pattern = pattern.replace('>', '$')
                pattern = pattern.replace('<', '^')

            dic_dominios[pattern] = accession

    return dic_dominios
Esempio n. 5
0
def parseo_dat(input_file):
    """
    Función para pasear un archivo .dat con la informacion de todos los
    dominios de la base de datos de prosite, además de su nombre, aceso
    y descripcion, se guardar en una archivo en la carpta de "DATA/"
    """

    if os.path.isfile(input_file) == True:
        pass
    else:
        print('ERROR:No existe el archivo indicado')
        sys.exit()

    path="DATA/data_base_prosite/"
    if os.path.isdir(path) == True:
        pass
    else:
        os.mkdir(path)
    
    handle = open(input_file,"r")
    out=path + "/db"
    output=open(out, "w")
    records = Prosite.parse(handle)
    for record in records:
        output.write(record.name + "\t")
        output.write(record.accession + "\t")
        output.write(record.description + "\t")
        output.write(record.pattern + "\n")
    handle.close()
    output.close()

    return out
Esempio n. 6
0
def prositeGetPloop():
    handle = open("prosite.dat")
    handle = open("prosite.dat")
    records = Prosite.parse(handle)
    for record in records:
        if (record.accession == "PS00017"):
            print(record.pattern)
Esempio n. 7
0
def search_pattern(dic, input, output):
    """
        Busca cada patrón que encuentra en el diccionario en las
        secuencias proteicas que encuentra en el fasta que hace de input.
        Devuelve un fichero tipo txt con información sobre los dominios
        encontrados en cada una de las proteínas.
    """

    with open(input, "r") as input_handle:

        with open(output, "w") as output_handle:

            for record in SeqIO.parse(input_handle, "fasta"):

                prot_id = record.id
                prot = str(record.seq)

                print("\n>" + prot_id + "\n", file=output_handle)

                for pattern in dic:

                    accession = dic[pattern]

                    if re.search(pattern, prot):

                        with open("prosite.dat", "r") as handle:

                            records = Prosite.parse(handle)

                            for record in records:

                                if (record.accession == accession):

                                    name = record.name
                                    description = record.description
                                    pattern_2 = record.pattern

                                    print("\tDominio: " + name + " | " +
                                          accession,
                                          file=output_handle)
                                    print("\tDescripción: " + description,
                                          file=output_handle)
                                    print("\tPatrón: " + pattern_2,
                                          file=output_handle)

                                    match = re.finditer(pattern, prot)

                                    for m in match:
                                        start = m.start()
                                        end = m.end()
                                        base = m.group()

                                        print("\t- " + base + " - Posición: " +
                                              str(start) + " - " + str(end),
                                              file=output_handle)

                                    print("", file=output_handle)

    return output
Esempio n. 8
0
def dat_parser(prosite_dat):
    print('\n' + ('Parsing prosite file...').center(80))
    handle = open(prosite_dat, "r")
    records = Prosite.parse(handle)
    PatternDict = {}
    for record in records:
        patron = pattern_translation(record.pattern)
        PatternDict[patron] = record.accession

    return PatternDict
Esempio n. 9
0
def donneesProt():
    handle = open("prosite.dat")
    records = Prosite.parse(handle)
    save_file = open("prosite_entries.dat", "w")  #Sauvegarde
    for record in records:
        save_file.write(record.accession + ',')
        save_file.write(record.name + ',')
        save_file.write(record.pattern + ',')
        save_file.write(record.pdoc + '\r\n')
    save_file.close()
    records.close()
Esempio n. 10
0
def prosite_to_pandas(input,temp_out):
    # ACC[accession]   TYPE[type]   NAME[name]   DESCRIPTION[description]
    print('ACC\tTYPE\tNAME\tDESCRIPTION',file=open(temp_out,"a"))
    with open(input) as handle:
        prosite_db=Prosite.parse(handle)
        for prosite_record in prosite_db:
            ACC = prosite_record.accession
            TYPE = prosite_record.type
            NAME = prosite_record.name
            DESCRIPTION = prosite_record.description
            print(ACC+'\t'+TYPE+'\t'+NAME+'\t'+DESCRIPTION,file=open(temp_out,"a"))
    pandatab = pd.read_csv(temp_out, sep='\t')
    return pandatab
def finder(folder_name):
	"""
	Funcion para compara los dominios de la base de datos con las distintas 
	proteínas. Se itera para cada alineamiento independientemente
	"""
	
	#Cabecera archivos resultados
	for file in os.listdir("./Results/" + folder_name + "/Domains/tmp"):
		with open("./Results/" + folder_name + "/Domains/" 
				  + file.replace("_tmp", "") + "_domains", 'a') as f:
			f.write("Domain Name\tDomain Accession\tDomain Description\t"
					"Domain Pattern\t Protein ID\tPosition\n")


		with open("./Results/" + folder_name + "/Domains/tmp/" + file, 
				  'r') as input_handle:
			#Iteracion en cada proteina
			for protein in SeqIO.parse(input_handle, "fasta"):
				seq_re = str(protein.seq)

				handle = open("./Data/Domain_DB/prosite.dat", "r")
				domains = Prosite.parse(handle)

				#Iteracion sobre cada dominio
				for domain in domains:
					pattern_pro = domain.pattern
					pattern_re = (pattern_pro.replace(".","").replace("x",".")
								 .replace("{","[^").replace("}","]").replace("(","{")
								 .replace(")","}").replace("<","^").replace(">","$")
								 .replace("-",""))
					
					#Busqueda patron en secuencia
					if pattern_re != "" and re.search(pattern_re, seq_re):
						
						with open("./Results/" + folder_name + "/Domains/" 
			  					  + file.replace("_tmp", "") 
			  					  + "_domains", 'a') as f:

							f.write("\n" + domain.name + "\t" + domain.accession 
									+ "\t" + domain.description + "\t" 
									+ domain.pattern + "\t" + protein.id + "\t")

							for m in re.finditer(pattern_re, seq_re):
								f.write(str(m.span()))

		os.remove("./Results/" + folder_name + "/Domains/tmp/" + file)
		input_handle.close()
	f.close()
	os.rmdir("./Results/" + folder_name + "/Domains/tmp")

	return
def parse_dat(prositedat, output_file):
    try:
        with open(output_file, 'w') as dominios:
            dominios.write('name\taccession\tdescription\tpattern\n')
            handle = open(prositedat, 'r')
            records = Prosite.parse(handle)
            for record in records:
                dominios.write(record.name + '\t' + record.accession + '\t' +
                               record.description + '\t' + record.pattern +
                               '\n')
            handle.close()
        dominios.close()
        return (dominios)
    except:
        print('No se ha podido leer el archivo: ' + prositedat +
              '. Abortando módulo...')
Esempio n. 13
0
def Prosite_parser():
    #function to parse protein domains usind ProSite database
    prosite_handle = open("Prosite_DB/prosite.dat", "r")
    records = Prosite.parse(prosite_handle)

    os.makedirs("Results/Protein_Domains", exist_ok=True)
    file_list = os.listdir("Results/Blast_Hits/")
    substitutions = {
        "-": "",
        "{": "[^",
        "}": "]",
        "(": "{",
        ")": "}",
        "X": ".",
        "x": ".",
        "<": "^",
        ">": "$"
    }

    for file in file_list:
        file_handle = open('Results/Blast_Hits/' + file, 'r')
        cds_seqs = file_handle.read().splitlines()

        for i in range(0, len(cds_seqs), 2):
            print("Parsing domains of " + cds_seqs[i][1:])
            result_name = "Results/Protein_Domains/" + cds_seqs[i][
                1:] + "_domains"
            protein = open(result_name, "w")
            sequence = cds_seqs[i + 1]
            for record in records:
                pattern = record.pattern[:-1]
                for key in substitutions.keys():
                    pattern = pattern.replace(key, substitutions[key])
                if re.search(pattern, sequence):
                    protein.write(
                        "Name: %s\nAccesion: %s\nDescription: %s\nPattern: %s\n\n"
                        % (record.name, record.accession, record.description,
                           record.pattern))
                else:
                    pass
            print("Domains parsed\n")
            protein.close()
    prosite_handle.close()

    return
Esempio n. 14
0
def domainsearch(filename):
    """filename: fasta file containing the protein sequences.

    This function extracts the domain patterns from PROSITE and 
    use them to search for this patterns in the protein sequences 
    of a fasta file. Returns a list with all the matches."""

    with open(filename, "r") as handle:
        fasta = list(SeqIO.parse(handle, "fasta"))

    repatterns = []
    propatterns = []
    initialre = [".", "x", "X", "-", "{", "}", "<", ">", "(", ")"]
    finalre = ["", ".", ".", "", "[^", "]", "^", "$", "{", "}"]
    handle = open("../prosite.dat", "r")
    records = Prosite.parse(handle)
    for record in records:
        if record.pattern:
            pattern = record.pattern
            for i in range(0, len(initialre)):
                pattern = pattern.replace(initialre[i], finalre[i])
            repatterns.append(pattern)
            propatterns.append(record.pattern)
        else:
            continue

    results = []
    for j in range(0, len(fasta)):
        protresults = []
        for k in range(0, len(repatterns)):
            matches = re.finditer(r"" + repatterns[k], str(fasta[j].seq))
            for m in matches:
                pattresults = []
                pattresults.append(fasta[j].id)
                pattresults.append(m.group())
                pattresults.append(m.start())
                pattresults.append(m.end())
                pattresults.append(propatterns[k])
                if pattresults:
                    protresults.append(pattresults)

        results.append(protresults)

    return results
Esempio n. 15
0
def output_results(prosite_dat, ResultDict, Results_Dir):

    output = open(Results_Dir + 'prosite_result.txt', "w")
    HitIds = ResultDict.keys()
    for protein in HitIds:
        output.write('Protein ' + protein + ' has the following domains:\n\n')
        for dominio in ResultDict[protein]:
            handle = open(prosite_dat, "r")
            records = Prosite.parse(handle)
            for record in records:
                if record.accession == dominio:
                    output.write("\tDomain name: " + record.name + '\n')
                    output.write("\tDomain accession: " + record.accession +
                                 '\n')
                    output.write("\tDomain description: " +
                                 record.description + '\n')
                    output.write("\tPattern found: " + record.pattern + '\n\n')

    return
Esempio n. 16
0
def Parsear_prosite(folder_result):
    """Function to parse the data base selecting the elements
    necessary for the create file"""

    with open("prosite.dat", "r") as data_base,\
            open("prosite_parser.tsv", "w") as result_file:
        result_file.write("Name" + "\t" + "Accesion" + "\t" + "Desctiption" +
                          "\t" + "Pattern" + "\n")
        records = Prosite.parse(data_base)

        for record in records:
            name = record.name
            accesion = record.accession
            description = record.description
            patt = record.pattern
            result = str(name + "\t" + accesion + "\t" + description + "\t" +
                         patt + "\n")
            result_file.write(result)

        records.close()
        result_file.close()
Esempio n. 17
0
def Parsing_prosite(handle="prosite.dat"):

	with open("prosite.dat", "r") as handle, \
			open("prosite_parsed.tsv", "w") as parsed_out:

		parsed_out.write("NAME\tACCESSION\tDESCRIPTION\tPATTERN\n")
		records = Prosite.parse(handle)

		#Finding each element


		for record in records:
			parsed_out.write("%s\t%s\t%s\t%s\n" % (
				record.name,
				record.accession,
				record.description,
				record.pattern)
							)

		handle.close()
		records.close()
		parsed_out.close()
Esempio n. 18
0
def parsear():
	"""
	Parsear a file.dat with Prosite. Extraction of the name, accession,
	pattern and description.
	"""
	inpfile = 'prosite.dat'
	path1 = "results/prosite"
	try:
		os.stat(path1)
	except:
		os.mkdir(path1)
	
	out = open(path1 + "database", "w")
	handle = open(inpfile, "r")
	records = Prosite.parse(handle)
	for record in records:
		out.write(record.name + "\t")
		out.write(record.accession + "\t")
		out.write(record.description + "\t")
		out.write(record.pattern + "\n")
	handle.close()
	out.close()
def encuentra(filtrado):
    """Funcion para extraer las listas con las diferentes caracteristicas que queremos
    obtener para la formacion del archivo prosite, parseando tanto .doc como .dat para obtenerla
    """
    #lista con los patrones a buscar en el archivo "filtro.fasta"
    patrones = []
    #lista de los nombres de los dominios en prosite
    nombres_prosite = []
    #lista de la descripcion
    descripcion = []
    #lista de los accession
    accesion = []
    handle = open("prosite.dat", "r")
    records = Prosite.parse(handle)
    for record in records:
        patrones.append(record.pattern)
        nombres_prosite.append(record.name)
        accesion.append(record.accession)
        descripcion.append(record.description)
    handle.close()

    #Modificacion de la lista patrones para buscarlo con re
    for numero in range(len(patrones)):
        if numero <= (len(patrones) - 1):
            valor = patrones[numero][:-1]

            for letra in valor:
                if letra == "x" or letra == "X":
                    valor = valor.replace("x", ".").replace("X", ".")
                if letra == "{" or letra == "}":
                    valor = valor.replace("{", "[^").replace("}", "]")
                if letra == "(" or letra == ")":
                    valor = valor.replace("(", "{").replace(")", "}")

                if letra == "-":
                    valor = valor.replace("-", "")
                    patrones[numero] = valor
    return busca(filtrado, patrones, nombres_prosite, accesion, descripcion)
Esempio n. 20
0
def domaininfo(keydomains):
    """keydomains: a list of the PROSITE domain matches.

    This function takes the matches domains founded and extend
    the information about them."""

    handle = open("../prosite.dat", "r")
    recordsdat = Prosite.parse(handle)

    for record in recordsdat:
        for i in range(0, len(keydomains)):
            for j in range(0, len(keydomains[i])):
                if record.pattern == keydomains[i][j][-1]:
                    keydomains[i][j].append(record.accession)
                    keydomains[i][j].append(record.name)
                    keydomains[i][j].append(record.description)
                    handle = open("../prosite.doc")
                    recordsdoc = Prodoc.parse(handle)
                    for info in recordsdoc:
                        if str(keydomains[i][j][5]) in str(info.prosite_refs):
                            keydomains[i][j].append(info.text)

    return keydomains
Esempio n. 21
0
def findDomains(multifasta, output = '') :
	""" Multifasta: archivo con todas las proteinas en las que se van a buscar dominios
	    Output: nombre de la query """

	file = open(multifasta, 'r')
	
	if not os.path.exists('results/prosite') :
		os.mkdir('results/prosite')
	output_file = str('results/prosite/dominios_' + output + '.txt') # un archivo para cada query
	result = open(output_file, 'w')
	accession_bruto = [] # todos los numeros de acceison de dominios encontrados en el multifasta
	accession = [] # lo mismo pero eliminando repeticiones

	for line in file : 
		if line.startswith('>') :
			result.write('*************************************************************************************************************'+'\n')
			result.write(line.replace('>', '') + '\n') # titulo: nombre de la proteina
		else :
			handle = open('prosite.dat', 'r')
			records = Prosite.parse(handle)
			for record in records :
				patron = repl(record.pattern) # traduccion patron 
				if len(patron) !=0 and re.search(patron, line) : # si existe el patron y se encuentra
					result.write('Patron: ' + record.pattern + '\nName: ' + record.name + '\nAccession: ' + record.accession + '\nDescription: ' + record.description + '\n\n')
					accession_bruto.append(record.accession) # guardamos info y el numero de accesion (necesario para buscar en prodoc)
	
	for a in accession_bruto : # para eliminar los repetidos
		if a not in accession :
			accession.append(a)

	result.write('\n\n\n\nINFORMACION DE LOS DOMINIOS\n\n')
	handle = open('prosite.doc', 'r')
	records = Prodoc.parse(handle)
	for record in records :
		if len(record.prosite_refs) != 0 and record.prosite_refs[0][0] in accession : # el numero de accesion de prosite esta en accesion
			result.write(record.text + '\n\nAccession prodoc: ' + record.accession + '\nAccession prosite: ' + record.prosite_refs[0][0] + '\n')
			result.write('**************************************************************************************\n\n\n')
def dataparse(Contador):
    """
    Para cada secuencia de los hits, hace una búsqueda de dominios y escribe en domain 
    el nombre, accesion, description y pattern del dominio reconocido
    """
    print("\nEsta parte puede tardar un tiempo, por favor espere")

    with open("blast_hits{}.fasta".format(Contador),
              "r") as input_handle, open("domain_prosite{}".format(Contador),
                                         "w") as output_handle:
        for record in SeqIO.parse(input_handle, "fasta"):
            sequence = str(record.seq)
            output_handle.write("\n" + record.name + "\n" + "\n")
            handle = open("prosite.dat", "r")
            records = Prosite.parse(handle)
            for hey in records:
                ozo = str(hey.pattern)
                Final = correct_pattern(ozo)
                if re.search(Final, sequence) and Final != "":
                    output_handle.write("name:" + hey.name + "\n")
                    output_handle.write("accession:" + hey.accession + "\n")
                    output_handle.write("description:" + hey.description +
                                        "\n")
                    output_handle.write("pattern:" + Final + "\n" + "\n")
Esempio n. 23
0
    def busqueda_dom(sec_prot, id_prot, prosite_dat, name_query):
        """
        
        DEFINICION: Funcóon que dada una secuencia proteica, analiza si tiene algun dominio
                    que este registrado en la base de datos de prosite
            
        ARGUMENTOS:
            - sec_prot= string que contiene la secuencia proteica a analizar
            - id_prot= string con el id de la proteina
            - prosite_data= archivo prosite.dat
            - name_query= String que representa el nombre del query que estamos analizando
        
        RESULTADO:
            - Genera un archivo tsv (dominios_proteicos.tsv) con el siguiente formato: 
    
   PROTEÍNA  NOMBRE_DOMINIO  ACCESION_DOMINIO  PDOC_ACCESSION   DESCRIPCION_DOMINIO   PATRON_Prosite  PATRON_Re
    
    
        **** Los archivos se escriben entre comillas dentro de los parentesis de la funcion ****
        
        """

        #=============================================================================================
        #==========================   SUBFUNCION DE LA SUBFUNCION AUXILIAR ===========================
        #=============================================================================================

        def patron_RE(pPRO):
            """
            DEFINICION: Función que transforma los patrones en formato prosite, a formato del 
                        modulo RE de python
                        
            ARGUMENTOS:
                - pPRO= string que contiene un patrón regular con formato de prosite
            
            RESULTADO:
                - La funcion devuelve un string con el patron regular en formato "del módulo RE"
            
            """

            pRE = pPRO
            PROSITE = ["-", ".", "x", "{", "}", "(", ")", "<", ">", ">]"]
            RE = ["", "", ".", "[^", "]", "{", "}", "^", "$", "]?$"]

            for i in range(len(RE)):
                pRE = pRE.replace(PROSITE[i], RE[i])

            return pRE

        #============================================================================================
        #============================================================================================
        #============================================================================================

        ###################   CÓDIGO PRINCIPAL DEL SCRIPT busqueda_dom.py  #################

        # APERTURA DEL ARCHIVO .dat y CREACIÓN DEL ARCHIVO dominios_proteicos.tsv
        dat = open(prosite_dat, "r", encoding="utf8")
        output = open(name_query + "_dominios_proteicos.tsv",
                      "a",
                      encoding="utf8")

        # impresión por pantalla de la cabecera de la tabla del ouptut
        print()
        print(
            "_\tPROTEÍNA_ID\tNOMBRE_DOMINIO\tACCESSION_DOMINIO\tPDOC_ACCESSION\tDESCRIPCION_DOMINIO\tPATRON_Prosite\tPATRON_Re"
        )
        print()

        # escribimos la cabecera de la tabla en el archivo output
        output.write(
            "PROTEÍNA_ID\tNOMBRE_DOMINIO\tACCESSION_DOMINIO\tPDOC_ACCESSION\tDESCRIPCION_DOMINIO\tPATRON_Prosite\tPATRON_Re\n"
        )

        # Variable de control, empleada para la representacion de la tabla imprimida por pantalla
        contador_hits = 0

        #BUCLE QUE RECORRE EL ARCHIVO .dat en busca de todos los patrones existentes
        records = Prosite.parse(dat)
        for dom in records:

            # si el dominio tiene un patron en prosite.dat
            if len(dom.pattern) != 0:
                pRE = patron_RE(
                    dom.pattern
                )  # conversion del patron en fmt de prosite a fmt de RE
                if re.search(pRE, sec_prot
                             ):  # si el patron esta en la proteina, hacemos:

                    #guardamos las siguientes variables
                    name_dom = dom.name
                    accession_dom = dom.accession
                    accession_pdoc = dom.pdoc
                    descrip_dom = dom.description
                    ProPattern = dom.pattern

                    #sumamos uno al contador
                    contador_hits += 1

                    #creamos la lista con los posibles valores a consultar en el doc
                    if accession_dom not in lista_hits:
                        lista_hits.append(accession_pdoc)

                    #imprimimos por pantalla la tabla
                    print(
                        str(contador_hits) + "\t" + id_prot + "\t" + name_dom +
                        "\t" + accession_dom + "\t" + accession_pdoc + "\t" +
                        descrip_dom + "\t" + ProPattern + "\t" + pRE)
                    print()
                    output.write(id_prot + "\t" + name_dom + "\t" +
                                 accession_dom + "\t" + accession_pdoc + "\t" +
                                 descrip_dom + "\t" + ProPattern + "\t" + pRE +
                                 "\n")

        # CIERRE DE LOS ARCHIVOS .dat y output
        dat.close()
        output.close()
Esempio n. 24
0
def domain_parser():
    """Search domains of each hit
    Returns: dictionary with values useful to graphic, len of the hit with the largest sequence, list of detected patterns
    """
    input_handle = "blast_hits.fasta"
    # File with: (1) the domains of each hit and (2) domains information
    domains = "domains_hits.txt"
    output_handle = open(domains, "w")
    output_handle.write(
        "#This file contains the domains of each hit.\n#At the bottom, you will find detail information of all the domains detected.\n"
    )
    output_handle.write(
        "#We strongly recommend to open this file with Visual Studio Code.\n#Because when the names of the domains are too large, in regular editors the table looks awful.\n"
    )
    output_handle.write(
        "#Here it is only showed how many times a pattern is present.\n#In the figure of the domains you will find the position of each domain.\n\n"
    )
    accession_list = [
    ]  # List of prosite.doc accessions of the domains that had been found
    domains_dict = dict()  # dictionary that saves matches
    count = 1
    max_seq_len = 0  # Keep larger sequence to plot x-axe
    # Loop to go through hits
    for seq_record in SeqIO.parse(input_handle, "fasta"):
        output_handle.write(str(seq_record.id) +
                            "\n")  # print identifier of the hit
        output_handle.write(str(seq_record.seq) +
                            "\n")  # print sequence of the hit
        if len(seq_record.seq) > max_seq_len:
            max_seq_len = len(seq_record.seq)
        # Make a table for each hit with the domains, that contains the following fields: name, accession, description and pattern
        x = PrettyTable()
        x.field_names = [
            "name", "accession", "description", "pattern", "repetitions"
        ]

        # Loop to go through prosite domains
        handle = open("prosite.dat", "r")
        records = Prosite.parse(handle)
        for record in records:
            # prosite.dat preparation for parsing
            # {} -> [^]
            pattern = record.pattern.upper()
            pattern = pattern.replace("{", "[^")
            pattern = pattern.replace("}", "]")
            # - -> ""
            pattern = pattern.replace("-", "")
            # . -> ""
            pattern = pattern.replace(".", "")
            # X|x -> "[ARNDCQEGHILKMFPSTWYV]"
            AAS = "[ARNDCQEGHILKMFPSTWYV]"
            pattern = pattern.replace("x", AAS)
            pattern = pattern.replace("X", AAS)
            # () -> {}
            pattern = pattern.replace("(", "{")
            pattern = pattern.replace(")", "}")

            # >] -> ]?$
            pattern = pattern.replace(">]", "]?$")

            #  <  -> ^
            #  >  -> $
            pattern = pattern.replace("<", "^")
            pattern = pattern.replace(">", "$")
            if pattern != "":
                # Look if the hit contains the current patter
                if re.search(r"" + str(pattern),
                             str(seq_record.seq).upper()):  # if found
                    if record.pdoc not in accession_list:
                        # Save pdoc accession in the list of prosite.doc accessions
                        # if it is not already
                        accession_list.append(record.pdoc)
                    matches = re.finditer(r"" + str(pattern),
                                          str(seq_record.seq).upper())
                    reps = 0
                    for match in matches:  # save all matches in a dictionary to plot them later
                        domains_dict[count] = [
                            seq_record.id,
                            len(seq_record.seq), record.name,
                            match.start(),
                            match.end()
                        ]
                        count = count + 1
                        reps = reps + 1
                    x.add_row([
                        record.name, record.accession, record.description,
                        record.pattern, reps
                    ])  # add found domain to table

        output_handle.write(str(x) +
                            "\n")  # add table of hit to domains_hits.txt

    # At the end of the tables, print information of all the domains that had been found
    output_handle.write("\n")
    record_text_list = DocParser(accession_list)
    for text in record_text_list:
        output_handle.write(text)
    return (domains_dict, max_seq_len, accession_list)
Esempio n. 25
0
from Bio.ExPASy import Prosite
from urllib.request import urlopen
handle=open("/home/koreanraichu/prosite.dat")
records = Prosite.parse(handle)
record = next(records)
print(record.accession)
# 단식 소환

# for문 마려울 때 시도해보자.
for i in range(0,5):
    record = next(records)
    print(record.accession)

# 어떻게 사람이 for문만 씁니까 while도 적용해봐요
i=0
while i < 5:
    record = next(records)
    print(record.accession)
    i=i+1

#셋이 같이 적용하면 첫번째-2, 3, 4, 5, 6번째-7, 8, 9, 10, 11번째 불러오니까 반드시 각개로 해볼 것.

n=0
for record in records:
    n+=1
print(n)
# record 몇개십니까 선생님
Esempio n. 26
0
def make_domain (result_path):
    '''
    Crea un archivo que contiene los dominios encontrados en el multifasta
    de proteinas filtadas.

    Input:
        - result_path: ruta donde esta el multifasta input y donde se guarda
                       el resultado

    Output: archivo que contiene los dominios encontrados con un header que
            precede a cada proteina y los campos:
            "dominio, accesion, descripcion, patron encontrado"

            El return se silencia a 'None'.
    '''
    #Para hacer el plot posterior de dominios de proteinas se hacen unos
    #archivos temporales que contienen el start y el end y la longitud total.
    #Se almacenan en una carpeta temporal
    os.mkdir('Temporal')

    #Se abren el archivo output y el multifasta que se parsea
    with open(result_path+'/Domains.txt', 'w') as output:
        with open(result_path+'/MultifaFiltered.fasta','r') as handle:
            for record in SeqIO.parse(handle, 'fasta'):
                #Si una proteína no matchea con ningún patrón se va a eliminar
                #su archivo temporal. Inicialmente se supone que una proteína
                #no matchea ningun patrón
                has_matched = False
                #Se escribe el header del archivo de dominios:
                output.write('>'+record.id+"\n-------------\n")
                output.write('Nombre dominio\tAccesión\tDescripción\tSecuencia\n')
                #Se abre el archivo temporal:
                with open('Temporal/'+record.id,'w') as temporal_output:
                    #Para cada dominio de prosite se comprueba si existe patrón,
                    #se adapta al formato de re y se intenta matchear
                    #El archivo prosite.dat se tiene que abrir para cada record
                    #ya que al escanear por el, como es un iterador se "consume"
                    #(si se abre al principio funciona para el primer record
                    #pero para los demás es como si estuviera vacío)
                    with open('prosite.dat', 'r') as prosite_file:
                        for domain in Prosite.parse(prosite_file):
                            if domain.pattern:
                                adapted_pattern = adapt_pattern(domain.pattern)
                                match = re.search(adapted_pattern, str(record.seq))
                                #Si ha matcheado se escriben los datos al
                                #resultado y al output temporal y se cambia
                                #has_matched -> True
                                if match:
                                    output.write(domain.name+'\t'+domain.accession \
                                                +'\t'+domain.description+'\t' \
                                                + match.group()+'\n')

                                    temporal_output.write(domain.name+'\t' \
                                    +str(match.start())+'\t'+str(match.end()) \
                                    +'\t'+str(len(record.seq))+'\n')

                                    has_matched = True

                        #Una vez se termina de escanear todos los dominios se
                        #añade un newline extra al output para separar de la
                        #proxima proteina
                        output.write('\n')

                #Finalmente una vez se ha acabado con la proteina y se ha
                #cerrado el archivo.
                #Si no se ha matcheado se elimina el temporal output creado
                if not has_matched:
                    os.remove('Temporal/'+record.id)

    return None
Esempio n. 27
0
def patfinder(fasta, output):
    """fasta: name of the FASTA file with the sequences to search domains on (str).
	output: name of the file where the results will be stored (str).

	This function searchs for domains of the prosite database on the given sequences,
	and shows some information about them. 
	"""

    for x in range(0, 4):
        print(">> Starting Prosite pattern search " + "." * x, end="\r")
        sleep(0.2)
    print()

    # Count number of proteins to analyze.
    nseqs = 0
    with open(fasta, "r") as file:
        for line in file:
            if line[0] == ">":
                nseqs += 1

    print("\n> %d proteins to analyze." % nseqs)
    print(
        "\nThe results will be shown sequence by sequence. Each query sequence (name ended by _QUERYSEQ) will be followed by its blastp hits."
    )
    print(
        "On the results folder, you'll find also a txt file with all the results, including also the exact sequence of the domains on the proteins and their position."
    )
    print(
        "You'll be given also the option to see further information of the found domains of your choice."
    )
    input("\nPRESS ENTER TO CONTINUE\n")

    initial = [".", "-", "<", ">", "x", "X", "{", "}", "(", ")"]
    final = ["", "", "^", "$", ".", ".", "[^", "]", "{", "}"]

    out_file = open(output, "w")

    j = 0
    with open(fasta, "r") as seqs_handle:
        # Parse prosite.dat
        for seq_record in SeqIO.parse(seqs_handle, "fasta"):
            call('clear')
            j += 1
            seq, seq_id = seq_record.seq, seq_record.id
            print("\n---------------------------------------")
            print(">> (%d of %d) Prosite domains on sequence %s" %
                  (j, nseqs, seq_id))
            print("-----------------------------------------")
            out_file.write("\n\n>> Prosite domains on sequence %s" % seq_id)
            with open("prosite.dat", "r") as handle:
                pat_records = Prosite.parse(handle)
                total = 0
                results = []
                for record in pat_records:

                    # Some patterns are empty. If not, convert them to regular expresions.
                    if record.pattern != "":
                        pattern = record.pattern
                        for i in range(0, len(initial)):
                            pattern = pattern.replace(initial[i], final[i])

                        # Search domains.
                        matches = re.finditer(pattern, str(seq))
                        hit = False
                        domains, pos = [], []
                        for m in matches:
                            domains.append(m.group())
                            pos.append(m.start())
                            hit = True

                        # Show found domains.
                        if hit == True:
                            total += 1
                            print("\n> Found %d hits for domain %s." %
                                  (len(domains), record.name))
                            out_file.write(
                                "\n> Found %d hits for domain %s:\n" %
                                (len(domains), record.name))
                            out_file.write("Pos\tHit sequence\n")
                            out_file.write("---\t------------\n")
                            for i in range(0, len(domains)):
                                out_file.write("%s\t%s\n" %
                                               (pos[i], domains[i]))

                            results.append(record.accession)
                            print("Domain accesion id: %s" % record.accession)
                            print("Description: %s" % record.description)
                            print("Pattern: %s" % record.pattern)
                            out_file.write("Domain accesion id: %s\n" %
                                           record.accession)
                            out_file.write("Description: %s\n" %
                                           record.description)
                            out_file.write("Pattern: %s\n" % record.pattern)

                if total == 0:
                    print("No domains found for this protein.")
                    out_file.write("No domains found for this protein.")
                    print("\n----------------------------")
                    input("PRESS ENTER TO CONTINUE\n")
                else:
                    print("\nTotal: %d different domains found.\n" % total)
                    out_file.write(
                        "\n\nTotal: %d different domains found.\n\n" % total)
                    print(
                        "\n---------------------------------------------------------"
                    )
                    print(
                        "If you want further information of these domains, press Y."
                    )
                    print("Press ENTER or any other key to continue.")
                    selection = input("> ")
                    if selection.upper() == "Y":
                        # Parse prosite.doc for further information.
                        with open("prosite.doc", "r") as doc_handle:
                            doc_records = Prodoc.parse(doc_handle)
                            for doc_record in doc_records:
                                for x in results:
                                    if x in str(doc_record.prosite_refs):
                                        print("> %s domain." % x)
                                        print(doc_record.text)
                            print("\n----------------------------")
                            input("PRESS ENTER TO CONTINUE\n")

    out_file.close()