Exemple #1
0
def ubiquitin(filepath,fts):
	table = functions.connectMongoDB('uniprot','ubiquitinTable')
	# Open a file
	ac_flag = 0
	out_ac = []
	out_position = []
	out_data = dict()
	special = 0
	specials = ['Glycyllysineisopeptide(Lys-Gly)','Peptide(Met-Gly)(interchainwithG-Cter','Glycylserineester(Ser-Gly)','Glycylcysteinethioester(Cys-Gly)']
	with open(filepath) as fp:
		for line in fp:
			collapsed = ' '.join(line.split())
			data = collapsed.split(";")
			parsed_1 = data[0].split(" ")
			if parsed_1[0] == "AC" and  ac_flag == 0:
				ac_flag = 1	
				out_ac.append(parsed_1[1])
				if len(data)  > 2:
					for x in range(1, len(data)-1):
						out_ac.append(data[x])
				out_data = {'ac':out_ac}
			##[go,interpro,pfam,prosite,smart,supfam]
			elif parsed_1[0] == "FT":
				if len(parsed_1) > 4 and special == 0:
					ft = ''
					for i in range(4,len(parsed_1)):
						ft = ft + parsed_1[i]
					ft = re.sub('[.]', '', ft)
					out_position = functions.remove_duplicates([parsed_1[2],parsed_1[3]])
					if ft in specials:
						special = 1
						continue
					if ft in fts:
						fts.setdefault(ft, []).append(out_position)
						out_position = []
				elif special == 1:
					for i in range(1,len(parsed_1)):
						ft = ft + parsed_1[i]
					ft = re.sub('[.]', '', ft)
					if ft in fts:
						fts.setdefault(ft, []).append(out_position)
						out_position = []
					special = 0
			elif parsed_1[0] == '//':
				fts = dict( [(k,list(itertools.chain.from_iterable(v))) for k,v in fts.items() if len(v)>0]) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format 
				out_data = functions.merge_two_dicts(out_data,fts)
				#print(out_data)
				table.save(out_data)
				fts = {'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylserineester(Ser-Gly)(interchainwithG-Cterinubiquitin)':[],
                'Peptide(Met-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylcysteinethioester(Cys-Gly)(interchainwithG-Cterinubiquitin)':[]}
				
				##rewind
				out_ac = []
				ac_flag = 0
				out_position = []
				
	fp.close()
Exemple #2
0
    def segmentate_next_artwork_data(self):
        query = "SELECT id,id_wikidata,id_wikipedia,id_museodelprado FROM processed_artworks WHERE segmentated=0"
        artworks = MysqlND.execute_query(query, ())
        for artwork in artworks:
            id, id_wikidata, id_wikipedia, id_museodelprado = artwork[
                0], artwork[1], artwork[2], artwork[3]
            file = 'descriptions_processed_mp/' + str(id) + ".pickle"
            print(file)
            if exists_pickle(file):
                artwork_mp_description = get_pickle(file)
                description_mp = artwork_mp_description['description_mp']
                description_wikipedia = artwork_mp_description[
                    'description_wikipedia']
                description_mp_segmentated = self.segmentate(
                    description_mp, id, 'mp')
                description_wikipedia_segmentated = self.segmentate(
                    description_wikipedia, id, 'wp')
                segmentated = merge_two_dicts(
                    description_mp_segmentated,
                    description_wikipedia_segmentated)
                dict_solr = {
                    'id_wikidata': id_wikidata,
                    'id_wikipedia': id_wikipedia,
                    'id_museodelprado': id_museodelprado
                }

                data_artwork = self.get_solr_artwork_data(id)
                dict_solr = self.process_metadata_to_dict(
                    id, dict_solr, data_artwork)

                for key, value in segmentated.iteritems():
                    print(key + ": " + value)
                    dict_solr['id'] = key
                    dict_solr['text'] = value

                    dict_solr['list_artworks_segment'] = []
                    dict_solr['list_references_segment'] = []
                    dict_solr['list_characters_segment'] = []
                    dict_solr['list_events_segment'] = []

                    dict_solr = self.process_text_to_dict_narrative_elements(
                        value, dict_solr)
                    save_solr_registry(
                        dict_solr, core_solr='http://localhost:8983/solr/TFM')
                    MysqlND.execute_query(
                        "UPDATE processed_artworks SET segmentated=1 WHERE id="
                        + str(id), ())
def tableGeneration(filepath, fts):
    table = functions.connectMongoDB('uniprot', 'table')
    # Open a file
    id_flag = 0
    ac_flag = 0
    out_ac = []
    out_position = []
    out_data = dict()
    special = 0
    sequence = ''
    with open(filepath) as fp:
        for line in fp:
            collapsed = ' '.join(line.split())
            data = collapsed.split(";")
            parsed_1 = data[0].split(" ")
            if parsed_1[0] == "ID" and id_flag == 0:
                id_flag = 1
                out_id = parsed_1[1]
            elif parsed_1[0] == "AC" and ac_flag == 0:
                ac_flag = 1
                out_ac.append(parsed_1[1])
                if len(data) > 2:
                    for x in range(1, len(data) - 1):
                        out_ac.append(data[x])
                out_data = {'_id': out_id, 'ac': out_ac}
            ##[go,interpro,pfam,prosite,smart,supfam]
            elif parsed_1[0] == "FT":
                if len(parsed_1) > 4 and special == 0:
                    ft = ''
                    for i in range(4, len(parsed_1)):
                        ft = ft + parsed_1[i]
                    ft = re.sub('[.]', '', ft)
                    out_position = functions.remove_duplicates(
                        [parsed_1[2], parsed_1[3]])
                    if ft == 'Glycyllysineisopeptide(Lys-Gly)':
                        special = 1
                        continue
                    if ft in fts:
                        fts.setdefault(ft, []).append(out_position)
                        out_position = []
                elif special == 1:
                    for i in range(1, len(parsed_1)):
                        ft = ft + parsed_1[i]
                    ft = re.sub('[.]', '', ft)
                    if ft in fts:
                        fts.setdefault(ft, []).append(out_position)
                        out_position = []
                    special = 0
            ##
            ## parse_1[0] is usually RT,DR,FT,or SQ etc... only squence part has length greater than 2
            elif len(parsed_1[0]) > 2:
                sequence += collapsed
            elif parsed_1[0] == '//':
                fts = dict(
                    [(k, list(itertools.chain.from_iterable(v)))
                     for k, v in fts.items() if len(v) > 0]
                )  #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format
                out_data = functions.merge_two_dicts(out_data, fts)
                sequence = ''.join(sequence.split())
                out_data['sequence'] = sequence
                #print(out_data)
                table.save(out_data)
                fts = {
                    'Phosphoserine': [],
                    'Phosphothreonine': [],
                    'Phosphotyrosine': [],
                    'N6-acetyllysine': [],
                    'Omega-N-methylarginine': [],
                    'N6-methyllysine': [],
                    'N6,N6-dimethyllysine': [],
                    'N6,N6,N6-trimethyllysine': [],
                    'N-linked(GlcNAc)asparagine': [],
                    'S-palmitoylcysteine': [],
                    'Pyrrolidonecarboxylicacid': [],
                    'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)':
                    [],
                    'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':
                    []
                }

                ##rewind
                out_ac = []
                id_flag = 0
                ac_flag = 0
                out_position = []
                sequence = ''
    fp.close()
Exemple #4
0
def tableGeneration(filepath, ptms):
    table = functions.connectMongoDB('uniprot', 'table')
    table.drop()
    out_id = ""
    out_ac = []
    out_position = []
    out_data = dict()
    sequence = ""
    temp_ptm = ""
    prev_fp_pos = 0
    check = []

    fp = open(filepath)
    line = fp.readline()

    while line:
        collapsed = ' '.join(line.split())
        data = collapsed.split(";")
        info = data[0].split(" ")
        tag = info[0]
        #print(info[0]+" info1 "+info[1]+"\n")
        if tag == "ID":
            out_id = info[1]
        elif tag == "AC":
            out_ac.append(info[1])
            if len(data) > 2:
                for x in range(1, len(data) - 1):
                    out_ac.append(data[x].lstrip())
        elif tag == "OC":
            check.append(info[1].lstrip())
            if len(data) > 2:
                for x in range(1, len(data) - 1):
                    check.append(data[x].lstrip())
            out_data = {"_id": out_id, "ac": out_ac, "species": check}
        elif tag == "FT":
            temp_ptm = ""
            out_position = functions.remove_duplicates([info[2], info[3]])
            temp_ptm = " ".join(info[4:])
            #if "Q9TT90" in out_ac:
            #	print("################temp_ptm is 1 "+temp_ptm+"\n")
            prev_fp_pos = fp.tell()
            line = ' '.join(fp.readline().split())
            info = line.split(" ")
            while info[0] == "FT":
                if len(info) > 3 and is_number(info[2]) and is_number(info[3]):
                    #if "Q9TT90" in out_ac:
                    #    print("###########temp_ptm is 2 "+temp_ptm+"\n")
                    temp_ptm = re.sub('(\.*)\)', ')', temp_ptm)
                    for doc in ptms:
                        #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)':
                        #	print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n")
                        if doc == re.sub('[\.|\;].*', '', temp_ptm):
                            #if "Q9TT90" in out_ac:
                            #	print("yes\n")
                            ptms.setdefault(doc, []).append(out_position)
                    temp_ptm = ""
                    out_position = functions.remove_duplicates(
                        [info[2], info[3]])
                    temp_ptm = " ".join(info[4:])
                else:
                    temp_ptm = temp_ptm + " ".join(info[1:])
                    #if "Q9TT90" in out_ac:
                    #    print("#################temp_ptm is 3 "+temp_ptm+"\n")
                    #for i in range(1,len(info)):
                    #	temp_ptm += info[i].rstrip()
                    #print(temp_ptm+"\n")
                prev_fp_pos = fp.tell()
                line = ' '.join(fp.readline().split())
                info = line.split(" ")
            temp_ptm = re.sub('(\.*)\)', ')', temp_ptm)
            for doc in ptms:
                #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)':
                #	print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n")
                if doc == re.sub('[\.|\;].*', '', temp_ptm):
                    #if "Q9TT90" in out_ac:
                    #		print("yes\n")
                    ptms.setdefault(doc, []).append(out_position)
            ptms = dict([(k, list(itertools.chain.from_iterable(v)))
                         for k, v in ptms.items() if len(v) > 0])
            fp.seek(prev_fp_pos)
        elif tag == "SQ":
            sequence = seq_read(fp)
            out_data = functions.merge_two_dicts(out_data, ptms)
            out_data['sequence'] = sequence
            table.save(out_data)
            ##rewind
            ptms = {
                'Phosphoserine': [],
                'Phosphothreonine': [],
                'Phosphotyrosine': [],
                'N6-acetyllysine': [],
                'Omega-N-methylarginine': [],
                'Dimethylated arginine': [],
                'Symmetric dimethylarginine': [],
                'Asymmetric dimethylarginine': [],
                'N6-methyllysine': [],
                'N6,N6-dimethyllysine': [],
                'N6,N6,N6-trimethyllysine': [],
                'N-linked (GlcNAc) asparagine': [],
                'S-palmitoyl cysteine': [],
                'Pyrrolidone carboxylic acid': [],
                'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in SUMO)':
                [],
                'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in ubiquitin)':
                []
            }
            out_data.clear()
            out_ac = []
            out_position = []
            sequence = ""
            check = []

        line = fp.readline()

    fp.close()