def ubiquitin(filepath,fts): table = functions.connectMongoDB('uniprot','ubiquitinTable') # Open a file ac_flag = 0 out_ac = [] out_position = [] out_data = dict() special = 0 specials = ['Glycyllysineisopeptide(Lys-Gly)','Peptide(Met-Gly)(interchainwithG-Cter','Glycylserineester(Ser-Gly)','Glycylcysteinethioester(Cys-Gly)'] with open(filepath) as fp: for line in fp: collapsed = ' '.join(line.split()) data = collapsed.split(";") parsed_1 = data[0].split(" ") if parsed_1[0] == "AC" and ac_flag == 0: ac_flag = 1 out_ac.append(parsed_1[1]) if len(data) > 2: for x in range(1, len(data)-1): out_ac.append(data[x]) out_data = {'ac':out_ac} ##[go,interpro,pfam,prosite,smart,supfam] elif parsed_1[0] == "FT": if len(parsed_1) > 4 and special == 0: ft = '' for i in range(4,len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) out_position = functions.remove_duplicates([parsed_1[2],parsed_1[3]]) if ft in specials: special = 1 continue if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] elif special == 1: for i in range(1,len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] special = 0 elif parsed_1[0] == '//': fts = dict( [(k,list(itertools.chain.from_iterable(v))) for k,v in fts.items() if len(v)>0]) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format out_data = functions.merge_two_dicts(out_data,fts) #print(out_data) table.save(out_data) fts = {'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylserineester(Ser-Gly)(interchainwithG-Cterinubiquitin)':[], 'Peptide(Met-Gly)(interchainwithG-Cterinubiquitin)':[],'Glycylcysteinethioester(Cys-Gly)(interchainwithG-Cterinubiquitin)':[]} ##rewind out_ac = [] ac_flag = 0 out_position = [] fp.close()
def segmentate_next_artwork_data(self): query = "SELECT id,id_wikidata,id_wikipedia,id_museodelprado FROM processed_artworks WHERE segmentated=0" artworks = MysqlND.execute_query(query, ()) for artwork in artworks: id, id_wikidata, id_wikipedia, id_museodelprado = artwork[ 0], artwork[1], artwork[2], artwork[3] file = 'descriptions_processed_mp/' + str(id) + ".pickle" print(file) if exists_pickle(file): artwork_mp_description = get_pickle(file) description_mp = artwork_mp_description['description_mp'] description_wikipedia = artwork_mp_description[ 'description_wikipedia'] description_mp_segmentated = self.segmentate( description_mp, id, 'mp') description_wikipedia_segmentated = self.segmentate( description_wikipedia, id, 'wp') segmentated = merge_two_dicts( description_mp_segmentated, description_wikipedia_segmentated) dict_solr = { 'id_wikidata': id_wikidata, 'id_wikipedia': id_wikipedia, 'id_museodelprado': id_museodelprado } data_artwork = self.get_solr_artwork_data(id) dict_solr = self.process_metadata_to_dict( id, dict_solr, data_artwork) for key, value in segmentated.iteritems(): print(key + ": " + value) dict_solr['id'] = key dict_solr['text'] = value dict_solr['list_artworks_segment'] = [] dict_solr['list_references_segment'] = [] dict_solr['list_characters_segment'] = [] dict_solr['list_events_segment'] = [] dict_solr = self.process_text_to_dict_narrative_elements( value, dict_solr) save_solr_registry( dict_solr, core_solr='http://localhost:8983/solr/TFM') MysqlND.execute_query( "UPDATE processed_artworks SET segmentated=1 WHERE id=" + str(id), ())
def tableGeneration(filepath, fts): table = functions.connectMongoDB('uniprot', 'table') # Open a file id_flag = 0 ac_flag = 0 out_ac = [] out_position = [] out_data = dict() special = 0 sequence = '' with open(filepath) as fp: for line in fp: collapsed = ' '.join(line.split()) data = collapsed.split(";") parsed_1 = data[0].split(" ") if parsed_1[0] == "ID" and id_flag == 0: id_flag = 1 out_id = parsed_1[1] elif parsed_1[0] == "AC" and ac_flag == 0: ac_flag = 1 out_ac.append(parsed_1[1]) if len(data) > 2: for x in range(1, len(data) - 1): out_ac.append(data[x]) out_data = {'_id': out_id, 'ac': out_ac} ##[go,interpro,pfam,prosite,smart,supfam] elif parsed_1[0] == "FT": if len(parsed_1) > 4 and special == 0: ft = '' for i in range(4, len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) out_position = functions.remove_duplicates( [parsed_1[2], parsed_1[3]]) if ft == 'Glycyllysineisopeptide(Lys-Gly)': special = 1 continue if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] elif special == 1: for i in range(1, len(parsed_1)): ft = ft + parsed_1[i] ft = re.sub('[.]', '', ft) if ft in fts: fts.setdefault(ft, []).append(out_position) out_position = [] special = 0 ## ## parse_1[0] is usually RT,DR,FT,or SQ etc... only squence part has length greater than 2 elif len(parsed_1[0]) > 2: sequence += collapsed elif parsed_1[0] == '//': fts = dict( [(k, list(itertools.chain.from_iterable(v))) for k, v in fts.items() if len(v) > 0] ) #delete empty FTs from dictionary ##list(itertools.chain.from_iterable(v)) format out_data = functions.merge_two_dicts(out_data, fts) sequence = ''.join(sequence.split()) out_data['sequence'] = sequence #print(out_data) table.save(out_data) fts = { 'Phosphoserine': [], 'Phosphothreonine': [], 'Phosphotyrosine': [], 'N6-acetyllysine': [], 'Omega-N-methylarginine': [], 'N6-methyllysine': [], 'N6,N6-dimethyllysine': [], 'N6,N6,N6-trimethyllysine': [], 'N-linked(GlcNAc)asparagine': [], 'S-palmitoylcysteine': [], 'Pyrrolidonecarboxylicacid': [], 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': [], 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-Cterinubiquitin)': [] } ##rewind out_ac = [] id_flag = 0 ac_flag = 0 out_position = [] sequence = '' fp.close()
def tableGeneration(filepath, ptms): table = functions.connectMongoDB('uniprot', 'table') table.drop() out_id = "" out_ac = [] out_position = [] out_data = dict() sequence = "" temp_ptm = "" prev_fp_pos = 0 check = [] fp = open(filepath) line = fp.readline() while line: collapsed = ' '.join(line.split()) data = collapsed.split(";") info = data[0].split(" ") tag = info[0] #print(info[0]+" info1 "+info[1]+"\n") if tag == "ID": out_id = info[1] elif tag == "AC": out_ac.append(info[1]) if len(data) > 2: for x in range(1, len(data) - 1): out_ac.append(data[x].lstrip()) elif tag == "OC": check.append(info[1].lstrip()) if len(data) > 2: for x in range(1, len(data) - 1): check.append(data[x].lstrip()) out_data = {"_id": out_id, "ac": out_ac, "species": check} elif tag == "FT": temp_ptm = "" out_position = functions.remove_duplicates([info[2], info[3]]) temp_ptm = " ".join(info[4:]) #if "Q9TT90" in out_ac: # print("################temp_ptm is 1 "+temp_ptm+"\n") prev_fp_pos = fp.tell() line = ' '.join(fp.readline().split()) info = line.split(" ") while info[0] == "FT": if len(info) > 3 and is_number(info[2]) and is_number(info[3]): #if "Q9TT90" in out_ac: # print("###########temp_ptm is 2 "+temp_ptm+"\n") temp_ptm = re.sub('(\.*)\)', ')', temp_ptm) for doc in ptms: #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': # print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n") if doc == re.sub('[\.|\;].*', '', temp_ptm): #if "Q9TT90" in out_ac: # print("yes\n") ptms.setdefault(doc, []).append(out_position) temp_ptm = "" out_position = functions.remove_duplicates( [info[2], info[3]]) temp_ptm = " ".join(info[4:]) else: temp_ptm = temp_ptm + " ".join(info[1:]) #if "Q9TT90" in out_ac: # print("#################temp_ptm is 3 "+temp_ptm+"\n") #for i in range(1,len(info)): # temp_ptm += info[i].rstrip() #print(temp_ptm+"\n") prev_fp_pos = fp.tell() line = ' '.join(fp.readline().split()) info = line.split(" ") temp_ptm = re.sub('(\.*)\)', ')', temp_ptm) for doc in ptms: #if "Q9TT90" in out_ac and doc == 'Glycyllysineisopeptide(Lys-Gly)(interchainwithG-CterinSUMO)': # print(doc+" vs "+re.sub('[\.|\;].*','',temp_ptm)+"\n") if doc == re.sub('[\.|\;].*', '', temp_ptm): #if "Q9TT90" in out_ac: # print("yes\n") ptms.setdefault(doc, []).append(out_position) ptms = dict([(k, list(itertools.chain.from_iterable(v))) for k, v in ptms.items() if len(v) > 0]) fp.seek(prev_fp_pos) elif tag == "SQ": sequence = seq_read(fp) out_data = functions.merge_two_dicts(out_data, ptms) out_data['sequence'] = sequence table.save(out_data) ##rewind ptms = { 'Phosphoserine': [], 'Phosphothreonine': [], 'Phosphotyrosine': [], 'N6-acetyllysine': [], 'Omega-N-methylarginine': [], 'Dimethylated arginine': [], 'Symmetric dimethylarginine': [], 'Asymmetric dimethylarginine': [], 'N6-methyllysine': [], 'N6,N6-dimethyllysine': [], 'N6,N6,N6-trimethyllysine': [], 'N-linked (GlcNAc) asparagine': [], 'S-palmitoyl cysteine': [], 'Pyrrolidone carboxylic acid': [], 'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in SUMO)': [], 'Glycyl lysine isopeptide (Lys-Gly)(interchain with G-Cter in ubiquitin)': [] } out_data.clear() out_ac = [] out_position = [] sequence = "" check = [] line = fp.readline() fp.close()