def convert_paragraph(new_session_maker): from model_new_schema.bioentity import Bioentity, Paragraph log = logging.getLogger('convert.regulation.paragraph') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['text', 'date_created', 'created_by'] #Grab cached dictionaries key_to_bioentity = dict([(x.unique_key(), x) for x in new_session.query(Bioentity).all()]) #Grab all current objects current_objs = new_session.query(Paragraph).filter(Paragraph.class_type == 'REGULATION').all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) old_objs = break_up_file('/Users/kpaskov/final/Reg_Summary_Paragraphs04282013.txt') for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_paragraph(old_obj, key_to_bioentity) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_evidence(new_session_maker, chunk_size): from model_new_schema.regulation import Regulationevidence from model_new_schema.evelement import Experiment from model_new_schema.bioentity import Bioentity from model_new_schema.reference import Reference log = logging.getLogger('convert.regulation.evidence') log.info('begin') output_creator = OutputCreator(log) try: new_session = new_session_maker() #Values to check values_to_check = ['experiment_id', 'reference_id', 'strain_id', 'source', 'conditions', 'bioentity1_id', 'bioentity2_id', 'date_created', 'created_by'] #Grab cached dictionaries key_to_experiment = dict([(x.unique_key(), x) for x in new_session.query(Experiment).all()]) key_to_bioent = dict([(x.unique_key(), x) for x in new_session.query(Bioentity).all()]) pubmed_to_reference_id = dict([(x.pubmed_id, x.id) for x in new_session.query(Reference).all()]) #Grab old objects data = break_up_file('/Users/kpaskov/final/yeastmine_regulation.tsv') count = len(data) num_chunks = ceil(1.0*count/chunk_size) min_id = 0 j = 0 for i in range(0, num_chunks): #Grab all current objects current_objs = new_session.query(Regulationevidence).filter(Regulationevidence.id >= create_evidence_id(min_id)).filter(Regulationevidence.id < create_evidence_id(min_id+chunk_size)).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) untouched_obj_ids = set(id_to_current_obj.keys()) old_objs = data[min_id:min_id+chunk_size] for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_evidence(old_obj, j, key_to_experiment, key_to_bioent, pubmed_to_reference_id) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) j = j + 1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_domain(new_session_maker, chunk_size): from model_new_schema.protein import Domain as Domain log = logging.getLogger('convert.protein.domain') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(Domain).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'description', 'interpro_id', 'interpro_description', 'link'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects data = break_up_file('/Users/kpaskov/final/yeastmine_protein_domains.tsv') used_unique_keys = set() min_id = 0 count = len(data) num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): old_objs = data[min_id:min_id+chunk_size] for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Grab JASPAR domains from file old_objs = break_up_file('/Users/kpaskov/final/TF_family_class_accession04302013.txt') for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain_from_tf_file(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) output_creator.finished("1/1") new_session.commit() #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')
def convert_experiment(old_session_maker, new_session_maker): from model_new_schema.evelement import Experiment as NewExperiment from model_old_schema.cv import CVTerm as OldCVTerm log = logging.getLogger('convert.evelements.experiment') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(NewExperiment).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['display_name', 'link', 'description', 'date_created', 'created_by', 'eco_id'] untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects old_session = old_session_maker() old_objs = old_session.query(OldCVTerm).filter(OldCVTerm.cv_no==7).all() for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_experiment(old_obj) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) #Get experiments from regulation files experiment_names = set() rows = break_up_file('/Users/kpaskov/final/yeastmine_regulation.tsv') experiment_names.update([(row[4], row[5]) for row in rows]) i=0 for experiment_name, eco_id in experiment_names: newly_created_objs = create_experiment_from_reg_row(experiment_name, eco_id, i) for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) i = i+1 experiment_names = set() #Add experiments from binding files rows = break_up_file('/Users/kpaskov/final/yetfasco_data.txt', delimeter=';') for row in rows: if len(row) < 10: print row experiment_names.update([row[9][1:-1] for row in rows]) i=0 for experiment_name in experiment_names: newly_created_objs = create_experiment_from_binding_row(experiment_name, i) for newly_created_obj in newly_created_objs: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.unique_key() not in key_to_current_obj else key_to_current_obj[newly_created_obj.unique_key()] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) i = i+1 #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) print 'Removed at end' output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() old_session.close() log.info('complete')
def convert_domain_evidence(new_session_maker, chunk_size): from model_new_schema.protein import Domain, Domainevidence from model_new_schema.bioentity import Bioentity from model_new_schema.reference import Reference log = logging.getLogger('convert.protein.domain_evidence') log.info('begin') output_creator = OutputCreator(log) try: #Grab all current objects new_session = new_session_maker() current_objs = new_session.query(Domainevidence).all() id_to_current_obj = dict([(x.id, x) for x in current_objs]) key_to_current_obj = dict([(x.unique_key(), x) for x in current_objs]) #Values to check values_to_check = ['reference_id', 'strain_id', 'source', 'date_created', 'created_by', 'start', 'end', 'evalue', 'status', 'date_of_run', 'protein_id', 'domain_id'] #Grab cached dictionaries key_to_bioentity = dict([(x.unique_key(), x) for x in new_session.query(Bioentity).all()]) key_to_domain = dict([(x.unique_key(), x) for x in new_session.query(Domain).all()]) pubmed_id_to_reference_id = dict([(x.pubmed_id, x.id) for x in new_session.query(Reference).all()]) untouched_obj_ids = set(id_to_current_obj.keys()) #Grab old objects data = break_up_file('/Users/kpaskov/final/yeastmine_protein_domains.tsv') used_unique_keys = set() j=0 min_id = 0 count = len(data) num_chunks = ceil(1.0*count/chunk_size) for i in range(0, num_chunks): old_objs = data[min_id:min_id+chunk_size] for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain_evidence(old_obj, j, key_to_bioentity, key_to_domain) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = (newly_created_obj.protein_id, newly_created_obj.domain_id, newly_created_obj.start, newly_created_obj.end, newly_created_obj.evalue) if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if newly_created_obj.id not in key_to_current_obj else key_to_current_obj[newly_created_obj.id] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) j = j+1 output_creator.finished(str(i+1) + "/" + str(int(num_chunks))) new_session.commit() min_id = min_id+chunk_size #Grab JASPAR evidence from file old_objs = break_up_file('/Users/kpaskov/final/TF_family_class_accession04302013.txt') for old_obj in old_objs: #Convert old objects into new ones newly_created_objs = create_domain_evidence_from_tf_file(old_obj, j, key_to_bioentity, key_to_domain, pubmed_id_to_reference_id) if newly_created_objs is not None: #Edit or add new objects for newly_created_obj in newly_created_objs: unique_key = newly_created_obj.unique_key() if unique_key not in used_unique_keys: current_obj_by_id = None if newly_created_obj.id not in id_to_current_obj else id_to_current_obj[newly_created_obj.id] current_obj_by_key = None if unique_key not in key_to_current_obj else key_to_current_obj[unique_key] create_or_update(newly_created_obj, current_obj_by_id, current_obj_by_key, values_to_check, new_session, output_creator) used_unique_keys.add(unique_key) if current_obj_by_id is not None and current_obj_by_id.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_id.id) if current_obj_by_key is not None and current_obj_by_key.id in untouched_obj_ids: untouched_obj_ids.remove(current_obj_by_key.id) j = j+1 output_creator.finished("1/1") new_session.commit() #Delete untouched objs for untouched_obj_id in untouched_obj_ids: new_session.delete(id_to_current_obj[untouched_obj_id]) output_creator.removed() #Commit output_creator.finished() new_session.commit() except Exception: log.exception('Unexpected error:' + str(sys.exc_info()[0])) finally: new_session.close() log.info('complete')