def go_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() slim_ids = set() for pieces in make_file_starter('src/sgd/convert/data/go_slim_mapping.tab.txt')(): if len(pieces) >= 6: goid = pieces[5] slim_ids.add(goid) key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for bud_obj in bud_session.query(Go).all(): go_id = 'GO:' + str(bud_obj.go_go_id).zfill(7) yield {'display_name': bud_obj.go_term, 'source': key_to_source['GO'], 'description': bud_obj.go_definition, 'go_id': go_id, 'go_aspect': abbrev_to_go_aspect[bud_obj.go_aspect], 'is_slim': 1 if go_id in slim_ids else 0, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} bud_session.close() nex_session.close()
def paragraph_reference_starter(): nex_session = nex_session_maker() references = nex_session.query(Reference).all() key_to_paragraph = dict([(x.unique_key(), x) for x in nex_session.query(Paragraph).all()]) pubmed_id_to_reference = dict([(x.pubmed_id, x) for x in references]) sgdid_to_reference = dict([(x.sgdid, x) for x in references]) # LSP for paragraph in key_to_paragraph.values(): if paragraph.category == "LSP": sgdids = [x.split(">")[0] for x in paragraph.text.split("<reference:")] for sgdid in sgdids: if sgdid in sgdid_to_reference: reference = sgdid_to_reference[sgdid] yield {"paragraph_id": paragraph.id, "reference_id": reference.id} else: if sgdid != "<p": print "Reference not found: " + sgdid # Regulation file_names = [ "src/sgd/convert/data/regulationSummaries", "src/sgd/convert/data/15-8regulationSummaries.txt", "src/sgd/convert/data/15-9regulationSummaries.txt", "src/sgd/convert/data/15-10regulationSummaries.txt", "src/sgd/convert/data/15-11regulationSummaries.txt", ] for file_name in file_names: for row in make_file_starter(file_name)(): paragraph_key = (row[0], "BIOENTITY", "REGULATION") for pubmed_id in [int(x) for x in row[3].strip().split("|") if x != "references" and x != ""]: if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference: yield { "paragraph_id": key_to_paragraph[paragraph_key].id, "reference_id": pubmed_id_to_reference[pubmed_id].id, } else: print "Paragraph or reference not found: " + str(paragraph_key) + " " + str(pubmed_id) yield None # Strain for strain_key, paragraph in strain_paragraphs.iteritems(): paragraph_key = (strain_key, "STRAIN", None) for pubmed_id in paragraph[1]: if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference: yield { "paragraph_id": key_to_paragraph[paragraph_key].id, "reference_id": pubmed_id_to_reference[pubmed_id].id, } else: print "Paragraph or reference not found: " + str(paragraph_key) + " " + str(pubmed_id) yield None nex_session.close()
def tag_starter(): nex_session = nex_session_maker() for row in make_file_starter('src/sgd/convert/data/microarray_05_14/SPELL-tags.txt')(): tag = row[2].strip() for t in [x.strip() for x in tag.split('|')]: if t != '': yield { 'display_name': t, 'description': definitions.get(t) } nex_session.close()
def bioitem_tag_starter(): nex_session = nex_session_maker() key_to_dataset = dict([(x.unique_key(), x) for x in nex_session.query(Dataset).all()]) key_to_tag = dict([(x.unique_key(), x) for x in nex_session.query(Tag).all()]) for row in make_file_starter('src/sgd/convert/data/microarray_05_14/SPELL-tags.txt')(): dataset_key = (row[1].strip()[:-4], 'DATASET') tags = row[2].strip() for t in [x.strip() for x in tags.split('|')]: if t != '': yield { 'bioitem': key_to_dataset[dataset_key], 'tag': key_to_tag[create_format_name(t)] } nex_session.close()
def datasetcolumn_starter(): nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_dataset = dict([(x.unique_key(), x) for x in nex_session.query(Dataset).all()]) #Filename to pubmed_id, geo_id, channel_count, tags, and short description key_to_GSM = dict([((x[2], x[4].replace('delta', '').replace('sigma', '').strip()), x[5]) for x in make_file_starter(expression_dir + '/pmid_filename_gse_conds_tags_file_20150204.txt')()]) key_to_GSM.update([((x[2], x[4].strip()), x[5]) for x in make_file_starter(expression_dir + '/pmid_filename_gse_conds_tags_file_20150204.txt')()]) for path in os.listdir(expression_dir): if os.path.isdir(expression_dir + '/' + path): for file in os.listdir(expression_dir + '/' + path): dataset_key = (file[:-4], 'DATASET') if dataset_key in key_to_dataset: f = open(expression_dir + '/' + path + '/' + file, 'r') pieces = f.next().split('\t') f.close() geo_id = key_to_dataset[dataset_key].geo_id i = 0 for piece in pieces[3:]: column_name = piece.strip().decode('ascii','ignore') if (geo_id, column_name) not in key_to_GSM and geo_id is not None: print (geo_id, column_name) #print (geo_id, column_name) col_geo_id = None if (geo_id, column_name) not in key_to_GSM else key_to_GSM[(geo_id, column_name)] link = None if col_geo_id is None else 'http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=' + col_geo_id yield { 'description': column_name, 'dataset': key_to_dataset[dataset_key], 'source': key_to_source['SGD'], 'file_order': i, 'geo_id': col_geo_id, 'link': link } i += 1 nex_session.close()
def dataset_starter(): nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) pubmed_id_to_reference = dict([(x.pubmed_id, x) for x in nex_session.query(Reference).all()]) #Filename to pubmed_id, geo_id, channel_count, tags, and short description filename_to_info = dict([(x[1], (int(x[0]), x[2], x[3], x[6], None)) for x in make_file_starter(expression_dir + '/pmid_filename_gse_conds_tags_file_20150204.txt')()]) for path in os.listdir(expression_dir): if os.path.isdir(expression_dir + '/' + path): full_description = None geo_id = None pubmed_id = None state = 'BEGIN' try: for row in make_file_starter(expression_dir + '/' + path + '/README')(): if row[0].startswith('Full Description'): state = 'FULL_DESCRIPTION:' full_description = row[0][18:].strip() elif row[0].startswith('PMID:'): pubmed_id = int(row[0][6:].strip()) elif row[0].startswith('GEO ID:'): geo_id = row[0][8:].strip().split('.')[0].split('GPL')[0] elif row[0].startswith('PCL filename'): state = 'OTHER' elif state == 'FULL_DESCRIPTION': full_description = full_description + row[0].strip() elif state == 'OTHER': pcl_filename = row[0].strip() short_description = row[1].strip() tag = row[3].strip() if pcl_filename in filename_to_info: filename_to_info[pcl_filename] = (filename_to_info[pcl_filename][0], filename_to_info[pcl_filename][1], filename_to_info[pcl_filename][2], filename_to_info[pcl_filename][3], short_description) else: if geo_id == 'N/A': geo_id = None filename_to_info[pcl_filename] = (pubmed_id, geo_id, 1, tag, short_description) for file in os.listdir(expression_dir + '/' + path): if file != 'README': f = open(expression_dir + '/' + path + '/' + file, 'r') pieces = f.next().split('\t') f.close() if file in filename_to_info: pubmed_id, geo_id, channel_count, tags, short_description = filename_to_info[file] if pubmed_id not in pubmed_id_to_reference: print 'Warning: pubmed_id not found ' + str(pubmed_id) yield { 'description': full_description, 'geo_id': geo_id, 'pcl_filename': file, 'short_description': short_description, 'tags': tags, 'reference': None if pubmed_id is None or pubmed_id not in pubmed_id_to_reference else pubmed_id_to_reference[pubmed_id], 'source': key_to_source['SGD'], 'channel_count': channel_count, 'condition_count': len(pieces)-3 } else: print 'Filename not in readme: ' + file except: print 'File ' + expression_dir + '/' + path + '/README' + ' not found.' print traceback.format_exc() nex_session.close()
def domain_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) panther_id_to_description = {} for row in make_file_starter('src/sgd/convert/data/PANTHER9.0_HMM_classifications.txt')(): panther_id_to_description[row[0]] = row[1].lower() for row in make_file_starter('src/sgd/convert/data/domains.tab')(): source_key = row[3].strip() if source_key.startswith('ProSite'): source_key = 'Prosite' if source_key.startswith('SignalP'): source_key = 'SignalP' if source_key.startswith('Hamap'): source_key = 'HAMAP' if source_key == 'Coils': source_key = '-' display_name = row[4].strip() description = row[5].strip() interpro_id = None interpro_description = None if len(row) == 13: interpro_id = row[11].strip() interpro_description = row[12].strip() source_key = create_format_name(source_key) source = None if source_key not in key_to_source else key_to_source[source_key] description = None if description == '' else description interpro_description = None if interpro_description == '' else interpro_description interpro_id = None if interpro_id == '' else interpro_id if source_key == 'PANTHER': if display_name in panther_id_to_description: yield {'display_name': display_name, 'source': source, 'description': panther_id_to_description[display_name], 'bioitem_type': source_key, 'interpro_id': interpro_id, 'interpro_description': interpro_description} elif source_key is not None: yield {'display_name': display_name, 'source': source, 'description': description if description is not None else interpro_description, 'bioitem_type': source_key, 'interpro_id': interpro_id, 'interpro_description': interpro_description} else: print 'Source not found: ' + source_key for row in make_file_starter('src/sgd/convert/data/TF_family_class_accession04302013.txt')(): description = 'Class: ' + row[4] + ', Family: ' + row[3] yield {'display_name': row[0], 'source': key_to_source['JASPAR'], 'description': description, 'bioitem_type': 'JASPAR'} yield {'display_name': 'predicted signal peptide', 'source': key_to_source['SignalP'], 'description': 'predicted signal peptide', 'bioitem_type': 'SignalP'} yield {'display_name': 'predicted transmembrane domain', 'source': key_to_source['TMHMM'], 'description': 'predicted transmembrane domain', 'bioitem_type': 'TMHMM'} for bud_obj in bud_session.query(Dbxref).filter(or_(Dbxref.dbxref_type == 'PANTHER', Dbxref.dbxref_type == 'Prosite')).all(): dbxref_type = bud_obj.dbxref_type source_key = create_format_name(bud_obj.source) source = None if source_key not in key_to_source else key_to_source[source_key] if source is None: print source_key yield None bioitem_type = None if dbxref_type == 'Prosite ID': bioitem_type = 'Prosite' elif dbxref_type == 'PANTHER': bioitem_type = 'PANTHER' if bioitem_type == 'PANTHER': if display_name in panther_id_to_description: yield {'display_name': bud_obj.dbxref_id, 'source': source, 'description': panther_id_to_description[bud_obj.dbxref_id], 'bioitem_type': bioitem_type} else: yield {'display_name': bud_obj.dbxref_id, 'source': source, 'description': bud_obj.dbxref_name, 'bioitem_type': bioitem_type} bud_session.close() nex_session.close()
def experiment_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) for bud_obj in make_obo_file_starter('src/sgd/convert/data/eco.obo')(): description = None if 'def' not in bud_obj else bud_obj['def'] if description is not None and description.find('[') >= 0: description = description[:description.find('[')-1] if description is not None and description.find('"') >= 0: description = description[1:-1] yield {'display_name': bud_obj['name'], 'source': key_to_source['ECO'], 'description': description, 'eco_id': bud_obj['id']} for bud_obj in bud_session.query(CVTerm).filter(CVTerm.cv_no==7).all(): format_name = create_format_name(bud_obj.name) yield {'display_name': bud_obj.name, 'source': key_to_source['SGD'], 'description': bud_obj.definition, 'category': 'large-scale survey' if format_name in large_scale_survey else 'classical genetics' if format_name in classical_genetics else None, 'date_created': bud_obj.date_created, 'created_by': bud_obj.created_by} for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Venters_Macisaac_Hu05-12-2014_regulator_lines')(): source_key = row[11].strip() if source_key in key_to_source: yield {'display_name': row[4] if row[4] != '' else row[5], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[5]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/SGD_data_05_14_2014')(): source_key = row[11].strip() if source_key in key_to_source: yield {'display_name': row[4] if row[4] != '' else row[5], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[5]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Madhani_fixed')(): if len(row) >= 10: if source_key in key_to_source: source_key = row[11].strip() yield {'display_name': row[5] if row[5] != '' else row[4], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[4]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/2014-05-15_reg_data/Pimentel_PMID22616008.txt')(): if len(row) >= 10: if source_key in key_to_source: source_key = row[11].strip() yield {'display_name': row[4] if row[4] != '' else row[5], 'source': None if source_key not in key_to_source else key_to_source[source_key], 'eco_id': row[5]} else: print 'Source not found: ' + str(source_key) for row in make_file_starter('src/sgd/convert/data/yetfasco_data.txt', delimeter=';')(): expert_confidence = row[8][1:-1] if expert_confidence == 'High': yield {'display_name': row[9][1:-1], 'source': key_to_source['YeTFaSCo']} yield {'display_name': 'protein abundance', 'source': key_to_source['SGD']} yield {'display_name': 'EXP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/exp-inferred-experiment', 'description': 'Inferred from Experiment'} yield {'display_name': 'IDA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ida-inferred-direct-assay', 'description': 'Inferred from Direct Assay'} yield {'display_name': 'IPI', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ipi-inferred-physical-interaction', 'description': 'Inferred from Physical Interaction'} yield {'display_name': 'IMP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/imp-inferred-mutant-phenotype', 'description': 'Inferred from Mutant Phenotype'} yield {'display_name': 'IGI', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/igi-inferred-genetic-interaction', 'description': 'Inferred from Genetic Interaction'} yield {'display_name': 'IEP', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iep-inferred-expression-pattern', 'description': 'Inferred from Expression Pattern'} yield {'display_name': 'ISS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iss-inferred-sequence-or-structural-similarity', 'description': 'Inferred from Sequence or Structural Similarity'} yield {'display_name': 'ISA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/isa-inferred-sequence-alignment', 'description': 'Inferred from Sequence Alignment'} yield {'display_name': 'ISO', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iso-inferred-sequence-orthology', 'description': 'Inferred from Sequence Orthology'} yield {'display_name': 'ISM', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ism-inferred-sequence-model', 'description': 'Inferred from Sequence Model'} yield {'display_name': 'IGC', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/igc-inferred-genomic-context', 'description': 'Inferred from Genomic Context'} yield {'display_name': 'IBA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/iba-inferred-biological-aspect-ancestor', 'description': 'Inferred from Biological aspect of Ancestor'} yield {'display_name': 'IBD', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ibd-inferred-biological-aspect-descendent', 'description': 'Inferred from Biological aspect of Descendent'} yield {'display_name': 'IKR', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ikr-inferred-key-residues', 'description': 'Inferred from Key Residues'} yield {'display_name': 'IRD', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ird-inferred-rapid-divergence', 'description': 'Inferred from Rapid Divergence'} yield {'display_name': 'RCA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/rca-inferred-reviewed-computational-analysis', 'description': 'inferred from Reviewed Computational Analysis'} yield {'display_name': 'TAS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/tas-traceable-author-statement', 'description': 'Traceable Author Statement'} yield {'display_name': 'NAS', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/nas-non-traceable-author-statement', 'description': 'Non-traceable Author Statement'} yield {'display_name': 'IC', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/ic-inferred-curator', 'description': 'Inferred by Curator'} yield {'display_name': 'ND', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/nd-no-biological-data-available', 'description': 'No Biological Data Available'} yield {'display_name': 'IEA', 'source': key_to_source['GO'], 'link': 'http://www.geneontology.org/page/automatically-assigned-evidence-codes', 'description': 'Inferred from Electronic Annotation'} bud_session.close() nex_session.close()
def paragraph_reference_starter(): nex_session = nex_session_maker() references = nex_session.query(Reference).all() key_to_paragraph = dict([(x.unique_key(), x) for x in nex_session.query(Paragraph).all()]) pubmed_id_to_reference = dict([(x.pubmed_id, x) for x in references]) sgdid_to_reference = dict([(x.sgdid, x) for x in references]) #LSP for paragraph in key_to_paragraph.values(): if paragraph.category == 'LSP': sgdids = [x.split('>')[0] for x in paragraph.text.split('<reference:')] for sgdid in sgdids: if sgdid in sgdid_to_reference: reference = sgdid_to_reference[sgdid] yield { 'paragraph_id': paragraph.id, 'reference_id': reference.id } else: if sgdid != '<p': print 'Reference not found: ' + sgdid #Regulation file_names = ['src/sgd/convert/data/regulationSummaries', 'src/sgd/convert/data/15-8regulationSummaries.txt', 'src/sgd/convert/data/15-9regulationSummaries.txt', 'src/sgd/convert/data/15-10regulationSummaries.txt', 'src/sgd/convert/data/15-11regulationSummaries.txt', 'src/sgd/convert/data/16-1regulationSummaries.txt', 'src/sgd/convert/data/16-2regulationSummaries.txt', 'src/sgd/convert/data/16-3regulationSummaries.txt', 'src/sgd/convert/data/16-4regulationSummaries.txt', 'src/sgd/convert/data/16-5regulationSummaries.txt'] for file_name in file_names: for row in make_file_starter(file_name)(): paragraph_key = (row[0], 'BIOENTITY', 'REGULATION') for pubmed_id in [int(x) for x in row[3].strip().split('|') if x != 'references' and x != '']: if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference: yield { 'paragraph_id': key_to_paragraph[paragraph_key].id, 'reference_id': pubmed_id_to_reference[pubmed_id].id, } else: print 'Paragraph or reference not found: ' + str(paragraph_key) + ' ' + str(pubmed_id) yield None #Strain for strain_key, paragraph in strain_paragraphs.iteritems(): paragraph_key = (strain_key, 'STRAIN', None) for pubmed_id in paragraph[1]: if paragraph_key in key_to_paragraph and pubmed_id in pubmed_id_to_reference: yield { 'paragraph_id': key_to_paragraph[paragraph_key].id, 'reference_id': pubmed_id_to_reference[pubmed_id].id, } else: print 'Paragraph or reference not found: ' + str(paragraph_key) + ' ' + str(pubmed_id) yield None nex_session.close()
def bioentity_paragraph_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()]) id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()]) sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()]) sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()]) goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()]) #LSP for feature in bud_session.query(Feature).all(): paragraph_feats = feature.paragraph_feats if len(paragraph_feats) > 0 and feature.id in id_to_bioentity: paragraph_feats.sort(key=lambda x: x.order) paragraph_html, paragraph_text = clean_paragraph(id_to_bioentity[feature.id], '<p>' + ('</p><p>'.join([x.paragraph.text for x in paragraph_feats])) + '</p>', str([x.paragraph.id for x in paragraph_feats]), sgdid_to_reference, sgdid_to_bioentity, goid_to_go) date_edited = None year = 0 month = 0 day = 0 for paragraph_feat in paragraph_feats: my_date = paragraph_feat.paragraph.date_edited this_date = str(my_date).split(' ')[0].replace('-0', '-').split('-') this_year = int(this_date[0]) this_month = int(this_date[1]) this_day = int(this_date[2]) if date_edited is None or datetime(this_year, this_month, this_day) > datetime(year, month, day): date_edited = my_date year = this_year month = this_month day = this_day yield { 'bioentity': id_to_bioentity[feature.id], 'source': key_to_source['SGD'], 'text': paragraph_text, 'html': paragraph_html, 'date_edited': date_edited, 'date_created': paragraph_feats[0].paragraph.date_created, 'created_by': paragraph_feats[0].paragraph.created_by, 'category': 'LSP' } bioentity_key_to_date = dict() #Go for gofeature in bud_session.query(GoFeature).all(): bioentity_key = (gofeature.feature.name, 'LOCUS') if gofeature.annotation_type == 'manually curated' and bioentity_key not in bioentity_key_to_date: bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems(): if bioentity_key in key_to_bioentity: yield { 'bioentity': key_to_bioentity[bioentity_key], 'source': key_to_source['SGD'], 'text': str(date_last_reviewed), 'html': str(date_last_reviewed), 'date_created': None, 'created_by': None, 'category': 'GODATE' } else: #print 'Bioentity not found: ' + str(bioentity_key) yield None for pieces in make_file_starter('src/sgd/convert/data/gp_information.559292_sgd')(): if len(pieces) >= 8: sgdid = pieces[8] if sgdid.startswith('SGD:'): sgdid = sgdid[4:] go_annotation = [x[22:].strip() for x in pieces[9].split('|') if x.startswith('go_annotation_summary')] if len(go_annotation) == 1: if sgdid in sgdid_to_bioentity: yield { 'bioentity': sgdid_to_bioentity[sgdid], 'source': key_to_source['SGD'], 'text': go_annotation[0], 'html': go_annotation[0], 'date_created': None, 'created_by': None, 'category': 'GO' } else: print 'Bioentity not found: ' + sgdid yield None #Regulation file_names = ['src/sgd/convert/data/regulationSummaries', 'src/sgd/convert/data/15-8regulationSummaries.txt', 'src/sgd/convert/data/15-9regulationSummaries.txt', 'src/sgd/convert/data/15-10regulationSummaries.txt', 'src/sgd/convert/data/15-11regulationSummaries.txt', 'src/sgd/convert/data/16-1regulationSummaries.txt', 'src/sgd/convert/data/16-2regulationSummaries.txt', 'src/sgd/convert/data/16-3regulationSummaries.txt', 'src/sgd/convert/data/16-4regulationSummaries.txt', 'src/sgd/convert/data/16-5regulationSummaries.txt'] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], 'LOCUS') if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { 'bioentity': bioentity, 'source': key_to_source['SGD'], 'text': row[2], 'html': link_gene_names(row[2], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session), 'category': 'REGULATION' } else: #print 'Bioentity not found: ' + str(bioentity_key) yield None #Phenotype file_names = ['src/sgd/convert/data/PhenotypeSummaries032015.txt', 'src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt', 'src/sgd/convert/data/15-7phenoSummaries.txt', 'src/sgd/convert/data/15-8phenoSummaries.txt', 'src/sgd/convert/data/15-9phenoSummaries.txt', 'src/sgd/convert/data/15-10phenoSummaries.txt', 'src/sgd/convert/data/15-11phenoSummaries.txt', 'src/sgd/convert/data/15-12phenoSummaries.txt', 'src/sgd/convert/data/16-1phenoSummaries.txt', 'src/sgd/convert/data/16-2phenoSummaries.txt', 'src/sgd/convert/data/16-3phenoSummaries.txt', 'src/sgd/convert/data/16-4phenoSummaries.txt', 'src/sgd/convert/data/16-5phenoSummaries.txt', 'src/sgd/convert/data/16-6phenoSummaries.txt', 'src/sgd/convert/data/16-7phenoSummaries.txt', 'src/sgd/convert/data/16-9phenoSummaries.txt', 'src/sgd/convert/data/16-10phenoSummaries.txt'] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], 'LOCUS') if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { 'bioentity': bioentity, 'source': key_to_source['SGD'], 'text': row[1], 'html': link_gene_names(row[1], {bioentity.display_name, bioentity.format_name, bioentity.display_name + 'P', bioentity.format_name + 'P'}, nex_session), 'category': 'PHENOTYPE' } else: #print 'Bioentity not found: ' + str(bioentity_key) yield None bud_session.close() nex_session.close()
def bioentity_paragraph_starter(): bud_session = bud_session_maker() nex_session = nex_session_maker() key_to_source = dict([(x.unique_key(), x) for x in nex_session.query(Source).all()]) key_to_bioentity = dict([(x.unique_key(), x) for x in nex_session.query(Locus).all()]) id_to_bioentity = dict([(x.id, x) for x in nex_session.query(Locus).all()]) sgdid_to_reference = dict([(x.sgdid, x) for x in nex_session.query(Reference).all()]) sgdid_to_bioentity = dict([(x.sgdid, x) for x in nex_session.query(Bioentity).all()]) goid_to_go = dict([(int(x.go_id[3:]), x) for x in nex_session.query(Go).all()]) # LSP for feature in bud_session.query(Feature).all(): paragraph_feats = feature.paragraph_feats if len(paragraph_feats) > 0 and feature.id in id_to_bioentity: paragraph_feats.sort(key=lambda x: x.order) paragraph_html, paragraph_text = clean_paragraph( id_to_bioentity[feature.id], "<p>" + ("</p><p>".join([x.paragraph.text for x in paragraph_feats])) + "</p>", str([x.paragraph.id for x in paragraph_feats]), sgdid_to_reference, sgdid_to_bioentity, goid_to_go, ) yield { "bioentity": id_to_bioentity[feature.id], "source": key_to_source["SGD"], "text": paragraph_text, "html": paragraph_html, "date_edited": paragraph_feats[0].paragraph.date_edited, "date_created": paragraph_feats[0].paragraph.date_created, "created_by": paragraph_feats[0].paragraph.created_by, "category": "LSP", } bioentity_key_to_date = dict() # Go for gofeature in bud_session.query(GoFeature).all(): bioentity_key = (gofeature.feature.name, "LOCUS") if gofeature.annotation_type == "manually curated" and bioentity_key not in bioentity_key_to_date: bioentity_key_to_date[bioentity_key] = gofeature.date_last_reviewed for bioentity_key, date_last_reviewed in bioentity_key_to_date.iteritems(): if bioentity_key in key_to_bioentity: yield { "bioentity": key_to_bioentity[bioentity_key], "source": key_to_source["SGD"], "text": str(date_last_reviewed), "html": str(date_last_reviewed), "date_created": None, "created_by": None, "category": "GODATE", } else: # print 'Bioentity not found: ' + str(bioentity_key) yield None for pieces in make_file_starter("src/sgd/convert/data/gp_information.559292_sgd")(): if len(pieces) >= 8: sgdid = pieces[8] if sgdid.startswith("SGD:"): sgdid = sgdid[4:] go_annotation = [ x[22:].strip() for x in pieces[9].split("|") if x.startswith("go_annotation_summary") ] if len(go_annotation) == 1: if sgdid in sgdid_to_bioentity: yield { "bioentity": sgdid_to_bioentity[sgdid], "source": key_to_source["SGD"], "text": go_annotation[0], "html": go_annotation[0], "date_created": None, "created_by": None, "category": "GO", } else: print "Bioentity not found: " + sgdid yield None # Regulation file_names = [ "src/sgd/convert/data/regulationSummaries", "src/sgd/convert/data/15-8regulationSummaries.txt", "src/sgd/convert/data/15-9regulationSummaries.txt", "src/sgd/convert/data/15-10regulationSummaries.txt", "src/sgd/convert/data/15-11regulationSummaries.txt", ] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], "LOCUS") if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { "bioentity": bioentity, "source": key_to_source["SGD"], "text": row[2], "html": link_gene_names( row[2], { bioentity.display_name, bioentity.format_name, bioentity.display_name + "P", bioentity.format_name + "P", }, nex_session, ), "category": "REGULATION", } else: # print 'Bioentity not found: ' + str(bioentity_key) yield None # Phenotype file_names = [ "src/sgd/convert/data/PhenotypeSummaries032015.txt", "src/sgd/convert/data/15-6phenoSummariesTyposFixed.txt", "src/sgd/convert/data/15-7phenoSummaries.txt", "src/sgd/convert/data/15-8phenoSummaries.txt", "src/sgd/convert/data/15-9phenoSummaries.txt", "src/sgd/convert/data/15-10phenoSummaries.txt", "src/sgd/convert/data/15-11phenoSummaries.txt", ] for file_name in file_names: for row in make_file_starter(file_name)(): bioentity_key = (row[0], "LOCUS") if bioentity_key in key_to_bioentity: bioentity = key_to_bioentity[bioentity_key] yield { "bioentity": bioentity, "source": key_to_source["SGD"], "text": row[1], "html": link_gene_names( row[1], { bioentity.display_name, bioentity.format_name, bioentity.display_name + "P", bioentity.format_name + "P", }, nex_session, ), "category": "PHENOTYPE", } else: # print 'Bioentity not found: ' + str(bioentity_key) yield None bud_session.close() nex_session.close()
clean_up_orphans(nex_session_maker, DNAsequenceevidence, Evidence, 'DNASEQUENCE') for sequence_filename, coding_sequence_filename, strain_key in new_sequence_files: do_conversion(make_new_dna_sequence_evidence_starter(nex_session_maker, strain_key, sequence_filename, coding_sequence_filename), [Json2Obj(DNAsequenceevidence), Obj2NexDB(nex_session_maker, lambda x: x.query(DNAsequenceevidence).filter(DNAsequenceevidence.strain_id == strain_key_to_id[strain_key]).filter(DNAsequenceevidence.dna_type != '1KB'), name='convert.from_bud.evidence.dnasequence', delete_untouched=True, commit_interval=1000)]) update_contig_centromeres(nex_session_maker) update_contig_reference_alignment(nex_session_maker) protparam_data = dict([(row[0], row) for row in make_file_starter('src/sgd/convert/data/ProtParam.txt')()]) for sequence_filename, strain_key in protein_sequence_files: do_conversion(make_protein_sequence_evidence_starter(nex_session_maker, strain_key, sequence_filename, protparam_data), [Json2Obj(Proteinsequenceevidence), Obj2NexDB(nex_session_maker, lambda x: x.query(Proteinsequenceevidence).filter(Proteinsequenceevidence.strain_id == strain_key_to_id[strain_key]), name='convert.from_bud.evidence.proteinsequence', delete_untouched=True, commit_interval=1000)]) clean_up_orphans(nex_session_maker, Proteinsequenceevidence, Evidence, 'PROTEINSEQUENCE') do_conversion(make_kb_sequence_starter(nex_session_maker), [Json2Obj(DNAsequenceevidence), Obj2NexDB(nex_session_maker, lambda x: x.query(DNAsequenceevidence).filter(DNAsequenceevidence.dna_type == '1KB'), name='convert.from_bud.evidence.1kb_dnasequence', delete_untouched=True, commit_interval=1000)])