def load_tsv_filedbentities(): engine = create_engine(NEX2_URI, pool_recycle=3600) DBSession.configure(bind=engine) f = open(INPUT_FILE_NAME) i = 0 for line in f: val = line.split("\t") if val[0] == 'bun path': continue if len(val) > 0: i = i + 1 ### added by Shuai if len(val) < 14: print(val) return ### raw_date = val[13] if len(raw_date): raw_date = datetime.strptime(val[13], '%Y-%m-%d') else: raw_date = None raw_status = val[4].strip() if raw_status == 'Archive': raw_status = 'Archived' bun_path = val[0].strip() new_path = val[1].strip() if new_path.startswith("datasets/"): new_path = "/datasets" if bun_path[0] != '/': bun_path = bun_path.replace('genome-sequences/', '/genome-sequences/') if new_path[0] != '/': new_path = new_path.replace('genome-sequences/', '/genome-sequences/') readme_file = val[18] obj = { 'bun_path': bun_path, 'new_path': new_path, 'display_name': val[3].strip(), 'status': raw_status, 'source': val[5].strip(), 'topic_edam_id': val[7].upper().replace('TOPIC', 'EDAM').strip(), 'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(), 'format_edam_id': val[11].upper().replace('FORMAT', 'EDAM').strip(), 'file_extension': val[12].strip(), 'file_date': raw_date, 'is_public': (val[15] == '1'), 'is_in_spell': (val[16] == '1'), 'is_in_browser': (val[17] == '1'), 'readme_name': readme_file, 'description': val[19].decode('utf-8', 'ignore').replace('"', ''), 'pmids': val[20], 'keywords': val[21].replace('"', '') } create_and_upload_file(obj, i)
def update_database_load_file_to_s3(nex_session, ontology_file, source_to_id, edam_to_id): gzip_file = ontology_file + ".gz" import gzip import shutil with open(ontology_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) local_file = open(gzip_file, mode='rb') import hashlib go_md5sum = hashlib.md5(ontology_file.encode()).hexdigest() go_row = nex_session.query(Filedbentity).filter_by( md5sum=go_md5sum).one_or_none() if go_row is not None: return nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update( {"dbentity_status": 'Archived'}) nex_session.commit() data_id = edam_to_id.get('EDAM:2353') ## data:2353 Ontology data topic_id = edam_to_id.get( 'EDAM:0089') ## topic:0089 Ontology and terminology format_id = edam_to_id.get('EDAM:3262') ## format:3262 OWL/XML from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='Core Gene Ontology in OWL RDF/XML format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=go_md5sum)
def update_database_load_file_to_s3(nex_session, data_file, sgd_source_id, edam_to_id): local_file = open(data_file) import hashlib dx_md5sum = hashlib.md5(local_file.read()).hexdigest() dx_row = nex_session.query(Filedbentity).filter_by( md5sum=dx_md5sum).one_or_none() if dx_row is not None: return log.info("Uploading the file to S3...") data_file = data_file.split('/').pop() nex_session.query(Dbentity).filter_by(display_name=data_file, dbentity_status='Active').update( {"dbentity_status": 'Archived'}) nex_session.commit() data_id = edam_to_id.get('EDAM:2872') ## data:2872 ID list topic_id = edam_to_id.get( 'EDAM:3345') ## topic:3345 Data identity and mapping format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file( CREATED_BY, local_file, filename=data_file, file_extension='txt', description='subset of NCBI gene2accession file for taxon ID 559292', display_name=data_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=sgd_source_id)
def load_csv_disease_dbentities(): engine = create_engine(NEX2_URI, pool_recycle=3600) DBSession.configure(bind=engine) o = open(INPUT_FILE_NAME, 'rU') reader = csv.reader(o, delimiter='\t') for i, val in enumerate(reader): if i >= 0: if val[0] == '': logging.info('Found a blank value, DONE!') return obj = { 'sgdid': val[0].strip().replace("SGD:", ""), 'symbol': val[1].strip(), 'summary': val[2].strip() } upload_db(obj, i)
def load_csv_disease_dbentities(): engine = create_engine(NEX2_URI, pool_recycle=3600) DBSession.configure(bind=engine) o = open(INPUT_FILE_NAME, 'rU') reader = csv.reader(o, delimiter='\t') for i, val in enumerate(reader): if i > 0: if val[0] == '': logging.info('Found a blank value, DONE!') return obj = { 'taxon': val[0].strip().replace("taxon:", "TAX:"), 'sgdid': val[2].strip().replace("SGD:", ""), 'symbol': val[3].strip(), 'association': val[8].strip(), 'doid': val[10].strip(), 'hgnc': val[11].strip(), 'evidence_codes': val[16], 'pmid': val[18].strip().replace("PMID:", ""), 'date_assigned': val[19].strip(), 'source': val[20] } upload_db(obj, i)
from src.models import DBSession, Base, Colleague, ColleagueLocus, Dbentity, Locusdbentity, LocusUrl, LocusAlias, Dnasequenceannotation, So, Locussummary, Phenotypeannotation, PhenotypeannotationCond, Phenotype, Goannotation, Go, Goslimannotation, Goslim, Apo, Straindbentity, Strainsummary, Reservedname, GoAlias, Goannotation, Referencedbentity, Referencedocument, Referenceauthor, ReferenceAlias, Chebi from sqlalchemy import create_engine, and_, inspect import os import json import re import time import sys from random import randint #from pycallgraph import PyCallGraph #from pycallgraph.output import GraphvizOutput from datetime import datetime from threading import Thread import concurrent.futures engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) Base.metadata.bind = engine # populate text file with sgdis to be used to retrieve panther data def get_sgdids_for_panther(): new_data = Locusdbentity.get_s288c_genes() temp = [] for loc in new_data: temp.append(loc.sgdid) result = json.dumps(temp, ensure_ascii=False) with open('./scripts/bgi_json/data_dump/sgd_ids_for_panther.txt', 'w+') as res_file: res_file.write( result.replace('"', '').replace('[', '').replace(']', ''))
Reservedname, GoAlias, Goannotation, Referencedbentity, Referencedocument, Referenceauthor, ReferenceAlias, ) from sqlalchemy import create_engine, and_ from elasticsearch import Elasticsearch from mapping import mapping import os import requests engine = create_engine(os.environ["NEX2_URI"], pool_recycle=3600) DBSession.configure(bind=engine) Base.metadata.bind = engine INDEX_NAME = "searchable_items_aws" DOC_TYPE = "searchable_item" es = Elasticsearch(os.environ["ES_URI"], retry_on_timeout=True) def delete_mapping(): print "Deleting mapping..." response = requests.delete(os.environ["ES_URI"] + INDEX_NAME + "/") if response.status_code != 200: print "ERROR: " + str(response.json()) else: print "SUCCESS"
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') import hashlib file_md5sum = hashlib.md5(local_file.read()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=file_md5sum).one_or_none() if row is not None: return if "tbl" in gzip_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "sqn" in gzip_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') else: nex_session.query(Dbentity).filter( Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3671') ## data:3671 Text topic_id = edam_to_id.get('EDAM:0085') ## topic:0085 Functional genomics format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format if "tbl" in gzip_file: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none() # if readme is None: # log.info("ncbi_tbl_files.README is not in the database.") # return # readme_file_id = readme.dbentity_id readme_file_id = None # path.path = /reports/function upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='All yeast features in tbl file format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=file_md5sum) file = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if file is None: log.info("The " + gzip_file + " is not in the database.") return file_id = file.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/function").one_or_none() if path is None: log.info("The path /reports/function is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit()
def setUp(self): self.engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.remove() DBSession.configure(bind=self.engine) Base.metadata.create_all(self.engine)
def update_database_load_file_to_s3(nex_session, data_file, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') import hashlib gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=gff_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "") nex_session.query(Dbentity).filter( Dbentity.display_name.like('RNAcentral.%.json.gz')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3495') # data:3495 RNA sequence topic_id = edam_to_id.get('EDAM:0099') # topic:0099 RNA format_id = edam_to_id.get('EDAM:3464') # format:3464 JSON format from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='JSON file for yeast RNA genes', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=None, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gff_md5sum) rnaFile = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if rnaFile is None: log.info("The " + gzip_file + " is not in the database.") return file_id = rnaFile.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/chromosomal-features").one_or_none() if path is None: log.info( "The path: /reports/chromosomal-features is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit() log.info("Done uploading " + data_file)
def update_database_load_file_to_s3(nex_session, gaf_file, is_public, source_to_id, edam_to_id, datestamp): # gene_association.sgd.20171204.gz # gene_association.sgd-yeastmine.20171204.gz # datestamp = str(datetime.now()).split(" ")[0].replace("-", "") gzip_file = gaf_file + "." + datestamp + ".gz" import gzip import shutil with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) local_file = open(gzip_file) import hashlib gaf_md5sum = hashlib.md5(local_file.read()).hexdigest() row = nex_session.query(Filedbentity).filter_by( md5sum=gaf_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "") # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) if is_public == 1: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gene_association.sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:2048') ## data:2048 Report topic_id = edam_to_id.get('EDAM:0085') ## topic:0085 Functional genomics format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV if "yeastmine" not in gaf_file: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) readme = nex_session.query(Dbentity).filter_by( display_name="gene_association.README", dbentity_status='Active').one_or_none() if readme is None: log.info("gene_association.README is not in the database.") return readme_file_id = readme.dbentity_id # path.path = /reports/function upload_file( CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description= 'All GO annotations for yeast genes (protein and RNA) in GAF file format', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public=is_public, is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD']) gaf = nex_session.query(Dbentity).filter_by( display_name=gzip_file, dbentity_status='Active').one_or_none() if gaf is None: log.info("The " + gzip_file + " is not in the database.") return file_id = gaf.dbentity_id path = nex_session.query(Path).filter_by( path="/reports/function").one_or_none() if path is None: log.info("The path /reports/function is not in the database.") return path_id = path.path_id x = FilePath(file_id=file_id, path_id=path_id, source_id=source_to_id['SGD'], created_by=CREATED_BY) nex_session.add(x) nex_session.commit()
def upload_all_filedbentities(): engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) files = DBSession.query(Filedbentity).all() for x in files: print(x.get_path(), x.topic.display_name)
def update_database_load_file_to_s3(nex_session, gpad_file, gpi_file, source_to_id, edam_to_id): import hashlib gpad_local_file = open(gpad_file, mode='rb') gpi_local_file = open(gpi_file, mode='rb') gpad_md5sum = hashlib.md5(gpad_file.encode()).hexdigest() gpi_md5sum = hashlib.md5(gpi_file.encode()).hexdigest() gpad_row = nex_session.query(Filedbentity).filter_by( md5sum=gpad_md5sum).one_or_none() gpi_row = nex_session.query(Filedbentity).filter_by( md5sum=gpi_md5sum).one_or_none() if gpad_row is not None and gpi_row is not None: return if gpad_row is None: nex_session.query(Dbentity).filter_by( display_name=gpad_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) nex_session.commit() if gpi_row is None: nex_session.query(Dbentity).filter_by( display_name=gpi_file, dbentity_status='Active').update({"dbentity_status": 'Archived'}) nex_session.commit() data_id = edam_to_id.get('EDAM:2353') ## data:2353 Ontology data topic_id = edam_to_id.get( 'EDAM:0089') ## topic:0089 Ontology and terminology format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) if gpad_row is None: upload_file(CREATED_BY, gpad_local_file, filename=gpad_file, file_extension='.gz', description='Gene Product Association Data (GPAD)', display_name=gpad_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gpad_md5sum) if gpi_row is None: upload_file(CREATED_BY, gpi_local_file, filename=gpi_file, file_extension='gz', description='Gene Product Information (GPI)', display_name=gpi_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='0', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gpi_md5sum)
def load_csv_filedbentities(): engine = create_engine(NEX2_URI, pool_recycle=3600) DBSession.configure(bind=engine) o = open(INPUT_FILE_NAME, 'rU') reader = csv.reader(o) for i, val in enumerate(reader): if i > 0: ### added by Shuai if len(val) == 0: continue if val[0] == '': logging.info('Found a blank value, DONE!') return ### added by Shuai if len(val) < 14: print(val) return ### raw_date = val[13] if len(raw_date): temp = format_csv_date_string(val[13]) if temp is not None: raw_date = datetime.strptime(temp, '%Y-%m-%d') else: raw_date = datetime.strptime(val[13], '%Y-%m-%d') else: raw_date = None raw_status = val[4].strip() if raw_status == 'Archive': raw_status = 'Archived' bun_path = val[0].strip() new_path = val[1].strip() if bun_path[0] != '/': bun_path = bun_path.replace('genome-sequences/', '/genome-sequences/') if new_path[0] != '/': new_path = new_path.replace('genome-sequences/', '/genome-sequences/') readme_file = val[18] obj = { 'bun_path': bun_path, 'new_path': new_path, 'display_name': val[3].strip(), 'status': raw_status, 'source': val[5].strip(), 'topic_edam_id': val[7].upper().replace('TOPIC', 'EDAM').strip(), 'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(), 'format_edam_id': val[11].upper().replace('FORMAT', 'EDAM').strip(), 'file_extension': val[12].strip(), 'file_date': raw_date, 'is_public': (val[15] == '1'), 'is_in_spell': (val[16] == '1'), 'is_in_browser': (val[17] == '1'), 'readme_name': readme_file, 'description': val[19].decode('utf-8', 'ignore').replace('"', ''), 'pmids': val[20], 'keywords': val[21].replace('"', '') } create_and_upload_file(obj, i)
def load_csv_filedbentities(): engine = create_engine(NEX2_URI, pool_recycle=3600) DBSession.configure(bind=engine) # open ssh connection to download server client = paramiko.SSHClient() client.load_system_host_keys() username = input('Username for legacy download server: ') password = getpass.getpass('Password for %s@%s: ' % (username, HOSTNAME)) client.connect(HOSTNAME, 22, username, password, gss_auth=False, gss_kex=False) sftp_client = client.open_sftp() o = open(INPUT_FILE_NAME, 'rU') reader = csv.reader(o) for i, val in enumerate(reader): if i > 0: ### added by Shuai if len(val) == 0: continue if val[0] == '': logging.info('Found a blank value, DONE!') return ### added by Shuai if len(val) < 14: print(val) return ### raw_date = val[13] if len(raw_date): temp = format_csv_date_string(val[13]) if temp is not None: raw_date = datetime.strptime(temp, '%Y-%m-%d') else: raw_date = datetime.strptime(val[13], '%Y-%m-%d') else: raw_date = None raw_status = val[4].strip() if raw_status == 'Archive': raw_status = 'Archived' bun_path = val[0].strip() new_path = val[1].strip() if bun_path[0] != '/': bun_path = bun_path.replace('genome-sequences/', '/genome-sequences/') if new_path[0] != '/': new_path = new_path.replace('genome-sequences/', '/genome-sequences/') readme_file = val[18] obj = { 'bun_path': bun_path, 'new_path': new_path, 'display_name': val[3].strip(), 'status': raw_status, 'source': val[5].strip(), 'topic_edam_id': val[7].upper().replace('TOPIC', 'EDAM').strip(), 'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(), 'format_edam_id': val[11].upper().replace('FORMAT', 'EDAM').strip(), 'file_extension': val[12].strip(), 'file_date': raw_date, 'is_public': (val[15] == '1'), 'is_in_spell': (val[16] == '1'), 'is_in_browser': (val[17] == '1'), 'readme_name': readme_file, 'description': val[19].decode('utf-8', 'ignore').replace('"', ''), 'pmids': val[20], 'keywords': val[21].replace('"', '') } create_and_upload_file(obj, i, sftp_client) client.close()
from src.helpers import upload_file, update_readme_files_with_urls, add_keywords from src.models import DBSession, Base, Edam, Filedbentity, FileKeyword, FilePath, Keyword, Path, Referencedbentity, ReferenceFile, Source from sqlalchemy import create_engine, and_ from sqlalchemy.orm import sessionmaker, scoped_session from zope.sqlalchemy import ZopeTransactionExtension import transaction import traceback import pandas as pd from operator import itemgetter import time import pandas as pd from src.aws_helpers import get_zip_files, get_sra_files, get_readme_files, get_file_from_path_collection, multi_part_upload_s3, simple_s3_upload engine = create_engine(os.environ["NEX2_URI"], pool_recycle=3600) DBSession.configure(bind=engine, autoflush=False) Base.metadata.bind = engine NEX2_URI = os.environ.get('NEX2_URI') CREATED_BY = os.environ.get('CREATED_BY') SGD_SOURCE_ID = 834 INPUT_FILE_NAME = os.environ.get('INPUT_FILE_NAME') LOCAL_FILE_DIRECTORY = os.environ.get('LOCAL_FILE_DIRECTORY') S3_BUCKET = os.environ['S3_BUCKET'] S3_ACCESS_KEY = os.environ['S3_ACCESS_KEY'] S3_SECRET_KEY = os.environ['S3_SECRET_KEY'] MISSING_FILES = os.environ.get('MISSING_FILES', None) log_filename = os.environ.get("SCRIPT_LOG_FILE") os.makedirs(os.path.dirname(log_filename), exist_ok=True)
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id): local_file = open(gzip_file, mode='rb') ### upload a current GFF file to S3 with a static URL for Go Community ### upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz") ########################################################################## import hashlib gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest() row = nex_session.query(Filedbentity).filter_by(md5sum = gff_md5sum).one_or_none() if row is not None: return gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "") nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:3671') ## data:3671 Text topic_id = edam_to_id.get('EDAM:3068') ## topic:3068 Literature and language format_id = edam_to_id.get('EDAM:3507') ## format:3507 Document format from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) readme = nex_session.query(Dbentity).filter_by(display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none() if readme is None: log.info("saccharomyces_cerevisiae_gff.README is not in the database.") return readme_file_id = readme.dbentity_id # path.path = /reports/chromosomal-features upload_file(CREATED_BY, local_file, filename=gzip_file, file_extension='gz', description='GFF file for yeast genes (protein and RNA)', display_name=gzip_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', readme_file_id=readme_file_id, is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=gff_md5sum) gff = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none() if gff is None: log.info("The " + gzip_file + " is not in the database.") return file_id = gff.dbentity_id path = nex_session.query(Path).filter_by(path="/reports/chromosomal-features").one_or_none() if path is None: log.info("The path: /reports/chromosomal-features is not in the database.") return path_id = path.path_id x = FilePath(file_id = file_id, path_id = path_id, source_id = source_to_id['SGD'], created_by = CREATED_BY) nex_session.add(x) nex_session.commit() log.info("Done uploading " + gff_file)
def update_database_load_file_to_s3(nex_session, go_file, source_to_id, edam_to_id, ENGINE_CREATED): import hashlib desc = "Gene Product Association Data (GPAD)" if "gp_information" in go_file: desc = "Gene Product Information (GPI)" go_local_file = open(go_file, mode='rb') go_md5sum = hashlib.md5(go_file.encode()).hexdigest() go_row = nex_session.query(Filedbentity).filter_by( md5sum=go_md5sum).one_or_none() if go_row is not None: log.info("The current version of " + go_file + " is already in the database.\n") return log.info("Adding " + go_file + " to the database.\n") if "gp_association" in go_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gp_association.559292_sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "gp_information" in go_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('gp_information.559292_sgd%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') elif "noctua_sgd.gpad" in go_file: nex_session.query(Dbentity).filter( Dbentity.display_name.like('noctua_sgd.gpad%')).filter( Dbentity.dbentity_status == 'Active').update( {"dbentity_status": 'Archived'}, synchronize_session='fetch') nex_session.commit() data_id = edam_to_id.get('EDAM:2353') ## data:2353 Ontology data topic_id = edam_to_id.get( 'EDAM:0089') ## topic:0089 Ontology and terminology format_id = edam_to_id.get('EDAM:3475') ## format:3475 TSV # if ENGINE_CREATED == 0: from sqlalchemy import create_engine from src.models import DBSession engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600) DBSession.configure(bind=engine) if go_row is None: upload_file(CREATED_BY, go_local_file, filename=go_file, file_extension='.gz', description=desc, display_name=go_file, data_id=data_id, format_id=format_id, topic_id=topic_id, status='Active', is_public='1', is_in_spell='0', is_in_browser='0', file_date=datetime.now(), source_id=source_to_id['SGD'], md5sum=go_md5sum)