def load_tsv_filedbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    f = open(INPUT_FILE_NAME)
    i = 0
    for line in f:
        val = line.split("\t")
        if val[0] == 'bun path':
            continue
        if len(val) > 0:
            i = i + 1
            ### added by Shuai
            if len(val) < 14:
                print(val)
                return
            ###
            raw_date = val[13]
            if len(raw_date):
                raw_date = datetime.strptime(val[13], '%Y-%m-%d')
            else:
                raw_date = None
            raw_status = val[4].strip()
            if raw_status == 'Archive':
                raw_status = 'Archived'

            bun_path = val[0].strip()
            new_path = val[1].strip()
            if new_path.startswith("datasets/"):
                new_path = "/datasets"
            if bun_path[0] != '/':
                bun_path = bun_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            if new_path[0] != '/':
                new_path = new_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            readme_file = val[18]
            obj = {
                'bun_path': bun_path,
                'new_path': new_path,
                'display_name': val[3].strip(),
                'status': raw_status,
                'source': val[5].strip(),
                'topic_edam_id': val[7].upper().replace('TOPIC',
                                                        'EDAM').strip(),
                'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(),
                'format_edam_id': val[11].upper().replace('FORMAT',
                                                          'EDAM').strip(),
                'file_extension': val[12].strip(),
                'file_date': raw_date,
                'is_public': (val[15] == '1'),
                'is_in_spell': (val[16] == '1'),
                'is_in_browser': (val[17] == '1'),
                'readme_name': readme_file,
                'description': val[19].decode('utf-8',
                                              'ignore').replace('"', ''),
                'pmids': val[20],
                'keywords': val[21].replace('"', '')
            }
            create_and_upload_file(obj, i)
Beispiel #2
0
def update_database_load_file_to_s3(nex_session, ontology_file, source_to_id,
                                    edam_to_id):

    gzip_file = ontology_file + ".gz"
    import gzip
    import shutil
    with open(ontology_file, 'rb') as f_in, gzip.open(gzip_file,
                                                      'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    local_file = open(gzip_file, mode='rb')

    import hashlib
    go_md5sum = hashlib.md5(ontology_file.encode()).hexdigest()
    go_row = nex_session.query(Filedbentity).filter_by(
        md5sum=go_md5sum).one_or_none()

    if go_row is not None:
        return

    nex_session.query(Dbentity).filter_by(display_name=gzip_file,
                                          dbentity_status='Active').update(
                                              {"dbentity_status": 'Archived'})
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:2353')  ## data:2353 Ontology data
    topic_id = edam_to_id.get(
        'EDAM:0089')  ## topic:0089 Ontology and terminology
    format_id = edam_to_id.get('EDAM:3262')  ## format:3262 OWL/XML

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='Core Gene Ontology in OWL RDF/XML format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                is_public='0',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=go_md5sum)
Beispiel #3
0
def update_database_load_file_to_s3(nex_session, data_file, sgd_source_id,
                                    edam_to_id):

    local_file = open(data_file)

    import hashlib
    dx_md5sum = hashlib.md5(local_file.read()).hexdigest()
    dx_row = nex_session.query(Filedbentity).filter_by(
        md5sum=dx_md5sum).one_or_none()

    if dx_row is not None:
        return

    log.info("Uploading the file to S3...")

    data_file = data_file.split('/').pop()

    nex_session.query(Dbentity).filter_by(display_name=data_file,
                                          dbentity_status='Active').update(
                                              {"dbentity_status": 'Archived'})
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:2872')  ## data:2872 ID list
    topic_id = edam_to_id.get(
        'EDAM:3345')  ## topic:3345 Data identity and mapping
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(
        CREATED_BY,
        local_file,
        filename=data_file,
        file_extension='txt',
        description='subset of NCBI gene2accession file for taxon ID 559292',
        display_name=data_file,
        data_id=data_id,
        format_id=format_id,
        topic_id=topic_id,
        status='Active',
        is_public='0',
        is_in_spell='0',
        is_in_browser='0',
        file_date=datetime.now(),
        source_id=sgd_source_id)
def load_csv_disease_dbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    o = open(INPUT_FILE_NAME, 'rU')
    reader = csv.reader(o, delimiter='\t')
    for i, val in enumerate(reader):
        if i >= 0:
            if val[0] == '':
                logging.info('Found a blank value, DONE!')
                return
            obj = {
                'sgdid': val[0].strip().replace("SGD:", ""),
                'symbol': val[1].strip(),
                'summary': val[2].strip()
            }
            upload_db(obj, i)
def load_csv_disease_dbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    o = open(INPUT_FILE_NAME, 'rU')
    reader = csv.reader(o, delimiter='\t')
    for i, val in enumerate(reader):
        if i > 0:
            if val[0] == '':
                logging.info('Found a blank value, DONE!')
                return
            obj = {
                'taxon': val[0].strip().replace("taxon:", "TAX:"),
                'sgdid': val[2].strip().replace("SGD:", ""),
                'symbol': val[3].strip(),
                'association': val[8].strip(),
                'doid': val[10].strip(),
                'hgnc': val[11].strip(),
                'evidence_codes': val[16],
                'pmid': val[18].strip().replace("PMID:", ""),
                'date_assigned': val[19].strip(),
                'source': val[20]
            }
            upload_db(obj, i)
Beispiel #6
0
from src.models import DBSession, Base, Colleague, ColleagueLocus, Dbentity, Locusdbentity, LocusUrl, LocusAlias, Dnasequenceannotation, So, Locussummary, Phenotypeannotation, PhenotypeannotationCond, Phenotype, Goannotation, Go, Goslimannotation, Goslim, Apo, Straindbentity, Strainsummary, Reservedname, GoAlias, Goannotation, Referencedbentity, Referencedocument, Referenceauthor, ReferenceAlias, Chebi
from sqlalchemy import create_engine, and_, inspect
import os
import json
import re
import time
import sys
from random import randint
#from pycallgraph import PyCallGraph
#from pycallgraph.output import GraphvizOutput
from datetime import datetime
from threading import Thread
import concurrent.futures

engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
DBSession.configure(bind=engine)
Base.metadata.bind = engine


# populate text file with sgdis to be used to retrieve panther data
def get_sgdids_for_panther():
    new_data = Locusdbentity.get_s288c_genes()
    temp = []
    for loc in new_data:
        temp.append(loc.sgdid)
    result = json.dumps(temp, ensure_ascii=False)
    with open('./scripts/bgi_json/data_dump/sgd_ids_for_panther.txt',
              'w+') as res_file:
        res_file.write(
            result.replace('"', '').replace('[', '').replace(']', ''))
    Reservedname,
    GoAlias,
    Goannotation,
    Referencedbentity,
    Referencedocument,
    Referenceauthor,
    ReferenceAlias,
)
from sqlalchemy import create_engine, and_
from elasticsearch import Elasticsearch
from mapping import mapping
import os
import requests

engine = create_engine(os.environ["NEX2_URI"], pool_recycle=3600)
DBSession.configure(bind=engine)
Base.metadata.bind = engine

INDEX_NAME = "searchable_items_aws"
DOC_TYPE = "searchable_item"
es = Elasticsearch(os.environ["ES_URI"], retry_on_timeout=True)


def delete_mapping():
    print "Deleting mapping..."
    response = requests.delete(os.environ["ES_URI"] + INDEX_NAME + "/")
    if response.status_code != 200:
        print "ERROR: " + str(response.json())
    else:
        print "SUCCESS"
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id,
                                    edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    file_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=file_md5sum).one_or_none()

    if row is not None:
        return

    if "tbl" in gzip_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "sqn" in gzip_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    else:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')

    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')  ## data:3671 Text
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3507')  ## format:3507 Document format

    if "tbl" in gzip_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)

    # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none()
    # if readme is None:
    #    log.info("ncbi_tbl_files.README is not in the database.")
    #    return
    # readme_file_id = readme.dbentity_id

    readme_file_id = None

    # path.path = /reports/function

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='All yeast features in tbl file format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=file_md5sum)

    file = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()
    if file is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = file.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
    def setUp(self):
        self.engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.remove()
        DBSession.configure(bind=self.engine)

        Base.metadata.create_all(self.engine)
def update_database_load_file_to_s3(nex_session, data_file, gzip_file,
                                    source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "")

    nex_session.query(Dbentity).filter(
        Dbentity.display_name.like('RNAcentral.%.json.gz')).filter(
            Dbentity.dbentity_status == 'Active').update(
                {"dbentity_status": 'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3495')  # data:3495    RNA sequence
    topic_id = edam_to_id.get('EDAM:0099')  # topic:0099   RNA
    format_id = edam_to_id.get('EDAM:3464')  # format:3464  JSON format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='JSON file for yeast RNA genes',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=None,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    rnaFile = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()

    if rnaFile is None:
        log.info("The " + gzip_file + " is not in the database.")
        return

    file_id = rnaFile.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info(
            "The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + data_file)
Beispiel #11
0
def update_database_load_file_to_s3(nex_session, gaf_file, is_public,
                                    source_to_id, edam_to_id, datestamp):

    # gene_association.sgd.20171204.gz
    # gene_association.sgd-yeastmine.20171204.gz

    # datestamp = str(datetime.now()).split(" ")[0].replace("-", "")
    gzip_file = gaf_file + "." + datestamp + ".gz"
    import gzip
    import shutil
    with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    local_file = open(gzip_file)

    import hashlib
    gaf_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gaf_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'})

    if is_public == 1:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gene_association.sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
        nex_session.commit()

    data_id = edam_to_id.get('EDAM:2048')  ## data:2048 Report
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    if "yeastmine" not in gaf_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)

    readme = nex_session.query(Dbentity).filter_by(
        display_name="gene_association.README",
        dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("gene_association.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/function

    upload_file(
        CREATED_BY,
        local_file,
        filename=gzip_file,
        file_extension='gz',
        description=
        'All GO annotations for yeast genes (protein and RNA) in GAF file format',
        display_name=gzip_file,
        data_id=data_id,
        format_id=format_id,
        topic_id=topic_id,
        status='Active',
        readme_file_id=readme_file_id,
        is_public=is_public,
        is_in_spell='0',
        is_in_browser='0',
        file_date=datetime.now(),
        source_id=source_to_id['SGD'])

    gaf = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()
    if gaf is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gaf.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
def upload_all_filedbentities():
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)
    files = DBSession.query(Filedbentity).all()
    for x in files:
        print(x.get_path(), x.topic.display_name)
Beispiel #13
0
def update_database_load_file_to_s3(nex_session, gpad_file, gpi_file,
                                    source_to_id, edam_to_id):

    import hashlib

    gpad_local_file = open(gpad_file, mode='rb')
    gpi_local_file = open(gpi_file, mode='rb')

    gpad_md5sum = hashlib.md5(gpad_file.encode()).hexdigest()
    gpi_md5sum = hashlib.md5(gpi_file.encode()).hexdigest()

    gpad_row = nex_session.query(Filedbentity).filter_by(
        md5sum=gpad_md5sum).one_or_none()
    gpi_row = nex_session.query(Filedbentity).filter_by(
        md5sum=gpi_md5sum).one_or_none()

    if gpad_row is not None and gpi_row is not None:
        return

    if gpad_row is None:
        nex_session.query(Dbentity).filter_by(
            display_name=gpad_file,
            dbentity_status='Active').update({"dbentity_status": 'Archived'})
        nex_session.commit()
    if gpi_row is None:
        nex_session.query(Dbentity).filter_by(
            display_name=gpi_file,
            dbentity_status='Active').update({"dbentity_status": 'Archived'})
        nex_session.commit()

    data_id = edam_to_id.get('EDAM:2353')  ## data:2353 Ontology data
    topic_id = edam_to_id.get(
        'EDAM:0089')  ## topic:0089 Ontology and terminology
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    if gpad_row is None:
        upload_file(CREATED_BY,
                    gpad_local_file,
                    filename=gpad_file,
                    file_extension='.gz',
                    description='Gene Product Association Data (GPAD)',
                    display_name=gpad_file,
                    data_id=data_id,
                    format_id=format_id,
                    topic_id=topic_id,
                    status='Active',
                    is_public='0',
                    is_in_spell='0',
                    is_in_browser='0',
                    file_date=datetime.now(),
                    source_id=source_to_id['SGD'],
                    md5sum=gpad_md5sum)

    if gpi_row is None:
        upload_file(CREATED_BY,
                    gpi_local_file,
                    filename=gpi_file,
                    file_extension='gz',
                    description='Gene Product Information (GPI)',
                    display_name=gpi_file,
                    data_id=data_id,
                    format_id=format_id,
                    topic_id=topic_id,
                    status='Active',
                    is_public='0',
                    is_in_spell='0',
                    is_in_browser='0',
                    file_date=datetime.now(),
                    source_id=source_to_id['SGD'],
                    md5sum=gpi_md5sum)
Beispiel #14
0
def load_csv_filedbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    o = open(INPUT_FILE_NAME, 'rU')
    reader = csv.reader(o)
    for i, val in enumerate(reader):
        if i > 0:

            ### added by Shuai
            if len(val) == 0:
                continue

            if val[0] == '':
                logging.info('Found a blank value, DONE!')
                return

            ### added by Shuai
            if len(val) < 14:
                print(val)
                return
            ###
            raw_date = val[13]
            if len(raw_date):
                temp = format_csv_date_string(val[13])
                if temp is not None:
                    raw_date = datetime.strptime(temp, '%Y-%m-%d')
                else:
                    raw_date = datetime.strptime(val[13], '%Y-%m-%d')

            else:
                raw_date = None
            raw_status = val[4].strip()
            if raw_status == 'Archive':
                raw_status = 'Archived'

            bun_path = val[0].strip()
            new_path = val[1].strip()
            if bun_path[0] != '/':
                bun_path = bun_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            if new_path[0] != '/':
                new_path = new_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            readme_file = val[18]
            obj = {
                'bun_path': bun_path,
                'new_path': new_path,
                'display_name': val[3].strip(),
                'status': raw_status,
                'source': val[5].strip(),
                'topic_edam_id': val[7].upper().replace('TOPIC',
                                                        'EDAM').strip(),
                'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(),
                'format_edam_id': val[11].upper().replace('FORMAT',
                                                          'EDAM').strip(),
                'file_extension': val[12].strip(),
                'file_date': raw_date,
                'is_public': (val[15] == '1'),
                'is_in_spell': (val[16] == '1'),
                'is_in_browser': (val[17] == '1'),
                'readme_name': readme_file,
                'description': val[19].decode('utf-8',
                                              'ignore').replace('"', ''),
                'pmids': val[20],
                'keywords': val[21].replace('"', '')
            }
            create_and_upload_file(obj, i)
def load_csv_filedbentities():
    engine = create_engine(NEX2_URI, pool_recycle=3600)
    DBSession.configure(bind=engine)

    # open ssh connection to download server
    client = paramiko.SSHClient()
    client.load_system_host_keys()
    username = input('Username for legacy download server: ')
    password = getpass.getpass('Password for %s@%s: ' % (username, HOSTNAME))
    client.connect(HOSTNAME,
                   22,
                   username,
                   password,
                   gss_auth=False,
                   gss_kex=False)
    sftp_client = client.open_sftp()

    o = open(INPUT_FILE_NAME, 'rU')
    reader = csv.reader(o)
    for i, val in enumerate(reader):
        if i > 0:

            ### added by Shuai
            if len(val) == 0:
                continue

            if val[0] == '':
                logging.info('Found a blank value, DONE!')
                return

            ### added by Shuai
            if len(val) < 14:
                print(val)
                return
            ###
            raw_date = val[13]
            if len(raw_date):
                temp = format_csv_date_string(val[13])
                if temp is not None:
                    raw_date = datetime.strptime(temp, '%Y-%m-%d')
                else:
                    raw_date = datetime.strptime(val[13], '%Y-%m-%d')

            else:
                raw_date = None
            raw_status = val[4].strip()
            if raw_status == 'Archive':
                raw_status = 'Archived'

            bun_path = val[0].strip()
            new_path = val[1].strip()
            if bun_path[0] != '/':
                bun_path = bun_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            if new_path[0] != '/':
                new_path = new_path.replace('genome-sequences/',
                                            '/genome-sequences/')
            readme_file = val[18]
            obj = {
                'bun_path': bun_path,
                'new_path': new_path,
                'display_name': val[3].strip(),
                'status': raw_status,
                'source': val[5].strip(),
                'topic_edam_id': val[7].upper().replace('TOPIC',
                                                        'EDAM').strip(),
                'data_edam_id': val[9].upper().replace('DATA', 'EDAM').strip(),
                'format_edam_id': val[11].upper().replace('FORMAT',
                                                          'EDAM').strip(),
                'file_extension': val[12].strip(),
                'file_date': raw_date,
                'is_public': (val[15] == '1'),
                'is_in_spell': (val[16] == '1'),
                'is_in_browser': (val[17] == '1'),
                'readme_name': readme_file,
                'description': val[19].decode('utf-8',
                                              'ignore').replace('"', ''),
                'pmids': val[20],
                'keywords': val[21].replace('"', '')
            }
            create_and_upload_file(obj, i, sftp_client)
    client.close()
from src.helpers import upload_file, update_readme_files_with_urls, add_keywords
from src.models import DBSession, Base, Edam, Filedbentity, FileKeyword, FilePath, Keyword, Path, Referencedbentity, ReferenceFile, Source
from sqlalchemy import create_engine, and_
from sqlalchemy.orm import sessionmaker, scoped_session
from zope.sqlalchemy import ZopeTransactionExtension
import transaction
import traceback
import pandas as pd
from operator import itemgetter
import time
import pandas as pd

from src.aws_helpers import get_zip_files, get_sra_files, get_readme_files, get_file_from_path_collection, multi_part_upload_s3, simple_s3_upload

engine = create_engine(os.environ["NEX2_URI"], pool_recycle=3600)
DBSession.configure(bind=engine, autoflush=False)
Base.metadata.bind = engine

NEX2_URI = os.environ.get('NEX2_URI')
CREATED_BY = os.environ.get('CREATED_BY')
SGD_SOURCE_ID = 834
INPUT_FILE_NAME = os.environ.get('INPUT_FILE_NAME')
LOCAL_FILE_DIRECTORY = os.environ.get('LOCAL_FILE_DIRECTORY')

S3_BUCKET = os.environ['S3_BUCKET']
S3_ACCESS_KEY = os.environ['S3_ACCESS_KEY']
S3_SECRET_KEY = os.environ['S3_SECRET_KEY']
MISSING_FILES = os.environ.get('MISSING_FILES', None)

log_filename = os.environ.get("SCRIPT_LOG_FILE")
os.makedirs(os.path.dirname(log_filename), exist_ok=True)
Beispiel #17
0
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')
    
    ### upload a current GFF file to S3 with a static URL for Go Community ###
    upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz")
    ##########################################################################

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(md5sum = gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')   ## data:3671    Text
    topic_id = edam_to_id.get('EDAM:3068')  ## topic:3068   Literature and language
    format_id = edam_to_id.get('EDAM:3507') ## format:3507  Document format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)
    
    readme = nex_session.query(Dbentity).filter_by(display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("saccharomyces_cerevisiae_gff.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/chromosomal-features

    upload_file(CREATED_BY, local_file,
                filename=gzip_file,
                file_extension='gz',
                description='GFF file for yeast genes (protein and RNA)',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    gff = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none()

    if gff is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gff.dbentity_id

    path = nex_session.query(Path).filter_by(path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info("The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id = file_id,
                 path_id = path_id,
                 source_id = source_to_id['SGD'],
                 created_by = CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + gff_file)
Beispiel #18
0
def update_database_load_file_to_s3(nex_session, go_file, source_to_id,
                                    edam_to_id, ENGINE_CREATED):

    import hashlib

    desc = "Gene Product Association Data (GPAD)"
    if "gp_information" in go_file:
        desc = "Gene Product Information (GPI)"

    go_local_file = open(go_file, mode='rb')

    go_md5sum = hashlib.md5(go_file.encode()).hexdigest()

    go_row = nex_session.query(Filedbentity).filter_by(
        md5sum=go_md5sum).one_or_none()

    if go_row is not None:
        log.info("The current version of " + go_file +
                 " is already in the database.\n")
        return

    log.info("Adding " + go_file + " to the database.\n")

    if "gp_association" in go_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gp_association.559292_sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "gp_information" in go_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gp_information.559292_sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "noctua_sgd.gpad" in go_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('noctua_sgd.gpad%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:2353')  ## data:2353 Ontology data
    topic_id = edam_to_id.get(
        'EDAM:0089')  ## topic:0089 Ontology and terminology
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    # if ENGINE_CREATED == 0:
    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    if go_row is None:
        upload_file(CREATED_BY,
                    go_local_file,
                    filename=go_file,
                    file_extension='.gz',
                    description=desc,
                    display_name=go_file,
                    data_id=data_id,
                    format_id=format_id,
                    topic_id=topic_id,
                    status='Active',
                    is_public='1',
                    is_in_spell='0',
                    is_in_browser='0',
                    file_date=datetime.now(),
                    source_id=source_to_id['SGD'],
                    md5sum=go_md5sum)