def upload_file_helper(CREATED_BY, remote_file, obj, full_file_path):
    """ upload file to s3 and update db with s3_url
    
    Paramaters
    ----------
    CREATED_BY: str
    remote_file: file obj
    obj: dict

    """

    try:
        upload_file(CREATED_BY,
                    remote_file,
                    filename=obj['display_name'],
                    file_extension=obj['file_extension'],
                    description=obj['description'],
                    display_name=obj['display_name'],
                    data_id=obj['data_id'],
                    format_id=obj['format_id'],
                    status=obj['status'],
                    topic_id=obj['topic_id'],
                    is_public=obj['is_public'],
                    is_in_spell=obj['is_in_spell'],
                    is_in_browser=obj['is_in_browser'],
                    file_date=obj['file_date'],
                    readme_file_id=obj['readme_file_id'],
                    source_id=obj['source_id'],
                    full_file_path=full_file_path)
    except Exception as e:
        logging.error("Exception occurred", exc_info=True)
Example #2
0
def update_database_load_file_to_s3(nex_session, ontology_file, source_to_id,
                                    edam_to_id):

    gzip_file = ontology_file + ".gz"
    import gzip
    import shutil
    with open(ontology_file, 'rb') as f_in, gzip.open(gzip_file,
                                                      'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    local_file = open(gzip_file, mode='rb')

    import hashlib
    go_md5sum = hashlib.md5(ontology_file.encode()).hexdigest()
    go_row = nex_session.query(Filedbentity).filter_by(
        md5sum=go_md5sum).one_or_none()

    if go_row is not None:
        return

    nex_session.query(Dbentity).filter_by(display_name=gzip_file,
                                          dbentity_status='Active').update(
                                              {"dbentity_status": 'Archived'})
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:2353')  ## data:2353 Ontology data
    topic_id = edam_to_id.get(
        'EDAM:0089')  ## topic:0089 Ontology and terminology
    format_id = edam_to_id.get('EDAM:3262')  ## format:3262 OWL/XML

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='Core Gene Ontology in OWL RDF/XML format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                is_public='0',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=go_md5sum)
Example #3
0
def update_database_load_file_to_s3(nex_session, data_file, sgd_source_id,
                                    edam_to_id):

    local_file = open(data_file)

    import hashlib
    dx_md5sum = hashlib.md5(local_file.read()).hexdigest()
    dx_row = nex_session.query(Filedbentity).filter_by(
        md5sum=dx_md5sum).one_or_none()

    if dx_row is not None:
        return

    log.info("Uploading the file to S3...")

    data_file = data_file.split('/').pop()

    nex_session.query(Dbentity).filter_by(display_name=data_file,
                                          dbentity_status='Active').update(
                                              {"dbentity_status": 'Archived'})
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:2872')  ## data:2872 ID list
    topic_id = edam_to_id.get(
        'EDAM:3345')  ## topic:3345 Data identity and mapping
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(
        CREATED_BY,
        local_file,
        filename=data_file,
        file_extension='txt',
        description='subset of NCBI gene2accession file for taxon ID 559292',
        display_name=data_file,
        data_id=data_id,
        format_id=format_id,
        topic_id=topic_id,
        status='Active',
        is_public='0',
        is_in_spell='0',
        is_in_browser='0',
        file_date=datetime.now(),
        source_id=sgd_source_id)
def create_and_upload_file(obj, row_num):
    try:
        # find on local system
        if DATA_DIR:
            remote_file_path = DATA_DIR + '/' + obj['bun_path']
        else:
            remote_file_path = obj['bun_path']
        remote_file_path = remote_file_path + obj['display_name']
        remote_file = open(remote_file_path)
    except IOError:
        logging.error('error opening file ' + str(row_num))
        traceback.print_exc()
        return
    try:
        temp_engine = create_engine(NEX2_URI)
        session_factory = sessionmaker(bind=temp_engine,
                                       extension=ZopeTransactionExtension(),
                                       expire_on_commit=False)
        db_session = scoped_session(session_factory)
        # get README location
        readme_file_id = None
        if len(obj['readme_name']):
            readme = db_session.query(Filedbentity).filter(
                Filedbentity.display_name == obj['readme_name']).one_or_none()
            if readme is None:
                logging.warning('unable to find README ' + obj['readme_name'])
            else:
                readme_file_id = readme.dbentity_id

        # see if already exists, if not create
        existing = db_session.query(Filedbentity).filter(
            Filedbentity.display_name == obj['display_name']).one_or_none()

        source_id = db_session.query(Source.source_id).filter(
            Source.display_name == obj['source']).one_or_none()[0]

        if not existing:
            try:
                data_id = db_session.query(Edam.edam_id).filter(
                    Edam.edamid == obj['data_edam_id']).one_or_none()[0]
                format_id = db_session.query(Edam.edam_id).filter(
                    Edam.edamid == obj['format_edam_id']).one_or_none()[0]
                topic_id = db_session.query(Edam.edam_id).filter(
                    Edam.edamid == obj['topic_edam_id']).one_or_none()[0]
            except TypeError:
                logging.error('invalid EDAM id or source in row ' +
                              str(row_num) + ' val in ' + obj['data_edam_id'] +
                              ', ' + obj['format_edam_id'] + ', ' +
                              obj['topic_edam_id'])
                return

            print("remote_file=", remote_file)

            upload_file(CREATED_BY,
                        remote_file,
                        filename=obj['display_name'],
                        file_extension=obj['file_extension'],
                        description=obj['description'].replace('"', ''),
                        display_name=obj['display_name'],
                        data_id=data_id,
                        format_id=format_id,
                        status=obj['status'],
                        topic_id=topic_id,
                        is_public=obj['is_public'],
                        is_in_spell=obj['is_in_spell'],
                        is_in_browser=obj['is_in_browser'],
                        file_date=obj['file_date'],
                        readme_file_id=readme_file_id,
                        source_id=source_id)
            db_session.flush()
        else:
            existing.display_name = obj['display_name']
            existing.description = obj['description']
            existing.status = obj['status']
            existing.is_public = obj['is_public']
            existing.is_in_spell = obj['is_in_spell']
            existing.is_in_browser = obj['is_in_browser']
            existing.source_id = source_id
            # update file size
            if not existing.file_size and existing.s3_url:
                remote_file.seek(0, os.SEEK_END)
                file_size = remote_file.tell()
                remote_file.seek(0)
                existing.file_size = file_size
            if obj['file_date']:
                existing.file_date = obj['file_date']
                existing.year = obj['file_date'].year
            existing.readme_file_id = readme_file_id
            remote_file.seek(0, os.SEEK_END)
            transaction.commit()
            existing = db_session.query(Filedbentity).filter(
                Filedbentity.display_name ==
                obj['display_name']).one_or_none()
            # only upload s3 file if not defined
            if existing.s3_url is None:
                existing.upload_file_to_s3(remote_file, obj['display_name'])
            db_session.flush()
        # add path entries
        existing = db_session.query(Filedbentity).filter(
            Filedbentity.display_name == obj['display_name']).one_or_none()
        if not existing:
            logging.error('error with ' + obj['display_name'] + ' in row ' +
                          str(row_num))
            return
        path = db_session.query(Path).filter_by(
            path=obj['new_path']).one_or_none()
        if path is None:
            logging.warning('Could not find path ' + obj['new_path'] +
                            ' in row ' + str(row_num))
            return
        existing_filepath = db_session.query(FilePath).filter(
            and_(FilePath.file_id == existing.dbentity_id,
                 FilePath.path_id == path.path_id)).one_or_none()
        if not existing_filepath:
            new_filepath = FilePath(file_id=existing.dbentity_id,
                                    path_id=path.path_id,
                                    source_id=SGD_SOURCE_ID,
                                    created_by=CREATED_BY)
            db_session.add(new_filepath)
            transaction.commit()
            db_session.flush()
        # maybe add PMIDs
        if len(obj['pmids']):
            existing = db_session.query(Filedbentity).filter(
                Filedbentity.display_name ==
                obj['display_name']).one_or_none()
            pmids = obj['pmids'].split('|')
            for x in pmids:
                x = int(x.strip())
                existing_ref_file = db_session.query(ReferenceFile).filter(
                    ReferenceFile.file_id ==
                    existing.dbentity_id).one_or_none()
                ref = db_session.query(Referencedbentity).filter(
                    Referencedbentity.pmid == x).one_or_none()
                if ref and not existing_ref_file:
                    new_ref_file = ReferenceFile(created_by=CREATED_BY,
                                                 file_id=existing.dbentity_id,
                                                 reference_id=ref.dbentity_id,
                                                 source_id=SGD_SOURCE_ID)
                    db_session.add(new_ref_file)
                transaction.commit()
                db_session.flush()
        # maybe add keywords
        if len(obj['keywords']):
            existing = db_session.query(Filedbentity).filter(
                Filedbentity.display_name ==
                obj['display_name']).one_or_none()
            keywords = obj['keywords'].split('|')
            for x in keywords:
                x = x.strip()
                keyword = db_session.query(Keyword).filter(
                    Keyword.display_name == x).one_or_none()
                existing_file_keyword = db_session.query(FileKeyword).filter(
                    and_(FileKeyword.file_id == existing.dbentity_id,
                         FileKeyword.keyword_id ==
                         keyword.keyword_id)).one_or_none()
                if not existing_file_keyword:
                    new_file_keyword = FileKeyword(
                        created_by=CREATED_BY,
                        file_id=existing.dbentity_id,
                        keyword_id=keyword.keyword_id,
                        source_id=SGD_SOURCE_ID)
                    db_session.add(new_file_keyword)
                transaction.commit()
                db_session.flush()
        remote_file.close()
        logging.info('finished ' + obj['display_name'] + ', line ' +
                     str(row_num))
    except:
        logging.error('error with ' + obj['display_name'] + ' in row ' +
                      str(row_num))
        traceback.print_exc()
        db_session.rollback()
        db_session.close()
def update_database_load_file_to_s3(nex_session, gzip_file, source_to_id,
                                    edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    file_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=file_md5sum).one_or_none()

    if row is not None:
        return

    if "tbl" in gzip_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_tbl_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "sqn" in gzip_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_sqn_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    else:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('ncbi_gbf_files.%.tar.gz')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')

    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')  ## data:3671 Text
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3507')  ## format:3507 Document format

    if "tbl" in gzip_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)

    # readme = nex_session.query(Dbentity).filter_by(display_name="ncbi_tab_files.README", dbentity_status='Active').one_or_none()
    # if readme is None:
    #    log.info("ncbi_tbl_files.README is not in the database.")
    #    return
    # readme_file_id = readme.dbentity_id

    readme_file_id = None

    # path.path = /reports/function

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='All yeast features in tbl file format',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=file_md5sum)

    file = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()
    if file is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = file.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
Example #6
0
def update_database_load_file_to_s3(nex_session, gaf_file, is_public,
                                    source_to_id, edam_to_id, datestamp):

    # gene_association.sgd.20171204.gz
    # gene_association.sgd-yeastmine.20171204.gz

    # datestamp = str(datetime.now()).split(" ")[0].replace("-", "")
    gzip_file = gaf_file + "." + datestamp + ".gz"
    import gzip
    import shutil
    with open(gaf_file, 'rb') as f_in, gzip.open(gzip_file, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

    local_file = open(gzip_file)

    import hashlib
    gaf_md5sum = hashlib.md5(local_file.read()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gaf_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    # nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').update({"dbentity_status": 'Archived'})

    if is_public == 1:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gene_association.sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
        nex_session.commit()

    data_id = edam_to_id.get('EDAM:2048')  ## data:2048 Report
    topic_id = edam_to_id.get('EDAM:0085')  ## topic:0085 Functional genomics
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    if "yeastmine" not in gaf_file:
        from sqlalchemy import create_engine
        from src.models import DBSession
        engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
        DBSession.configure(bind=engine)

    readme = nex_session.query(Dbentity).filter_by(
        display_name="gene_association.README",
        dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("gene_association.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/function

    upload_file(
        CREATED_BY,
        local_file,
        filename=gzip_file,
        file_extension='gz',
        description=
        'All GO annotations for yeast genes (protein and RNA) in GAF file format',
        display_name=gzip_file,
        data_id=data_id,
        format_id=format_id,
        topic_id=topic_id,
        status='Active',
        readme_file_id=readme_file_id,
        is_public=is_public,
        is_in_spell='0',
        is_in_browser='0',
        file_date=datetime.now(),
        source_id=source_to_id['SGD'])

    gaf = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()
    if gaf is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gaf.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/function").one_or_none()
    if path is None:
        log.info("The path /reports/function is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()
Example #7
0
def update_database_load_file_to_s3(nex_session, gpad_file, gpi_file,
                                    source_to_id, edam_to_id):

    import hashlib

    gpad_local_file = open(gpad_file, mode='rb')
    gpi_local_file = open(gpi_file, mode='rb')

    gpad_md5sum = hashlib.md5(gpad_file.encode()).hexdigest()
    gpi_md5sum = hashlib.md5(gpi_file.encode()).hexdigest()

    gpad_row = nex_session.query(Filedbentity).filter_by(
        md5sum=gpad_md5sum).one_or_none()
    gpi_row = nex_session.query(Filedbentity).filter_by(
        md5sum=gpi_md5sum).one_or_none()

    if gpad_row is not None and gpi_row is not None:
        return

    if gpad_row is None:
        nex_session.query(Dbentity).filter_by(
            display_name=gpad_file,
            dbentity_status='Active').update({"dbentity_status": 'Archived'})
        nex_session.commit()
    if gpi_row is None:
        nex_session.query(Dbentity).filter_by(
            display_name=gpi_file,
            dbentity_status='Active').update({"dbentity_status": 'Archived'})
        nex_session.commit()

    data_id = edam_to_id.get('EDAM:2353')  ## data:2353 Ontology data
    topic_id = edam_to_id.get(
        'EDAM:0089')  ## topic:0089 Ontology and terminology
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    if gpad_row is None:
        upload_file(CREATED_BY,
                    gpad_local_file,
                    filename=gpad_file,
                    file_extension='.gz',
                    description='Gene Product Association Data (GPAD)',
                    display_name=gpad_file,
                    data_id=data_id,
                    format_id=format_id,
                    topic_id=topic_id,
                    status='Active',
                    is_public='0',
                    is_in_spell='0',
                    is_in_browser='0',
                    file_date=datetime.now(),
                    source_id=source_to_id['SGD'],
                    md5sum=gpad_md5sum)

    if gpi_row is None:
        upload_file(CREATED_BY,
                    gpi_local_file,
                    filename=gpi_file,
                    file_extension='gz',
                    description='Gene Product Information (GPI)',
                    display_name=gpi_file,
                    data_id=data_id,
                    format_id=format_id,
                    topic_id=topic_id,
                    status='Active',
                    is_public='0',
                    is_in_spell='0',
                    is_in_browser='0',
                    file_date=datetime.now(),
                    source_id=source_to_id['SGD'],
                    md5sum=gpi_md5sum)
Example #8
0
def update_database_load_file_to_s3(nex_session, gff_file, gzip_file, source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')
    
    ### upload a current GFF file to S3 with a static URL for Go Community ###
    upload_gff_to_s3(local_file, "latest/saccharomyces_cerevisiae.gff.gz")
    ##########################################################################

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(md5sum = gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/curation/data/", "")

    nex_session.query(Dbentity).filter(Dbentity.display_name.like('saccharomyces_cerevisiae.%.gff.gz')).filter(Dbentity.dbentity_status=='Active').update({"dbentity_status":'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3671')   ## data:3671    Text
    topic_id = edam_to_id.get('EDAM:3068')  ## topic:3068   Literature and language
    format_id = edam_to_id.get('EDAM:3507') ## format:3507  Document format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)
    
    readme = nex_session.query(Dbentity).filter_by(display_name="saccharomyces_cerevisiae_gff.README", dbentity_status='Active').one_or_none()
    if readme is None:
        log.info("saccharomyces_cerevisiae_gff.README is not in the database.")
        return
    readme_file_id = readme.dbentity_id

    # path.path = /reports/chromosomal-features

    upload_file(CREATED_BY, local_file,
                filename=gzip_file,
                file_extension='gz',
                description='GFF file for yeast genes (protein and RNA)',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=readme_file_id,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    gff = nex_session.query(Dbentity).filter_by(display_name=gzip_file, dbentity_status='Active').one_or_none()

    if gff is None:
        log.info("The " + gzip_file + " is not in the database.")
        return
    file_id = gff.dbentity_id

    path = nex_session.query(Path).filter_by(path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info("The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id = file_id,
                 path_id = path_id,
                 source_id = source_to_id['SGD'],
                 created_by = CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + gff_file)
Example #9
0
def update_database_load_file_to_s3(nex_session, go_file, source_to_id,
                                    edam_to_id, ENGINE_CREATED):

    import hashlib

    desc = "Gene Product Association Data (GPAD)"
    if "gp_information" in go_file:
        desc = "Gene Product Information (GPI)"

    go_local_file = open(go_file, mode='rb')

    go_md5sum = hashlib.md5(go_file.encode()).hexdigest()

    go_row = nex_session.query(Filedbentity).filter_by(
        md5sum=go_md5sum).one_or_none()

    if go_row is not None:
        log.info("The current version of " + go_file +
                 " is already in the database.\n")
        return

    log.info("Adding " + go_file + " to the database.\n")

    if "gp_association" in go_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gp_association.559292_sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "gp_information" in go_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('gp_information.559292_sgd%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    elif "noctua_sgd.gpad" in go_file:
        nex_session.query(Dbentity).filter(
            Dbentity.display_name.like('noctua_sgd.gpad%')).filter(
                Dbentity.dbentity_status == 'Active').update(
                    {"dbentity_status": 'Archived'},
                    synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:2353')  ## data:2353 Ontology data
    topic_id = edam_to_id.get(
        'EDAM:0089')  ## topic:0089 Ontology and terminology
    format_id = edam_to_id.get('EDAM:3475')  ## format:3475 TSV

    # if ENGINE_CREATED == 0:
    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    if go_row is None:
        upload_file(CREATED_BY,
                    go_local_file,
                    filename=go_file,
                    file_extension='.gz',
                    description=desc,
                    display_name=go_file,
                    data_id=data_id,
                    format_id=format_id,
                    topic_id=topic_id,
                    status='Active',
                    is_public='1',
                    is_in_spell='0',
                    is_in_browser='0',
                    file_date=datetime.now(),
                    source_id=source_to_id['SGD'],
                    md5sum=go_md5sum)
Example #10
0
def update_database_load_file_to_s3(nex_session, data_file, gzip_file,
                                    source_to_id, edam_to_id):

    local_file = open(gzip_file, mode='rb')

    import hashlib
    gff_md5sum = hashlib.md5(gzip_file.encode()).hexdigest()
    row = nex_session.query(Filedbentity).filter_by(
        md5sum=gff_md5sum).one_or_none()

    if row is not None:
        return

    gzip_file = gzip_file.replace("scripts/dumping/ncbi/data/", "")

    nex_session.query(Dbentity).filter(
        Dbentity.display_name.like('RNAcentral.%.json.gz')).filter(
            Dbentity.dbentity_status == 'Active').update(
                {"dbentity_status": 'Archived'}, synchronize_session='fetch')
    nex_session.commit()

    data_id = edam_to_id.get('EDAM:3495')  # data:3495    RNA sequence
    topic_id = edam_to_id.get('EDAM:0099')  # topic:0099   RNA
    format_id = edam_to_id.get('EDAM:3464')  # format:3464  JSON format

    from sqlalchemy import create_engine
    from src.models import DBSession
    engine = create_engine(os.environ['NEX2_URI'], pool_recycle=3600)
    DBSession.configure(bind=engine)

    upload_file(CREATED_BY,
                local_file,
                filename=gzip_file,
                file_extension='gz',
                description='JSON file for yeast RNA genes',
                display_name=gzip_file,
                data_id=data_id,
                format_id=format_id,
                topic_id=topic_id,
                status='Active',
                readme_file_id=None,
                is_public='1',
                is_in_spell='0',
                is_in_browser='0',
                file_date=datetime.now(),
                source_id=source_to_id['SGD'],
                md5sum=gff_md5sum)

    rnaFile = nex_session.query(Dbentity).filter_by(
        display_name=gzip_file, dbentity_status='Active').one_or_none()

    if rnaFile is None:
        log.info("The " + gzip_file + " is not in the database.")
        return

    file_id = rnaFile.dbentity_id

    path = nex_session.query(Path).filter_by(
        path="/reports/chromosomal-features").one_or_none()
    if path is None:
        log.info(
            "The path: /reports/chromosomal-features is not in the database.")
        return
    path_id = path.path_id

    x = FilePath(file_id=file_id,
                 path_id=path_id,
                 source_id=source_to_id['SGD'],
                 created_by=CREATED_BY)

    nex_session.add(x)
    nex_session.commit()

    log.info("Done uploading " + data_file)