Ejemplo n.º 1
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {
        '_id': 'ucsc',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'lastmodified': latest_lastmodified,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(download_list, no_confirm)
    # mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
Ejemplo n.º 2
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    download_list = get_file_list_for_download()
    if len(download_list) == 0:
        logging.info("No newer file found. Abort now.")
        sys.exit(0)

    doc = src_dump.find_one({'_id': 'ucsc'})
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)

    logfile = os.path.join(DATA_FOLDER, 'ucsc_dump.log')
    setup_logfile(logfile)

    # mark the download starts
    doc = {'_id': 'ucsc',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': latest_lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(download_list, no_confirm)
    # mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ucsc'}, {'$set': _updates})
Ejemplo n.º 3
0
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 4
0
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True  # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 5
0
 def get_src_version(self):
     src_dump = get_src_dump(self.src.client)
     src_version = {}
     for src in src_dump.find():
         version = src.get('release', src.get('timestamp', None))
         if version:
             src_version[src['_id']] = version
     return src_version
Ejemplo n.º 6
0
 def get_src_version(self):
     src_dump = get_src_dump(self.src.client)
     src_version = {}
     for src in src_dump.find():
         version = src.get('release', src.get('timestamp', None))
         if version:
             src_version[src['_id']] = version
     return src_version
Ejemplo n.º 7
0
 def prepare_src_dump(self):
     src_dump = get_src_dump()
     # just populate/initiate an src_dump record (b/c no dump before) if needed
     self.src_doc = src_dump.find_one({'_id': self.main_source})
     if not self.src_doc:
         src_dump.save({"_id":self.main_source})
         self.src_doc = src_dump.find_one({'_id': self.main_source})
     return src_dump
Ejemplo n.º 8
0
def check_refseq_release():
    refseq_release = get_refseq_release()
    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)
Ejemplo n.º 9
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'ensembl',
           'release': mart_version,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
Ejemplo n.º 10
0
 def poll(self):
     if not self.poll_schedule:
         raise ManagerError("poll_schedule is not defined")
     src_dump = get_src_dump()
     @asyncio.coroutine
     def check_pending_to_upload():
         sources = [src['_id'] for src in src_dump.find({'pending_to_upload': True}) if type(src['_id']) == str]
         logging.info("Found %d resources to upload (%s)" % (len(sources),repr(sources)))
         for src_name in sources:
             logging.info("Launch upload for '%s'" % src_name)
             try:
                 self.upload_src(src_name)
             except ResourceNotFound:
                 logging.error("Resource '%s' needs upload but is not registered in manager" % src_name)
     cron = aiocron.crontab(self.poll_schedule,func=partial(check_pending_to_upload),
             start=True, loop=self.job_manager.loop)
Ejemplo n.º 11
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    logging.info("Checking latest refseq release:\t", end='')
    refseq_release = get_refseq_release()
    logging.info(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'refseq_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
Ejemplo n.º 12
0
def main():
    no_confirm = True  # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {
        '_id': 'entrez',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True  # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 13
0
 def version(self):
     import biothings.utils.mongo as mongo
     if self.target_collection.database.name == btconfig.DATA_SRC_DATABASE:
         fulln = mongo.get_source_fullname(self.target_collection.name)
         if not fulln:
             return
         mainsrc = fulln.split(".")[0]
         col = mongo.get_src_dump()
         src = col.find_one({"_id": mainsrc})
         return src.get("release")
     elif self.target_collection.database.name == btconfig.DATA_TARGET_DATABASE:
         col = mongo.get_src_build()
         tgt = col.find_one({"_id": self.target_collection.name})
         if not tgt:
             return
         return tgt.get("_meta", {}).get("build_version")
     else:
         return None
Ejemplo n.º 14
0
 def __init__(self, log_dir=None, date=None, dry_run=False):
     self.log_dir = log_dir if log_dir else os.getcwd()
     d = datetime.now()
     self.date = date if date else "".join(map(str, [d.year, d.month, d.day]))
     self.dry_run = dry_run
     self.login_instance = PBB_login.WDLogin(user=WDUSER, pwd=WDPASS)
     self.fast_run_base_filter = {self.DOID_PROP: ''}
     self.info_log_path = None
     self.exc_log_path = None
     self.reference = None
     self.setup_logging()
     self.collection = get_src_db().mondo
     src_dump = get_src_dump()
     src_doc = src_dump.find_one({'_id': 'mondo'}) or {}
     self.retrieved = src_doc.get("download", {}).get("started_at", False) or datetime.now()
     self.ref_url = "https://github.com/monarch-initiative/monarch-disease-ontology/raw/{}/src/mondo/mondo.obo".format(
         src_doc.get("release", "master"))
     self.create_reference()
Ejemplo n.º 15
0
 def register_status(self,src_name,status,**extra):
     """
     Register overall status for resource
     """
     src_dump = get_src_dump()
     upload_info = {'status': status}
     upload_info.update(extra)
     if status == "uploading":
         upload_info["jobs"] = {}
         # unflag "need upload"
         src_dump.update_one({"_id" : src_name},{"$unset" : {"pending_to_upload":None}})
         src_dump.update_one({"_id" : src_name},{"$set" : {"upload" : upload_info}})
     else:
         # we want to keep information
         upd = {}
         for k,v in upload_info.items():
             upd["upload.%s" % k] = v
         src_dump.update_one({"_id" : src_name},{"$set" : upd})
Ejemplo n.º 16
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'exac'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILES_PATH[0])
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'exac_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {
        '_id': 'exac',
        'timestamp': timestamp,
        'data_folder': DATA_FOLDER,
        'lastmodified': lastmodified,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'exac'}, {'$set': _updates})
Ejemplo n.º 17
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    logfile = os.path.join(DATA_FOLDER, 'entrez_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(DATA_FOLDER, no_confirm=no_confirm)
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(DATA_FOLDER)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Ejemplo n.º 18
0
def main(no_confirm=True):

    src_dump = get_src_dump()
    lastmodified = check_lastmodified()
    doc = src_dump.find_one({'_id': 'uniprot'})
    if doc and 'lastmodified' in doc and lastmodified <= doc['lastmodified']:
        path, filename = os.path.split(DATAFILE_PATH)
        data_file = os.path.join(doc['data_folder'], filename)
        if os.path.exists(data_file):
            logging.info("No newer file found. Abort now.")
            sys.exit(0)

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'uniprot_dump.log')
    setup_logfile(logfile)

    #mark the download starts
    doc = {'_id': 'uniprot',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'lastmodified': lastmodified,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    download(no_confirm)
    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'uniprot'}, {'$set': _updates})
Ejemplo n.º 19
0
 def prepare_src_dump(self):
     """Sync with src_dump collection, collection information (src_doc)
     Return src_dump collection"""
     src_dump = get_src_dump()
     self.src_doc = src_dump.find_one({'_id': self.main_source})
     return src_dump
Ejemplo n.º 20
0
    * with "-d" parameter, it will continue monitoring,
      without "-d", it will quit after all running jobs are done.

'''
from subprocess import Popen, STDOUT, check_output
import time
from datetime import datetime
import sys
import os.path
from biothings.utils.mongo import get_src_dump
from biothings.utils.common import safewfile, timesofar

src_path = os.path.split(os.path.split(os.path.abspath(__file__))[0])[0]
sys.path.append(src_path)

src_dump = get_src_dump()


def check_mongo():
    '''Check for "pending_to_upload" flag in src_dump collection.
       And return a list of sources should be uploaded.
    '''
    # filter some more: _id is supposed to be a user-defined string, not an ObjectId()
    return [src['_id'] for src in src_dump.find({'pending_to_upload': True}) if type(src['_id']) == str]


def dispatch(src):
    src_doc = src_dump.find_one({'_id': src})
    datadump_logfile = src_doc.get('logfile', '')
    if datadump_logfile:
        upload_logfile = os.path.join(os.path.split(datadump_logfile)[0], '{}_upload.log'.format(src))
Ejemplo n.º 21
0
 def prepare_src_dump(self):
     # Mongo side
     self.src_dump = get_src_dump()
     self.src_doc = self.src_dump.find_one({'_id': self.src_name}) or {}
Ejemplo n.º 22
0
    * with "-d" parameter, it will continue monitoring,
      without "-d", it will quit after all running jobs are done.

'''
from subprocess import Popen, STDOUT, check_output
import time
from datetime import datetime
import sys
import os.path
from biothings.utils.mongo import get_src_dump
from biothings.utils.common import safewfile, timesofar

src_path = os.path.split(os.path.split(os.path.abspath(__file__))[0])[0]
sys.path.append(src_path)

src_dump = get_src_dump()


def check_mongo():
    '''Check for "pending_to_upload" flag in src_dump collection.
       And return a list of sources should be uploaded.
    '''
    # filter some more: _id is supposed to be a user-defined string, not an ObjectId()
    return [
        src['_id'] for src in src_dump.find({'pending_to_upload': True})
        if type(src['_id']) == str
    ]


def dispatch(src):
    src_doc = src_dump.find_one({'_id': src})
Ejemplo n.º 23
0
def main_cron(no_confirm=True):
    '''set no_confirm to True for running this script automatically
       without intervention.'''

    src_dump = get_src_dump()
    mart_version = chk_latest_mart_version()
    logging.info("Checking latest mart_version:\t%s" % mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'],
                                 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            logging.info("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                       DATA_FOLDER) == 'Y'):
            sys.exit(0)

    logfile = os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version)
    setup_logfile(logfile)

    #mark the download starts
    doc = {
        '_id': 'ensembl',
        'release': mart_version,
        'timestamp': time.strftime('%Y%m%d'),
        'data_folder': DATA_FOLDER,
        'logfile': logfile,
        'status': 'downloading'
    }
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(
            os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(
            os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(
            os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(
            os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True  # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})