Esempio n. 1
0
    def doc_feeder(self, index_type=None, index_name=None, step=10000, verbose=True, query=None, scroll='10m', **kwargs):
        conn = self.conn
        index_name = index_name or self.ES_INDEX_NAME
        doc_type = index_type or self.ES_INDEX_TYPE

        n = self.count(query=query)['count']
        cnt = 0
        t0 = time.time()
        if verbose:
            print('\ttotal docs: {}'.format(n))

        _kwargs = kwargs.copy()
        _kwargs.update(dict(size=step, index=index_name, doc_type=doc_type))
        res = helpers.scan(conn, query=query, scroll=scroll, **_kwargs)
        t1 = time.time()
        for doc in res:
            if verbose and cnt % step == 0:
                if cnt != 0:
                    print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
                print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
                t1 = time.time()
            yield doc
            cnt += 1
        if verbose:
            print('done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1)))
            print("Finished! [{}]".format(timesofar(t0)))
Esempio n. 2
0
def load_contig(contig):
    '''save cadd contig into mongodb collection.
       should be an iterable.
    '''
    # if CADD_INPUT == "exome":
    # CADD_INPUT = exome
    tabix = pysam.Tabixfile(whole_genome)
    src_db = get_src_db()
    target_coll = src_db["cadd"]
    t0 = time.time()
    cnt = 0
    docs = (doc for doc in fetch_generator(tabix, contig))
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
        cnt += 1
        if len(doc_list) == 100:
            target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
            doc_list = []
        if cnt % 100000 == 0:
            print(cnt, timesofar(t0))
    if doc_list:
        target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
    print("successfully loaded cadd chromosome %s into mongodb" % contig)
    print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
Esempio n. 3
0
def get_genome_in_bit(chr_fa_folder):
    ''' encode each chromosome fasta sequence into a bitarray,
        and store them in a dictionary with chr numbers as keys
        chr_fa_folder is the folder to put all gzipped fasta files:

        fasta files can be downloaded from NCBI FTP site:

        ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
        chr<i>.fa.gz  (e.g. chr1.fa.gz)

    '''
    chr_bit_d = {}
    chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
    t0 = time.time()
    for i in chr_range:
        t1 = time.time()
        #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
        file_name = 'chr{}.fa.gz'.format(i)
        print("Loading {}...".format(file_name), end='')
        file_name = os.path.join(chr_fa_folder, file_name)
        with open_anyfile(file_name) as seq_f:
            seq_f.readline()  # skip header
            seq_bit = bitarray()
            for line in seq_f:
                line = line.rstrip('\n')
                line_bit = nuc_to_bit(line)
                seq_bit += line_bit
            chr_bit_d.update({i: seq_bit})
        print("done.[{}]".format(timesofar(t1)))
    print('=' * 20)
    print("Finished. [{}]".format(timesofar(t0)))

    return chr_bit_d
Esempio n. 4
0
def redo_parse_gbff(path):
    '''call this function manually to re-start the parsing step and set src_dump.
       This is used when main() is broken at parsing step, then parsing need to be re-started
       after the fix.
    '''
    #mark the download starts
    src_dump = get_src_dump()

    t0 = time.time()
    t_download = timesofar(t0)
    t1 = time.time()
    #mark parsing starts
    src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
    parse_gbff(path)
    t_parsing = timesofar(t1)
    t_total = timesofar(t0)

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Esempio n. 5
0
def get_genome_in_bit(chr_fa_folder):
    ''' encode each chromosome fasta sequence into a bitarray,
        and store them in a dictionary with chr numbers as keys
        chr_fa_folder is the folder to put all gzipped fasta files:

        fasta files can be downloaded from NCBI FTP site:

        ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh37.p13/Primary_Assembly/assembled_chromosomes/FASTA/
        chr<i>.fa.gz  (e.g. chr1.fa.gz)

    '''
    chr_bit_d = {}
    chr_range = [str(i) for i in range(1, 23)] + ['X', 'Y', 'MT']
    t0 = time.time()
    for i in chr_range:
        t1 = time.time()
        #file_name = 'hs_ref_GRCh37.p5_chr{}.fa.gz'.format(i)
        file_name = 'chr{}.fa.gz'.format(i)
        print("Loading {}...".format(file_name), end='')
        file_name = os.path.join(chr_fa_folder, file_name)
        with open_anyfile(file_name) as seq_f:
            seq_f.readline()   # skip header
            seq_bit = bitarray()
            for line in seq_f:
                line = line.rstrip('\n')
                line_bit = nuc_to_bit(line)
                seq_bit += line_bit
            chr_bit_d.update({i: seq_bit})
        print("done.[{}]".format(timesofar(t1)))
    print('='*20)
    print("Finished. [{}]".format(timesofar(t0)))

    return chr_bit_d
Esempio n. 6
0
def load_contig(contig):
    '''save cadd contig into mongodb collection.
       should be an iterable.
    '''
    # if CADD_INPUT == "exome":
    # CADD_INPUT = exome
    tabix = pysam.Tabixfile(whole_genome)
    src_db = get_src_db()
    target_coll = src_db["cadd"]
    t0 = time.time()
    cnt = 0
    docs = (doc for doc in fetch_generator(tabix, contig))
    doc_list = []
    for doc in docs:
        doc_list.append(doc)
        cnt += 1
        if len(doc_list) == 100:
            target_coll.insert(doc_list,
                               manipulate=False,
                               check_keys=False,
                               w=0)
            doc_list = []
        if cnt % 100000 == 0:
            print(cnt, timesofar(t0))
    if doc_list:
        target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)
    print("successfully loaded cadd chromosome %s into mongodb" % contig)
    print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
Esempio n. 7
0
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None):
    '''A iterator for returning docs in a collection, with batch query.
       additional filter query can be passed via "query", e.g.,
       doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}})
       batch_callback is a callback function as fn(cnt, t), called after every batch
       fields is optional parameter passed to find to restrict fields to return.
    '''
    src = get_src_db()
    if type(collection) == str:
        cur = src[collection].find()
    else:
        cur = collection.find()
    n = cur.count()
    s = s or 0
    e = e or n
    print('Retrieving {} documents from database "{}".'.format(n, collection))
    t0 = time.time()
    if inbatch:
        doc_li = []
    cnt = 0
    t1 = time.time()
    try:
        if s:
            cur.skip(s)
            cnt = s
            print("Skipping {} documents.".format(s))
        if e:
            cur.limit(e - (s or 0))
        cur.batch_size(step)
        print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='')
        for doc in cur:
            if inbatch:
                doc_li.append(doc)
            else:
                yield doc
            cnt += 1
            if cnt % step == 0:
                if inbatch:
                    yield doc_li
                    doc_li = []
                print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
                if batch_callback:
                    batch_callback(cnt, time.time()-t1)
                if cnt < e:
                    t1 = time.time()
                    print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='')
        if inbatch and doc_li:
            #Important: need to yield the last batch here
            yield doc_li

        #print 'Done.[%s]' % timesofar(t1)
        print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
        print("=" * 20)
        print('Finished.[total time: {}]'.format(timesofar(t0)))
    finally:
        cur.close()
Esempio n. 8
0
    def doc_feeder(self,
                   step=10000,
                   verbose=True,
                   query=None,
                   scroll='10m',
                   **kwargs):
        q = query if query else {'query': {'match_all': {}}}
        _q_cnt = self.count(q=q, raw=True)
        n = _q_cnt['count']
        n_shards = _q_cnt['_shards']['total']
        assert n_shards == _q_cnt['_shards']['successful']
        _size = int(step / n_shards)
        assert _size * n_shards == step
        cnt = 0
        t0 = time.time()
        if verbose:
            print('\ttotal docs: {}'.format(n))
            t1 = time.time()

        res = self._es.search(self._index,
                              self._doc_type,
                              body=q,
                              size=_size,
                              search_type='scan',
                              scroll=scroll,
                              **kwargs)
        # double check initial scroll request returns no hits
        assert len(res['hits']['hits']) == 0

        while 1:
            if verbose:
                t1 = time.time()
                if cnt < n:
                    print('\t{}-{}...'.format(cnt + 1, min(cnt + step, n)),
                          end='')
            res = self._es.scroll(res['_scroll_id'], scroll=scroll)
            if len(res['hits']['hits']) == 0:
                break
            else:
                for doc in res['hits']['hits']:
                    _doc = doc.get('_source', {})
                    # "_id" field is not stored by default
                    # so it may not be returned in _source
                    _doc.setdefault("_id", doc["_id"])
                    yield _doc
                    cnt += 1
                if verbose:
                    print('done.[%.1f%%,%s]' %
                          (min(cnt, n) * 100. / n, timesofar(t1)))

        if verbose:
            print("Finished! [{}]".format(timesofar(t0)))

        assert cnt == n, "Error: scroll query terminated early [{}, {}], please retry.\nLast response:\n{}".format(
            cnt, n, res)
Esempio n. 9
0
def doc_feeder(collection, step=1000, s=None, e=None, inbatch=False, query=None, batch_callback=None, fields=None):
    '''A iterator for returning docs in a collection, with batch query.
       additional filter query can be passed via "query", e.g.,
       doc_feeder(collection, query={'taxid': {'$in': [9606, 10090, 10116]}})
       batch_callback is a callback function as fn(cnt, t), called after every batch
       fields is optional parameter passed to find to restrict fields to return.
    '''
    src = get_src_db()
    cur = src[collection].find()
    n = cur.count()
    s = s or 0
    e = e or n
    print('Retrieving {} documents from database "{}".'.format(n, collection))
    t0 = time.time()
    if inbatch:
        doc_li = []
    cnt = 0
    t1 = time.time()
    try:
        if s:
            cur.skip(s)
            cnt = s
            print("Skipping {} documents.".format(s))
        if e:
            cur.limit(e - (s or 0))
        cur.batch_size(step)
        print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='')
        for doc in cur:
            if inbatch:
                doc_li.append(doc)
            else:
                yield doc
            cnt += 1
            if cnt % step == 0:
                if inbatch:
                    yield doc_li
                    doc_li = []
                print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
                if batch_callback:
                    batch_callback(cnt, time.time()-t1)
                if cnt < e:
                    t1 = time.time()
                    print("Processing {}-{} documents...".format(cnt + 1, min(cnt + step, e)), end='')
        if inbatch and doc_li:
            #Important: need to yield the last batch here
            yield doc_li

        #print 'Done.[%s]' % timesofar(t1)
        print('Done.[%.1f%%,%s]' % (cnt * 100. / n, timesofar(t1)))
        print("=" * 20)
        print('Finished.[total time: {}]'.format(timesofar(t0)))
    finally:
        cur.close()
Esempio n. 10
0
    def main(self, index, collection, diff_filepath, validate=False, wait=60):
        self._index = index
        self._esi._index = index
        diff = loadobj(diff_filepath)
        source_collection = diff['source']
        add_list = self.add(source_collection, diff['add'])
        delete_list = self.delete(collection, diff['delete'])
        update_list = self.update(diff['update'])
        t00 = time()
        print('Adding new {} docs...'.format(len(diff['add'])))
        t0 = time()
        bulk(self._es, add_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Deleting {} docs'.format(len(diff['delete'])))
        t0 = time()
        bulk(self._es, delete_list)
        print("Done. [{}]".format(timesofar(t0)))
        print('Updating {} docs'.format(len(diff['update'])))
        t0 = time()
        bulk(self._es, update_list)
        print("Done. [{}]".format(timesofar(t0)))
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        if validate:
	    print('Waiting {}s to let ES to finish...'.format(wait), end="")
            sleep(wait)
            print("Done.")
            print("Validating...")
            t0 = time()
            q = {
                "query": {
                    "constant_score": {
                        "filter": {
                            "exists": {
                                "field": collection
                            }
                        }
                    }
                }
            }
            data = self._esi.doc_feeder(query=q, _source=collection)
            temp_collection = collection + '_temp_' + get_random_string()
            self._src[temp_collection].drop()
            load_source(temp_collection, src_data=data)
            c1 = get_backend(source_collection, 'mongodb')
            c2 = get_backend(temp_collection, 'mongodb')
            diff_result = diff_collections(c1, c2, use_parallel=False)
            self._src[temp_collection].drop()
            print("Done. [{}]".format(t0))
            return diff_result
Esempio n. 11
0
def two_docs_iterator(b1, b2, id_list, step=10000):
    t0 = time.time()
    n = len(id_list)
    for i in range(0, n, step):
        t1 = time.time()
        print("Processing %d-%d documents..." % (i + 1, min(i + step, n)))
        _ids = id_list[i:i + step]
        iter1 = b1.mget_from_ids(_ids, asiter=True)
        iter2 = b2.mget_from_ids(_ids, asiter=True)
        for doc1, doc2 in zip(iter1, iter2):
            yield doc1, doc2
        print('Done.[%.1f%%,%s]' % (i * 100. / n, timesofar(t1)))
    print("=" * 20)
    print('Finished.[total time: %s]' % timesofar(t0))
Esempio n. 12
0
 def main(self, index, collection, diff_filepath, validate=False, wait=60):
     self._index = index
     self._esi._index = index
     diff = loadobj(diff_filepath)
     source_collection = diff['source']
     add_list = self.add(source_collection, diff['add'])
     delete_list = self.delete(collection, diff['delete'])
     update_list = self.update(diff['update'])
     t00 = time()
     print('Adding new {} docs...'.format(len(diff['add'])))
     t0 = time()
     bulk(self._es, add_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Deleting {} docs'.format(len(diff['delete'])))
     t0 = time()
     bulk(self._es, delete_list)
     print("Done. [{}]".format(timesofar(t0)))
     print('Updating {} docs'.format(len(diff['update'])))
     t0 = time()
     bulk(self._es, update_list)
     print("Done. [{}]".format(timesofar(t0)))
     print("=" * 20)
     print("Finished! [{}]".format(timesofar(t00)))
     if validate:
         print('Waiting {}s to let ES to finish...'.format(wait), end="")
         sleep(wait)
         print("Done.")
         print("Validating...")
         t0 = time()
         q = {
             "query": {
                 "constant_score": {
                     "filter": {
                         "exists": {
                             "field": collection
                         }
                     }
                 }
             }
         }
         data = self._esi.doc_feeder(query=q, _source=collection)
         temp_collection = collection + '_temp_' + get_random_string()
         self._src[temp_collection].drop()
         load_source(temp_collection, src_data=data)
         c1 = get_backend(source_collection, 'mongodb')
         c2 = get_backend(temp_collection, 'mongodb')
         diff_result = diff_collections(c1, c2, use_parallel=False)
         self._src[temp_collection].drop()
         print("Done. [{}]".format(t0))
         return diff_result
Esempio n. 13
0
def two_docs_iterator(b1, b2, id_list, step=10000):
    t0 = time.time()
    n = len(id_list)
    for i in range(0, n, step):
        t1 = time.time()
        print "Processing %d-%d documents..." % (i + 1, min(i + step, n)),
        _ids = id_list[i:i+step]
        iter1 = b1.mget_from_ids(_ids, asiter=True)
        iter2 = b2.mget_from_ids(_ids, asiter=True)
        for doc1, doc2 in zip(iter1, iter2):
            yield doc1, doc2
        print 'Done.[%.1f%%,%s]' % (i*100./n, timesofar(t1))
    print "="*20
    print 'Finished.[total time: %s]' % timesofar(t0)
Esempio n. 14
0
def load_source(collection_name,
                src_module=None,
                src_data=None,
                inbatch=True,
                new_collection=True):
    '''save src data into mongodb collection.
       if src_module is provided, src_data = src_module.load_data()
       if new_collection is True, it requires the target collection is empty.
       else, use src_data directly, should be a iterable.
    '''
    src_db = get_src_db()
    target_coll = src_db[collection_name]
    if new_collection and target_coll.count() > 0:
        print("Error: target collection {} exists.".format(collection_name))
        return

    t0 = time.time()
    cnt = 0
    if src_module:
        src_data = src_module.load_data()
    if src_data:
        doc_list = []
        for doc in src_data:
            cnt += 1
            if not inbatch:
                target_coll.insert(doc,
                                   manipulate=False,
                                   check_keys=False,
                                   w=0)
            else:
                doc_list.append(doc)
                if len(doc_list) == 100:
                    target_coll.insert(doc_list,
                                       manipulate=False,
                                       check_keys=False,
                                       w=0)
                    doc_list = []
            if cnt % 100000 == 0:
                print(cnt, timesofar(t0))
        if doc_list:
            target_coll.insert(doc_list,
                               manipulate=False,
                               check_keys=False,
                               w=0)

        print("successfully loaded %s into mongodb" % collection_name)
        print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
    else:
        print("Error: no src data to load.")
Esempio n. 15
0
    def apply_changes(self, changes, verify=True, noconfirm=False):
        if verify:
            self.pre_verify_changes(changes)

        if not (noconfirm or ask('\nContinue to apply changes?') == 'Y'):
            print("Aborted.")
            return -1
        #src = self.get_source_collection(changes)
        step = self.step
        _db = get_target_db()
        source_col = _db[changes['source']]
        src = GeneDocMongoDBBackend(source_col)
        target = GeneDocESBackend(self)
        _timestamp = changes['timestamp']

        def _add_docs(ids):
            i = 0
            for _ids in iter_n(ids, step):
                t1 = time.time()
                _doc_li = src.mget_from_ids(_ids)
                for _doc in _doc_li:
                    _doc['_timestamp'] = _timestamp
                    i += 1
                target.insert(_doc_li)
                print('\t{}\t{}'.format(i, timesofar(t1)))

        t0 = time.time()
        if changes['add']:
            print("Adding {} new docs...".format(len(changes['add'])))
            t00 = time.time()
            _add_docs(changes['add'])
            print("done. [{}]".format(timesofar(t00)))
        if changes['delete']:
            print("Deleting {} discontinued docs...".format(len(changes['delete'])), end='')
            t00 = time.time()
            target.remove_from_ids(changes['delete'], step=step)
            print("done. [{}]".format(timesofar(t00)))
        if changes['update']:
            print("Updating {} existing docs...".format(len(changes['update'])))
            t00 = time.time()
            ids = [x['_id'] for x in changes['update']]
            _add_docs(ids)
            print("done. [{}]".format(timesofar(t00)))

        target.finalize()

        print("\n")
        print("Finished.", timesofar(t0))
Esempio n. 16
0
    def merge(self, step=100000, restart_at=0):
        t0 = time.time()
        self.validate_src_collections()
        self.log_building_start()
        try:
            if self.using_ipython_cluster:
                self._merge_ipython_cluster(step=step)
            else:
                self._merge_local(step=step, restart_at=restart_at)

            if self.target.name == 'es':
                print "Updating metadata...",
                self.update_mapping_meta()

            t1 = round(time.time() - t0, 0)
            t = timesofar(t0)
            self.log_src_build({'status': 'success',
                                'time': t,
                                'time_in_s': t1,
                                'timestamp': datetime.now()})

        finally:
            #do a simple validation here
            if getattr(self, '_stats', None):
                print "Validating..."
                target_cnt = self.target.count()
                if target_cnt == self._stats['total_genes']:
                    print "OK [total count={}]".format(target_cnt)
                else:
                    print "Warning: total count of gene documents does not match [{}, should be {}]".format(target_cnt, self._stats['total_genes'])

            if self.merge_logging:
                sys.stdout.close()
Esempio n. 17
0
def load_x(idx, fieldname, cvt_fn=None):
    print('DATA_FOLDER: ' + DATA_FOLDER)
    DATAFILE = os.path.join(DATA_FOLDER, 'idmapping_selected.tab.gz')
    load_start(DATAFILE)
    t0 = time.time()
    xli = []
    for ld in tabfile_feeder(DATAFILE, header=1):
        ld = listitems(ld, *(2,19,idx))    # GeneID Ensembl(Gene) target_value
        for value in dupline_seperator(dupline=ld,
                                       dup_sep='; '):
            xli.append(value)

    ensembl2geneid = list2dict(list_nondup([(x[1], x[0]) for x in xli if x[0]!='' and x[1]!='']), 0, alwayslist=True)
    xli2 = []
    for entrez_id, ensembl_id, x_value in xli:
        if x_value:
            if cvt_fn:
                x_value = cvt_fn(x_value)
            if entrez_id:
                xli2.append((entrez_id, x_value))
            elif ensembl_id:
                entrez_id = ensembl2geneid.get(ensembl_id, None)
                if entrez_id:
                    for _eid in entrez_id:
                        xli2.append((_eid, x_value))
                else:
                    xli2.append((ensembl_id, x_value))

    gene2x = list2dict(list_nondup(xli2), 0)
    fn = lambda value: {fieldname: sorted(value) if type(value) is types.ListType else value}
    gene2x = value_convert(gene2x, fn, traverse_list=False)
    load_done('[%d, %s]' % (len(gene2x), timesofar(t0)))

    return gene2x
Esempio n. 18
0
def run2():
    from databuild.esbuilder import ESIndexerBase
    esb = ESIndexerBase()
    doc_d = build(sources)
    t0 = time.time()
    esb.build_index(doc_d)
    print 'Done[%s]' % timesofar(t0)
Esempio n. 19
0
def run_jobs_on_ipythoncluster(worker, task_list, shutdown_ipengines_after_done=False):

    t0 = time.time()
    rc = Client(CLUSTER_CLIENT_JSON)
    lview = rc.load_balanced_view()
    print "\t# nodes in use: {}".format(len(lview.targets or rc.ids))
    lview.block = False

    print "\t# of tasks: {}".format(len(task_list))
    print "\tsubmitting...",
    job = lview.map_async(worker, task_list)
    print "done."
    try:
        job.wait_interactive()
    except KeyboardInterrupt:
        #handle "Ctrl-C"
        if ask("\nAbort all submitted jobs?") == 'Y':
            lview.abort()
            print "Aborted, all submitted jobs are cancelled."
        else:
            print "Aborted, but your jobs are still running on the cluster."
        return

    if len(job.result) != len(task_list):
        print "WARNING:\t# of results returned ({}) != # of tasks ({}).".format(len(job.result), len(task_list))
    print "\ttotal time: {}".format(timesofar(t0))

    if shutdown_ipengines_after_done:
        print "\tshuting down all ipengine nodes...",
        lview.shutdown()
        print 'Done.'
    return job.result
Esempio n. 20
0
def main():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit()

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'entrez_dump.log'), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)
    sys.stderr = sys.stdout

    #mark the download starts
    src_dump = get_src_dump()
    doc = {'_id': 'entrez',
           'timestamp': timestamp,
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()
    try:
        download(DATA_FOLDER, no_confirm=no_confirm)
        t_download = timesofar(t0)
        t1 = time.time()
        #mark parsing starts
        src_dump.update({'_id': 'entrez'}, {'$set': {'status': 'parsing'}})
        parse_gbff(DATA_FOLDER)
        t_parsing = timesofar(t1)
        t_total = timesofar(t0)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': {
            'download': t_download,
            'parsing': t_parsing,
            'total': t_total
        },
        'pending_to_upload': True    # a flag to trigger data uploading
    }

    src_dump.update({'_id': 'entrez'}, {'$set': _updates})
Esempio n. 21
0
    def load(self, genedoc_d=None, update_data=True, update_master=True, test=False, step=10000):
        if not self.temp_collection:
            self.make_temp_collection()

        self.temp_collection.drop()       # drop all existing records just in case.

        if update_data:
            genedoc_d = genedoc_d or self.load_genedoc()

            print("Uploading to the DB...", end='')
            t0 = time.time()
            # for doc in self.doc_iterator(genedoc_d, batch=False):
            #     if not test:
            #         doc.save()
            for doc_li in self.doc_iterator(genedoc_d, batch=True, step=step):
                if not test:
                    self.temp_collection.insert(doc_li, manipulate=False, check_keys=False)
            print('Done[%s]' % timesofar(t0))
            self.switch_collection()

            if getattr(self, 'ENTREZ_GENEDOC_ROOT', False):
                print('Uploading "geneid_d" to GridFS...', end='')
                t0 = time.time()
                geneid_d = self.get_geneid_d()
                dump2gridfs(geneid_d, self.__collection__ + '__geneid_d.pyobj', self.db)
                print('Done[%s]' % timesofar(t0))
            if getattr(self, 'ENSEMBL_GENEDOC_ROOT', False):
                print('Uploading "mapping2entrezgene" to GridFS...', end='')
                t0 = time.time()
                x2entrezgene_list = self.get_mapping_to_entrez()
                dump2gridfs(x2entrezgene_list, self.__collection__ + '__2entrezgene_list.pyobj', self.db)
                print('Done[%s]' % timesofar(t0))

        if update_master:
            # update src_master collection
            if not test:
                _doc = {"_id": unicode(self.__collection__),
                        "name": unicode(self.__collection__),
                        "timestamp": datetime.datetime.now()}
                for attr in ['ENTREZ_GENEDOC_ROOT', 'ENSEMBL_GENEDOC_ROOT', 'id_type']:
                    if hasattr(self, attr):
                        _doc[attr] = getattr(self, attr)
                if hasattr(self, 'get_mapping'):
                    _doc['mapping'] = getattr(self, 'get_mapping')()

                conn.GeneDocSourceMaster(_doc).save()
Esempio n. 22
0
def update_index(changes, sync_src, sync_target, noconfirm=False):
    # changes['_add'] = changes['delete']
    # changes['_delete'] = changes['add']
    # changes['delete'] = changes['_delete']
    # changes['add'] = changes['_add']
    # del changes['_add']
    # del changes['_delete']

    print "\t{}\trecords will be added.".format(len(changes['add']))
    print "\t{}\trecords will be deleted.".format(len(changes['delete']))
    print "\t{}\trecords will be updated.".format(len(changes['update']))

    print
    print '\tsync_src:\t{:<45}{}'.format(sync_src.target_collection.name,
                                             sync_src.name)
    print '\tsync_target\t{:<45}{}'.format(sync_target.target_esidxer.ES_INDEX_NAME,
                                               sync_target.name)

    if noconfirm or ask("Continue?")=='Y':
        t00 = time.time()
        es_idxer = sync_target.target_esidxer

        if len(changes['add']) > 0:
            print "Adding {} new records...".format(len(changes['add']))
            t0 = time.time()
            _q = {'_id': {'$in': changes['add']}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print "Done. [{}]".format(timesofar(t0))

        if len(changes['delete']) > 0:
            print "Deleting {} old records...".format(len(changes['delete']))
            t0 = time.time()
            es_idxer.delete_docs(changes['delete'])
            print "Done. [{}]".format(timesofar(t0))

        if len(changes['update']) > 0:
            print "Updating {} existing records...".format(len(changes['update']))
            t0 = time.time()
            ids = [d['_id'] for d in changes['update']]
            _q = {'_id': {'$in': ids}}
            for docs in doc_feeder(sync_src.target_collection, step=1000, inbatch=True, query=_q):
                es_idxer.add_docs(docs)
            print "Done. [{}]".format(timesofar(t0))
        print '='*20
        print 'Finished. [{}]'.format(timesofar(t00))
Esempio n. 23
0
    def _db_upload(self, doc_li, step=10000, verbose=True):
        import time
        from utils.common import timesofar
        from utils.dataload import list2dict, list_itemcnt, listsort

        output = []
        t0 = time.time()
        for i in range(0, len(doc_li), step):
            output.extend(self.target_db.update(doc_li[i:i+step]))
            if verbose:
                print('\t%d-%d Done [%s]...' % (i+1, min(i+step, len(doc_li)), timesofar(t0)))

        res = list2dict(list_itemcnt([x[0] for x in output]), 0)
        print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0)))
        res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True)
        print('\n'.join(['\t%s\t%d' % x for x in res[:10]]))
        if len(res) > 10:
            print("\t%d lines omitted..." % (len(res)-10))
Esempio n. 24
0
    def _db_upload(self, doc_li, step=10000, verbose=True):
        import time
        from utils.common import timesofar
        from utils.dataload import list2dict, list_itemcnt, listsort

        output = []
        t0 = time.time()
        for i in range(0, len(doc_li), step):
            output.extend(self.target_db.update(doc_li[i:i + step]))
            if verbose:
                print('\t%d-%d Done [%s]...' % (i + 1, min(i + step, len(doc_li)), timesofar(t0)))

        res = list2dict(list_itemcnt([x[0] for x in output]), 0)
        print("Done![%s, %d OK, %d Error]" % (timesofar(t0), res.get(True, 0), res.get(False, 0)))
        res = listsort(list_itemcnt([x[2].args[0] for x in output if x[0] is False]), 1, reverse=True)
        print('\n'.join(['\t%s\t%d' % x for x in res[:10]]))
        if len(res) > 10:
            print("\t%d lines omitted..." % (len(res) - 10))
Esempio n. 25
0
def main(daemon=False):
    running_processes = {}
    while 1:
        src_to_update_li = check_mongo()
        if src_to_update_li:
            print '\nDispatcher:  found pending jobs ', src_to_update_li
            for src_to_update in src_to_update_li:
                if src_to_update not in running_processes:
                    mark_upload_started(src_to_update)
                    p = dispatch(src_to_update)
                    src_dump.update({'_id': src_to_update}, {"$set": {"upload.pid": p.pid}})
                    p.t0 = time.time()
                    running_processes[src_to_update] = p

        jobs_finished = []
        if running_processes:
            print 'Dispatcher:  {} active job(s)'.format(len(running_processes))
            print get_process_info(running_processes)

        for src in running_processes:
            p = running_processes[src]
            returncode = p.poll()
            if returncode is not None:
                t1 = round(time.time()-p.t0, 0)
                d = {
                     'upload.returncode': returncode,
                     'upload.timestamp': datetime.now(),
                     'upload.time_in_s': t1,
                     'upload.time': timesofar(p.t0),
                     'upload.logfile': p.logfile,
                     }
                if returncode == 0:
                    print 'Dispatcher:  {} finished successfully with code {} (time: {}s)'.format(src, returncode, t1)
                    d['upload.status'] = "success"
                else:
                    print 'Dispatcher:  {} failed with code {} (time: {}s)'.format(src, returncode, t1)
                    d['upload.status'] = "failed"

                mark_upload_done(src, d)
                jobs_finished.append(src)
                p.log_f.close()
            else:
                p.log_f.flush()
        for src in jobs_finished:
            del running_processes[src]

        if running_processes:
            time.sleep(10)
        else:
            if daemon:
                #continue monitor src_dump collection
                print '\b'*50,
                for i in range(100):
                    print '\b'*2+[unichr(8212), '\\', '|', '/'][i%4],
                    time.sleep(0.1)
            else:
                break
Esempio n. 26
0
 def _add_docs(ids):
     i = 0
     for _ids in iter_n(ids, step):
         t1 = time.time()
         _doc_li = src.mget_from_ids(_ids)
         for _doc in _doc_li:
             _doc['_timestamp'] = _timestamp
             i += 1
         target.insert(_doc_li)
         print('\t{}\t{}'.format(i, timesofar(t1)))
Esempio n. 27
0
    def doc_feeder(self, step=1000, s=None, e=None, inbatch=False, query=None, **kwargs):
        '''A iterator for returning docs in a ES index with batch query.
           additional filter query can be passed via "query", e.g.,
           doc_feeder(query='taxid:9606'}})
           other parameters can be passed via "**kwargs":
                fields, from, size etc.
        '''
        if query:
            q = StringQuery(query)
        else:
            q = MatchAllQuery()
        raw_res = None

        cnt = 0
        t0 = time.time()
        while 1:
            t1 = time.time()
            if raw_res is None:
                raw_res = self.conn.search_raw(q, self._index, self._doc_type,
                      start=s, size=step, scan=True, scroll='5m', **kwargs)
                n = raw_res['hits']['total']
                print 'Retrieving %d documents from index "%s/%s".' % (n, self._index, self._doc_type)
            else:
                raw_res = self.conn.search_scroll(raw_res._scroll_id, scroll='5m')
            hits_cnt = len(raw_res['hits']['hits'])
            if hits_cnt == 0:
                break
            else:

                print "Processing %d-%d documents..." % (cnt+1, cnt+hits_cnt) ,
                res = self._cleaned_res(raw_res)
                if inbatch:
                    yield res
                else:
                    for hit in res:
                        yield hit
                cnt += hits_cnt
                print 'Done.[%.1f%%,%s]' % (cnt*100./n, timesofar(t1))
                if e and cnt > e:
                    break

        print "="*20
        print 'Finished.[total docs: %s, total time: %s]' % (cnt, timesofar(t0))
Esempio n. 28
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    src_dump = get_src_dump()
    print("Checking latest mart_version:\t", end=' ')
    mart_version = chk_latest_mart_version()
    print(mart_version)

    doc = src_dump.find_one({'_id': 'ensembl'})
    if doc and 'release' in doc and mart_version <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'gene_ensembl__gene__main.txt')
        if os.path.exists(data_file):
            print("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(ENSEMBL_FOLDER, str(mart_version))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'ensembl_mart_%s.log' % mart_version), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)

    #mark the download starts
    doc = {'_id': 'ensembl',
           'release': mart_version,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        BM = BioMart()
        BM.species_li = get_all_species(mart_version)
        BM.get_gene__main(os.path.join(DATA_FOLDER, 'gene_ensembl__gene__main.txt'))
        BM.get_translation__main(os.path.join(DATA_FOLDER, 'gene_ensembl__translation__main.txt'))
        BM.get_xref_entrezgene(os.path.join(DATA_FOLDER, 'gene_ensembl__xref_entrezgene__dm.txt'))

        BM.get_profile(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_profile__dm.txt'))
        BM.get_interpro(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_interpro__dm.txt'))
        BM.get_pfam(os.path.join(DATA_FOLDER, 'gene_ensembl__prot_pfam__dm.txt'))
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'ensembl'}, {'$set': _updates})
Esempio n. 29
0
    def doc_feeder(self, step=10000, verbose=True, query=None, scroll='10m', **kwargs):
        q = query if query else {'query': {'match_all': {}}}
        _q_cnt = self.count(q=q, raw=True)
        n = _q_cnt['count']
        n_shards = _q_cnt['_shards']['total']
        assert n_shards == _q_cnt['_shards']['successful']
        _size = int(step / n_shards)
        assert _size * n_shards == step
        cnt = 0
        t0 = time.time()
        if verbose:
            print('\ttotal docs: {}'.format(n))
            t1 = time.time()

        res = self._es.search(self._index, self._doc_type, body=q,
                              size=_size, search_type='scan', scroll=scroll, **kwargs)
        # double check initial scroll request returns no hits
        assert len(res['hits']['hits']) == 0

        while 1:
            if verbose:
                t1 = time.time()
                if cnt < n:
                    print('\t{}-{}...'.format(cnt+1, min(cnt+step, n)), end='')
            res = self._es.scroll(res['_scroll_id'], scroll=scroll)
            if len(res['hits']['hits']) == 0:
                break
            else:
                for doc in res['hits']['hits']:
                    _doc = doc.get('_source', {})
                    # "_id" field is not stored by default
                    # so it may not be returned in _source
                    _doc.setdefault("_id", doc["_id"])
                    yield _doc
                    cnt += 1
                if verbose:
                    print('done.[%.1f%%,%s]' % (min(cnt, n)*100./n, timesofar(t1)))

        if verbose:
            print("Finished! [{}]".format(timesofar(t0)))

        assert cnt == n, "Error: scroll query terminated early [{}, {}], please retry.\nLast response:\n{}".format(cnt, n, res)
Esempio n. 30
0
def build(sources, batch=True):
    entrez_root = ENTREZ_ROOT in sources
    ensembl_root = ENSEMBL_ROOT in sources
    print "Preparing root nodes...",
    t0 = time.time()
    if entrez_root and ensembl_root:
    	root_nodes = merge_root_nodes()
    elif entrez_root:
    	root_nodes = list(get_src(ENTREZ_ROOT).find())
    elif ensembl_root:
    	root_nodes = list(get_src(ENSEMBL_ROOT).find())
    else:
    	raise ValueError, "You need at least one source with root nodes."
    print 'Done[%s, %s]' % (len(root_nodes), timesofar(t0))

    print "Merging other sources with root nodes...",
    t0 = time.time()
    _sources = copy.copy(sources)
    if entrez_root:
    	_sources.remove(ENTREZ_ROOT)
    if ensembl_root:
    	_sources.remove(ENSEMBL_ROOT)
    src_collections = [get_src(src) for src in _sources]
    out_d = {}
    if not batch:
        for _id in root_nodes:
            vli = [root_nodes[_id]]
            for sc in src_collections:
                v = sc.get_from_id(_id)
                if v:
                    vli.append(v)
            v_merged = dict_attrmerge(vli)
            out_d[_id] = v_merged
    else:
        for doc_d in _doc_feeder(src_collections, root_nodes.keys(), step=10000, asdict=True):
            _id, vli = doc_d.items()[0]
            vli = [root_nodes[_id]] + [v for v in vli if v]
            v_merged = dict_attrmerge(vli)
            out_d[_id] = v_merged

    print 'Done[%s, %s]' % (len(out_d), timesofar(t0))
    return out_d
Esempio n. 31
0
def load_source(collection_name, src_module=None, src_data=None, inbatch=True, new_collection=True, step=1000):
    '''save src data into mongodb collection.
       if src_module is provided, src_data = src_module.load_data()
       if new_collection is True, it requires the target collection is empty.
       else, use src_data directly, should be a iterable.
    '''
    src_db = get_src_db()
    target_coll = src_db[collection_name]
    if new_collection and target_coll.count() > 0:
        print("Error: target collection {} exists.".format(collection_name))
        return

    t0 = time.time()
    cnt = 0
    if src_module:
        src_data = src_module.load_data()
    if src_data:
        doc_list = []
        for doc in src_data:
            cnt += 1
            if not inbatch:
                try:
                    target_coll.insert_one(doc)
                except:
                    print('One duplicate id exists, id is {}'.format(doc['_id']))
                    continue
            else:
                doc_list.append(doc)
                if len(doc_list) == step:
                    target_coll.insert_many(doc_list)
                    doc_list = []
            if cnt % 100000 == 0:
                print(cnt, timesofar(t0))
        if doc_list:
            target_coll.insert(doc_list, manipulate=False, check_keys=False, w=0)

        print("successfully loaded %s into mongodb" % collection_name)
        print("total docs: {}; total time: {}".format(cnt, timesofar(t0)))
    else:
        print("Error: no src data to load.")
Esempio n. 32
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    use_parallel = '-p' in sys.argv

    t0 = time.time()
    bdr = DataBuilder(backend='mongodb')
    bdr.load_build_config(config)
    bdr.using_ipython_cluster = use_parallel
    bdr.merge()
    print "Finished.", timesofar(t0)
Esempio n. 33
0
def load_ucsc_exons():
    print('DATA_FOLDER: ' + DATA_FOLDER)
    species_li = os.listdir(DATA_FOLDER)
    print "Found {} species folders.".format(len(species_li))
    t0 = time.time()
    gene2exons = {}
    for species in species_li:
        print species, '...'
        gene2exons.update(load_exons_for_species(species))

    load_done('[%d, %s]' % (len(gene2exons), timesofar(t0)))

    return gene2exons
Esempio n. 34
0
def main():
    if len(sys.argv) > 1:
        config = sys.argv[1]
    else:
        config = 'mygene_allspecies'
    use_parallel = '-p' in sys.argv
    noconfirm = '-b' in sys.argv
    if config == 'clean':
        clean_target_collection()
    else:
        t0 = time.time()
        build_index(config, use_parallel=use_parallel, noconfirm=noconfirm)
        print "Finished.", timesofar(t0)
Esempio n. 35
0
def upload(docs, collection):
	'''do the actual upload docs to the db.'''
	print 'Uploading to DB...',
	t0 = time.time()
	if  type(docs) is types.DictType:
		doc_li = docs.values()
	else:
		doc_li = docs

	db = get_db()   #database for merged data
	coll = db[collection]
	for i in range(0, len(doc_li), 10000):
		coll.insert(doc_li[i:i+10000])
	print 'Done[%s]' % timesofar(t0)
Esempio n. 36
0
def parse_vcf(vcf_infile,
              compressed=True,
              verbose=True,
              by_id=True,
              **tabix_params):
    t0 = time.time()
    compressed == vcf_infile.endswith('.gz')
    vcf_r = Reader(filename=vcf_infile, compressed=compressed)
    vcf_r.fetch('1', 1)  # call a dummy fetch to initialize vcf_r._tabix
    if tabix_params:
        vcf_r.reader = vcf_r._tabix.fetch(**tabix_params)
    cnt_1, cnt_2, cnt_3 = 0, 0, 0
    for rec in vcf_r:
        doc = parse_one_rec(rec)
        if by_id:
            # one hgvs id, one doc
            if doc['_id']:
                if isinstance(doc['_id'], list):
                    for i, _id in enumerate(doc['_id']):
                        _doc = copy.copy(doc)
                        _doc['alt'] = doc['alt'][i]
                        _doc[POS_KEY] = doc[POS_KEY][i]
                        _doc['_id'] = _id
                        yield _doc
                        cnt_2 += 1
                        if verbose:
                            print(_doc['rsid'], '\t', _doc['_id'])

                else:
                    yield doc
                    cnt_2 += 1
                    if verbose:
                        print(doc['rsid'], '\t', doc['_id'])
            else:
                cnt_3 += 1
        else:
            # one rsid, one doc
            if doc['_id']:
                yield doc
                cnt_2 += 1
                if verbose:
                    print(doc['rsid'], '\t', doc['_id'])
            else:
                cnt_3 += 1
        cnt_1 += 1
    print("Done. [{}]".format(timesofar(t0)))
    print("Total rs: {}; total docs: {}; skipped rs: {}".format(
        cnt_1, cnt_2, cnt_3))
Esempio n. 37
0
def doc_feeder0(collection, step=1000, s=None, e=None, inbatch=False):
    '''A iterator for returning docs in a collection, with batch query.'''
    n = collection.count()
    s = s or 1
    e = e or n
    print 'Found %d documents in database "%s".' % (n, collection.name)
    for i in range(s - 1, e + 1, step):
        print "Processing %d-%d documents..." % (i + 1, i + step),
        t0 = time.time()
        res = collection.find(skip=i, limit=step, timeout=False)
        if inbatch:
            yield res
        else:
            for doc in res:
                yield doc
        print 'Done.[%s]' % timesofar(t0)
Esempio n. 38
0
    def handle_genedoc_merged(self, **kwargs):
        for config in ('mygene', 'mygene_allspecies'):
            t0 = time.time()
            p = Popen(['python', '-m', 'databuild.sync', config, '-p', '-b'], cwd=src_path)
            returncode = p.wait()
            t = timesofar(t0)
            if returncode == 0:
                msg = 'Dispatcher:  "{}" syncer finished successfully with code {} (time: {})'.format(config, returncode, t)
            else:
                msg = 'Dispatcher:  "{}" syncer failed successfully with code {} (time: {})'.format(config, returncode, t)
            print(msg)
            if hipchat_msg:
                msg += '<a href="http://su07:8000/log/sync/{}">sync log</a>'.format(config)
                hipchat_msg(msg, message_format='html')

            assert returncode == 0, "Subprocess failed. Check error above."
Esempio n. 39
0
def main():
    no_confirm = True  # set it to True for running this script automatically without intervention.
    src_dump = get_src_dump()
    (file_name, release) = get_newest_release()
    doc = src_dump.find_one({'_id': 'clinvar'})
    if new_release_available(doc['release']):
        data_file = os.path.join(doc['data_folder'], file_name)
        if os.path.exists(data_file):
            print("No newer file found. Abort now.")
            return

        if not os.path.exists(DATA_FOLDER):
            os.makedirs(DATA_FOLDER)
        else:
            if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0
                    or ask('DATA_FOLDER (%s) is not empty. Continue?' %
                           DATA_FOLDER) == 'Y'):
                return

        log_f, logfile = safewfile(os.path.join(DATA_FOLDER,
                                                'clinvar_dump.log'),
                                   prompt=(not no_confirm),
                                   default='O')
        sys.stdout = LogPrint(log_f, timestamp=True)

        # mark the download starts
        doc = {
            '_id': 'clinvar',
            'timestamp': timestamp,
            'data_folder': DATA_FOLDER,
            'release': release,
            'logfile': logfile,
            'status': 'downloading'
        }
        src_dump.save(doc)
        t0 = time.time()
        try:
            download_ftp_file(no_confirm)
        finally:
            sys.stdout.close()
        # mark the download finished successfully
        _updates = {
            'status': 'success',
            'time': timesofar(t0),
            'pending_to_upload': True  # a flag to trigger data uploading
        }
        src_dump.update({'_id': 'clinvar'}, {'$set': _updates})
Esempio n. 40
0
def main_cron():
    no_confirm = True   # set it to True for running this script automatically without intervention.

    print("Checking latest refseq release:\t", end='')
    refseq_release = get_refseq_release()
    print(refseq_release)

    src_dump = get_src_dump()
    doc = src_dump.find_one({'_id': 'refseq'})
    if doc and 'release' in doc and refseq_release <= doc['release']:
        data_file = os.path.join(doc['data_folder'], 'complete.109.rna.gbff.gz')
        if os.path.exists(data_file):
            print("No newer release found. Abort now.")
            sys.exit(0)

    DATA_FOLDER = os.path.join(REFSEQ_FOLDER, str(refseq_release))
    if not os.path.exists(DATA_FOLDER):
        os.makedirs(DATA_FOLDER)
    else:
        if not (no_confirm or len(os.listdir(DATA_FOLDER)) == 0 or ask('DATA_FOLDER (%s) is not empty. Continue?' % DATA_FOLDER) == 'Y'):
            sys.exit(0)

    log_f, logfile = safewfile(os.path.join(DATA_FOLDER, 'refseq_dump.log'), prompt=(not no_confirm), default='O')
    sys.stdout = LogPrint(log_f, timestamp=True)

    #mark the download starts
    doc = {'_id': 'refseq',
           'release': refseq_release,
           'timestamp': time.strftime('%Y%m%d'),
           'data_folder': DATA_FOLDER,
           'logfile': logfile,
           'status': 'downloading'}
    src_dump.save(doc)
    t0 = time.time()

    try:
        download(DATA_FOLDER, refseq_release, no_confirm=no_confirm)
    finally:
        sys.stdout.close()

    #mark the download finished successfully
    _updates = {
        'status': 'success',
        'time': timesofar(t0),
        'pending_to_upload': True    # a flag to trigger data uploading
    }
    src_dump.update({'_id': 'refseq'}, {'$set': _updates})
Esempio n. 41
0
def load_collection(database, input_file_list, collection_name):
    """
    : param database: mongodb url
    : param input_file_list: variant docs, path to file
    : param collection_name: annotation source name
    """
    conn = pymongo.MongoClient(database)
    db = conn.variantdoc
    posts = db[collection_name]
    t1 = time.time()
    cnt = 0
    input_file_list = getFileList()
    for doc in load_data(input_file_list):
        posts.insert(doc, manipulate=False, check_keys=False, w=0)
        cnt += 1
        if cnt % 100000 == 0:
            print cnt, timesofar(t1)
    print "successfully loaded %s into mongodb" % collection_name
Esempio n. 42
0
def sync_from_one_diff(index, collection, diff_filepath, validate=False, wait=60, dryrun=False, returncnt=False, save2file=None):
    sync = ESSyncer(index=index)
    #sync._index = index
    #sync._esi._index = index
    diff = loadobj(diff_filepath)
    source_collection = diff['source']
    add_iter = sync.add(source_collection, diff['add'])
    delete_iter = sync.delete(collection, diff['delete'])
    update_iter = sync.update2(diff['update'], collection, source_collection)
    t00 = time()
    if save2file:
        from itertools import chain
        import json
        for op in chain(add_iter, delete_iter, update_iter):
            json.dump(op, save2file)
        print("="*20)
        print("Finished! [{}]".format(timesofar(t00)))
        return

    print('Adding new {} docs...'.format(len(diff['add'])))
    t0 = time()
    if not dryrun:
        try:
            bulk(sync._es, add_iter)
        except:
            pass
    print("Done. [{}]".format(timesofar(t0)))

    print('Deleting {} docs'.format(len(diff['delete'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, delete_iter)
    print("Done. [{}]".format(timesofar(t0)))

    print('Updating {} docs'.format(len(diff['update'])))
    t0 = time()
    if not dryrun:
        bulk(sync._es, update_iter)
    print("Done. [{}]".format(timesofar(t0)))

    # add flush and refresh
    try:
        res = sync._es.indices.flush()
        print("Flushing...", res)
        res = sync._es.indices.refresh()
        print("Refreshing...", res)
    except:
        pass

    print("="*20)
    print("Finished! [{}]".format(timesofar(t00)))

    if returncnt:
        cnt = {
            'add': len(diff['add']),
            'delete': len(diff['delete']),
            'update': len(diff['update'])
        }
        return cnt

    if validate:
        print('Waiting {}s to let ES to finish...'.format(wait), end="")
        sleep(wait)
        print("Done.")
        print("Validating...")
        t0 = time()
        q = {
            "query": {
                "constant_score": {
                    "filter": {
                        "exists": {
                            "field": 'clinvar'
                        }
                    }
                }
            }
        }
        data = sync._esi.doc_feeder(query=q, _source=collection)
        temp_collection = collection + '_temp_' + get_random_string()
        sync._src[temp_collection].drop()
        load_source(temp_collection, src_data=data)
        c1 = get_backend(source_collection, 'mongodb')
        c2 = get_backend(temp_collection, 'mongodb')
        diff_result = diff_collections(c1, c2, use_parallel=False)
        sync._src[temp_collection].drop()
        print("Done. [{}]".format(t0))
        return diff_result