def process_wrapper(in_queue, out_queue, plock, disease_file, ndo=0): dx = bd.BeeStringDict(marketscan+'DX', keysize = 9, readonly=True) idx = bd.BeeStringDict(marketscan+'indices', keysize = 51, readonly=True) code2dx = pickle.load(open(disease_file)) for (county, index, tot_cty) in iter( in_queue.get, None ): if index % 50000 == 0: plock.acquire() print 'AT {:1.2f} %\n'.format(index/float(tot_cty)*100) sys.stdout.flush() plock.release() ret = {} ## Do DX for M and F separately for gender in ['M','F']: county_ids = set() try: county_ids = idx[gender + ' ' + county] except KeyError: print 'indices does not have key: F ' + county continue if ndo > 0: county_ids = set(list(county_ids)[:ndo]) ret[gender] = do_county(dx, county_ids, code2dx) if len(ret.keys()) > 0: out_queue.put((county, ret)) out_queue.put(None)
def main(): rx = bd.BeeStringDict('RX', keysize=9, readonly=1) rx_set = bd.BeeStringDict('RXset', keysize=9, readonly=0) count = -1 empty = 0 keys = sorted(rx.keys()) print len(keys) for key in keys: line = rx.get(key, '') if len(line) > 9: subs = line.split('|') codes = [] for sub in subs: code = sub.split(':')[0] if len(code) > 4: codes.append(code) #print codes rx_set[key] = set(codes) rx_set.commit() else: empty += 1 count += 1 if count % 10000 == 0: print rich_string_wrap(num2comma2(count),'y',0,'k',0), \ rich_string_wrap(num2comma2(empty),'r',0,'k',0), ('%4.3f ' % (float(empty+0.5)/float(count+0.5)))
def __init__(self, filename=None, type=None, index_attrs=None, key_sizes=None, from_dump=False): if from_dump == True: # path = os.getcwd() + '/storage/' + type.__name__ + '/' + 'dumb.p' #storage = Storage() #storage.open_file(path) #filetext = storage.current_file.read() #c = pickle.load(StringIO.StringIO(filetext)) c = pickle.load(open(path, 'rb')) self.filename = c.filename self.name = type.__name__ self.gd = c.gd self.pp = c.pp self.type = c.type self.counter = c.counter self.size = c.size self.m = c.m self.index_attrs = c.index_attrs self.key_sizes = c.key_sizes self.trees = {} print("Extracting ") #storage.extract('storage/' + type.__name__) print("Extracting done ") for index_attr, key_size in zip(self.index_attrs, self.key_sizes): tree = BeeDict.BeeStringDict(os.getcwd() + '/storage/' + type(type).__name__ + index_attr, keysize=key_size) # tree.close() self.trees[index_attr] = tree else: self.filename = filename self.gd = 0 self.pp = [0] self.type = type self.name = type.__name__ self.counter = 0 self.size = 0 self.m = {} if index_attrs is not None and key_sizes is not None: self.index_attrs = index_attrs self.key_sizes = key_sizes self.trees = {} for attr, key_size in zip(index_attrs, key_sizes): tree = BeeDict.BeeStringDict(os.getcwd() + '/storage/' + type(type).__name__ + attr, keysize=key_size) # tree.close() self.trees[attr] = tree
def process_pat_worker(in_queue, out_queue, plock, process_function, db_list, **kwargs): for db in db_list: kwargs[db] = bd.BeeStringDict(db_list[db][0] + db, keysize=db_list[db][1], readonly=True) #people_dx = bd.BeeStringDict('kanix_matches_F_Breast_Cancer.txt_demo_bd', # keysize = 9, readonly=True) #kwargs['person_dx_info'] = people_dx ### BADDD #rx = bd.BeeStringDict(marketscan+'RX', keysize = 9, readonly=True) #dx = bd.BeeStringDict(marketscan+'DX', keysize = 9, readonly=True) #demo = bd.BeeStringDict(marketscan+'ID_demographics', keysize = 9, readonly=True) #rxset = bd.BeeStringDict(marketscan+'RXSet', keysize = 9, readonly=True) #dx = bd.BeeStringDict(marketscan+'DX', keysize = 9, readonly=True) ## iter = reads until get "None" for (patline, index, tot_pat) in iter(in_queue.get, None): if index % 50000 == 0: plock.acquire() print 'AT {:1.2f} %\n'.format(index / float(tot_pat) * 100) sys.stdout.flush() plock.release() result = process_function(patline, **kwargs) if not result is None: out_queue.put(result) out_queue.put(None) plock.acquire() print 'child: is in q empty? ' + str(in_queue.empty()) plock.release()
def main(): global drug2set drug2set = read_drug_sets() clean = False names = [] for name in sorted(drug2set.keys()): imya = name.replace(" ", "_") names.append(imya) system('clear') from mx.BeeBase import BeeDict as bd rx_set = bd.BeeStringDict('../RXset', keysize=9, readonly=1) all_pats = sorted(rx_set.keys()) ndrugs = len(names) num = 0 for i in range(num_procs): procs.append(multiprocessing.Process(target=worker)) procs[-1].daemon = True procs[-1].start() num = 0 alle = float(len(all_pats)) for item in all_pats: drug_set = rx_set.get(item, set([])) if num < len(all_pats): num += 1 if len(drug_set) > 0: q.put((drug_set, num, item, alle)) else: break if num < ndrugs: q.join() for p in procs: q.put(None) q.join() for p in procs: p.join() print "Finished everything...." print "num active children:", multiprocessing.active_children()
def page_test(): studs = get_dataset() p = Ipage() open('page.txt', 'w').close() from mx.BeeBase import BeeDict tree = BeeDict.BeeStringDict(os.getcwd() + '/storage/' + student.__name__ + 'name', keysize=256) for stud in studs[0:10]: p.insert(stud) p.store('page.txt', 0) p.store_to_tree(tree, student, 'name', 'page.txt') print(zip(tree.keys(), tree.values())) tree.close()
def main(): global drug2set drug2set = read_drug_sets() clean = False names = [] for name in sorted(drug2set.keys()): imya = name.replace(" ", "_") names.append(imya) system('clear') from mx.BeeBase import BeeDict as bd rx_set = bd.BeeStringDict('../RXset', keysize=9, readonly=1) all_pats = sorted(rx_set.keys()) print num2comma2(len(all_pats))
def tree_test(): from mx.BeeBase import BeeDict # Here is a very simple file based string dictionary: # d = BeeDict.BeeStringDict('storage/BeeStringDict.example', keysize=26) # studs = get_shuffled_million() # i = 0 # max = '' # for stud in studs[:100000]: # with Profiler() as p: # d[stud.attrs['name']] = stud.get_string() # i += 1 # d.commit() # if i % 5000 == 0: # print('#', i) # d.close() d = BeeDict.BeeStringDict('storage/BeeStringDict.example', keysize=26) martha = d.cursor(key='Martha Morrow') print(martha.next()) # print(len(d)) # print(d['Martha Morrow']) print(martha.key, d[martha.key])
from mx.BeeBase import BeeDict as bd import pandas as pd from itertools import chain data = '/Volumes/Macintosh HD-1/Users/Data/' data = '/Users/Data/' px = bd.BeeStringDict(data + 'PX', keysize=9, readonly=True) rx = bd.BeeStringDict(data + 'RXset', keysize=9, readonly=True) dx = bd.BeeStringDict(data + 'DX', keysize=9, readonly=True) demo = bd.BeeStringDict(data + 'ID_demographics', keysize=9, readonly=True) from pat_weeks import * import billing_codes [icd9, cpt] = billing_codes.load_icd9_cpt() import time t0 = time.time() bcpat = open('ab').read().strip().split('\n') ndo = 0 #100000 import numpy as np lens = np.zeros((len(bcpat), 5)) for (i, pat) in enumerate(bcpat): wkinfo = get_pat_weeks(pat, dx, px) wnums = [int(w) for w in wkinfo['d'].keys()] brca_diagnos_wk = [ int(w) for w in get_diagnosis_weeks(wkinfo, ('174', '239.3', '238.3', '233')) ] lens[i, :] = [ len(wkinfo['d']), len(wkinfo['p']), max(wnums) - min(wnums),