def get_preprocessed_patients(sample_size = 25, rebuild_cache=False): cache_file = '/PHShome/ju601/crt/data/patient_cache.json' # Build cache if not os.path.isfile(cache_file) or rebuild_cache: patients_out = [] delta_efs_out = [] patient_nums = range(906) for i in patient_nums: if i%100 == 0: logger.info(str(i) + '/' + str(patient_nums[-1])) patient_data = get_data([i])[0] if patient_data is not None: ef_delta = get_ef_delta(patient_data) if ef_delta is not None: patients_out.append(patient_data['NEW_EMPI']) delta_efs_out.append(ef_delta) with open(cache_file, 'w') as cache: cache_obj = { 'patients': patients_out, 'delta_efs': delta_efs_out } json.dump(cache_obj, cache) # Load from cache with open(cache_file, 'r') as f: cached = json.load(f) n = min(sample_size, len(cached['patients'])) return cached['patients'][:n], cached['delta_efs'][:n]
def __iter__(self): for i in self.patient_list: p = get_data([i])[0] self.status.write(p['NEW_EMPI'] + '\n') for category in categories: if category in p: for idx, doc in enumerate(p[category]): tag = p['NEW_EMPI'] + '_' + category + '_' + str(idx) + '\n' yield LabeledSentence(words=doc['free_text'].split(), tags=[tag])
def preprocess(i): print "\nPreprocessing Medications - " + str(i) p = get_data([i])[0] for (i, m) in enumerate(p['Med']): (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True) m['RXNORM_NAME'] = name m['RXNORM_CLASSES'] = rxclasses save(p)
def remove_medication_preprocessing(num_patients): for i in range(num_patients): print "Removing Medication Preprocessing - " + str(i) p = get_data([i])[0] for m in p['Med']: del m['RXNORM_NAME'] del m['RXNORM_CLASSES'] save(p)
def remove_medication_preprocessing(num_patients): for i in range(num_patients): print "Removing Medication Preprocessing - " + str(i) p = get_data([i])[0] for m in p['Med']: del m['RXNORM_NAME'] del m['RXNORM_CLASSES'] save(p)
def preprocess(i): print "\nPreprocessing Medications - " + str(i) p = get_data([i])[0] for (i, m) in enumerate(p['Med']): (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True) m['RXNORM_NAME'] = name m['RXNORM_CLASSES'] = rxclasses save(p)
def __iter__(self): for i in self.patient_list: p = get_data([i])[0] self.status.write(p['NEW_EMPI'] + '\n') for category in categories: if category in p: for idx, doc in enumerate(p[category]): tag = p['NEW_EMPI'] + '_' + category + '_' + str( idx) + '\n' yield LabeledSentence(words=doc['free_text'].split(), tags=[tag])
def jsonify_text(person_id): person = loader.get_data([person_id])[0] for key in person.keys(): if lp.is_note_doc(key): for i in range(len(person[key])): doc = person[key][i] data = lp.parse_note_header(doc, key) data['free_text'] = doc person[key][i] = data with open('./data/patients/FAKE_EMPI_' + str(person_id) + '.json', 'w') as outfile: json.dump(person, outfile) print 'JSONIFIED PERSON ' + str(person_id)
def preprocess_medications(num_patients): for i in range(num_patients): print "\nPreprocessing Medications - " + str(i) + " - progress: ", p = get_data([i])[0] for (i, m) in enumerate(p['Med']): if i%100 == 0: print ", " + str(i) + '/' + str(len(p['Med'])), (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True) m['RXNORM_NAME'] = name m['RXNORM_CLASSES'] = rxclasses save(p)
def preprocess_medications(num_patients): for i in range(num_patients): print "\nPreprocessing Medications - " + str(i) + " - progress: ", p = get_data([i])[0] for (i, m) in enumerate(p['Med']): if i % 100 == 0: print ", " + str(i) + '/' + str(len(p['Med'])), (name, rxclasses) = get_rx_classes(m['Medication'], include_name=True) m['RXNORM_NAME'] = name m['RXNORM_CLASSES'] = rxclasses save(p)
def load_and_write_data(artifact: Artifact, key: str, location: str): """Loads data and writes it to the artifact if not already present. Parameters ---------- artifact The artifact to write to. key The entity key associated with the data to write. location The location associated with the data to load and the artifact to write to. """ if key in artifact: logger.debug(f'Data for {key} already in artifact. Skipping...') else: logger.debug(f'Loading data for {key} for location {location}.') data = loader.get_data(key, location) logger.debug(f'Writing data for {key} to artifact.') artifact.write(key, data) return artifact.load(key)
def generate_train_test(levels): mask, lats, lons = get_mask_data() sat_data = get_data(lats.ravel(), lons.ravel(), levels) sat_layers = [ exposure.equalize_adapthist(item[0].reshape(lats.shape).T, clip_limit=0.03) for item in sat_data ] m, n = mask.shape image_num = 0 for x in range(0, m - SLIDING_WINDOW_SIZE, SLIDING_INCREMENT): for y in range(0, n - SLIDING_WINDOW_SIZE, SLIDING_INCREMENT): prepared_layers = map( lambda data: data[x:x + SLIDING_WINDOW_SIZE, y:y + SLIDING_WINDOW_SIZE], sat_layers) prepared_mask = mask[x:x + SLIDING_WINDOW_SIZE, y:y + SLIDING_WINDOW_SIZE] layer = np.dstack(prepared_layers) imsave(os.path.join(IMAGES_PATH, str(image_num) + '.png'), np.flipud(layer)) imsave(os.path.join(LABELS_PATH, str(image_num) + '.png'), np.flipud(prepared_mask)) image_num += 1
def make_test_file(filepath,env): ''' make test file into X for LSTM model Parameters ---------- filepath : string filepath of current test data env : Env Environment of system Returns ------- X : 3d matrix (batch_size,timestep(=lookback),1(=feature)) matrix of X which can be used for LSTM model dataX : matrix (feature * timestep) original data X ''' feature=env.get_config("data","feature",type="list") time_slice=env.get_config("data","time_slice",type="int") is_debug=env.get_config("system","debug",type="int") # load data dataX=ld.get_data(filepath,feature) # simplify data # dataX=pp.mean_simplify(dataX,len(feature),time_slice) #make LSTM data X = make_LSTM_X(dataX,env) if is_debug==1: label_path=env.get_config("path","label_path") labeldata=lb.load_label(label_path) label=lb.data_labeling(dataX,filepath,labeldata) Y = make_LSTM_Y(label,env) return X,dataX,Y else: return X,dataX
import extractor import loader from sklearn.ensemble import RandomForestClassifier from os.path import join import os from sklearn.model_selection import train_test_split, ParameterGrid from sklearn.decomposition import PCA from sklearn.externals import joblib from sklearn.base import clone from time import time extractor.extract_features_from_yt_audioset() extractor.extract_features_from_augmented_audioset() x, y = loader.get_data() print(len(x)) print(len(x[0])) print(len(y)) n_components = 8 print('Using PCA on dataset: keeping %s features' % n_components) pca = PCA(n_components=n_components) pca.fit(x, y) x = pca.transform(x) print('Explained variance ratio: %s' % pca.explained_variance_ratio_) print('\n\n') x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3) fix_params = dict(min_samples_split=3,
dead_patients = [] gender = [] age = [] is_cancer = [] consult_time = [] cancer_time = [] cancer_mgh_time = [] cardio_mgh_time = [] noncancer_time = [] utilization = [] utilization_cancer = [] utilization_mgh_cancer = [] utilization_noncancer = [] utilization_mgh_cardio = [] for i in range(num_patients): p = get_data([i])[0] # Filter to only dead patients if p['Vital_status'] == 'Date of Death reported from SS Death Master File': if p['Consult_Date'] not in [None, '']: dead_patients.append(p['EMPI']) gender.append(p['Gender']) # Dates dob = datetime.strptime(p['Date_of_Birth'], "%m/%d/%Y") dod = datetime.strptime(p['Date_Of_Death\r'], "%m/%d/%Y") doc = datetime.strptime(p['Consult_Date'], "%m/%d/%Y") age.append((dod - dob).days / 365.0) timing = dod - doc consult_time.append(timing.days) # Diagnoses
def plot_num_docs(patient_range = range(90)): rel_dates = dict() keyword_counts = dict() keywords = ['ef\w+(.+)%', 'ejection fraction:\w*(.+)%', 'ef of (.+)%','ejection fraction of (.+)%', 'ef is (.+)%', 'ef:\w*(.+)%','ejection fraction is (.+)%', 'ef:\w*(.+)%'] overall_counts = dict() for i in patient_range: if i % 25 == 0: print i data = get_data([i])[0] rel_dates = get_doc_rel_dates(data, rel_dates, True) #keyword_counts = get_doc_keywords(data, keywords, keyword_counts, True) #ef_occurances = get_ef_values(data, car_only = True) if False and len(ef_occurances) > 2: #REMOVE FALSE TO SEE PLOTS dates, efs = zip(*ef_occurances) pl.figure() pl.scatter(dates, efs) pl.show() #for doc in keyword_counts: # s = 0 # for key in keyword_counts[doc]: # s += len(keyword_counts[doc][key]) # # if not doc in overall_counts: # overall_counts[doc] = [s] # else: # overall_counts[doc] += [s] # print overall_counts['Car'] # pl.figure() # pl.hist(overall_counts['Car']) # pl.show() #for keyword in keyword_counts: # print keyword, ": ", str(sum(keyword_counts[keyword])) #for doc in keyword_counts: # print doc # for keyword in keyword_counts[doc]: # print "\t", keyword, ": ", str(sum(keyword_counts[doc][keyword])) note_deltas = [] struct_deltas = [] for doc_type in rel_dates: if is_note_doc(doc_type): note_deltas += [x.days for x in rel_dates[doc_type]] else: struct_deltas += [x.days for x in rel_dates[doc_type]] for word in keyword_counts: keyword_counts[word] = [x.days for x in rel_dates[doc_type]] bins = 100 print print "Notes: ", len(note_deltas) print "Structs: ", len(struct_deltas) pl.figure() h = pl.hist([note_deltas, struct_deltas], bins,stacked = True, color = ['blue', 'red'], label = ['Number of sentences in\nunstructured notes', 'Number of structured entries']) pl.legend(loc = 2) pl.title("Frequency of Occurances of New Data in Patient") pl.xlabel("Days Since Implant Procedure") pl.ylabel("Number of Pieces of Information") pl.show() for word in keyword_counts: pl.figure() pl.hist(keyword_counts[word], bins, color = ['blue']) pl.title("Occurances of " + word + " in corpus at time from procedure") pl.show()
def get_dataset(): # get training data and test data from MNIST dataset train_x_from_dataset, train_y_from_dataset, test_x_from_dataset, test_y_from_dataset = mnist.get_data( ) # preprocess for training data # get data with label 6 and 9 index_6 = np.where(train_y_from_dataset == 6) index_9 = np.where(train_y_from_dataset == 9) # shuffle data index = np.concatenate([index_6[0], index_9[0]]) np.random.seed(1) np.random.shuffle(index) # get data that we want (data with label 6 and 9) train_y = train_y_from_dataset[index] train_x = train_x_from_dataset[index] # if data's label = 6 set 0 and if data's label = 9 set 1 train_y[np.where(train_y == 6)] = 0 train_y[np.where(train_y == 9)] = 1 # preprocess for test data index_6 = np.where(test_y_from_dataset == 6) index_9 = np.where(test_y_from_dataset == 9) index = np.concatenate([index_6[0], index_9[0]]) np.random.shuffle(index) test_y = test_y_from_dataset[index] test_x = test_x_from_dataset[index] test_y[np.where(test_y == 6)] = 0 test_y[np.where(test_y == 9)] = 1 return train_x, train_y, test_x, test_y
after_date = rel_date dist_from_thresh = dist if before is not None and after is not None: return (after - before, before, after, before_date, after_date) else: return (None, None, None, None, None) # Collect statistics has_procedure = 0 has_baseline = 0 no_baseline = [] has_followup = 0 stats = defaultdict(list) total = 1056 for i in range(total - 1): p = get_data([i])[0] print str(i) + " - " + p['EMPI'] procedure_date = get_operation_date(p) if procedure_date: has_procedure += 1 (ef_delta, baseline_ef, followup_ef, baseline_date, followup_date) = get_ef_delta(p) if not baseline_ef: no_baseline.append(p['EMPI']) if baseline_ef and baseline_date > -60: has_baseline += 1 if followup_date > 100 and followup_date < 500: has_followup += 1 stats['procedure_date'].append(procedure_date) stats['baseline_days'].append(baseline_date) stats['followup_days'].append(followup_date)
def plot_num_docs(patient_range=range(90)): rel_dates = dict() keyword_counts = dict() keywords = [ 'ef\w+(.+)%', 'ejection fraction:\w*(.+)%', 'ef of (.+)%', 'ejection fraction of (.+)%', 'ef is (.+)%', 'ef:\w*(.+)%', 'ejection fraction is (.+)%', 'ef:\w*(.+)%' ] overall_counts = dict() for i in patient_range: if i % 25 == 0: print i data = get_data([i])[0] rel_dates = get_doc_rel_dates(data, rel_dates, True) #keyword_counts = get_doc_keywords(data, keywords, keyword_counts, True) #ef_occurances = get_ef_values(data, car_only = True) if False and len(ef_occurances) > 2: #REMOVE FALSE TO SEE PLOTS dates, efs = zip(*ef_occurances) pl.figure() pl.scatter(dates, efs) pl.show() #for doc in keyword_counts: # s = 0 # for key in keyword_counts[doc]: # s += len(keyword_counts[doc][key]) # # if not doc in overall_counts: # overall_counts[doc] = [s] # else: # overall_counts[doc] += [s] # print overall_counts['Car'] # pl.figure() # pl.hist(overall_counts['Car']) # pl.show() #for keyword in keyword_counts: # print keyword, ": ", str(sum(keyword_counts[keyword])) #for doc in keyword_counts: # print doc # for keyword in keyword_counts[doc]: # print "\t", keyword, ": ", str(sum(keyword_counts[doc][keyword])) note_deltas = [] struct_deltas = [] for doc_type in rel_dates: if is_note_doc(doc_type): note_deltas += [x.days for x in rel_dates[doc_type]] else: struct_deltas += [x.days for x in rel_dates[doc_type]] for word in keyword_counts: keyword_counts[word] = [x.days for x in rel_dates[doc_type]] bins = 100 print print "Notes: ", len(note_deltas) print "Structs: ", len(struct_deltas) pl.figure() h = pl.hist([note_deltas, struct_deltas], bins, stacked=True, color=['blue', 'red'], label=[ 'Number of sentences in\nunstructured notes', 'Number of structured entries' ]) pl.legend(loc=2) pl.title("Frequency of Occurances of New Data in Patient") pl.xlabel("Days Since Implant Procedure") pl.ylabel("Number of Pieces of Information") pl.show() for word in keyword_counts: pl.figure() pl.hist(keyword_counts[word], bins, color=['blue']) pl.title("Occurances of " + word + " in corpus at time from procedure") pl.show()