def test_save_subset(capsys): ch = get_cache(DOCTEST_SESSION) inputArray = np.array([10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]) randomSubset = np.array([0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,0,0,1,0,1]) inputHash = ch.hashArray('input_array', inputArray, 'feature') subsetHash = ch.hashArray('subset_hash', randomSubset, 'subset') outputHash,resultingName = save_subset(inputHash['hash'], False, CODEX_ROOT + '/uploads/save_subset_output_test.h5', session=ch) readingHash = codex_read_hd5(CODEX_ROOT + '/uploads/save_subset_output_test.h5', [resultingName], "feature", session=ch) save_subset(None, None, CODEX_ROOT + '/uploads/', session=ch) inputArray = np.array([10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200]) randomSubset = np.array([0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,0,0,1,0,1]) inputHash = ch.hashArray('input_array', inputArray, 'feature', session=ch) subsetHash = ch.hashArray('subset_hash', randomSubset, 'subset', session=ch) # Test scenario of not applying a subset mask outputHash,resultingName = save_subset(inputHash['hash'], False, CODEX_ROOT + '/uploads/save_subset_output_test.h5', session=ch) readingHash = codex_read_hd5(CODEX_ROOT + '/uploads/save_subset_output_test.h5', [resultingName], "feature", session=ch) assert outputHash == readingHash[0][0] # Test scenario of applying subset mask. Save full feature. outputHash,resultingName = save_subset(inputHash['hash'], subsetHash['hash'], CODEX_ROOT + '/uploads/save_subset_output_test.h5', session=ch) readingHash = codex_read_hd5(CODEX_ROOT + '/uploads/save_subset_output_test.h5', [resultingName], "feature", session=ch) assert outputHash == readingHash[0][0]
def test_update_data(capsys, testData): cache = get_cache(DOCTEST_SESSION) message = {'routine': 'arrange', 'hashType': 'feature', 'field': 'name', 'old': 'TiO2', 'new':'updated_name', 'cid': '8vrjn', 'sessionkey': DOCTEST_SESSION} result = update_data(message, {}) #assert result['message'] == 'success'
def test_codex_read_csv(capsys): ch = get_cache(DOCTEST_SESSION) featureList = ['TiO2','FeOT','SiO2','Total'] hashList = codex_read_csv(CODEX_ROOT + '/uploads/missing.csv',featureList, "feature", session=ch) featureList = ['fake_feature','FeOT','SiO2','Total'] hashList = codex_read_csv(CODEX_ROOT + '/uploads/doctest.csv',featureList, "feature", session=ch)
def codex_read_npy(file, featureList, hashType, session=None): ''' Inuputs: Outputs: Notes: ''' cache = get_cache(session) hashList = [] try: data = np.load(file) except BaseException: logging.warning("ERROR: codex_read_npy - cannot open file") return None samples, features = data.shape featureList = [] for x in range(0, features): try: feature_data = data[:, x].astype(float) except BaseException: feature_data = string2token(data[:, x]) logging.info("Log: codex_read_npy: Tokenized " + feature_name) feature_name = "feature_" + str(x) featureList.append(feature_name) feature_hash = cache.hashArray(feature_name, feature_data, hashType) hashList.append(feature_hash['hash']) return hashList, featureList
def save_subset(inputHash, subsetHash, saveFilePath, session=None): ''' Inuputs: Outputs: ''' cache = get_cache(session) returnHash = cache.findHashArray("hash", inputHash, "feature") if(returnHash is None): logging.warning("Hash not found. Returning!") return data = returnHash['data'] feature_name = returnHash['name'] if(subsetHash is not False): data, subsetName = cache.applySubsetMask(data, subsetHash) if(subsetHash is not False): newFeatureName = feature_name + "_" + subsetName else: newFeatureName = feature_name newHash = cache.hashArray(newFeatureName, data, 'feature') h5f = h5py.File(saveFilePath, 'w') h5f.create_dataset(newFeatureName, data=data) return newHash['hash'], newFeatureName
def test_get_sessions(capsys, testData): cache = get_cache(DOCTEST_SESSION) message = {'session_name': 'AUTOSAVE', 'state': {'windows': [{'data': {'features': ['SiO2', 'TiO2']}, 'height': 500, 'width': 500, 'x': 0, 'y': 0, 'windowType': 'Scatter'}]}, 'sessionkey': DOCTEST_SESSION} result = get_sessions({}, {}, CODEX_ROOT) assert "AUTOSAVE" in result['sessions']
def test_clustering(capsys, testData): ch = get_cache(DOCTEST_SESSION) result = clustering(testData['inputHash'], testData['hashList'], None, False, "kmean", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'failure' assert result['WARNING'] == 'kmean algorithm not supported' result = clustering(testData['inputHash'], testData['hashList'], None, False, "kmeans", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "mean_shift", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "birch", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "ward", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "spectral", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "dbscan", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "agglomerative", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = clustering(testData['inputHash'], testData['hashList'], None, False, "affinity_propagation", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'success'
def testData(session=None): ''' Inputs: None Outputs: None Notes: doctest function to streamline data ingestion for use in clustering unit tests TODO - labels are currently stashed in features due to front-end limitations. Need to convert here when they get moved to their own class. ''' from api.sub.hash import get_cache, DOCTEST_SESSION cache = get_cache(DOCTEST_SESSION if session is None else session) featureList = ['TiO2', 'FeOT', 'SiO2', 'Total'] hashList, featureList = codex_read_csv(CODEX_ROOT + '/uploads/doctest.csv', featureList, "feature", session=cache) # merge 1d arrays to nd-array data = cache.mergeHashResults(hashList) samples, features = data.shape inputHash = cache.hashArray('Merged', data, "feature") template = np.zeros(samples) templateHashDictionary = cache.hashArray("template", template, "feature") templateHash = templateHashDictionary['hash'] labelHash = codex_read_csv(CODEX_ROOT + '/uploads/doctest.csv', ["labels"], "feature", session=cache) labelHash = labelHash[0][0] regrLabelData = [] random.seed(50) for j in range(samples): regrLabelData.append(random.randint(0, 10)) regrLabelData = np.asarray(regrLabelData) regrLabelDictionary = cache.hashArray("regrLabelHash", regrLabelData, "feature") regrLabelHash = regrLabelDictionary['hash'] return { "inputHash": inputHash['hash'], 'featureNames': featureList, "hashList": hashList, "templateHash": templateHash, "classLabelHash": labelHash, "regrLabelHash": regrLabelHash }
def load_session(msg, result, loadPath): ''' Inputs: Outputs: ''' try: cache = get_cache(msg['sessionkey']) session_name = msg['session_name'] session_path = os.path.join(loadPath, 'sessions', session_name) result['session_name'] = msg['session_name'] if os.path.exists(session_path): result['session_data'] = cache.unpickle_data( session_name, loadPath) else: result["WARNING"] = session_name + " does not exist." except: logging.warning(traceback.format_exc()) return result
def save_session(msg, result, savePath): ''' Inputs: Outputs: ''' try: cache = get_cache(msg['sessionkey']) session_name = msg['session_name'] if session_name == "AUTOSAVE": cache.pickle_data(session_name, msg['state'], savePath) else: session_path = os.path.join(savePath, 'sessions', session_name) if not os.path.exists(session_path): cache.pickle_data(session_name, msg['state'], savePath) else: result["WARNING"] = "{session_name} already exists.".format( session_name=session_name) except: logging.warning(traceback.format_exc()) return result
def codex_server_memory_check(verbose=False, session=None): ''' Inputs: Outputs: Notes: Value returned in MB ''' from api.sub.hash import get_cache # defer import to try to circumvent circular import cache = get_cache(session) allowed_ram = 4096 current_ram = get_codex_memory_usage() if(verbose): logging.info("RAM Usage: " + str(current_ram) + "/" + str(allowed_ram)) while(current_ram > allowed_ram): last_ram = current_ram status = cache.remove_stale_data() if(status != True): return current_ram = get_codex_memory_usage() if(verbose): logging.info("RAM Usage: " + str(current_ram) + "/" + str(allowed_ram)) if(math.isclose(current_ram, last_ram, abs_tol=10)): return
def test_dimension_reduction(capsys, testData): ch = get_cache(DOCTEST_SESSION) result = dimension_reduction(testData['inputHash'], testData['hashList'], None, False, "PCA", False, {"n_components":2}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = dimension_reduction(testData['inputHash'], testData['hashList'], None, False, "ICA", False, {"n_components":2}, None, "direct", None, {}, ch).run() assert result['message'] == 'success'
def test_codex_read_hd5(capsys): ch = get_cache(DOCTEST_SESSION) featureList = ['L2/RetrievalGeometry/retrieval_latitude/','L2/RetrievalResults/xco2'] result = codex_read_hd5(CODEX_ROOT + '/uploads/lnd_glint_subsample_10000.h5',featureList, "feature", session=ch) assert result == (['314f2860593b8d3a5c8612693aed9232874210a3', '5d3d72c3ad2afcccb86d1693fd1a4b3bb39f407a'], ['L2/RetrievalGeometry/retrieval_latitude/', 'L2/RetrievalResults/xco2']) featureList = ['L2/RetrievalGeometry/retrieval_latitude/','L2/RetrievalResults/xco2','missing_feature'] result = codex_read_hd5(CODEX_ROOT + '/uploads/lnd_glint_subsample_10000.h5',featureList, "feature", session=ch) result = codex_read_hd5(CODEX_ROOT + '/uploads/lnd_glint_subsample_1000.h5', featureList, "feature", session=ch)
def test_peak_detection(capsys, testData): ch = get_cache(DOCTEST_SESSION) result = peak_detection(testData['inputHash'], testData['hashList'], None, False, "cwt", False, { "peak_width": 5, "gap_threshold": 2, "min_snr": 1, "noise_perc": 3 }, None, "direct", None, {}, ch).run() assert result['message'] == 'success'
def test_clustering(capsys, testData): ch = get_cache(DOCTEST_SESSION) result = normalize(testData['inputHash'], testData['hashList'], None, False, "test", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run() assert result['message'] == 'failure' assert result['WARNING'] == 'test algorithm not supported' result = normalize(testData['inputHash'], testData['hashList'], None, False, "normalize", False, {}, None, "direct", None, {}, ch).run() assert result['message'] == 'success' result = normalize(testData['inputHash'], testData['hashList'], None, False, "normalize", False, {}, None, "direct", None, {}, ch).run() assert result['message'] == 'success'
def codex_read_csv(file, featureList, hashType, session=None): ''' Inputs: Outputs: ''' cache = get_cache(session) #cache.logReturnCode(inspect.currentframe()) hashList = [] columns = defaultdict(list) try: with open(file) as f: reader = csv.DictReader(f) for row in reader: for (k, v) in row.items(): columns[k].append(v) f.close() except BaseException: logging.warning("codex_read_csv - cannot open file") return None if(featureList is None): featureList = columns.keys() for feature_name in featureList: try: feature_data = columns[feature_name][:] except BaseException: logging.warning("codex_read_csv: Feature not found.") return None if(isinstance(feature_data, list)): feature_data = np.asarray(feature_data) try: feature_data = feature_data.astype(np.float) except BaseException: logging.info("Tokenizing {f}.".format(f=feature_name)) feature_data = string2token(feature_data) feature_hash = cache.hashArray(feature_name, feature_data, hashType) hashList.append(feature_hash['hash']) return hashList, list(featureList)
def codex_read_hd5(file, featureList, hashType, session=None): ''' Inuputs: Outputs: Notes: ''' cache = get_cache(session) hashList = [] try: f = h5py.File(file, 'r+') except BaseException: logging.warning("ERROR: codex_read_hd5 - cannot open file") return None if(featureList is None): featureList = list(traverse_datasets(file)) for feature_name in featureList: try: feature_data = f[feature_name][:] except BaseException: logging.warning("Error: codex_read_hd5: Feature not found.") return try: feature_data = feature_data.astype(float) except BaseException: feature_data = string2token(feature_data) logging.info("Log: codex_read_hd5: Tokenized " + feature_name) feature_hash = cache.hashArray(feature_name, feature_data, hashType) hashList.append(feature_hash['hash']) f.close() return hashList, list(featureList)
def test_downsample(capsys): ch = get_cache(DOCTEST_SESSION) array = np.random.rand(200) result = downsample(array, percentage=10, session=ch) assert len(result) == 20 result = downsample(array, samples=50, session=ch) assert len(result) == 50 # More samples than in array result = downsample(array, samples=250, session=ch) assert len(result) == 200 ch.resetCacheList("downsample") result1 = downsample(array, samples=50, session=ch) result2 = downsample(array, samples=50, session=ch) assert np.array_equal(result1, result2) == True result3 = downsample(array, percentage=120, session=ch) result4 = downsample(array, session=ch)
def download_code(msg, result, savePath): ''' Inputs: Outputs: ''' try: cache = get_cache(msg['sessionkey']) saveFile = os.path.join(savePath, "returned_code.py") cache.dump_code_to_file(saveFile) f = open(saveFile, "r") lines = f.readlines() outString = "".join(lines) outStringEncoded = outString.encode('ascii') result['code'] = str( base64.b64encode(outStringEncoded).decode('utf-8')) result['message'] = 'success' f.close() except: logging.warning(traceback.format_exc()) return result
def explain_this(inputHash, featureNames, dataSelections, result, session=None): ''' Inputs: Outputs: Notes: Only works for binary classification. 0 class should be main data, 1 class should be isolated data to explain. ''' ch = get_cache(session) startTime = time.time() result = {"WARNING": None} returnHash = ch.findHashArray("hash", inputHash, "feature") if returnHash is None: warning("ERROR: explain_this: Hash not found. Returning.") return None data = returnHash['data'] if data is None: return None if data.ndim < 2: warning("ERROR: explain_this - insufficient data dimmensions") return None X, y = create_data_from_indices(dataSelections, data) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) result['tree_sweep'] = [] samples_, features_ = X.shape max_depth = 5 for i in range(1, max_depth): #train and fit the model parameters = { 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'], 'min_samples_split': range(2, 10), 'min_samples_leaf': range(1, 10), 'max_features': range(1, features_) } clf = DecisionTreeClassifier(max_depth=i) random_search = RandomizedSearchCV(estimator=clf, param_distributions=parameters, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1) random_search.fit(X_train, y_train) best_tree = random_search.best_estimator_ #generate the interpretation of the model dictionary = {} proportion_tree_sums = get_proportion_tree_sums( best_tree, X_test, y_test) feature_weights, feature_rank = zip(*sorted( zip(best_tree.feature_importances_, featureNames), reverse=True)) json_tree = export_json_tree(best_tree, featureNames[::-1], ["Main Data", "Isolated Data"], proportion_tree_sums) #rotate the tree here rotated_tree = rotate_tree(json_tree) dictionary['json_tree'] = rotated_tree dictionary["score"] = np.round(best_tree.score(X_test, y_test) * 100) dictionary["max_features"] = best_tree.max_features_ feature_weights = np.asarray(feature_weights).astype(float) dictionary["feature_rank"] = feature_rank dictionary["feature_weights"] = (np.round(feature_weights * 100)).tolist() result["tree_sweep"].append(dictionary) return result
def label_swap(labels, dataHash, session=None): ''' Inputs: Outputs: ''' from api.sub.hash import get_cache cache = get_cache(session) test_uniq_labels = np.unique(labels) num_labels = labels.size tmp = cache.findHashArray("name", dataHash, "label") if (tmp is None): return labels saved_labels = tmp["data"] ref_uniq_labels = np.unique(saved_labels) num_saved_labels = saved_labels.size # If reference labels are from DBSCAN, don't attempt to use them if (np.any(ref_uniq_labels[:] == -1)): return labels # If test labels are from DBSCAN, don't attempt to use them if (np.any(test_uniq_labels[:] == -1)): return labels # Do not attempt to remap label colors if the k values of the # test and reference label sets have a delta larger than 5 if (abs(ref_uniq_labels.size - test_uniq_labels.size) > 5): logging.info( "Difference between test and reference labels is too high, returning original labels" ) return labels finalMap = {} for z in range(0, test_uniq_labels.size): finalMap[str(z)] = None l_saved_labels = saved_labels.tolist() l_labels = labels.tolist() used = [] # For each k-label for x in range(0, test_uniq_labels.size): whileCount = 0 whileMax = 15 while (finalMap[str(x)] is None): if (x in ref_uniq_labels): # Find the first index of the list with that k-label # Compute on shuffeled list so first index is always different shuffle(l_saved_labels) ref_ind = l_saved_labels.index(x) # Get the label currently being used in new data at ref_ind # location cur_label = labels[ref_ind] # If the incoming label hasn't been used yet, use it. Else, try # again. if (finalMap[str(x)] is None): if (cur_label not in used): finalMap[str(x)] = cur_label used.append(cur_label) # Exceeded reference label k-count. Move on and fill in later else: break # Non-convergence break for while loop if (whileCount > whileMax): break whileCount += 1 # If test has more clusters than reference, some will not yet be filled (None) # Fill them in incrementally with k-labels which have not yet been used. for x in range(0, test_uniq_labels.size): if (finalMap[str(x)] is None): found = False for y in range(0, test_uniq_labels.size): if (y not in used and found == False): finalMap[str(x)] = y used.append(y) found = True if (found == False): newMax = max(used) finalMap[str(x)] = newMax + 1 used.append(newMax + 1) # Apply the "finalMap" translation dictionary on outgoing labels for j in range(0, labels.size): label = labels[j] newLabel = finalMap[str(label)] labels[j] = int(newLabel) return labels
def find_more_like_this(inputHash, featureList, dataSelections, similarityThreshold, result, session=None): ch = get_cache(session) startTime = time.time() result = {"WARNING": None} returnHash = ch.findHashArray("hash", inputHash, "feature") if returnHash is None: warning("ERROR: general_classifier: Hash not found. Returning.") return None data = returnHash['data'] if data is None: return None #handles the data and training for the positive-unlabeled learning bagging classifier #get the formatted data dataSelectionsValues = list(dataSelections) #get a full mask of all data [0, 1, 0 ... 0, 0, 1] of length(num data) data_mask = np.zeros(np.shape(data)[0]) for index in dataSelectionsValues: data_mask[index] = 1 #get the data corresponding to all positive examples positive_data = [] for index in dataSelectionsValues: positive_data.append(data[index]) #train the classifier with the formatted data as the positive examples and #a random sample from the other data with replacement as the negatives #this is done several times and the models are bagged votes = np.zeros(np.shape(data)[0]) #parameter search n_estimators = np.arange(3, 15) # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = np.arange(2, 5) # Minimum number of samples required to split a node min_samples_split = np.arange(2, 10) # Minimum number of samples required at each leaf node min_samples_leaf = np.arange(1, 10) # Method of selecting samples for training each tree bootstrap = [True, False] # Create the random grid random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } num_classifiers = 1 for i in range(num_classifiers): #each iteration train a classifier on the positive data #and a random subsample of the other data as negative examples #choose the same number of negative as positive examples you have negative_data_indices = np.random.choice( [index for index in range(len(data)) if data_mask[index] == 0], replace=True, size=len(positive_data)) negative_data = [data[index] for index in negative_data_indices] X = positive_data + negative_data #create labels now Y = np.zeros(len(X)) #make ones for po for i in range(len(positive_data)): Y[i] = 1 rf = RandomForestClassifier() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=30, cv=3, verbose=2, random_state=42, n_jobs=-1) rf_random.fit(X, Y) prediction = rf_random.predict(data) #make predictions and add them to votes votes += prediction votes = votes / num_classifiers #return all points labeled with a positive being when half or more of the bagged #classifiers vote on a given piece of data like_this_indices = [ index for index, value in enumerate(votes) if value > similarityThreshold ] #also include all of the positive inputs in the like_this_indices array result["like_this"] = like_this_indices return result
def on_message(self, message): global fileChunks msg = json.loads(message) result = {} filename = msg["filename"] filepath = os.path.join(CODEX_ROOT, "uploads", filename) if (msg["done"] == True): stop_cache_server() codex_hash_server = make_cache_process() codex_hash_server.start() logging.info('Finished file transfer, initiating save...') cache = get_cache(msg['sessionkey'], timeout=None) f = open(filepath, 'wb') for chunk in fileChunks: f.write(chunk) f.close() fileChunks = [] fileExtension = filename.split(".")[-1] if (fileExtension == "csv"): hashList, featureList = cache.import_csv(filepath) elif (fileExtension == "h5"): hashList, featureList = cache.import_hd5(filepath) elif (fileExtension == "npy"): hashList, featureList = cache.import_npy(filepath) else: result['message'] = "Currently unsupported filetype" stringMsg = json.dumps(result) self.write_message(stringMsg) sentinel_values = cache.getSentinelValues(featureList) nan = sentinel_values["nan"] inf = sentinel_values["inf"] ninf = sentinel_values["ninf"] else: contents = base64.decodebytes(str.encode(msg["chunk"])) fileChunks.append(contents) if msg['done']: result['status'] = 'complete' result['feature_names'] = featureList result["nan"] = nan result["inf"] = inf result["ninf"] = ninf logging.info('Finished file save.') else: result['status'] = 'streaming' stringMsg = json.dumps(result) self.write_message(stringMsg)
def algorithm_call(msg, result): ''' Inputs: Outputs: ''' try: ch = get_cache(msg['sessionkey']) parms = msg['parameters'] downsampled = msg["downsampled"] algorithmName = msg['algorithmName'] algorithmType = msg["algorithmType"] featureList = msg["dataFeatures"] featureList = get_featureList(featureList) subsetHashName = msg["dataSelections"] if (subsetHashName != []): subsetHashName = subsetHashName[0] else: subsetHashName = False try: labelName = msg["labelName"] labelHash = ch.findHashArray("name", labelName, "feature")['hash'] except: labelHash = None try: cross_val = msg["cross_val"] except: cross_val = None try: search_type = msg["search_type"] except: search_type = 'direct' try: scoring = msg["scoring"] except: scoring = None try: activeLabels = msg["activeLabels"] except: activeLabels = None hashList = ch.feature2hashList(featureList) data = ch.mergeHashResults(hashList) inputHash = ch.hashArray('Merged', data, "feature") if (inputHash != None): inputHash = inputHash["hash"] if (downsampled != False): downsampled = int(downsampled) if (algorithmType == "clustering"): pca = dimension_reduction(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, "PCA", downsampled, {"n_components":2}, scoring, search_type, cross_val, result, ch).run() result = clustering(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() result['data'] = pca['data'] elif (algorithmType == "dimensionality_reduction"): result = dimension_reduction(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() elif (algorithmType == "normalize"): result = normalize(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() elif (algorithmType == "peak_detect"): result = peak_detection(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() elif (algorithmType == "regression"): result = regression(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() elif (algorithmType == "template_scan"): result = template_scan(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() elif (algorithmType == "correlation"): result = correlation(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run() else: result['message'] = "Cannot parse algorithmType" except: logging.warning(traceback.format_exc()) return result
def test_codex_server_memory_check(capsys): ch = get_cache(DOCTEST_SESSION) codex_server_memory_check(session=ch)
def downsample(inputArray, samples=0, percentage=0.0, session=None, algorithm="simple"): ''' Inputs: inputArray - numpy array - array to be downsampled samples - int (optional) - number of samples requested in output array percentage - float (optional) - Outputs: outputArray - numpy array - resulting downsampled array Notes: If one wishes to do a percentage, do the percentage to samples calculation in the calling function ''' cache = get_cache(session) # first, create a hash of the input array, don't save inputHash = cache.hashArray("NOSAVE", inputArray, "NOSAVE") inputHashCode = inputHash["hash"] inputArray = impute( inputArray ) # TODO - mblib spanning seems to have problems with NaNs. Impute until fixed. totalPoints = inputArray.shape[0] # if number of samples is provided, use if (samples > 0): usedSamples = samples elif (percentage != 0): if (percentage <= 100 and percentage >= 0): usedSamples = int(float(percentage / 100) * totalPoints) else: logging.warning( "ERROR: downsample - perceange out of bounds 0-100") usedSamples = totalPoints else: logging.warning("ERROR: downsample - samples and percentage both 0.") usedSamples = totalPoints # first, check if this downsampling has already been done before existingHashCheck = cache.findHashArray("name", inputHashCode, "downsample") # Check if raw length is already less than requested downsample rate. # If it is, use that, otherwise, resample. if (existingHashCheck is not None and existingHashCheck["samples"] == usedSamples): outputArray = existingHashCheck["data"] else: try: if algorithm == "simple": outputArray = inputArray[np.random.choice(inputArray.shape[0], usedSamples, replace=False)] elif algorithm == "spanning": if inputArray.ndim == 1: inputList = [inputArray.tolist()] else: inputList = inputArray.T.tolist() mask_, array_ = mask_spanning_subset(inputList, usedSamples) outputArray = inputArray[mask_] else: logging.warning( "Unknown downsampling algorithm: {algorithm}".format( algorithm=algorithm)) outputArray = inputArray except BaseException: logging.warning( "downsample - failed to downsample.\n\n{trace}".format( trace=traceback.format_exc())) outputArray = inputArray # Hash the downsampled output, using the hash of the input in place of the name. # Later look up using this, w.r.t origin data outputHash = cache.hashArray(inputHashCode, outputArray, "downsample") return outputArray
def test_get_data_metrics(capsys, testData): cache = get_cache(DOCTEST_SESSION) message = {'routine': 'arrange', 'hashType': 'feature', 'activity': 'metrics', 'name': ['TiO2'], 'cid': '8vrjn', 'sessionkey': DOCTEST_SESSION} result = get_data_metrics(message, {})
def export_contents(msg, result, savePath): ''' Inputs: Outputs: ''' try: if (msg["type"] == "code"): cache = get_cache(msg['sessionkey']) saveFile = os.path.join(savePath, "returned_code.py") cache.dump_code_to_file(saveFile) f = open(saveFile, "r") lines = f.readlines() outString = "".join(lines) outStringEncoded = outString.encode('ascii') result['data'] = str( base64.b64encode(outStringEncoded).decode('utf-8')) result['filename'] = 'codex_code.py' result['message'] = 'success' result['content'] = True f.close() elif (msg["type"] == "features"): cache = get_cache(msg['sessionkey']) data = cache.return_data() names = [] features = [] for item in data['features']: names.append(item['name']) features.append(item['data']) if features: features = np.column_stack(features) header = ",".join(names) saveFile = os.path.join(savePath, "features.csv") np.savetxt(saveFile, features, delimiter=',', header=header) f = open(saveFile, "r") lines = f.readlines() outString = "".join(lines) outStringEncoded = outString.encode('ascii') result['data'] = str( base64.b64encode(outStringEncoded).decode('utf-8')) result['filename'] = 'codex_features.csv' result['content'] = True f.close() else: result['content'] = False result['message'] = 'success' elif (msg["type"] == "selections"): cache = get_cache(msg['sessionkey']) data = cache.return_data() names = [] selections = [] for item in data['subsets']: names.append(item['name']) selections.append(item['data']) if selections: selections = np.column_stack(selections) header = ",".join(names) saveFile = os.path.join(savePath, "selections.csv") np.savetxt(saveFile, selections, delimiter=',', header=header) f = open(saveFile, "r") lines = f.readlines() outString = "".join(lines) outStringEncoded = outString.encode('ascii') result['data'] = str( base64.b64encode(outStringEncoded).decode('utf-8')) result['filename'] = 'codex_selections.csv' result['content'] = True f.close() else: result['content'] = False result['message'] = 'success' else: result[ 'WARNING'] = 'export type not supported. code|features|selections are supported.' result['message'] = 'failure' except: logging.warning(traceback.format_exc()) return result
def run(self): self.cache = get_cache(self.session) startTime = time.time() self.result = { 'algorithm': self.algorithmName, 'downsample': self.downsampled, 'WARNING': None } returnHash = self.cache.findHashArray("hash", self.inputHash, "feature") if returnHash is None: logging.warning("Input hash not found: {inputHash}".format( inputHash=self.inputHash)) self.result[ "WARNING"] = "Input hash not found: {inputHash}".format( inputHash=self.inputHash) self.result['message'] = "failure" return self.result self.X = returnHash['data'] if self.X is None: self.result['message'] = "failure" logging.warning("X returned None") return self.result ret = self.check_valid() if not ret: self.result['message'] = "failure" return self.result if self.X.ndim == 1: full_samples = self.X.shape[0] full_features = 1 else: full_samples, full_features = self.X.shape self.result['eta'] = getComputeTimeEstimate(self.__class__.__name__, self.algorithmName, full_samples, full_features) if self.subsetHashName is not False: self.X = self.cache.applySubsetMask(self.X, self.subsetHashName) if (self.X is None): logging.warning("Subset hash not found: {subsetHash}".format( subsetHash=self.subsetHashName)) self.result['message'] = "failure" return self.result if self.downsampled is not False: self.X = downsample(self.X, samples=self.downsampled, session=self.cache, algorithm='simple') logging.info( "Downsampled to {samples} samples".format(samples=len(self.X))) # TODO - labels are currently cached under features if self.labelHash: labelHash_dict = self.cache.findHashArray("hash", self.labelHash, "feature") if labelHash_dict is None: logging.warning("Label hash not found: {labelHash}".format( self.labelHash)) self.result['message'] = "failure" return self.result else: self.y = labelHash_dict['data'] self.result['y'] = self.y.tolist() if self.X.ndim == 1: computed_samples = self.X.shape[0] computed_features = 1 else: computed_samples, computed_features = self.X.shape self.X = impute(self.X) self.result['data'] = self.X.tolist() self.algorithm = self.get_algorithm() if self.algorithm == None: self.result['message'] = "failure" self.result['WARNING'] = "{alg} algorithm not supported".format( alg=self.algorithmName) return self.result self.fit_algorithm() # TODO - The front end should specify a save name for the model model_name = self.algorithmName + "_" + str(random.random()) if self.search_type == 'direct': model_dict = self.cache.saveModel(model_name, self.algorithm, "regressor") else: model_dict = self.cache.saveModel(model_name, self.algorithm.best_estimator_, "regressor") if not model_dict: self.result['WARNING'] = "Model could not be saved." else: self.result['model_name'] = model_dict['name'] self.result['model_hash'] = model_dict['hash'] endTime = time.time() computeTime = endTime - startTime logTime(self.__class__.__name__, self.algorithmName, computeTime, computed_samples, computed_features) self.result['message'] = "success" return self.result
def test_add_data(capsys, testData): cache = get_cache(DOCTEST_SESSION) message = {'routine': 'arrange', 'hashType': 'feature', 'activity': 'add', 'name': 'TiO2', 'data': [1, 2, 3, 4], 'sessionkey': DOCTEST_SESSION} results = add_data(message, {})