Example #1
0
def test_save_subset(capsys):

    ch = get_cache(DOCTEST_SESSION)
    inputArray = np.array([10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200])
    randomSubset = np.array([0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,0,0,1,0,1])
    inputHash = ch.hashArray('input_array', inputArray, 'feature')
    subsetHash = ch.hashArray('subset_hash', randomSubset, 'subset')
    outputHash,resultingName = save_subset(inputHash['hash'], False, CODEX_ROOT + '/uploads/save_subset_output_test.h5', session=ch)
    readingHash = codex_read_hd5(CODEX_ROOT + '/uploads/save_subset_output_test.h5', [resultingName], "feature", session=ch)

    save_subset(None, None, CODEX_ROOT + '/uploads/', session=ch)

    inputArray = np.array([10,20,30,40,50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200])
    randomSubset = np.array([0,1,0,1,0,1,1,0,0,0,0,0,1,1,1,0,0,1,0,1])

    inputHash = ch.hashArray('input_array', inputArray, 'feature', session=ch)
    subsetHash = ch.hashArray('subset_hash', randomSubset, 'subset', session=ch)

    # Test scenario of not applying a subset mask
    outputHash,resultingName = save_subset(inputHash['hash'], False, CODEX_ROOT + '/uploads/save_subset_output_test.h5', session=ch)
    readingHash = codex_read_hd5(CODEX_ROOT + '/uploads/save_subset_output_test.h5', [resultingName], "feature", session=ch)

    assert outputHash == readingHash[0][0]

    # Test scenario of applying subset mask.  Save full feature.
    outputHash,resultingName = save_subset(inputHash['hash'], subsetHash['hash'], CODEX_ROOT + '/uploads/save_subset_output_test.h5', session=ch)
    readingHash = codex_read_hd5(CODEX_ROOT + '/uploads/save_subset_output_test.h5', [resultingName], "feature", session=ch)

    assert outputHash == readingHash[0][0]
Example #2
0
def test_update_data(capsys, testData):

    cache = get_cache(DOCTEST_SESSION)

    message = {'routine': 'arrange', 'hashType': 'feature', 'field': 'name', 'old': 'TiO2', 'new':'updated_name', 'cid': '8vrjn', 'sessionkey': DOCTEST_SESSION}
    result = update_data(message, {})
    #assert result['message'] == 'success'
Example #3
0
def test_codex_read_csv(capsys):

    ch = get_cache(DOCTEST_SESSION)
    featureList = ['TiO2','FeOT','SiO2','Total']
    hashList = codex_read_csv(CODEX_ROOT + '/uploads/missing.csv',featureList, "feature", session=ch)
    featureList = ['fake_feature','FeOT','SiO2','Total']
    hashList = codex_read_csv(CODEX_ROOT + '/uploads/doctest.csv',featureList, "feature", session=ch)
Example #4
0
def codex_read_npy(file, featureList, hashType, session=None):
    '''
    Inuputs:

    Outputs:

    Notes:

    '''
    cache = get_cache(session)

    hashList = []

    try:
        data = np.load(file)
    except BaseException:
        logging.warning("ERROR: codex_read_npy - cannot open file")
        return None

    samples, features = data.shape
    featureList = []
    for x in range(0, features):

        try:
            feature_data = data[:, x].astype(float)
        except BaseException:
            feature_data = string2token(data[:, x])
            logging.info("Log: codex_read_npy: Tokenized " + feature_name)

        feature_name = "feature_" + str(x)
        featureList.append(feature_name)
        feature_hash = cache.hashArray(feature_name, feature_data, hashType)
        hashList.append(feature_hash['hash'])

    return hashList, featureList
Example #5
0
def save_subset(inputHash, subsetHash, saveFilePath, session=None):
    '''
    Inuputs:

    Outputs:

    '''
    cache = get_cache(session)
    returnHash = cache.findHashArray("hash", inputHash, "feature")
    if(returnHash is None):
        logging.warning("Hash not found. Returning!")
        return

    data = returnHash['data']
    feature_name = returnHash['name']

    if(subsetHash is not False):
        data, subsetName = cache.applySubsetMask(data, subsetHash)

    if(subsetHash is not False):
        newFeatureName = feature_name + "_" + subsetName
    else:
        newFeatureName = feature_name

    newHash = cache.hashArray(newFeatureName, data, 'feature')

    h5f = h5py.File(saveFilePath, 'w')
    h5f.create_dataset(newFeatureName, data=data)

    return newHash['hash'], newFeatureName
Example #6
0
def test_get_sessions(capsys, testData):

    cache = get_cache(DOCTEST_SESSION)

    message =  {'session_name': 'AUTOSAVE', 'state': {'windows': [{'data': {'features': ['SiO2', 'TiO2']}, 'height': 500, 'width': 500, 'x': 0, 'y': 0, 'windowType': 'Scatter'}]}, 'sessionkey': DOCTEST_SESSION}
    result = get_sessions({}, {}, CODEX_ROOT)
    assert "AUTOSAVE" in result['sessions']
Example #7
0
def test_clustering(capsys, testData):

    ch = get_cache(DOCTEST_SESSION)

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "kmean",                False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'failure'
    assert result['WARNING'] == 'kmean algorithm not supported'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "kmeans",               False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "mean_shift",           False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "birch",                False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "ward",                 False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "spectral",             False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "dbscan",               False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "agglomerative",        False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = clustering(testData['inputHash'], testData['hashList'], None, False, "affinity_propagation", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'
Example #8
0
def testData(session=None):
    '''
    Inputs:
        None

    Outputs:
        None

    Notes:
        doctest function to streamline data ingestion for use
        in clustering unit tests

        TODO - labels are currently stashed in features due to front-end limitations.  
                Need to convert here when they get moved to their own class.

    '''
    from api.sub.hash import get_cache, DOCTEST_SESSION

    cache = get_cache(DOCTEST_SESSION if session is None else session)

    featureList = ['TiO2', 'FeOT', 'SiO2', 'Total']
    hashList, featureList = codex_read_csv(CODEX_ROOT + '/uploads/doctest.csv',
                                           featureList,
                                           "feature",
                                           session=cache)

    # merge 1d arrays to nd-array
    data = cache.mergeHashResults(hashList)
    samples, features = data.shape

    inputHash = cache.hashArray('Merged', data, "feature")

    template = np.zeros(samples)
    templateHashDictionary = cache.hashArray("template", template, "feature")
    templateHash = templateHashDictionary['hash']

    labelHash = codex_read_csv(CODEX_ROOT + '/uploads/doctest.csv', ["labels"],
                               "feature",
                               session=cache)
    labelHash = labelHash[0][0]

    regrLabelData = []
    random.seed(50)

    for j in range(samples):
        regrLabelData.append(random.randint(0, 10))

    regrLabelData = np.asarray(regrLabelData)
    regrLabelDictionary = cache.hashArray("regrLabelHash", regrLabelData,
                                          "feature")
    regrLabelHash = regrLabelDictionary['hash']

    return {
        "inputHash": inputHash['hash'],
        'featureNames': featureList,
        "hashList": hashList,
        "templateHash": templateHash,
        "classLabelHash": labelHash,
        "regrLabelHash": regrLabelHash
    }
Example #9
0
def load_session(msg, result, loadPath):
    '''
    Inputs:

    Outputs:

    '''
    try:

        cache = get_cache(msg['sessionkey'])

        session_name = msg['session_name']
        session_path = os.path.join(loadPath, 'sessions', session_name)
        result['session_name'] = msg['session_name']

        if os.path.exists(session_path):
            result['session_data'] = cache.unpickle_data(
                session_name, loadPath)
        else:
            result["WARNING"] = session_name + " does not exist."

    except:
        logging.warning(traceback.format_exc())

    return result
Example #10
0
def save_session(msg, result, savePath):
    '''
    Inputs:

    Outputs:

    '''
    try:

        cache = get_cache(msg['sessionkey'])

        session_name = msg['session_name']

        if session_name == "AUTOSAVE":
            cache.pickle_data(session_name, msg['state'], savePath)
        else:
            session_path = os.path.join(savePath, 'sessions', session_name)

            if not os.path.exists(session_path):
                cache.pickle_data(session_name, msg['state'], savePath)
            else:
                result["WARNING"] = "{session_name} already exists.".format(
                    session_name=session_name)

    except:
        logging.warning(traceback.format_exc())

    return result
Example #11
0
def codex_server_memory_check(verbose=False, session=None):
    '''
    Inputs:

    Outputs:

    Notes:
        Value returned in MB

    '''
    from api.sub.hash import get_cache # defer import to try to circumvent circular import

    cache = get_cache(session)
    allowed_ram = 4096
    current_ram = get_codex_memory_usage()

    if(verbose):
        logging.info("RAM Usage: " + str(current_ram) + "/" + str(allowed_ram))

    while(current_ram > allowed_ram):
        last_ram = current_ram
        status = cache.remove_stale_data()
        if(status != True):
            return

        current_ram = get_codex_memory_usage()

        if(verbose):
            logging.info("RAM Usage: " + str(current_ram) + "/" + str(allowed_ram))

        if(math.isclose(current_ram, last_ram, abs_tol=10)):
            return
Example #12
0
def test_dimension_reduction(capsys, testData):

    ch = get_cache(DOCTEST_SESSION)

    result = dimension_reduction(testData['inputHash'], testData['hashList'], None, False, "PCA", False, {"n_components":2}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = dimension_reduction(testData['inputHash'], testData['hashList'], None, False, "ICA", False, {"n_components":2}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'
Example #13
0
def test_codex_read_hd5(capsys):

    ch = get_cache(DOCTEST_SESSION)
    featureList = ['L2/RetrievalGeometry/retrieval_latitude/','L2/RetrievalResults/xco2']
    result = codex_read_hd5(CODEX_ROOT + '/uploads/lnd_glint_subsample_10000.h5',featureList, "feature", session=ch)
    assert result == (['314f2860593b8d3a5c8612693aed9232874210a3', '5d3d72c3ad2afcccb86d1693fd1a4b3bb39f407a'], ['L2/RetrievalGeometry/retrieval_latitude/', 'L2/RetrievalResults/xco2'])

    featureList = ['L2/RetrievalGeometry/retrieval_latitude/','L2/RetrievalResults/xco2','missing_feature']
    result = codex_read_hd5(CODEX_ROOT + '/uploads/lnd_glint_subsample_10000.h5',featureList, "feature", session=ch)
    result = codex_read_hd5(CODEX_ROOT + '/uploads/lnd_glint_subsample_1000.h5', featureList, "feature", session=ch)
Example #14
0
def test_peak_detection(capsys, testData):

    ch = get_cache(DOCTEST_SESSION)

    result = peak_detection(testData['inputHash'], testData['hashList'], None,
                            False, "cwt", False, {
                                "peak_width": 5,
                                "gap_threshold": 2,
                                "min_snr": 1,
                                "noise_perc": 3
                            }, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'
Example #15
0
def test_clustering(capsys, testData):

    ch = get_cache(DOCTEST_SESSION)

    result = normalize(testData['inputHash'], testData['hashList'], None, False, "test", False, {'k': 3, 'eps': 0.7, 'n_neighbors': 10, 'quantile': 0.5, 'damping': 0.9}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'failure'
    assert result['WARNING'] == 'test algorithm not supported'

    result = normalize(testData['inputHash'], testData['hashList'], None, False, "normalize", False, {}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'

    result = normalize(testData['inputHash'], testData['hashList'], None, False, "normalize", False, {}, None, "direct", None, {}, ch).run()
    assert result['message'] == 'success'
Example #16
0
def codex_read_csv(file, featureList, hashType, session=None):
    '''
    Inputs:

    Outputs:

    '''
    cache = get_cache(session)
    #cache.logReturnCode(inspect.currentframe())

    hashList = []
    columns = defaultdict(list)

    try:
        with open(file) as f:
            reader = csv.DictReader(f)
            for row in reader:
                for (k, v) in row.items():
                    columns[k].append(v)
        f.close()
    except BaseException:
        logging.warning("codex_read_csv - cannot open file")
        return None

    if(featureList is None):
        featureList = columns.keys()

    for feature_name in featureList:
        try:
            feature_data = columns[feature_name][:]
        except BaseException:
            logging.warning("codex_read_csv: Feature not found.")
            return None

        if(isinstance(feature_data, list)):
            feature_data = np.asarray(feature_data)

        try:
            feature_data = feature_data.astype(np.float)
        except BaseException:
            logging.info("Tokenizing {f}.".format(f=feature_name))
            feature_data = string2token(feature_data)

        feature_hash = cache.hashArray(feature_name, feature_data, hashType)
        hashList.append(feature_hash['hash'])

    return hashList, list(featureList)
Example #17
0
def codex_read_hd5(file, featureList, hashType, session=None):
    '''
    Inuputs:

    Outputs:

    Notes:

    '''
    cache = get_cache(session)

    hashList = []

    try:
        f = h5py.File(file, 'r+')
    except BaseException:
        logging.warning("ERROR: codex_read_hd5 - cannot open file")
        return None

    if(featureList is None):
        featureList = list(traverse_datasets(file))

    for feature_name in featureList:
        try:
            feature_data = f[feature_name][:]
        except BaseException:
            logging.warning("Error: codex_read_hd5: Feature not found.")
            return

        try:
            feature_data = feature_data.astype(float)
        except BaseException:
            feature_data = string2token(feature_data)
            logging.info("Log: codex_read_hd5: Tokenized " + feature_name)

        feature_hash = cache.hashArray(feature_name, feature_data, hashType)
        hashList.append(feature_hash['hash'])

    f.close()
    return hashList, list(featureList)
Example #18
0
def test_downsample(capsys):

    ch = get_cache(DOCTEST_SESSION)
    array = np.random.rand(200)

    result = downsample(array, percentage=10, session=ch)
    assert len(result) == 20

    result = downsample(array, samples=50, session=ch)
    assert len(result) == 50

    # More samples than in array
    result = downsample(array, samples=250, session=ch)
    assert len(result) == 200

    ch.resetCacheList("downsample")
    result1 = downsample(array, samples=50, session=ch)
    result2 = downsample(array, samples=50, session=ch)
    assert np.array_equal(result1, result2) == True

    result3 = downsample(array, percentage=120, session=ch)

    result4 = downsample(array, session=ch)
Example #19
0
def download_code(msg, result, savePath):
    '''
    Inputs:

    Outputs:

    '''
    try:
        cache = get_cache(msg['sessionkey'])
        saveFile = os.path.join(savePath, "returned_code.py")
        cache.dump_code_to_file(saveFile)
        f = open(saveFile, "r")
        lines = f.readlines()
        outString = "".join(lines)
        outStringEncoded = outString.encode('ascii')
        result['code'] = str(
            base64.b64encode(outStringEncoded).decode('utf-8'))
        result['message'] = 'success'
        f.close()

    except:
        logging.warning(traceback.format_exc())

    return result
Example #20
0
def explain_this(inputHash,
                 featureNames,
                 dataSelections,
                 result,
                 session=None):
    '''
    Inputs:

    Outputs:

    Notes: Only works for binary classification.  0 class should be main data, 1 class should be isolated data to explain.

    '''
    ch = get_cache(session)

    startTime = time.time()
    result = {"WARNING": None}

    returnHash = ch.findHashArray("hash", inputHash, "feature")
    if returnHash is None:
        warning("ERROR: explain_this: Hash not found. Returning.")
        return None

    data = returnHash['data']
    if data is None:
        return None

    if data.ndim < 2:
        warning("ERROR: explain_this - insufficient data dimmensions")
        return None

    X, y = create_data_from_indices(dataSelections, data)

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)

    result['tree_sweep'] = []

    samples_, features_ = X.shape

    max_depth = 5
    for i in range(1, max_depth):
        #train and fit the model
        parameters = {
            'criterion': ['gini', 'entropy'],
            'splitter': ['best', 'random'],
            'min_samples_split': range(2, 10),
            'min_samples_leaf': range(1, 10),
            'max_features': range(1, features_)
        }

        clf = DecisionTreeClassifier(max_depth=i)

        random_search = RandomizedSearchCV(estimator=clf,
                                           param_distributions=parameters,
                                           n_iter=100,
                                           cv=3,
                                           verbose=2,
                                           random_state=42,
                                           n_jobs=-1)

        random_search.fit(X_train, y_train)

        best_tree = random_search.best_estimator_

        #generate the interpretation of the model
        dictionary = {}

        proportion_tree_sums = get_proportion_tree_sums(
            best_tree, X_test, y_test)

        feature_weights, feature_rank = zip(*sorted(
            zip(best_tree.feature_importances_, featureNames), reverse=True))

        json_tree = export_json_tree(best_tree, featureNames[::-1],
                                     ["Main Data", "Isolated Data"],
                                     proportion_tree_sums)
        #rotate the tree here
        rotated_tree = rotate_tree(json_tree)

        dictionary['json_tree'] = rotated_tree

        dictionary["score"] = np.round(best_tree.score(X_test, y_test) * 100)
        dictionary["max_features"] = best_tree.max_features_

        feature_weights = np.asarray(feature_weights).astype(float)
        dictionary["feature_rank"] = feature_rank
        dictionary["feature_weights"] = (np.round(feature_weights *
                                                  100)).tolist()

        result["tree_sweep"].append(dictionary)

    return result
Example #21
0
def label_swap(labels, dataHash, session=None):
    '''
    Inputs:

    Outputs:

    '''
    from api.sub.hash import get_cache
    cache = get_cache(session)

    test_uniq_labels = np.unique(labels)
    num_labels = labels.size

    tmp = cache.findHashArray("name", dataHash, "label")
    if (tmp is None):
        return labels

    saved_labels = tmp["data"]
    ref_uniq_labels = np.unique(saved_labels)
    num_saved_labels = saved_labels.size

    # If reference labels are from DBSCAN, don't attempt to use them
    if (np.any(ref_uniq_labels[:] == -1)):
        return labels

    # If test labels are from DBSCAN, don't attempt to use them
    if (np.any(test_uniq_labels[:] == -1)):
        return labels

    # Do not attempt to remap label colors if the k values of the
    #    test and reference label sets have a delta larger than 5
    if (abs(ref_uniq_labels.size - test_uniq_labels.size) > 5):
        logging.info(
            "Difference between test and reference labels is too high, returning original labels"
        )
        return labels

    finalMap = {}
    for z in range(0, test_uniq_labels.size):
        finalMap[str(z)] = None

    l_saved_labels = saved_labels.tolist()
    l_labels = labels.tolist()
    used = []

    # For each k-label
    for x in range(0, test_uniq_labels.size):

        whileCount = 0
        whileMax = 15
        while (finalMap[str(x)] is None):

            if (x in ref_uniq_labels):

                # Find the first index of the list with that k-label
                #	Compute on shuffeled list so first index is always different
                shuffle(l_saved_labels)
                ref_ind = l_saved_labels.index(x)

                # Get the label currently being used in new data at ref_ind
                # location
                cur_label = labels[ref_ind]

                # If the incoming label hasn't been used yet, use it. Else, try
                # again.
                if (finalMap[str(x)] is None):
                    if (cur_label not in used):
                        finalMap[str(x)] = cur_label
                        used.append(cur_label)

            # Exceeded reference label k-count. Move on and fill in later
            else:
                break

            # Non-convergence break for while loop
            if (whileCount > whileMax):
                break
            whileCount += 1

    # If test has more clusters than reference, some will not yet be filled (None)
    # 	Fill them in incrementally with k-labels which have not yet been used.
    for x in range(0, test_uniq_labels.size):
        if (finalMap[str(x)] is None):
            found = False
            for y in range(0, test_uniq_labels.size):
                if (y not in used and found == False):
                    finalMap[str(x)] = y
                    used.append(y)
                    found = True
            if (found == False):
                newMax = max(used)
                finalMap[str(x)] = newMax + 1
                used.append(newMax + 1)

    # Apply the "finalMap" translation dictionary on outgoing labels
    for j in range(0, labels.size):

        label = labels[j]
        newLabel = finalMap[str(label)]
        labels[j] = int(newLabel)

    return labels
Example #22
0
def find_more_like_this(inputHash,
                        featureList,
                        dataSelections,
                        similarityThreshold,
                        result,
                        session=None):

    ch = get_cache(session)

    startTime = time.time()
    result = {"WARNING": None}

    returnHash = ch.findHashArray("hash", inputHash, "feature")
    if returnHash is None:
        warning("ERROR: general_classifier: Hash not found. Returning.")
        return None

    data = returnHash['data']
    if data is None:
        return None

    #handles the data and training for the positive-unlabeled learning bagging classifier
    #get the formatted data
    dataSelectionsValues = list(dataSelections)
    #get a full mask of all data [0, 1, 0 ... 0, 0, 1] of length(num data)
    data_mask = np.zeros(np.shape(data)[0])
    for index in dataSelectionsValues:
        data_mask[index] = 1
    #get the data corresponding to all positive examples
    positive_data = []
    for index in dataSelectionsValues:
        positive_data.append(data[index])

    #train the classifier with the formatted data as the positive examples and
    #a random sample from the other data with replacement as the negatives
    #this is done several times and the models are bagged
    votes = np.zeros(np.shape(data)[0])

    #parameter search
    n_estimators = np.arange(3, 15)
    # Number of features to consider at every split
    max_features = ['auto', 'sqrt']
    # Maximum number of levels in tree
    max_depth = np.arange(2, 5)
    # Minimum number of samples required to split a node
    min_samples_split = np.arange(2, 10)
    # Minimum number of samples required at each leaf node
    min_samples_leaf = np.arange(1, 10)
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {
        'n_estimators': n_estimators,
        'max_features': max_features,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'bootstrap': bootstrap
    }

    num_classifiers = 1
    for i in range(num_classifiers):
        #each iteration train a classifier on the positive data
        #and a random subsample of the other data as negative examples
        #choose the same number of negative as positive examples you have
        negative_data_indices = np.random.choice(
            [index for index in range(len(data)) if data_mask[index] == 0],
            replace=True,
            size=len(positive_data))

        negative_data = [data[index] for index in negative_data_indices]

        X = positive_data + negative_data

        #create labels now
        Y = np.zeros(len(X))
        #make ones for po
        for i in range(len(positive_data)):
            Y[i] = 1

        rf = RandomForestClassifier()

        rf_random = RandomizedSearchCV(estimator=rf,
                                       param_distributions=random_grid,
                                       n_iter=30,
                                       cv=3,
                                       verbose=2,
                                       random_state=42,
                                       n_jobs=-1)

        rf_random.fit(X, Y)

        prediction = rf_random.predict(data)

        #make predictions and add them to votes
        votes += prediction

    votes = votes / num_classifiers

    #return all points labeled with a positive being when half or more of the bagged
    #classifiers vote on a given piece of data
    like_this_indices = [
        index for index, value in enumerate(votes)
        if value > similarityThreshold
    ]
    #also include all of the positive inputs in the like_this_indices array

    result["like_this"] = like_this_indices

    return result
Example #23
0
    def on_message(self, message):

        global fileChunks

        msg = json.loads(message)
        result = {}

        filename = msg["filename"]
        filepath = os.path.join(CODEX_ROOT, "uploads", filename)

        if (msg["done"] == True):

            stop_cache_server()
            codex_hash_server = make_cache_process()
            codex_hash_server.start()

            logging.info('Finished file transfer, initiating save...')
            cache = get_cache(msg['sessionkey'], timeout=None)

            f = open(filepath, 'wb')
            for chunk in fileChunks:
                f.write(chunk)
            f.close()
            fileChunks = []

            fileExtension = filename.split(".")[-1]
            if (fileExtension == "csv"):
                hashList, featureList = cache.import_csv(filepath)

            elif (fileExtension == "h5"):
                hashList, featureList = cache.import_hd5(filepath)

            elif (fileExtension == "npy"):
                hashList, featureList = cache.import_npy(filepath)

            else:
                result['message'] = "Currently unsupported filetype"
                stringMsg = json.dumps(result)
                self.write_message(stringMsg)

            sentinel_values = cache.getSentinelValues(featureList)
            nan = sentinel_values["nan"]
            inf = sentinel_values["inf"]
            ninf = sentinel_values["ninf"]

        else:
            contents = base64.decodebytes(str.encode(msg["chunk"]))
            fileChunks.append(contents)

        if msg['done']:
            result['status'] = 'complete'
            result['feature_names'] = featureList
            result["nan"] = nan
            result["inf"] = inf
            result["ninf"] = ninf
            logging.info('Finished file save.')
        else:
            result['status'] = 'streaming'

        stringMsg = json.dumps(result)
        self.write_message(stringMsg)
Example #24
0
def algorithm_call(msg, result):
    '''
    Inputs:

    Outputs:

    '''
    try:

        ch = get_cache(msg['sessionkey'])

        parms = msg['parameters']
        downsampled = msg["downsampled"]
        algorithmName = msg['algorithmName']
        algorithmType = msg["algorithmType"]

        featureList = msg["dataFeatures"]
        featureList = get_featureList(featureList)

        subsetHashName = msg["dataSelections"]
        if (subsetHashName != []):
            subsetHashName = subsetHashName[0]
        else:
            subsetHashName = False

        try:
            labelName = msg["labelName"]
            labelHash = ch.findHashArray("name", labelName, "feature")['hash']
        except:
            labelHash = None

        try:
            cross_val = msg["cross_val"]
        except:
            cross_val = None

        try:
            search_type = msg["search_type"]
        except:
            search_type = 'direct'

        try:
            scoring = msg["scoring"]
        except:
            scoring = None

        try:
            activeLabels = msg["activeLabels"]
        except:
            activeLabels = None


        hashList = ch.feature2hashList(featureList)

        data = ch.mergeHashResults(hashList)
        inputHash = ch.hashArray('Merged', data, "feature")

        if (inputHash != None):
            inputHash = inputHash["hash"]

        if (downsampled != False):
            downsampled = int(downsampled)

        if (algorithmType == "clustering"):
            pca = dimension_reduction(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, "PCA", downsampled, {"n_components":2}, scoring, search_type, cross_val, result, ch).run()
            result =       clustering(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()
            result['data'] = pca['data']

        elif (algorithmType == "dimensionality_reduction"):
            result = dimension_reduction(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()

        elif (algorithmType == "normalize"):
            result = normalize(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()

        elif (algorithmType == "peak_detect"):
            result = peak_detection(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()

        elif (algorithmType == "regression"):
            result = regression(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()

        elif (algorithmType == "template_scan"):
            result = template_scan(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()

        elif (algorithmType == "correlation"):
            result = correlation(inputHash, activeLabels, featureList, hashList, labelHash, subsetHashName, algorithmName, downsampled, parms, scoring, search_type, cross_val, result, ch).run()

        else:
            result['message'] = "Cannot parse algorithmType"


    except:
        logging.warning(traceback.format_exc())

    return result
Example #25
0
def test_codex_server_memory_check(capsys):

    ch = get_cache(DOCTEST_SESSION)
    codex_server_memory_check(session=ch)
Example #26
0
def downsample(inputArray,
               samples=0,
               percentage=0.0,
               session=None,
               algorithm="simple"):
    '''
    Inputs:
        inputArray  - numpy array       - array to be downsampled
        samples     - int (optional)    - number of samples requested in output array
        percentage  - float	(optional)  -

    Outputs:
        outputArray - numpy array - resulting downsampled array

    Notes:
        If one wishes to do a percentage, do the
        percentage to samples calculation in the calling function
    '''
    cache = get_cache(session)

    # first, create a hash of the input array, don't save
    inputHash = cache.hashArray("NOSAVE", inputArray, "NOSAVE")
    inputHashCode = inputHash["hash"]
    inputArray = impute(
        inputArray
    )  # TODO - mblib spanning seems to have problems with NaNs.  Impute until fixed.
    totalPoints = inputArray.shape[0]

    # if number of samples is provided, use
    if (samples > 0):
        usedSamples = samples

    elif (percentage != 0):
        if (percentage <= 100 and percentage >= 0):
            usedSamples = int(float(percentage / 100) * totalPoints)
        else:
            logging.warning(
                "ERROR: downsample - perceange out of bounds 0-100")
            usedSamples = totalPoints

    else:
        logging.warning("ERROR: downsample - samples and percentage both 0.")
        usedSamples = totalPoints

    # first, check if this downsampling has already been done before
    existingHashCheck = cache.findHashArray("name", inputHashCode,
                                            "downsample")

    # Check if raw length is already less than requested downsample rate.
    #   If it is, use that, otherwise, resample.
    if (existingHashCheck is not None
            and existingHashCheck["samples"] == usedSamples):
        outputArray = existingHashCheck["data"]

    else:

        try:

            if algorithm == "simple":

                outputArray = inputArray[np.random.choice(inputArray.shape[0],
                                                          usedSamples,
                                                          replace=False)]

            elif algorithm == "spanning":

                if inputArray.ndim == 1:
                    inputList = [inputArray.tolist()]
                else:
                    inputList = inputArray.T.tolist()
                mask_, array_ = mask_spanning_subset(inputList, usedSamples)
                outputArray = inputArray[mask_]

            else:
                logging.warning(
                    "Unknown downsampling algorithm: {algorithm}".format(
                        algorithm=algorithm))
                outputArray = inputArray

        except BaseException:
            logging.warning(
                "downsample - failed to downsample.\n\n{trace}".format(
                    trace=traceback.format_exc()))
            outputArray = inputArray

    # Hash the downsampled output, using the hash of the input in place of the name.
    #	Later look up using this, w.r.t origin data
    outputHash = cache.hashArray(inputHashCode, outputArray, "downsample")
    return outputArray
Example #27
0
def test_get_data_metrics(capsys, testData):

    cache = get_cache(DOCTEST_SESSION)

    message = {'routine': 'arrange', 'hashType': 'feature', 'activity': 'metrics', 'name': ['TiO2'], 'cid': '8vrjn', 'sessionkey': DOCTEST_SESSION}
    result = get_data_metrics(message, {})
Example #28
0
def export_contents(msg, result, savePath):
    '''
    Inputs:

    Outputs:

    '''
    try:

        if (msg["type"] == "code"):
            cache = get_cache(msg['sessionkey'])
            saveFile = os.path.join(savePath, "returned_code.py")
            cache.dump_code_to_file(saveFile)
            f = open(saveFile, "r")
            lines = f.readlines()
            outString = "".join(lines)
            outStringEncoded = outString.encode('ascii')
            result['data'] = str(
                base64.b64encode(outStringEncoded).decode('utf-8'))
            result['filename'] = 'codex_code.py'
            result['message'] = 'success'
            result['content'] = True
            f.close()

        elif (msg["type"] == "features"):
            cache = get_cache(msg['sessionkey'])
            data = cache.return_data()
            names = []
            features = []
            for item in data['features']:
                names.append(item['name'])
                features.append(item['data'])
            if features:
                features = np.column_stack(features)
                header = ",".join(names)
                saveFile = os.path.join(savePath, "features.csv")
                np.savetxt(saveFile, features, delimiter=',', header=header)
                f = open(saveFile, "r")
                lines = f.readlines()
                outString = "".join(lines)
                outStringEncoded = outString.encode('ascii')
                result['data'] = str(
                    base64.b64encode(outStringEncoded).decode('utf-8'))
                result['filename'] = 'codex_features.csv'
                result['content'] = True
                f.close()
            else:
                result['content'] = False
            result['message'] = 'success'

        elif (msg["type"] == "selections"):
            cache = get_cache(msg['sessionkey'])
            data = cache.return_data()
            names = []
            selections = []
            for item in data['subsets']:
                names.append(item['name'])
                selections.append(item['data'])
            if selections:
                selections = np.column_stack(selections)
                header = ",".join(names)
                saveFile = os.path.join(savePath, "selections.csv")
                np.savetxt(saveFile, selections, delimiter=',', header=header)
                f = open(saveFile, "r")
                lines = f.readlines()
                outString = "".join(lines)
                outStringEncoded = outString.encode('ascii')
                result['data'] = str(
                    base64.b64encode(outStringEncoded).decode('utf-8'))
                result['filename'] = 'codex_selections.csv'
                result['content'] = True
                f.close()
            else:
                result['content'] = False
            result['message'] = 'success'
        else:
            result[
                'WARNING'] = 'export type not supported. code|features|selections are supported.'
            result['message'] = 'failure'

    except:
        logging.warning(traceback.format_exc())

    return result
Example #29
0
    def run(self):

        self.cache = get_cache(self.session)

        startTime = time.time()
        self.result = {
            'algorithm': self.algorithmName,
            'downsample': self.downsampled,
            'WARNING': None
        }

        returnHash = self.cache.findHashArray("hash", self.inputHash,
                                              "feature")
        if returnHash is None:
            logging.warning("Input hash not found: {inputHash}".format(
                inputHash=self.inputHash))
            self.result[
                "WARNING"] = "Input hash not found: {inputHash}".format(
                    inputHash=self.inputHash)
            self.result['message'] = "failure"
            return self.result

        self.X = returnHash['data']
        if self.X is None:
            self.result['message'] = "failure"
            logging.warning("X returned None")
            return self.result

        ret = self.check_valid()
        if not ret:
            self.result['message'] = "failure"
            return self.result

        if self.X.ndim == 1:
            full_samples = self.X.shape[0]
            full_features = 1
        else:
            full_samples, full_features = self.X.shape

        self.result['eta'] = getComputeTimeEstimate(self.__class__.__name__,
                                                    self.algorithmName,
                                                    full_samples,
                                                    full_features)

        if self.subsetHashName is not False:
            self.X = self.cache.applySubsetMask(self.X, self.subsetHashName)
            if (self.X is None):
                logging.warning("Subset hash not found: {subsetHash}".format(
                    subsetHash=self.subsetHashName))
                self.result['message'] = "failure"
                return self.result

        if self.downsampled is not False:
            self.X = downsample(self.X,
                                samples=self.downsampled,
                                session=self.cache,
                                algorithm='simple')
            logging.info(
                "Downsampled to {samples} samples".format(samples=len(self.X)))

        # TODO - labels are currently cached under features
        if self.labelHash:
            labelHash_dict = self.cache.findHashArray("hash", self.labelHash,
                                                      "feature")
            if labelHash_dict is None:
                logging.warning("Label hash not found: {labelHash}".format(
                    self.labelHash))
                self.result['message'] = "failure"
                return self.result
            else:
                self.y = labelHash_dict['data']
                self.result['y'] = self.y.tolist()

        if self.X.ndim == 1:
            computed_samples = self.X.shape[0]
            computed_features = 1
        else:
            computed_samples, computed_features = self.X.shape

        self.X = impute(self.X)
        self.result['data'] = self.X.tolist()

        self.algorithm = self.get_algorithm()
        if self.algorithm == None:
            self.result['message'] = "failure"
            self.result['WARNING'] = "{alg} algorithm not supported".format(
                alg=self.algorithmName)
            return self.result

        self.fit_algorithm()

        # TODO - The front end should specify a save name for the model
        model_name = self.algorithmName + "_" + str(random.random())
        if self.search_type == 'direct':
            model_dict = self.cache.saveModel(model_name, self.algorithm,
                                              "regressor")
        else:
            model_dict = self.cache.saveModel(model_name,
                                              self.algorithm.best_estimator_,
                                              "regressor")
        if not model_dict:
            self.result['WARNING'] = "Model could not be saved."
        else:
            self.result['model_name'] = model_dict['name']
            self.result['model_hash'] = model_dict['hash']

        endTime = time.time()
        computeTime = endTime - startTime
        logTime(self.__class__.__name__, self.algorithmName, computeTime,
                computed_samples, computed_features)

        self.result['message'] = "success"
        return self.result
Example #30
0
def test_add_data(capsys, testData):

    cache = get_cache(DOCTEST_SESSION)

    message = {'routine': 'arrange', 'hashType': 'feature', 'activity': 'add', 'name': 'TiO2', 'data': [1, 2, 3, 4], 'sessionkey': DOCTEST_SESSION}
    results = add_data(message, {})