Ejemplo n.º 1
0
def main():
    # 1) TWO BODYS (TBs)
    trainTBs = twoBBdf(path=train_path, dict=TB_dict)
    testTBs = twoBBdf(path=test_path, dict=TB_dict)
    trainpromisingTBs, testpromisingTBs = firstStage(train_TBs=trainTBs,
                                                     test_TBs=testTBs,
                                                     threshold=0.16,
                                                     random_seed=42)

    # 2) EXTRA TRACKS (ETs)
    trainETs = twoBBdf(path=train_path,
                       dict=ET_dict,
                       specific_TBs=trainpromisingTBs.index)
    testETs = twoBBdf(path=test_path,
                      dict=ET_dict,
                      specific_TBs=testpromisingTBs.index)
    trainpromisingETs, testpromisingETs = secondStage(train_ETs=trainETs,
                                                      test_ETs=testETs,
                                                      threshold=0.5,
                                                      random_seed=42)
    trainETs.specific_ETs = trainpromisingETs.index
    testETs.specific_ETs = testpromisingETs.index

    # 2.5) Combine TBs and ETs and apply LOF calculation
    trainTAG_df = combine(TB_COM_df=LOF(trainTBs), ET_COM_df=LOF(trainETs))
    testTAG_df = combine(TB_COM_df=LOF(testTBs), ET_COM_df=LOF(testETs))
    trainTAG_df.to_csv('trainTAG_df.csv')
    testTAG_df.to_csv('testTAG_df.csv')

    # 3) LOF, combine TBs+ETs and then feed into tagger
    TAGs = thirdStage(train_TAG_df=trainTAG_df,
                      test_TAG_df=testTAG_df,
                      train_TB_scores=trainpromisingTBs,
                      test_TB_scores=testpromisingTBs,
                      train_path=train_path,
                      test_path=test_path,
                      random_seed=42)
    return TAGs
def lof_calculation():
    # try catch statemnt for error handling
    try:
        # start calculating as no previous responses cashed to fielsystem
        # checking parameters existance befor caculation
        # geting x axis used for plotting only not in calculation
        if responsee["x"]:
            x = responsee["x"]
            # geting y axis used for plotting only not in calculation
            if responsee["y"]:
                y = responsee["y"]
                print x
                print y
                """Retrieving Features Types"""

                if responsee["result"]["fields"]:
                    fields = responsee["result"]["fields"]
                    # logging feature types map
                    logger.debug('fields : ' + json.dumps(fields))

                    # initializing global varibels to hold features by  data type
                    # categorical datatype features list
                    global categorical_features_list
                    categorical_features_list = []

                    # numarical datatype features list
                    global numaric_features_list
                    numaric_features_list = []

                    # integer datatype features list
                    global int_features_list
                    int_features_list = []

                    # consuming provided features or auto detecting features if not provided
                    # check if variable exixts
                    if "analysisFeatures" in responsee:
                        features = responsee["analysisFeatures"]

                        # consuming provided features or auto detecting features if not provided
                        # check if it contatins values
                        if len(features) != 0:

                            for feature in features:
                                if feature:
                                    print "feature is:  " + feature
                                    for field in fields:
                                        if field["id"] == feature:
                                            if field["type"] == "text":
                                                categorical_features_list.append(
                                                    field["id"])
                                            if field["type"] == "int4":
                                                # execlusing ID form features
                                                if field["id"] != "_id":
                                                    int_features_list.append(
                                                        field["id"])
                                            if field["type"] == "numeric":
                                                numaric_features_list.append(
                                                    field["id"])

                        # auto detecting features as not provided
                        else:
                            automatic_detection_of_features(fields)

                    # auto detecting features as not provided
                    else:
                        automatic_detection_of_features(fields)

                    # logging information of detected featues

                    logger.debug('features considred for LOF analysis are :  ')
                    logger.debug(categorical_features_list)
                    logger.debug(numaric_features_list)
                    logger.debug(int_features_list)
                    """Creating global pre-Analysis (with _ids) and Pre- Response
                    (this will keep track of original data) Data Set """
                    # initializing global variables
                    global analysis_ready_list
                    analysis_ready_list = []
                    global response_ready_full_list
                    response_ready_full_list = []

                    # acomodatibf global variables values
                    """DATA VALIDATION, TRANFORMATION AND HANDLING DUBLICATES AND NULLS STAGES """
                    for rec in records:
                        analysis_ready_list.append((rec["_id"], ))
                        response_ready_full_list.append((rec["_id"], ))

                    if len(categorical_features_list) > 0:
                        textual_to_numarical_categorical(
                            records, categorical_features_list)

                    if len(int_features_list) > 0:
                        numeric_to_float(records, int_features_list)

                    if len(numaric_features_list) > 0:
                        numeric_to_float(records, numaric_features_list)

                    # logging information for accomdation results
                    logger.debug('analysis_ready_list : ' +
                                 json.dumps(analysis_ready_list))
                    logger.debug('response_ready_full_list : ' +
                                 json.dumps(response_ready_full_list))
                    """real_analysis_ready_data after removing _id as ID IS NOT USED FOR ANALYSIS"""
                    # the REMOVING IDS STAGE TO CREATE ANALYSIS DATASET
                    real_analysis_ready_data = [
                        tuple(cols[1:]) for cols in analysis_ready_list
                    ]
                    # logging information for data going to analysis
                    logger.debug('analysis_ready_list : ' +
                                 json.dumps(analysis_ready_list))
                    logger.debug('real_analysis_ready_data: ' +
                                 json.dumps(real_analysis_ready_data))
                    """preparing LOF Model"""
                    # passing data to LOF model
                    lof = LOF(real_analysis_ready_data)

                    # preparing LOF anomaly reponse list
                    response_results_list = []
                    """Using LOF MODEL"""
                    # looping over values (tubles of readings) to get its anomaly score against the model
                    i = 0
                    for instance in real_analysis_ready_data:
                        # 10 is the number of nighbours considered for calculation
                        value = lof.local_outlier_factor(10, instance)
                        """
                        # Sending only outliers
                        # if(value>1):
                        """
                        # tagging readings with normal, local, and global based on its anomaly score
                        if i < len(records):
                            if value <= 1:
                                response_results_list.append(
                                    (value, records[i][x], records[i][y],
                                     "normal"))
                            if value > 1 and value <= 2:
                                response_results_list.append(
                                    (value, records[i][x], records[i][y],
                                     "local"))
                            if value > 2:
                                response_results_list.append(
                                    (value, records[i][x], records[i][y],
                                     "global"))
                            i = i + 1

                    # initializing and preparing response
                    response_data = {}
                    response_data['success'] = True
                    response_data['result'] = []

                    # adding anomaly results messages
                    for record in response_results_list:
                        response_data['result'].append(record)

                    # adding service messages
                    response_data['messages'] = messages

                    # casting response map to json
                    json_response = json.dumps(response_data)
                    # logging infomation for the service client response
                    logger.debug('response : ' + json.dumps(response_data))
                    # storing new resonse to file system
                    store_responses_to_file_system("lof", json_response)

    # incase compiler catched an error
    except Exception, e:
        # logging error
        logger.error(e)
        # adding error to response message
        json_response = error_resonse(str(traceback.print_exc()))
        # printing full error trace
        traceback.print_exc()
Ejemplo n.º 3
0
        value = (k - len(rc))**2
        if value < min_value:
            min_value = value
            k_min = k

    #k_new = int((k_min + len(rc)) / 2)
    k_new = len(rc)
    if k_new not in kList:
        kList.append(k_new)
    kList.sort()
    # print("New klist is \n",kList)


while True:
    print("New instances is \n", instances)
    lof = LOF(instances)
    M = [[] for i in range(len(instances))]
    pre_c1 = []
    kinf = []

    for i in range(len(instances)):
        pre_c1.append(0)
        kinf.append(0)
    for k in kList:
        if k > len(instances):
            kList = kList[0:kList.index(k)]
            break
    if len(instances) == 1 and len(kList) == 0:
        kList.append(1)
    print("kList is", kList)
Ejemplo n.º 4
0
# Generate some outliers
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.r_[X_inliers, X_outliers]

n_outliers = len(X_outliers)
ground_truth = np.ones(len(X), dtype=int)
ground_truth[-n_outliers:] = -1


y = np.zeros(200,dtype=np.int)
y_outlier = np.ones(20,dtype=np.int)
y = np.append(y, y_outlier)

# use my class
lof = LOF()
coef = lof.fit_predict(X)
coef = (coef - coef.min()) / (coef.max() - coef.min())
#print(coef)

from PyDBOD.base import Base
'''
probedata = clf.fit_predict(data)
print(clf.threshold_)
'''
color = np.array(['k','b'])


plt.title("Local Outlier Factor (LOF)")
plt.scatter(X[:, 0], X[:, 1], color=color[y], s=3., label='Data points')
# plot circles with radius proportional to the outlier scores