Example #1
0
def calc_python_spark(ask_parm, crowd_id):
    D = ask_parm['D_F']
    F = ask_parm['F_F']
    F_Group = ask_parm['F_Group']
    option_dict, option_list = utils.option_dict_list(F)

    sc = Tsparkcore.createSparkContext("crowd_ANOVA_TWO_WAY")
    sqlctx = SQLContext(sc)

    hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD)
    hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id)
    output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1]))

    schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2)
    schemaCrowd.registerTempTable("statics_crowd_details")
    crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details")

    data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD)
    data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON")

    output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row_test(kv[1], option_list))

    schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2)
    schemaStd.registerTempTable("base_standard")
    stdDataFrame = sqlctx.sql(
        "SELECT  codeID, organID, data_json, examTime, rowKey, examID FROM base_standard")

    cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID,
            crowdDataFrame.examID == stdDataFrame.examID]

    rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(stdDataFrame.data_json)
    rsDataFrameGArrs = {}

    for rowKey in rsDataFrame.collect():
        for colKey in option_list:
            rsDataFrameGArrs[colKey] = []
            if (rowKey.has_key(colKey)):
                rsDataFrameGArrs[colKey].append(rowKey[colKey])
            else:
                rsDataFrameGArrs[colKey].append(None)
    frame_Arrs = []
    for colKey in option_list :
        group_list = F_Group[colKey]
        rsDataFrameG = get_df_group(colKey, group_list, rsDataFrameGArrs)

    formula_list = []
    # rsDataFrameG1 = get_df_group(group_all["390"][0], rsDataFrame)
    # rsDataFrameG2 = get_df_group(group_all["390"][1], rsDataFrame)
    # rsDataFrameG3 = get_df_group(group_all["390"][2], rsDataFrame)
    frame = DataFrame()
    # G1 = rsDataFrameG1.select(rsDataFrameG1.data_json["390"]).toPandas()
    # G2 = rsDataFrameG2.select(rsDataFrameG2.data_json["390"]).toPandas()
    # G3 = rsDataFrameG3.select(rsDataFrameG2.data_json["390"]).toPandas()


    frame['D'] = G1

    i=0
    for key in  option_list:
        i = i+1;
        formula_list.append('F'+str(i))
        frame['F'+str(i)] = frame_Arrs[i-1]

    formula = "D~ " + "+".join(formula_list)
    print 'formula', formula
    anova_results = anova_lm(ols(formula, frame).fit())


    result = {}
    import json
    result['1'] = anova_results.to_json()

    print "anova_results", anova_results
    sc.stop()
    return None
Example #2
0
def calc_python_spark(ask_parm, crowd_id):
    group1 = ask_parm["group1"]
    group2 = ask_parm["group2"]

    print 'group1', group1, ', group2', group2

    option_dict, option_list = utils.option_dict_list(ask_parm['optionList'])
    print "option_dict", option_dict
    option_list.append("390")
    group1["parmId"] = "390"
    group2["parmId"] = "390"
    sc = Tsparkcore.createSparkContext("crowd_Pair_test")
    sqlctx = SQLContext(sc)

    hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD)
    hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id)
    output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1]))

    schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2)
    schemaCrowd.registerTempTable("statics_crowd_details")
    crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details")

    data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD)
    data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON")

    output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row(kv[1], group1, option_list))

    schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2)
    schemaStd.registerTempTable("base_standard")
    stdDataFrame = sqlctx.sql(
        "SELECT codeID, organID, examID, examTime, data_json, group_Type FROM base_standard").filter("group_Type=true")
    # stdDataFrame.show()

    cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID,
            crowdDataFrame.examID == stdDataFrame.examID]

    rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(crowdDataFrame.codeID, crowdDataFrame.organID,
                                                                          crowdDataFrame.examID, stdDataFrame.examTime,
                                                                          stdDataFrame.data_json)

    rsDataFrameG1 = rsDataFrame.filter(stdDataFrame.data_json["390"] > 17).orderBy(crowdDataFrame.codeID,
                                                                                   stdDataFrame.examTime)

    rsDataFrameG2 = rsDataFrame.filter(stdDataFrame.data_json["390"] < 17).orderBy(crowdDataFrame.codeID,
                                                                                   stdDataFrame.examTime)

    rsDataFrameG1RDD = rsDataFrameG1.map(lambda p: p.data_json)
    rsDataFrameG2RDD = rsDataFrameG2.map(lambda p: p.data_json)

    rsDataFrameG1Arrs = get_key_arrs(rsDataFrameG1RDD, option_list)
    rsDataFrameG2Arrs = get_key_arrs(rsDataFrameG2RDD, option_list)
    print "rsDataFrameG1Arrs ", rsDataFrameG1Arrs
    print "rsDataFrameG2Arrs ", rsDataFrameG2Arrs

    result = {}
    for colKey in option_list:
        if colKey is "390":
            break
        name = option_dict[colKey]
        print "name =>", name
        result[name] = get_result_arrs(rsDataFrameG1Arrs[colKey], rsDataFrameG2Arrs[colKey])

    print result
    sc.stop()
    return json.dumps(result, ensure_ascii=False)
Example #3
0
def calc_python_spark(ask_parm, crowd_id):
    ask_parm['D_F'] = ask_parm['D_F'].split(":")[0]
    option_dict, option_list = utils.option_dict_list(ask_parm['F_F'])
    arrs = ask_parm['F_F'].split(",")
    list = []
    for key in arrs:
        list.append(key.split(":")[0])
    ask_parm['F_F'] =','.join(list)
    D = ask_parm['D_F']
    F = ask_parm['F_F']

    F_Arrs = F.split(',')

    print "option_dict", option_dict
    option_list.append(D)
    arrs = F.split(",")
    for arr in arrs:
        option_list.append(arr)

    group1 = {}
    group1['parmId'] = 390
    group1['parmVal'] = "12-14"
    group1['parmType'] = 1

    group2 = {}
    group2['parmId'] = 390
    group2['parmVal'] = "12-14"
    group2['parmType'] = 1

    group_all = {}
    group_all["390"] = [group1, group2, group2]


    sc = Tsparkcore.createSparkContext("crowd_ANOVA_TWO_WAY")
    sqlctx = SQLContext(sc)

    hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD)
    hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id)
    output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1]))

    schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2)
    schemaCrowd.registerTempTable("statics_crowd_details")
    crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details")

    data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD)
    data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON")

    output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row_test(kv[1], option_list))

    schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2)
    schemaStd.registerTempTable("base_standard")
    stdDataFrame = sqlctx.sql(
        "SELECT  codeID, organID, data_json, examTime, rowKey, examID FROM base_standard")

    cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID,
            crowdDataFrame.examID == stdDataFrame.examID]

    rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(crowdDataFrame.codeID, crowdDataFrame.organID,
                                                                          crowdDataFrame.examID, stdDataFrame.examTime,
                                                                          stdDataFrame.data_json)

    rsDataFramePD= None
    # for key in group_all :
    #     groups = group_all[key]
    #     list = []
    #     k=0
    #
    #     for group in groups:
    #         rsDataFrameG = get_df_group(group, rsDataFrame)
    #         if rsDataFrameG is  None :
    #             continue
    #
    #         rsDataFrameG = rsDataFrameG.withColumn(key + "_1", rsDataFrameG.data_json["390"]*0 + k)
    #         if rsDataFramePD is None :
    #             rsDataFramePD = rsDataFrameG
    #         else :
    #             rsDataFramePD = rsDataFramePD.unionAll(rsDataFrameG)
    #
    #         print "k=", k
    #         k = k + 1
    #
    #     print "rsDataFramePD==》", rsDataFramePD

            # list.append(rsDataFrameG)

        # get_mult_group(list, key)



    formula_list = []
    rsDataFrameG1 = get_df_group(group_all["390"][0], rsDataFrame)
    rsDataFrameG2 = get_df_group(group_all["390"][1], rsDataFrame)
    rsDataFrameG3 = get_df_group(group_all["390"][2], rsDataFrame)
    frame = DataFrame()
    G1 = rsDataFrameG1.select(rsDataFrameG1.data_json["390"]).toPandas()
    G2 = rsDataFrameG2.select(rsDataFrameG2.data_json["390"]).toPandas()
    G3 = rsDataFrameG3.select(rsDataFrameG2.data_json["390"]).toPandas()


    frame['D'] = G1
    frame_Arrs = []
    frame_Arrs.append(G2)
    frame_Arrs.append(G3)
    i=0
    for key in  F_Arrs:
        i = i+1;
        formula_list.append('F'+str(i))
        frame['F'+str(i)] = frame_Arrs[i-1]

    formula = "D~ " + "+".join(formula_list)
    print 'formula', formula
    anova_results = anova_lm(ols(formula, frame).fit())


    result = {}
    import json
    result['1'] = anova_results.to_json()

    print "anova_results", anova_results
    sc.stop()
    return json.dumps(result, ensure_ascii=False)
Example #4
0
def calc_python_spark(ask_parm, crowd_id):
    ask_parm['D'] = '9'
    ask_parm['F'] = '390,9'
    ask_parm['A'] = '390,9'
    ask_parm['C'] = '390,9'
    ask_parm['WLS'] = '390,9'

    D = ask_parm['D']
    F_Arrs = ask_parm['F'].split(',')
    option_dict, option_list = utils.option_dict_list(ask_parm['optionList'])
    print "option_dict", option_dict

    sc = Tsparkcore.createSparkContext("crowd_ANOVA_TWO_WAY")
    sqlctx = SQLContext(sc)

    hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD)
    hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd,
                                       "EXAM_LIST",
                                       crowdID=crowd_id)
    output_hbase_rdd = hbase_rdd.map(
        lambda kv: Tsparkcore.map_crowd_Row(kv[1]))

    schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2)
    schemaCrowd.registerTempTable("statics_crowd_details")
    crowdDataFrame = sqlctx.sql(
        "SELECT codeID, organID, examID FROM statics_crowd_details")

    data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD)
    data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON")

    output_data_rdd = data_rdd.map(
        lambda kv: Tsparkcore.map_std_Row_Two_Way_anova(kv[1], option_list))

    schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2)
    schemaStd.registerTempTable("base_standard")
    stdDataFrame = sqlctx.sql(
        "SELECT  codeID, organID, data_json, examTime, rowKey, examID, has_Type FROM base_standard"
    ).filter('has_Type=True')

    # print stdDataFrame.show()

    cond = [
        crowdDataFrame.codeID == stdDataFrame.codeID,
        crowdDataFrame.organID == stdDataFrame.organID,
        crowdDataFrame.examID == stdDataFrame.examID
    ]

    rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(
        crowdDataFrame.codeID, crowdDataFrame.organID, crowdDataFrame.examID,
        stdDataFrame.examTime, stdDataFrame.data_json)
    list = []
    i = 0
    list.append(rsDataFrame.data_json[D].alias('D'))
    formula_list = []

    for key in F_Arrs:
        i = i + 1
        formula_list.append('F' + str(i))
        list.append(rsDataFrame.data_json[key].alias('F' + str(i)))
    df_D = rsDataFrame.select(list).toPandas()

    print 'df_D', df_D

    formula = "D~ " + "+".join(formula_list)
    print 'formula', formula
    anova_results = anova_lm(ols(formula, df_D).fit())
    print anova_results

    result = {}

    import json
    result['1'] = anova_results

    sc.stop()
    return json.dumps(result, ensure_ascii=False)
Example #5
0
def calc_python_spark(ask_parm, crowd_id):
    group1 = ask_parm["group1"]
    group2 = ask_parm["group2"]

    print 'group1', group1, ', group2', group2

    print "parmId", type(group1['parmId'])
    print "parmVal", type(group1['parmVal'])
    print "parmType", type(group1['parmType'])

    print "parmId", type(group2['parmId'])
    print "parmVal", type(group2['parmVal'])
    print "parmType", type(group2['parmType'])



    option_dict, option_list = utils.option_dict_list(ask_parm['optionList'])
    print "option_dict", option_dict

    sc = Tsparkcore.createSparkContext("crowd_Pair_test")
    sqlctx = SQLContext(sc)

    hbase_rdd = Tsparkcore.createSparkRDD(sc, HBASE_CROWD)
    hbase_rdd = Tsparkcore.calHbaseRDD(hbase_rdd, "EXAM_LIST", crowdID=crowd_id)
    output_hbase_rdd = hbase_rdd.map(lambda kv: Tsparkcore.map_crowd_Row(kv[1]))

    schemaCrowd = sqlctx.createDataFrame(output_hbase_rdd, samplingRatio=0.2)
    schemaCrowd.registerTempTable("statics_crowd_details")
    crowdDataFrame = sqlctx.sql("SELECT codeID, organID, examID FROM statics_crowd_details")

    data_rdd = Tsparkcore.createSparkRDD(sc, HBASE_STANDARD)
    data_rdd = Tsparkcore.calHbaseRDD(data_rdd, "EXAM_JSON")

    output_data_rdd = data_rdd.map(lambda kv: Tsparkcore.map_std_Row_test(kv[1],  option_list))

    schemaStd = sqlctx.createDataFrame(output_data_rdd, samplingRatio=0.2)
    schemaStd.registerTempTable("base_standard")
    stdDataFrame = sqlctx.sql(
        "SELECT codeID, organID, examID, examTime, data_json FROM base_standard")



    cond = [crowdDataFrame.codeID == stdDataFrame.codeID, crowdDataFrame.organID == stdDataFrame.organID,
            crowdDataFrame.examID == stdDataFrame.examID]

    rsDataFrame = crowdDataFrame.join(stdDataFrame, cond, "inner").select(crowdDataFrame.codeID, crowdDataFrame.organID,
                                                                          crowdDataFrame.examID, stdDataFrame.examTime,
                                                                          stdDataFrame.data_json )

    rsDataFrameG1 = get_df_group(group1, rsDataFrame)
    rsDataFrameG2 = get_df_group(group2, rsDataFrame)

    result = {}

    for colKey in option_list:
        print "colKey", colKey
        name = option_dict[colKey]
        print "name =>", name, ", colKey=>"+colKey
        G1 = rsDataFrameG1.select(rsDataFrameG1.data_json["390"]).toPandas()
        G2 = rsDataFrameG2.select(rsDataFrameG2.data_json["390"]).toPandas()

        result[name] = get_result_arrs(G1, G2)

    import json
    print result

    sc.stop()
    return json.dumps(result, ensure_ascii=False)