def audit_seller(input_file_name, output_file_name):
    logger.info("audit seller started")

    with open(input_file_name, 'r') as inf:
        with open(output_file_name, 'w') as ouf:

            for line in inf:

                (deal_srl, option_srl, cat_info, p_info, dt) = line.split(",")

                result = predict.predict(model_lv1,
                                         model_lv2s,
                                         model_lv3s,
                                         cat_info + ' ' + p_info,
                                         top=3,
                                         product_id=deal_srl)[0]

                ouf.write(deal_srl)
                ouf.write('\t')
                ouf.write(option_srl)
                ouf.write('\t')
                ouf.write(cat_info)
                ouf.write('\t')
                ouf.write(p_info)
                ouf.write('\t')
                ouf.write(dt.strip('\n'))
                ouf.write('\t')
                ouf.write(str(result.get_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv1_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv2_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv3_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv4_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv5_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv6_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv1_score()))
                ouf.write('\t')
                ouf.write(str(result.get_lv2_score()))
                ouf.write('\t')
                ouf.write(str(result.get_lv3_score()))
                ouf.write('\n')

    logger.info("audit seller completed!")
def audit(input_file_name, output_file_name):
    logger.info("audit started")

    with open(input_file_name, 'r') as inf:
        with open(output_file_name, 'w') as ouf:

            for line in inf:

                (product_id, original_cate1, original_cate2, original_cate3,
                 original_cate4, original_cate5, original_cate6,
                 input_string) = line.split("\t")

                if len(original_cate4) == 0:
                    formatted_cate4 = original_cate3
                else:
                    formatted_cate4 = original_cate4

                if len(original_cate5) == 0:
                    formatted_cate5 = formatted_cate4
                else:
                    formatted_cate5 = original_cate5

                if len(original_cate6) == 0:
                    formatted_cate6 = formatted_cate5
                else:
                    formatted_cate6 = original_cate6

                result = predict.predict(model_lv1,
                                         model_lv2s,
                                         model_lv3s,
                                         input_string,
                                         top=3,
                                         product_id=product_id)[0]

                ouf.write(product_id)
                ouf.write('\t')
                ouf.write(result.get_input_string())
                ouf.write('\t')
                ouf.write(result.get_normalized_input_string())
                ouf.write('\t')
                ouf.write('Original')
                ouf.write('\t')
                ouf.write(original_cate1)
                ouf.write('\t')
                ouf.write(original_cate2)
                ouf.write('\t')
                ouf.write(original_cate3)
                ouf.write('\t')
                ouf.write(formatted_cate4.strip('\n'))
                ouf.write('\t')
                ouf.write(formatted_cate5.strip('\n'))
                ouf.write('\t')
                ouf.write(formatted_cate6.strip('\n'))
                ouf.write('\n')
                ouf.write(product_id)
                ouf.write('\t')
                ouf.write(result.get_input_string())
                ouf.write('\t')
                ouf.write(result.get_normalized_input_string())
                ouf.write('\t')
                ouf.write('Predict')
                ouf.write('\t')
                ouf.write(str(result.get_lv1_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv2_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv3_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv4_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv5_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv6_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv1_score()))
                ouf.write('\t')
                ouf.write(str(result.get_lv2_score()))
                ouf.write('\t')
                ouf.write(str(result.get_lv3_score()))
                ouf.write('\t')
                ouf.write(result.get_predict_type())
                ouf.write('\n')

    logger.info("audit completed!")
def evaluate(input_file_name, output_file_name):
    logger.info("evaluate started")

    total_count = 0.0
    lv1_correct_count = 0.0
    lv2_correct_count = 0.0
    lv3_correct_count = 0.0
    lv4_correct_count = 0.0
    lv5_correct_count = 0.0
    lv6_correct_count = 0.0

    total_by_correct_lv1 = dict()
    lv2_correct_by_correct_lv1 = dict()
    lv3_correct_by_correct_lv1 = dict()
    lv4_correct_by_correct_lv1 = dict()
    lv5_correct_by_correct_lv1 = dict()
    lv6_correct_by_correct_lv1 = dict()

    with open(input_file_name, 'r') as inf:
        with open(output_file_name, 'w') as ouf:

            for line in inf:

                (t_type, product_id, input_string, correct_catecode,
                 correct_catecode1, correct_catecode2, correct_catecode3,
                 correct_catecode4, correct_catecode5, correct_catecode6,
                 correct_cate1, correct_cate2, correct_cate3, correct_cate4,
                 correct_cate5, correct_cate6) = line.split("\t")

                result = predict.predict(model_lv1,
                                         model_lv2s,
                                         model_lv3s,
                                         input_string,
                                         top=3,
                                         product_id=product_id)[0]

                ouf.write(t_type)
                ouf.write('\t')
                ouf.write(product_id)
                ouf.write('\t')
                ouf.write(result.get_input_string())
                ouf.write('\t')
                ouf.write(result.get_normalized_input_string())
                ouf.write('\t')
                ouf.write(correct_catecode1.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_catecode2.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_catecode3.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_catecode4.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_catecode5.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_catecode6.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_cate1.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_cate2.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_cate3.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_cate4.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_cate5.strip('\n'))
                ouf.write('\t')
                ouf.write(correct_cate6.strip('\n'))
                ouf.write('\t')
                ouf.write(str(result.get_lv1_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv2_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv3_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv4_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv5_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv6_catecode()))
                ouf.write('\t')
                ouf.write(str(result.get_lv1_score()))
                ouf.write('\t')
                ouf.write(str(result.get_lv2_score()))
                ouf.write('\t')
                ouf.write(str(result.get_lv3_score()))
                ouf.write('\t')
                ouf.write(str(result.get_final_score()))
                ouf.write('\t')
                ouf.write(result.get_predict_type())
                ouf.write('\t')
                if correct_catecode1 == str(result.get_lv1_catecode()):
                    ouf.write("LV1_CORRECT")
                else:
                    ouf.write("LV1_WRONG")
                ouf.write('\t')
                if correct_catecode2 == str(result.get_lv2_catecode()):
                    ouf.write("LV2_CORRECT")
                else:
                    ouf.write("LV2_WRONG")
                ouf.write('\t')
                if correct_catecode3 == str(result.get_lv3_catecode()):
                    ouf.write("LV3_CORRECT")
                else:
                    ouf.write("LV3_WRONG")
                ouf.write('\t')
                if correct_catecode4 == str(result.get_lv4_catecode()):
                    ouf.write("LV4_CORRECT")
                else:
                    ouf.write("LV4_WRONG")
                ouf.write('\t')
                if correct_catecode5 == str(result.get_lv5_catecode()):
                    ouf.write("LV5_CORRECT")
                else:
                    ouf.write("LV5_WRONG")
                ouf.write('\t')
                if correct_catecode6 == str(result.get_lv6_catecode()):
                    ouf.write("LV6_CORRECT")
                else:
                    ouf.write("LV6_WRONG")
                ouf.write('\n')

                total_count = total_count + 1.0
                if correct_catecode1 not in total_by_correct_lv1:
                    total_by_correct_lv1[correct_catecode1] = 1.0
                else:
                    total_by_correct_lv1[correct_catecode1] = \
                        total_by_correct_lv1[correct_catecode1] + 1.0

                if str(result.get_lv1_catecode()) == correct_catecode1:
                    lv1_correct_count = lv1_correct_count + 1.0

                if str(result.get_lv2_catecode()) == correct_catecode2:
                    lv2_correct_count = lv2_correct_count + 1.0
                    if correct_catecode1 not in lv2_correct_by_correct_lv1:
                        lv2_correct_by_correct_lv1[correct_catecode1] = 1.0
                    else:
                        lv2_correct_by_correct_lv1[correct_catecode1] = \
                            lv2_correct_by_correct_lv1[correct_catecode1] + 1.0

                if str(result.get_lv3_catecode()) == correct_catecode3:
                    lv3_correct_count = lv3_correct_count + 1.0
                    if correct_catecode1 not in lv3_correct_by_correct_lv1:
                        lv3_correct_by_correct_lv1[correct_catecode1] = 1.0
                    else:
                        lv3_correct_by_correct_lv1[correct_catecode1] = \
                            lv3_correct_by_correct_lv1[correct_catecode1] + 1.0

                if str(result.get_lv4_catecode()) == correct_catecode4:
                    lv4_correct_count = lv4_correct_count + 1.0
                    if correct_catecode1 not in lv4_correct_by_correct_lv1:
                        lv4_correct_by_correct_lv1[correct_catecode1] = 1.0
                    else:
                        lv4_correct_by_correct_lv1[correct_catecode1] = \
                            lv4_correct_by_correct_lv1[correct_catecode1] + 1.0

                if str(result.get_lv5_catecode()) == correct_catecode5:
                    lv5_correct_count = lv5_correct_count + 1.0
                    if correct_catecode1 not in lv5_correct_by_correct_lv1:
                        lv5_correct_by_correct_lv1[correct_catecode1] = 1.0
                    else:
                        lv5_correct_by_correct_lv1[correct_catecode1] = \
                            lv5_correct_by_correct_lv1[correct_catecode1] + 1.0

                if str(result.get_lv6_catecode()) == correct_catecode6:
                    lv6_correct_count = lv6_correct_count + 1.0
                    if correct_catecode1 not in lv6_correct_by_correct_lv1:
                        lv6_correct_by_correct_lv1[correct_catecode1] = 1.0
                    else:
                        lv6_correct_by_correct_lv1[correct_catecode1] = \
                            lv6_correct_by_correct_lv1[correct_catecode1] + 1.0

    ordered_total_by_correct_lv1 = OrderedDict(
        reversed(sorted(total_by_correct_lv1.items(), key=lambda x: x[1])))
    logger.info("\tCATEGORY\tSIZE\tLV1\tLV2\tLV3\tLV4\tLV5\tLV6")
    logger.info(
        "\tROOT\t%d\t%f\t%f\t%f\t%f\t%f\t%f" %
        (int(total_count), round(float(lv1_correct_count / total_count), 4),
         round(float(lv2_correct_count / total_count),
               4), round(float(lv3_correct_count / total_count),
                         4), round(float(lv4_correct_count / total_count), 4),
         round(float(lv5_correct_count / total_count),
               4), round(float(lv6_correct_count / total_count), 4)))

    for cate1 in ordered_total_by_correct_lv1:
        logger.info(
            "\t%s\t%d\t\t%f\t%f\t%f\t%f\t%f" %
            (const.CATE_INT_NAMES[cate1], int(total_by_correct_lv1[cate1]),
             round(
                 float(
                     lv2_correct_by_correct_lv1.get(cate1, 0) /
                     total_by_correct_lv1[cate1]), 4),
             round(
                 float(
                     lv3_correct_by_correct_lv1.get(cate1, 0) /
                     total_by_correct_lv1[cate1]), 4),
             round(
                 float(
                     lv4_correct_by_correct_lv1.get(cate1, 0) /
                     total_by_correct_lv1[cate1]), 4),
             round(
                 float(
                     lv5_correct_by_correct_lv1.get(cate1, 0) /
                     total_by_correct_lv1[cate1]), 4),
             round(
                 float(
                     lv6_correct_by_correct_lv1.get(cate1, 0) /
                     total_by_correct_lv1[cate1]), 4)))
    logger.info("evaluate completed!")
Exemple #4
0
 def post(self):
     args = parser.parse_args()
     features = [args['posteamint'], args['down'], args['ydstogo'], args['yrdline100'], args['ScoreDiff'], args['TimeSecs'], args['play']]
     prediction = predict(features, 'passed_success')[0]
     return json.dumps(prediction.tolist())
Exemple #5
0
def prediction():

    root = None
    sub_root = None

    try:

        root = request.get_json(force=True)

        if type(root) is str:
            root = json.loads(root)

        sub_root = get_json(root, const.JSON_KEY_ROOT)
        products = get_json(sub_root, const.JSON_KEY_PRODUCTS)

        # if top k prediction return config is not set from the request, uses value 1 as default
        top = 1
        if const.JSON_KEY_TOP in sub_root:
            top = get_json(sub_root, const.JSON_KEY_TOP)

        # top k return cannot exceed max top default value due to risk of under performance
        if top >= const.MAX_TOP:
            top = const.MAX_TOP

        for p in products:
            # get product id
            product_id = get_json(p, const.JSON_KEY_PRODUCT_ID)

            # get product name and brand and concatenate
            product_name = get_json(p, const.JSON_KEY_PRODUCT_NAME)
            product_brand = ''
            if const.JSON_KEY_PRODUCT_BRAND in p:
                product_brand = get_json(p, const.JSON_KEY_PRODUCT_BRAND)
            text_raw = product_name + ' ' + product_brand

            # get attribute and sort by their attribute key name's alphabetically order
            # and concatenate their values after product name and brand
            product_attributes = ''
            if const.JSON_KEY_PRODUCT_ATTRIBUTES in p:
                product_attributes = get_json(
                    p, const.JSON_KEY_PRODUCT_ATTRIBUTES)
            product_attributes_dict = {
                k: v
                for d in product_attributes for k, v in d.items()
            }
            for att in OrderedDict(sorted(
                    product_attributes_dict.items())).values():
                text_raw = text_raw + ' ' + att

            # concatenate product description at the end if there is any value present
            product_description = ''
            if const.JSON_KEY_PRODUCT_DESCRIPTION in p:
                product_description = get_json(
                    p, const.JSON_KEY_PRODUCT_DESCRIPTION)
            text_raw = text_raw + ' ' + product_description

            # set concatenated value for json output
            p[const.JSON_KEY_TEXT_RAW] = text_raw

            # call predict function given the concatenated text value
            pred = predict.predict(model_lv1,
                                   model_lv2s,
                                   model_lv3s,
                                   text_raw,
                                   top=top,
                                   product_id=product_id)

            # returns list of k top prediction category for given product
            pred_list = list()
            for i in range(0, len(pred)):
                if pred[i].get_predict_error():
                    raise Exception(
                        "There is error in prediction result. Please check your input text."
                    )

                pred_result = dict()

                pred_result[const.JSON_KEY_RANK] = i + 1
                pred_result[const.JSON_KEY_CATE1] = pred[i].get_lv1_catecode()
                pred_result[const.JSON_KEY_SCORE_LV1] = str(
                    pred[i].get_lv1_score())
                pred_result[const.JSON_KEY_CATE2] = pred[i].get_lv2_catecode()
                pred_result[const.JSON_KEY_SCORE_LV2] = str(
                    pred[i].get_lv2_score())
                pred_result[const.JSON_KEY_CATE3] = pred[i].get_lv3_catecode()
                pred_result[const.JSON_KEY_SCORE_LV3] = str(
                    pred[i].get_lv3_score())
                pred_result[const.JSON_KEY_CATE4] = pred[i].get_lv4_catecode()
                pred_result[const.JSON_KEY_CATE5] = pred[i].get_lv5_catecode()
                pred_result[const.JSON_KEY_CATE6] = pred[i].get_lv6_catecode()
                pred_result[const.JSON_KEY_SCORE_FINAL] = str(
                    pred[i].get_final_score())

                pred_list.append(pred_result)

            p[const.JSON_KEY_PREDICT] = pred_list
            p[const.JSON_KEY_TEXT_NORM] = pred[0].get_normalized_input_string()

        sub_root[const.JSON_KEY_PRODUCTS] = products
        sub_root[const.JSON_KEY_TOP] = top
        sub_root[const.JSON_KEY_SUCCESS] = True

    except Exception as e:

        if sub_root is None:
            root = json.loads('{"' + const.JSON_KEY_ROOT + '": {}}')
            sub_root = get_json(root, const.JSON_KEY_ROOT)

        sub_root[const.JSON_KEY_SUCCESS] = False
        sub_root[const.JSON_KEY_ERROR] = str(e)

        traceback.print_tb(e.__traceback__)

    return jsonify(root)
def create_cqi_output(filename):

    lst = list()
    # read in one avro file
    with open(const.get_cqi_input_file_path() + filename, 'rb') as fo:
        reader = fastavro.reader(fo)

        for record in reader:
            lst.append([
                record['itemId'], record['productId'], record['categoryCode'],
                record['originalAttr'], record['normalizedAttr'],
                record['excludeType'], record['categoryCodeLv1'],
                record['categoryNameLv1']
            ])

    # noinspection PyUnresolvedReferences
    df = pd.DataFrame(lst,
                      columns=[
                          'itemId', 'productId', 'categoryCode',
                          'originalAttr', 'normalizedAttr', 'excludeType',
                          'categoryCodeLv1', 'categoryNameLv1'
                      ])
    lst = None

    df['originCateCode'] = df['categoryCode']
    df['originString'] = df['originalAttr']
    df['cleanseString'] = ''
    df['predCateCode'] = ''
    df['predCateCode1'] = ''
    df['predCateCode2'] = ''
    df['predCateCode3'] = ''
    df['predCateCode4'] = ''
    df['predCateCode5'] = ''
    df['predCateCode6'] = ''
    df['scoreCateCode1'] = 0.0
    df['scoreCateCode2'] = 0.0
    df['scoreCateCode3_6'] = 0.0
    df['scoreFinal'] = 0.0
    df['success'] = 0

    # noinspection PyUnresolvedReferences
    cleansed_prod_df = pd.read_csv(
        const.get_cleansed_prod_dictionary_file_name(),
        names=['productId', 'isCleansed'],
        sep='\t',
        dtype=[('productId', 'long'), ('isCleansed', 'str')])

    # df = pd.merge(df, book_cate_df, on='originCateCode', how='left')
    # df = pd.merge(df, jikgu_prod_df, on='productId', how='left')
    # noinspection PyUnresolvedReferences
    df = pd.merge(df, cleansed_prod_df, on='productId', how='left')

    for i, row in df.iterrows():
        if not df.at[i, 'originString'] or len(df.at[i, 'originString']) == 0:
            continue

        pred = predict.predict(
            model_lv1,
            model_lv2s,
            model_lv3s,
            df.at[i,
                  'normalizedAttr'],  # input already garbage filtered string
            product_id=df.at[i, 'productId'],
            item_id=df.at[i, 'itemId'],
            garbage_filter=False)[0]

        df.at[i, 'cleanseString'] = pred.get_normalized_input_string()

        if "OLD" not in str(df.at[i, 'categoryNameLv1']).upper():

            if "JIKGU" in df.at[i, 'excludeType']:
                continue

            if "BOOK" in df.at[i, 'excludeType']:
                continue

            if "DVD" in df.at[i, 'excludeType']:
                continue

            if df.at[i, 'isCleansed'] == '1':
                if len(str(df.at[i, 'excludeType'])) == 0:
                    df.at[i, 'excludeType'] = 'OPERATOR_MODEL'
                else:
                    df.at[i, 'excludeType'] = str(
                        df.at[i, 'excludeType']) + ',OPERATOR_MODEL'
                continue

            if pred.get_predict_error() is True:
                continue

            if pred.get_final_score() < 0.25:
                df.at[i, 'scoreCateCode1'] = pred.get_lv1_score()
                df.at[i, 'scoreCateCode2'] = pred.get_lv2_score()
                df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score()
                df.at[i, 'scoreFinal'] = pred.get_final_score()
                continue

        df.at[i, 'predCateCode'] = pred.get_catecode()
        df.at[i, 'predCateCode1'] = pred.get_lv1_catecode()
        df.at[i, 'predCateCode2'] = pred.get_lv2_catecode()
        df.at[i, 'predCateCode3'] = pred.get_lv3_catecode()
        df.at[i, 'predCateCode4'] = pred.get_lv4_catecode()
        df.at[i, 'predCateCode5'] = pred.get_lv5_catecode()
        df.at[i, 'predCateCode6'] = pred.get_lv6_catecode()
        df.at[i, 'scoreCateCode1'] = pred.get_lv1_score()
        df.at[i, 'scoreCateCode2'] = pred.get_lv2_score()
        df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score()
        df.at[i, 'scoreFinal'] = pred.get_final_score()
        if pred.get_predict_error() is True:
            df.at[i, 'success'] = 0
        else:
            df.at[i, 'success'] = 1

    # write result out to avro file
    schema = {
        'name':
        'topLevelRecord',
        'type':
        'record',
        'fields': [{
            'name': 'itemId',
            'type': ['long', 'null']
        }, {
            'name': 'productId',
            'type': ['long', 'null']
        }, {
            'name': 'originCateCode',
            'type': ['string', 'null']
        }, {
            'name': 'originString',
            'type': 'string'
        }, {
            'name': 'cleanseString',
            'type': 'string'
        }, {
            'name': 'predCateCode',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode1',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode2',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode3',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode4',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode5',
            'type': ['string', 'null']
        }, {
            'name': 'predCateCode6',
            'type': ['string', 'null']
        }, {
            'name': 'scoreCateCode1',
            'type': ['float', 'null']
        }, {
            'name': 'scoreCateCode2',
            'type': ['float', 'null']
        }, {
            'name': 'scoreCateCode3_6',
            'type': ['float', 'null']
        }, {
            'name': 'scoreFinal',
            'type': ['float', 'null']
        }, {
            'name': 'excludeType',
            'type': 'string'
        }]
    }

    output = df[[
        'itemId', 'productId', 'originCateCode', 'originString',
        'cleanseString', 'predCateCode', 'predCateCode1', 'predCateCode2',
        'predCateCode3', 'predCateCode4', 'predCateCode5', 'predCateCode6',
        'scoreCateCode1', 'scoreCateCode2', 'scoreCateCode3_6', 'scoreFinal',
        'excludeType'
    ]]

    records = output.to_json(orient='records')
    records = json.loads(records)
    with open(const.get_cqi_output_file_path() + filename, 'wb') as out:
        fastavro.writer(out, schema, records)

    logger.info("Successfully write " + filename)