def audit_seller(input_file_name, output_file_name): logger.info("audit seller started") with open(input_file_name, 'r') as inf: with open(output_file_name, 'w') as ouf: for line in inf: (deal_srl, option_srl, cat_info, p_info, dt) = line.split(",") result = predict.predict(model_lv1, model_lv2s, model_lv3s, cat_info + ' ' + p_info, top=3, product_id=deal_srl)[0] ouf.write(deal_srl) ouf.write('\t') ouf.write(option_srl) ouf.write('\t') ouf.write(cat_info) ouf.write('\t') ouf.write(p_info) ouf.write('\t') ouf.write(dt.strip('\n')) ouf.write('\t') ouf.write(str(result.get_catecode())) ouf.write('\t') ouf.write(str(result.get_lv1_catecode())) ouf.write('\t') ouf.write(str(result.get_lv2_catecode())) ouf.write('\t') ouf.write(str(result.get_lv3_catecode())) ouf.write('\t') ouf.write(str(result.get_lv4_catecode())) ouf.write('\t') ouf.write(str(result.get_lv5_catecode())) ouf.write('\t') ouf.write(str(result.get_lv6_catecode())) ouf.write('\t') ouf.write(str(result.get_lv1_score())) ouf.write('\t') ouf.write(str(result.get_lv2_score())) ouf.write('\t') ouf.write(str(result.get_lv3_score())) ouf.write('\n') logger.info("audit seller completed!")
def audit(input_file_name, output_file_name): logger.info("audit started") with open(input_file_name, 'r') as inf: with open(output_file_name, 'w') as ouf: for line in inf: (product_id, original_cate1, original_cate2, original_cate3, original_cate4, original_cate5, original_cate6, input_string) = line.split("\t") if len(original_cate4) == 0: formatted_cate4 = original_cate3 else: formatted_cate4 = original_cate4 if len(original_cate5) == 0: formatted_cate5 = formatted_cate4 else: formatted_cate5 = original_cate5 if len(original_cate6) == 0: formatted_cate6 = formatted_cate5 else: formatted_cate6 = original_cate6 result = predict.predict(model_lv1, model_lv2s, model_lv3s, input_string, top=3, product_id=product_id)[0] ouf.write(product_id) ouf.write('\t') ouf.write(result.get_input_string()) ouf.write('\t') ouf.write(result.get_normalized_input_string()) ouf.write('\t') ouf.write('Original') ouf.write('\t') ouf.write(original_cate1) ouf.write('\t') ouf.write(original_cate2) ouf.write('\t') ouf.write(original_cate3) ouf.write('\t') ouf.write(formatted_cate4.strip('\n')) ouf.write('\t') ouf.write(formatted_cate5.strip('\n')) ouf.write('\t') ouf.write(formatted_cate6.strip('\n')) ouf.write('\n') ouf.write(product_id) ouf.write('\t') ouf.write(result.get_input_string()) ouf.write('\t') ouf.write(result.get_normalized_input_string()) ouf.write('\t') ouf.write('Predict') ouf.write('\t') ouf.write(str(result.get_lv1_catecode())) ouf.write('\t') ouf.write(str(result.get_lv2_catecode())) ouf.write('\t') ouf.write(str(result.get_lv3_catecode())) ouf.write('\t') ouf.write(str(result.get_lv4_catecode())) ouf.write('\t') ouf.write(str(result.get_lv5_catecode())) ouf.write('\t') ouf.write(str(result.get_lv6_catecode())) ouf.write('\t') ouf.write(str(result.get_lv1_score())) ouf.write('\t') ouf.write(str(result.get_lv2_score())) ouf.write('\t') ouf.write(str(result.get_lv3_score())) ouf.write('\t') ouf.write(result.get_predict_type()) ouf.write('\n') logger.info("audit completed!")
def evaluate(input_file_name, output_file_name): logger.info("evaluate started") total_count = 0.0 lv1_correct_count = 0.0 lv2_correct_count = 0.0 lv3_correct_count = 0.0 lv4_correct_count = 0.0 lv5_correct_count = 0.0 lv6_correct_count = 0.0 total_by_correct_lv1 = dict() lv2_correct_by_correct_lv1 = dict() lv3_correct_by_correct_lv1 = dict() lv4_correct_by_correct_lv1 = dict() lv5_correct_by_correct_lv1 = dict() lv6_correct_by_correct_lv1 = dict() with open(input_file_name, 'r') as inf: with open(output_file_name, 'w') as ouf: for line in inf: (t_type, product_id, input_string, correct_catecode, correct_catecode1, correct_catecode2, correct_catecode3, correct_catecode4, correct_catecode5, correct_catecode6, correct_cate1, correct_cate2, correct_cate3, correct_cate4, correct_cate5, correct_cate6) = line.split("\t") result = predict.predict(model_lv1, model_lv2s, model_lv3s, input_string, top=3, product_id=product_id)[0] ouf.write(t_type) ouf.write('\t') ouf.write(product_id) ouf.write('\t') ouf.write(result.get_input_string()) ouf.write('\t') ouf.write(result.get_normalized_input_string()) ouf.write('\t') ouf.write(correct_catecode1.strip('\n')) ouf.write('\t') ouf.write(correct_catecode2.strip('\n')) ouf.write('\t') ouf.write(correct_catecode3.strip('\n')) ouf.write('\t') ouf.write(correct_catecode4.strip('\n')) ouf.write('\t') ouf.write(correct_catecode5.strip('\n')) ouf.write('\t') ouf.write(correct_catecode6.strip('\n')) ouf.write('\t') ouf.write(correct_cate1.strip('\n')) ouf.write('\t') ouf.write(correct_cate2.strip('\n')) ouf.write('\t') ouf.write(correct_cate3.strip('\n')) ouf.write('\t') ouf.write(correct_cate4.strip('\n')) ouf.write('\t') ouf.write(correct_cate5.strip('\n')) ouf.write('\t') ouf.write(correct_cate6.strip('\n')) ouf.write('\t') ouf.write(str(result.get_lv1_catecode())) ouf.write('\t') ouf.write(str(result.get_lv2_catecode())) ouf.write('\t') ouf.write(str(result.get_lv3_catecode())) ouf.write('\t') ouf.write(str(result.get_lv4_catecode())) ouf.write('\t') ouf.write(str(result.get_lv5_catecode())) ouf.write('\t') ouf.write(str(result.get_lv6_catecode())) ouf.write('\t') ouf.write(str(result.get_lv1_score())) ouf.write('\t') ouf.write(str(result.get_lv2_score())) ouf.write('\t') ouf.write(str(result.get_lv3_score())) ouf.write('\t') ouf.write(str(result.get_final_score())) ouf.write('\t') ouf.write(result.get_predict_type()) ouf.write('\t') if correct_catecode1 == str(result.get_lv1_catecode()): ouf.write("LV1_CORRECT") else: ouf.write("LV1_WRONG") ouf.write('\t') if correct_catecode2 == str(result.get_lv2_catecode()): ouf.write("LV2_CORRECT") else: ouf.write("LV2_WRONG") ouf.write('\t') if correct_catecode3 == str(result.get_lv3_catecode()): ouf.write("LV3_CORRECT") else: ouf.write("LV3_WRONG") ouf.write('\t') if correct_catecode4 == str(result.get_lv4_catecode()): ouf.write("LV4_CORRECT") else: ouf.write("LV4_WRONG") ouf.write('\t') if correct_catecode5 == str(result.get_lv5_catecode()): ouf.write("LV5_CORRECT") else: ouf.write("LV5_WRONG") ouf.write('\t') if correct_catecode6 == str(result.get_lv6_catecode()): ouf.write("LV6_CORRECT") else: ouf.write("LV6_WRONG") ouf.write('\n') total_count = total_count + 1.0 if correct_catecode1 not in total_by_correct_lv1: total_by_correct_lv1[correct_catecode1] = 1.0 else: total_by_correct_lv1[correct_catecode1] = \ total_by_correct_lv1[correct_catecode1] + 1.0 if str(result.get_lv1_catecode()) == correct_catecode1: lv1_correct_count = lv1_correct_count + 1.0 if str(result.get_lv2_catecode()) == correct_catecode2: lv2_correct_count = lv2_correct_count + 1.0 if correct_catecode1 not in lv2_correct_by_correct_lv1: lv2_correct_by_correct_lv1[correct_catecode1] = 1.0 else: lv2_correct_by_correct_lv1[correct_catecode1] = \ lv2_correct_by_correct_lv1[correct_catecode1] + 1.0 if str(result.get_lv3_catecode()) == correct_catecode3: lv3_correct_count = lv3_correct_count + 1.0 if correct_catecode1 not in lv3_correct_by_correct_lv1: lv3_correct_by_correct_lv1[correct_catecode1] = 1.0 else: lv3_correct_by_correct_lv1[correct_catecode1] = \ lv3_correct_by_correct_lv1[correct_catecode1] + 1.0 if str(result.get_lv4_catecode()) == correct_catecode4: lv4_correct_count = lv4_correct_count + 1.0 if correct_catecode1 not in lv4_correct_by_correct_lv1: lv4_correct_by_correct_lv1[correct_catecode1] = 1.0 else: lv4_correct_by_correct_lv1[correct_catecode1] = \ lv4_correct_by_correct_lv1[correct_catecode1] + 1.0 if str(result.get_lv5_catecode()) == correct_catecode5: lv5_correct_count = lv5_correct_count + 1.0 if correct_catecode1 not in lv5_correct_by_correct_lv1: lv5_correct_by_correct_lv1[correct_catecode1] = 1.0 else: lv5_correct_by_correct_lv1[correct_catecode1] = \ lv5_correct_by_correct_lv1[correct_catecode1] + 1.0 if str(result.get_lv6_catecode()) == correct_catecode6: lv6_correct_count = lv6_correct_count + 1.0 if correct_catecode1 not in lv6_correct_by_correct_lv1: lv6_correct_by_correct_lv1[correct_catecode1] = 1.0 else: lv6_correct_by_correct_lv1[correct_catecode1] = \ lv6_correct_by_correct_lv1[correct_catecode1] + 1.0 ordered_total_by_correct_lv1 = OrderedDict( reversed(sorted(total_by_correct_lv1.items(), key=lambda x: x[1]))) logger.info("\tCATEGORY\tSIZE\tLV1\tLV2\tLV3\tLV4\tLV5\tLV6") logger.info( "\tROOT\t%d\t%f\t%f\t%f\t%f\t%f\t%f" % (int(total_count), round(float(lv1_correct_count / total_count), 4), round(float(lv2_correct_count / total_count), 4), round(float(lv3_correct_count / total_count), 4), round(float(lv4_correct_count / total_count), 4), round(float(lv5_correct_count / total_count), 4), round(float(lv6_correct_count / total_count), 4))) for cate1 in ordered_total_by_correct_lv1: logger.info( "\t%s\t%d\t\t%f\t%f\t%f\t%f\t%f" % (const.CATE_INT_NAMES[cate1], int(total_by_correct_lv1[cate1]), round( float( lv2_correct_by_correct_lv1.get(cate1, 0) / total_by_correct_lv1[cate1]), 4), round( float( lv3_correct_by_correct_lv1.get(cate1, 0) / total_by_correct_lv1[cate1]), 4), round( float( lv4_correct_by_correct_lv1.get(cate1, 0) / total_by_correct_lv1[cate1]), 4), round( float( lv5_correct_by_correct_lv1.get(cate1, 0) / total_by_correct_lv1[cate1]), 4), round( float( lv6_correct_by_correct_lv1.get(cate1, 0) / total_by_correct_lv1[cate1]), 4))) logger.info("evaluate completed!")
def post(self): args = parser.parse_args() features = [args['posteamint'], args['down'], args['ydstogo'], args['yrdline100'], args['ScoreDiff'], args['TimeSecs'], args['play']] prediction = predict(features, 'passed_success')[0] return json.dumps(prediction.tolist())
def prediction(): root = None sub_root = None try: root = request.get_json(force=True) if type(root) is str: root = json.loads(root) sub_root = get_json(root, const.JSON_KEY_ROOT) products = get_json(sub_root, const.JSON_KEY_PRODUCTS) # if top k prediction return config is not set from the request, uses value 1 as default top = 1 if const.JSON_KEY_TOP in sub_root: top = get_json(sub_root, const.JSON_KEY_TOP) # top k return cannot exceed max top default value due to risk of under performance if top >= const.MAX_TOP: top = const.MAX_TOP for p in products: # get product id product_id = get_json(p, const.JSON_KEY_PRODUCT_ID) # get product name and brand and concatenate product_name = get_json(p, const.JSON_KEY_PRODUCT_NAME) product_brand = '' if const.JSON_KEY_PRODUCT_BRAND in p: product_brand = get_json(p, const.JSON_KEY_PRODUCT_BRAND) text_raw = product_name + ' ' + product_brand # get attribute and sort by their attribute key name's alphabetically order # and concatenate their values after product name and brand product_attributes = '' if const.JSON_KEY_PRODUCT_ATTRIBUTES in p: product_attributes = get_json( p, const.JSON_KEY_PRODUCT_ATTRIBUTES) product_attributes_dict = { k: v for d in product_attributes for k, v in d.items() } for att in OrderedDict(sorted( product_attributes_dict.items())).values(): text_raw = text_raw + ' ' + att # concatenate product description at the end if there is any value present product_description = '' if const.JSON_KEY_PRODUCT_DESCRIPTION in p: product_description = get_json( p, const.JSON_KEY_PRODUCT_DESCRIPTION) text_raw = text_raw + ' ' + product_description # set concatenated value for json output p[const.JSON_KEY_TEXT_RAW] = text_raw # call predict function given the concatenated text value pred = predict.predict(model_lv1, model_lv2s, model_lv3s, text_raw, top=top, product_id=product_id) # returns list of k top prediction category for given product pred_list = list() for i in range(0, len(pred)): if pred[i].get_predict_error(): raise Exception( "There is error in prediction result. Please check your input text." ) pred_result = dict() pred_result[const.JSON_KEY_RANK] = i + 1 pred_result[const.JSON_KEY_CATE1] = pred[i].get_lv1_catecode() pred_result[const.JSON_KEY_SCORE_LV1] = str( pred[i].get_lv1_score()) pred_result[const.JSON_KEY_CATE2] = pred[i].get_lv2_catecode() pred_result[const.JSON_KEY_SCORE_LV2] = str( pred[i].get_lv2_score()) pred_result[const.JSON_KEY_CATE3] = pred[i].get_lv3_catecode() pred_result[const.JSON_KEY_SCORE_LV3] = str( pred[i].get_lv3_score()) pred_result[const.JSON_KEY_CATE4] = pred[i].get_lv4_catecode() pred_result[const.JSON_KEY_CATE5] = pred[i].get_lv5_catecode() pred_result[const.JSON_KEY_CATE6] = pred[i].get_lv6_catecode() pred_result[const.JSON_KEY_SCORE_FINAL] = str( pred[i].get_final_score()) pred_list.append(pred_result) p[const.JSON_KEY_PREDICT] = pred_list p[const.JSON_KEY_TEXT_NORM] = pred[0].get_normalized_input_string() sub_root[const.JSON_KEY_PRODUCTS] = products sub_root[const.JSON_KEY_TOP] = top sub_root[const.JSON_KEY_SUCCESS] = True except Exception as e: if sub_root is None: root = json.loads('{"' + const.JSON_KEY_ROOT + '": {}}') sub_root = get_json(root, const.JSON_KEY_ROOT) sub_root[const.JSON_KEY_SUCCESS] = False sub_root[const.JSON_KEY_ERROR] = str(e) traceback.print_tb(e.__traceback__) return jsonify(root)
def create_cqi_output(filename): lst = list() # read in one avro file with open(const.get_cqi_input_file_path() + filename, 'rb') as fo: reader = fastavro.reader(fo) for record in reader: lst.append([ record['itemId'], record['productId'], record['categoryCode'], record['originalAttr'], record['normalizedAttr'], record['excludeType'], record['categoryCodeLv1'], record['categoryNameLv1'] ]) # noinspection PyUnresolvedReferences df = pd.DataFrame(lst, columns=[ 'itemId', 'productId', 'categoryCode', 'originalAttr', 'normalizedAttr', 'excludeType', 'categoryCodeLv1', 'categoryNameLv1' ]) lst = None df['originCateCode'] = df['categoryCode'] df['originString'] = df['originalAttr'] df['cleanseString'] = '' df['predCateCode'] = '' df['predCateCode1'] = '' df['predCateCode2'] = '' df['predCateCode3'] = '' df['predCateCode4'] = '' df['predCateCode5'] = '' df['predCateCode6'] = '' df['scoreCateCode1'] = 0.0 df['scoreCateCode2'] = 0.0 df['scoreCateCode3_6'] = 0.0 df['scoreFinal'] = 0.0 df['success'] = 0 # noinspection PyUnresolvedReferences cleansed_prod_df = pd.read_csv( const.get_cleansed_prod_dictionary_file_name(), names=['productId', 'isCleansed'], sep='\t', dtype=[('productId', 'long'), ('isCleansed', 'str')]) # df = pd.merge(df, book_cate_df, on='originCateCode', how='left') # df = pd.merge(df, jikgu_prod_df, on='productId', how='left') # noinspection PyUnresolvedReferences df = pd.merge(df, cleansed_prod_df, on='productId', how='left') for i, row in df.iterrows(): if not df.at[i, 'originString'] or len(df.at[i, 'originString']) == 0: continue pred = predict.predict( model_lv1, model_lv2s, model_lv3s, df.at[i, 'normalizedAttr'], # input already garbage filtered string product_id=df.at[i, 'productId'], item_id=df.at[i, 'itemId'], garbage_filter=False)[0] df.at[i, 'cleanseString'] = pred.get_normalized_input_string() if "OLD" not in str(df.at[i, 'categoryNameLv1']).upper(): if "JIKGU" in df.at[i, 'excludeType']: continue if "BOOK" in df.at[i, 'excludeType']: continue if "DVD" in df.at[i, 'excludeType']: continue if df.at[i, 'isCleansed'] == '1': if len(str(df.at[i, 'excludeType'])) == 0: df.at[i, 'excludeType'] = 'OPERATOR_MODEL' else: df.at[i, 'excludeType'] = str( df.at[i, 'excludeType']) + ',OPERATOR_MODEL' continue if pred.get_predict_error() is True: continue if pred.get_final_score() < 0.25: df.at[i, 'scoreCateCode1'] = pred.get_lv1_score() df.at[i, 'scoreCateCode2'] = pred.get_lv2_score() df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score() df.at[i, 'scoreFinal'] = pred.get_final_score() continue df.at[i, 'predCateCode'] = pred.get_catecode() df.at[i, 'predCateCode1'] = pred.get_lv1_catecode() df.at[i, 'predCateCode2'] = pred.get_lv2_catecode() df.at[i, 'predCateCode3'] = pred.get_lv3_catecode() df.at[i, 'predCateCode4'] = pred.get_lv4_catecode() df.at[i, 'predCateCode5'] = pred.get_lv5_catecode() df.at[i, 'predCateCode6'] = pred.get_lv6_catecode() df.at[i, 'scoreCateCode1'] = pred.get_lv1_score() df.at[i, 'scoreCateCode2'] = pred.get_lv2_score() df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score() df.at[i, 'scoreFinal'] = pred.get_final_score() if pred.get_predict_error() is True: df.at[i, 'success'] = 0 else: df.at[i, 'success'] = 1 # write result out to avro file schema = { 'name': 'topLevelRecord', 'type': 'record', 'fields': [{ 'name': 'itemId', 'type': ['long', 'null'] }, { 'name': 'productId', 'type': ['long', 'null'] }, { 'name': 'originCateCode', 'type': ['string', 'null'] }, { 'name': 'originString', 'type': 'string' }, { 'name': 'cleanseString', 'type': 'string' }, { 'name': 'predCateCode', 'type': ['string', 'null'] }, { 'name': 'predCateCode1', 'type': ['string', 'null'] }, { 'name': 'predCateCode2', 'type': ['string', 'null'] }, { 'name': 'predCateCode3', 'type': ['string', 'null'] }, { 'name': 'predCateCode4', 'type': ['string', 'null'] }, { 'name': 'predCateCode5', 'type': ['string', 'null'] }, { 'name': 'predCateCode6', 'type': ['string', 'null'] }, { 'name': 'scoreCateCode1', 'type': ['float', 'null'] }, { 'name': 'scoreCateCode2', 'type': ['float', 'null'] }, { 'name': 'scoreCateCode3_6', 'type': ['float', 'null'] }, { 'name': 'scoreFinal', 'type': ['float', 'null'] }, { 'name': 'excludeType', 'type': 'string' }] } output = df[[ 'itemId', 'productId', 'originCateCode', 'originString', 'cleanseString', 'predCateCode', 'predCateCode1', 'predCateCode2', 'predCateCode3', 'predCateCode4', 'predCateCode5', 'predCateCode6', 'scoreCateCode1', 'scoreCateCode2', 'scoreCateCode3_6', 'scoreFinal', 'excludeType' ]] records = output.to_json(orient='records') records = json.loads(records) with open(const.get_cqi_output_file_path() + filename, 'wb') as out: fastavro.writer(out, schema, records) logger.info("Successfully write " + filename)