def compare_test_ranking_similarity(test_st, test_end, page_num, min_show, min_click):
    test_group = make_test_ranking.get_pic_group(test_st, test_end, page_num)
    test_pic_info = make_test_ranking.get_test_data_ranking(test_st, test_end, test_group, min_show, min_click)
    ranking_click_show = make_test_ranking.sort_rank_by_click_show(test_pic_info)
    ranking_save_click = make_test_ranking.sort_rank_by_save_click(test_pic_info)
    # ranking_save_show = make_test_ranking.sort_rank_by_save_show(test_pic_info)
    list_similarity = []
    print 'click show', ranking_click_show
    print 'save, click', ranking_save_click
    # print 'save, show', ranking_save_show
    for page in ranking_click_show:
        rank_a = ranking_click_show[page]
        rank_b = ranking_save_click[page]
        similarity = calculate_similarity.calculate_similarity(rank_a, rank_b)
        list_similarity.append([page, similarity])
    list_similarity.sort(key=lambda x: x[0], reverse=False)
    print list_similarity

    cf_data = Config('data.conf')
    chart_path = cf_data.get('path', 'chart_result')
    file_name = Name+'_Test_time_'+test_st+'_'+test_end+'_Min_show='+str(min_show)+'_Min_click='+str(min_click)
    fout_html = open(chart_path+file_name+'.html', 'w')
    fout_html.write('<!doctype html><html lang="en"><meta charset="UTF-8"><head><script type="text/javascript" src="http://cdn.hcharts.cn/jquery/jquery-1.8.3.min.js"></script><script type="text/javascript" src="http://cdn.hcharts.cn/highcharts/highcharts.js"></script><script>$(function () { $("#container").highcharts({ chart: { zoomType: "xy" }, title: { text: "Click_show与'+Name+'的相似度" }, xAxis: [{ tickInterval: 1 }], yAxis: [{ title: { text: "相似度" }}], tooltip: { shared: true }, series: [{ name: "Click_show 与'+Name+'相似度", data:')
    fout_html.write(str(list_similarity))
    fout_html.write('}] }); }); </script></head><body><div id="container" style="min-width:700px;height:400px"></div></body></html> ')
    fout_html.close()
def init_mongodb_pic_info(behavior):
    """
    该函数只需要初始化数据库一次, 写入到kdd数据库中的 pic_click_behavior 表
    """
    cf_data = Config('data.conf')
    dataset_path = cf_data.get('path', 'dataset_path')
    if behavior == 'all':
        db_name = 'pic_info_all'
    elif behavior == 'save':
        db_name = 'pic_info_save'
    elif behavior == 'click':
        db_name = 'pic_info_click'
    else:
        return False
    mongo = Mongo('kdd', db_name)
    fin = open(dataset_path+db_name, 'r')
    dict_raw = eval(fin.read())  # 这里耗时最大, 文件比较大
    for pic in dict_raw:
        each_line = {}
        each_line['pid'] = pic
        for full_time in dict_raw[pic]:
            each_line[full_time] = dict_raw[pic][full_time]
        record = mongo.collection.find({'pid': pic}, {'_id': 0})
        if record.count() == 0:
            mongo.collection.insert(each_line)
        else:
            print '该图片的信息已存在!'
    fin.close()
    mongo.close()
 def __init__(self, name, act_type):
     cf = Config(name)
     self.max_request_num = int(cf.get('filter_value', 'max_request_num'))
     self.max_sub_seq_length = int(cf.get('filter_value', 'max_sub_seq_length'))
     self.max_click_percent = float(cf.get('filter_value', 'max_click_percent'))
     temp = cf.get('filter_value', 'act_time')
     self.act_time = temp.split(',')
     self.act_type = act_type
def train_data_position_bias(data_name):
    cf_data = Config('data.conf')
    path = cf_data.get('path', 'dataset_path')
    fin = open(path+data_name+'.txt', 'r')
    line = fin.readline()
    list_pic = line.strip('\r\n').split(',')
    fin.close()
    print 'pb, 图片总数: ', len(list_pic)
    return list_pic
def filter_data(time_st, time_end):
    list_time = Function.get_time_list(time_st, time_end)
    cf = Config('data.conf')
    raw_data_path = cf.get('path', 'raw_data')
    filter_data_output = cf.get('path', 'position_bias_data')
    act_type = int(cf.get('act_type', 'value'))
    filer_rule = FilterRule('rule.conf', act_type)   # 初始化处理过滤的类 FilterRule
    valid_cnt = 0
    invalid = 0

    for day in list_time:
        input_path = raw_data_path + day
        output_path = filter_data_output + day
        if not os.path.exists(output_path):
            os.mkdir(output_path)
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in = input_path + '\\full_' + temp_name + str(i)
                if os.path.exists(file_in):
                    file_pic = output_path + '\\pic_' + temp_name + str(i)
                    file_result = output_path + '\\result_' + temp_name + str(i)
                    fout_pic = open(file_pic, 'w')
                    fout_result = open(file_result, 'w')

                    fin = open(file_in, 'r')
                    dict_file = eval(fin.read())
                    for idfa in dict_file:
                        list_pic = dict_file[idfa]['pic']
                        list_result = dict_file[idfa]['result']
                        length = len(list_result)
                        length_pic = len(list_pic)
                        if length_pic != length:
                            print 'result number overflow the ranking!'
                            continue
                        act_time = dict_file[idfa]['act_time']
                        is_valid = filer_rule.check_each_seq(list_result)
                        if is_valid:
                            out_pic = ''
                            out_result = ''
                            for j in range(0, length):
                                out_pic += list_pic[j] + ' '
                                out_result += str(list_result[j]) + ' '
                            out_pic.strip(' ')
                            out_result.strip(' ')
                            fout_pic.write(out_pic + '\n')
                            fout_result.write(out_result + '\n')
                            valid_cnt += 1
                        else:
                            invalid += 1
                    fin.close()
                    fout_pic.close()
                    fout_result.close()
        print 'filter ', day
    return valid_cnt
def prepare_ndcg_data():
    cf_data = Config('data.conf')
    path = cf_data.get('path', 'dataset_path')
    fin = open(path+'four_ranking.txt', 'r')
    dict_train_ranking = {}
    while True:
        line = fin.readline()
        if not line:
            break
        if line == '\n':
            continue
        name, ranking = line.split('\t')
        name = name.strip(':')
        dict_train_ranking[name] = ranking
    fin.close()

    fin = open(path+'test_ranking.txt', 'r')
    line = fin.readline()
    dict_test_ranking = eval(line)
    fin.close()
    list_click = dict_train_ranking['rank_click']
    similarity_click = []
    for page in dict_test_ranking:
        ideal_ranking = dict_test_ranking[page]
        similarity = ndcg_similarity(ideal_ranking, list_click)
        similarity_click.append([page, similarity])
    similarity_click.sort(key=lambda x: x[0])
    print 'click'

    list_prob = dict_train_ranking['rank_prob']
    similarity_prob = []
    for page in dict_test_ranking:
        ideal_ranking = dict_test_ranking[page]
        similarity = ndcg_similarity(ideal_ranking, list_prob)
        similarity_prob.append([page, similarity])
    similarity_prob.sort(key=lambda x: x[0])
    print 'prob'

    list_pb = dict_train_ranking['rank_pb']
    similarity_pb = []
    for page in dict_test_ranking:
        ideal_ranking = dict_test_ranking[page]
        similarity = ndcg_similarity(ideal_ranking, list_pb)
        similarity_pb.append([page, similarity])
    similarity_pb.sort(key=lambda x: x[0])
    print 'pb'

    list_full = dict_train_ranking['rank_full']
    similarity_full = []
    for page in dict_test_ranking:
        ideal_ranking = dict_test_ranking[page]
        similarity = ndcg_similarity(ideal_ranking, list_full)
        similarity_full.append([page, similarity])
    similarity_full.sort(key=lambda x: x[0])
    print 'full'
    return similarity_click, similarity_prob, similarity_pb, similarity_full
def train_data_full_model(data_name):
    cf_data = Config('data.conf')
    path = cf_data.get('path', 'dataset_path')
    fin = open(path+data_name+'.txt', 'r')
    line = fin.readline()
    list_pic = line.strip('\r\n').split(',')
    fin.close()
    print 'full model, 图片总数: ', len(list_pic)
    # data_name = data_name.split('data3')[1]
    return list_pic, data_name
def draw_ndcg(behavior, data_name):
    similarity_click, similarity_prob, similarity_pb, similarity_full = calculate_similarity.prepare_ndcg_data()

    cf_data = Config('data.conf')
    chart_path = cf_data.get('path', 'chart_result')
    file_name = behavior+'_nDCG' + data_name
    fout_html = open(chart_path+file_name+'.html', 'w')
    fout_html.write('<!doctype html><html lang="en"><meta charset="UTF-8"><head><script type="text/javascript" src="http://cdn.hcharts.cn/jquery/jquery-1.8.3.min.js"></script><script type="text/javascript" src="http://cdn.hcharts.cn/highcharts/highcharts.js"></script><script>$(function () { $("#container").highcharts({ chart: { zoomType: "xy" }, title: { text: "nDCG" }, xAxis: [{ tickInterval: 1 }], yAxis: [{ title: { text: "相似度" }}], tooltip: { shared: true }, series: [{ name: "图片点击次数", data:')
    fout_html.write(str(similarity_click))
    fout_html.write('} ,{ name:"图片点击概率", data: ')
    fout_html.write(str(similarity_prob)+'},{ name: "考虑Position bias效应",data: ')
    fout_html.write(str(similarity_pb)+'},{ data: ')
    fout_html.write(str(similarity_full))
    fout_html.write(',name: "完整模型" }] }); }); </script></head><body><div id="container" style="min-width:700px;height:400px"></div></body></html>')
    fout_html.close()
def merge_pic_click():
    cf_data = Config('data.conf')
    dataset_path = cf_data.get('path', 'dataset_path')
    fin = open(dataset_path+'group_pic', 'r')
    dict_group_pic = {}
    while True:
        line = fin.readline()
        if not line:
            break
        group_id, pics = line.strip('\r\n').split('\t')
        if group_id not in dict_group_pic:
            dict_group_pic[group_id] = pics
    fin.close()

    fin = open(dataset_path+'pic_info_all', 'r')
    dict_raw = eval(fin.read())
    dict_output = {}
    for group_id in dict_group_pic:
        dict_output[group_id] = {}
        str_pics = dict_group_pic[group_id]
        list_pic = str_pics.strip(',').split(',')
        for pic in list_pic:
            if pic not in dict_raw:
                print 'pic not in data set: ', pic
            else:
                for full_time in dict_raw[pic]:
                    page = dict_raw[pic][full_time][3]
                    page = str(page)
                    day = full_time.split(':')[0]
                    if page not in dict_output[group_id]:
                        dict_output[group_id][page] = [0, 0, 0, day]
                        dict_output[group_id][page][0] = dict_raw[pic][full_time][0]
                        dict_output[group_id][page][1] = dict_raw[pic][full_time][1]
                        dict_output[group_id][page][2] = dict_raw[pic][full_time][2]
                    else:
                        dict_output[group_id][page][0] += dict_raw[pic][full_time][0]
                        dict_output[group_id][page][1] += dict_raw[pic][full_time][1]
                        dict_output[group_id][page][2] += dict_raw[pic][full_time][2]
                        if day not in dict_output[group_id][page][3]:
                            dict_output[group_id][page][3] += ',' + day
    fin.close()

    fout = open(dataset_path+'group_pic_position_hour', 'w')
    for group in dict_output:
        fout.write(str(group) + '\t')
        fout.write(str(dict_output[group]))
        fout.write('\n')
    fout.close()
Exemple #10
0
def b():
    cf = Config('data.conf')
    path = cf.get('path', 'dataset_path')
    name_data2 = '1104-1111_data2_pb'
    name_data3 = '1104-1111_data3_full_normal_turn'
    fin_data2 = open(path+name_data2+'.txt', 'r')
    line = fin_data2.read()
    list_data2 = line.split(',')
    data2 = list_data2[0: 2000]
    fin_data2.close()

    fin_data3 = open(path+name_data3+'.txt', 'r')
    line = fin_data3.read()
    list_data3 = line.split(',')
    data3 = list_data3[0: 2000]
    fin_data3.close()

    similiarity = calculate_similarity.calculate_similarity(data2, data3)
    print similiarity
	def __init__( self , configFile , visible = True ) :
		self.configFile = configFile
		self.config = Config( self.configFile )
		self.config.fetch( )

		self.visible = visible
		self.password = None

		self.__prepareConfig( )

		self.controller = Controller( self )
		self.controller.configure( )
Exemple #12
0
def init_mongodb_hour_ranking():
    """
    该函数只需要初始化数据库一次, 写入到kdd数据库中的 hour_ranking 表
    """
    cf_data = Config('data.conf')
    dataset_path = cf_data.get('path', 'dataset_path')
    mongo = Mongo('kdd', 'hour_ranking')
    fin = open(dataset_path+'hour_ranking', 'r')
    while True:
        line = fin.readline()
        if not line:
            break
        ranking_time, ranking = line.strip('\n').split('\t')
        list_ranking = ranking.strip(' ').split(', ')
        record = mongo.collection.find({'time': ranking_time}, {'_id': 0})
        if record.count() == 0:
            mongo.collection.insert({'time': ranking_time, 'ranking': list_ranking})
        else:
            print '该小时的ranking已存在!'
    fin.close()
    mongo.close()
Exemple #13
0
def init_group_pic_pb():
    """
    该函数只需要初始化数据库一次, 写入到kdd数据库中的 group_pic_pb 表, 用于在线的图片组的position bias 查询
    """
    cf_data = Config('data.conf')
    path = cf_data.get('path', 'dataset_path')
    mongo = Mongo('kdd', 'group_pic_pb')
    fin = open(path+'/group_pic_position_hour', 'r')
    while True:
        line = fin.readline()
        if not line:
            break
        # linux 中处理windows文件的行, 需要去除'\r\n', windows 中这样写也可以
        group_id, page_click_info = line.strip('\r\n').split('\t')
        record = mongo.collection.find({'gid': int(group_id)}, {'_id': 0})
        page_click_info = eval(page_click_info)
        if record.count() == 0:
            mongo.collection.insert({'gid': int(group_id), 'pinfo': page_click_info})
        else:
            print str(group_id), ' 已存在 '
    fin.close()
    mongo.close()
def position_bias(st_time, end_time, behavior, max_req):
    """
    :param behavior:  all 整个过滤后数据集   view 不包含save操作的集合  save 包含save操作的集合
    :return:
    dict_result    {1: [show_num, click_num]}  show_num: 当前页码一共展示多少次图片,click_num,一共被点击量
    """
    cf_data = Config('data.conf')
    fin_path = cf_data.get('path', 'position_bias_data')
    if 'all' == behavior:
        act_value = 1
        # fin_path = cf_data.get('path', 'filter_data')
    elif 'view' == behavior:
        act_value = 1
        # fin_path = cf_data.get('path', 'view_data')
    elif 'save' == behavior:
        act_value = 2
        # fin_path = cf_data.get('path', 'save_data')
    else:
        return False
    chart_path = cf_data.get('path', 'chart_result')

    list_time = Function.get_time_list(st_time, end_time)
    # dict_result = {}  # {page: [show_num, click_num]}
    dict_result = {}    # {page: [show_num, click_num, save_num]} 增加纵坐标为 save/click
    for day in list_time:
        input_path = fin_path + day
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in_result = input_path + '\\result_' + temp_name + str(i)
                if os.path.exists(file_in_result):
                    fin_result = open(file_in_result, 'r')
                    while True:
                        line_result = fin_result.readline()
                        if not line_result:
                            break
                        list_result = line_result.strip('\n').strip(' ').split(' ')
                        for index, item in enumerate(list_result):
                            list_result[index] = int(item)
                        length = len(list_result)
                        request_num = length/36
                        if request_num > max_req:
                            continue
                        # page_num = request_num * 4
                        for j in range(1, 5):
                            if j not in dict_result:
                                dict_result[j] = [0, 0, 0]
                            dict_result[j][0] += 9
                            for k in range((j-1)*9, j*9):
                                if list_result[k] >= act_value:   # 这里统计点击情况
                                    dict_result[j][1] += 1
                                if list_result[k] == 2:
                                    dict_result[j][2] += 1
                        for j in range(2, request_num+1):
                            page = j*4 - 3
                            if page not in dict_result:
                                dict_result[page] = [0, 0, 0]
                            dict_result[page][0] += 9
                            for k in range((page-1)*9, page*9):
                                if list_result[k] >= act_value:
                                    dict_result[page][1] += 1
                                if list_result[k] == 2:
                                    dict_result[page][2] += 1
                    fin_result.close()
        print 'page pb: ', day

    ## 把dict_result按照格式输出, 画成highcharts图
    a1 = []  # 呈现数量
    a2 = []  # click数量
    a3 = []  # save数量
    b = []  # probability per page
    page_num = len(dict_result)*4
    for page in range(1, page_num):
        if page in dict_result:
            a1.append([page, dict_result[page][0]])
            a2.append([page, dict_result[page][1]])
            a3.append([page, dict_result[page][2]])
            prob = float(dict_result[page][2])/dict_result[page][0]  # 更改为save/show, 原始为 click/show
            temp = round(prob, 4)
            b.append([page, temp])
    fout = open(chart_path+'4-position-bias-'+behavior+'_Max-'+str(max_req)+'.result', 'w')
    fout.write('show number:\n')
    fout.write(str(a1))
    fout.write('\n\nclick number:\n')
    fout.write(str(a2))
    fout.write('\n\nsave number:\n')
    fout.write(str(a3))
    fout.write('\n\nsave/show:\n')
    fout.write(str(b))
    fout.close()
def draw_evaluation(behavior, train_time, pic_num, test_st, test_end, page_num, data2, data3):
    test_group = make_test_ranking.get_pic_group(test_st, test_end, page_num)
    data1_click, data2_prob = make_train_ranking.train_data_baseline_all(behavior, train_time)
    data3_pb = make_train_ranking.train_data_position_bias(data2)
    data4_full, data_name = make_train_ranking.train_data_full_model(data3)

    cf_data = Config('data.conf')
    path = cf_data.get('path', 'dataset_path')
    fout = open(path+'four_ranking.txt', 'w')
    fout.write('rank_click:\t'+str(data1_click)+'\n\n')
    fout.write('rank_prob:\t'+str(data2_prob)+'\n\n')
    fout.write('rank_pb:\t'+str(data3_pb)+'\n\n')
    fout.write('rank_full:\t'+str(data4_full)+'\n\n')
    fout.close()

    ranking_click, ranking_prob, ranking_pb, ranking_full = \
        make_train_ranking.select_share_pic(data1_click, data2_prob, data3_pb, data4_full, pic_num)

    test_pic_info = make_test_ranking.get_test_data_ranking(test_st, test_end, test_group, 0, 0)
    test_ranking = make_test_ranking.sort_rank_by_click_show(test_pic_info)

    # 画 full model 的 similarity 数据
    similarity_full, average_pic_num = calculate_similarity.draw_chart(ranking_full, test_ranking)

    fout = open(path+'test_ranking.txt', 'w')
    fout.write(str(test_ranking))
    fout.close()

    # 画 position bias 的 similarity 数据
    similarity_pb, average_pic_num = calculate_similarity.draw_chart(ranking_pb, test_ranking)

    # 按照 prob 逆序后 算相似度
    similarity_prob, average_pic_num = calculate_similarity.draw_chart(ranking_prob, test_ranking)

    # 按照click 逆序后 算相似度
    similarity_click, average_pic_num = calculate_similarity.draw_chart(ranking_click, test_ranking)
    print 'average', str(average_pic_num)


    days = train_time.keys()
    days.sort(reverse=True)
    train_st = days[-1]
    train_end = days[0]

    cf_data = Config('data.conf')
    data_path = cf_data.get('path', 'dataset_path')
    chart_path = cf_data.get('path', 'chart_result')
    file_name = behavior+'_train_'+train_st+'_'+train_end+'_K='+str(pic_num)+'_average='+str(average_pic_num)+'_test_'+test_st+'_'+test_end+'_'+data_name
    fout_data = open(data_path+file_name+'.data', 'w')
    fout_data.write('Baseline click data: \n' + str(similarity_click) + '\n\n')
    fout_data.write('Baseline prob data: \n' + str(similarity_prob) + '\n\n')
    fout_data.write('Position bias: \n' + str(similarity_pb) + '\n\n')
    fout_data.write('Full model: \n' + str(similarity_full))
    fout_data.close()

    fout_html = open(chart_path+file_name+'.html', 'w')
    fout_html.write('<!doctype html><html lang="en"><meta charset="UTF-8"><head><script type="text/javascript" src="http://cdn.hcharts.cn/jquery/jquery-1.8.3.min.js"></script><script type="text/javascript" src="http://cdn.hcharts.cn/highcharts/highcharts.js"></script><script>$(function () { $("#container").highcharts({ chart: { zoomType: "xy" }, title: { text: "Similarity Charts" }, xAxis: [{ tickInterval: 1 }], yAxis: [{ title: { text: "相似度" }}], tooltip: { shared: true }, series: [{ name: "图片点击次数", data:')
    fout_html.write(str(similarity_click))
    fout_html.write('} ,{ name:"图片点击概率", data: ')
    fout_html.write(str(similarity_prob)+'},{ name: "考虑Position bias效应",data: ')
    fout_html.write(str(similarity_pb)+'},{ data: ')
    fout_html.write(str(similarity_full))
    fout_html.write(',name: "完整模型" }] }); }); </script></head><body><div id="container" style="min-width:700px;height:400px"></div></body></html>')
    fout_html.close()
def quality(st_time, end_time, min_show):
    cf = Config('data.conf')
    fin_path = cf.get('path', 'filter_data')
    chart_path = cf.get('path', 'chart_result')
    list_time = Function.get_time_list(st_time, end_time)
    dict_pic_quality = {}
    for day in list_time:
        folder_path = fin_path + day
        if os.path.exists(folder_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in_pic = folder_path + '\\pic_' + temp_name + str(i)
                file_in_result = folder_path + '\\result_' + temp_name + str(i)
                if os.path.exists(file_in_result):
                    fin_result = open(file_in_result, 'r')
                    fin_pic = open(file_in_pic, 'r')
                    while True:
                        line_result = fin_result.readline()
                        line_pic = fin_pic.readline()
                        if not line_result:
                            break
                        list_result = line_result.strip('\n').strip(' ').split(' ')
                        list_pic = line_pic.strip('\n').strip(' ').split(' ')
                        for index, item in enumerate(list_result):
                            list_result[index] = int(item)
                        length = len(list_result)
                        request_num = length/36
                        for j in range(1, 5):
                            if j not in dict_pic_quality:
                                dict_pic_quality[j] = {}
                            for k in range((j-1)*9, j*9):
                                pic = list_pic[k]
                                if pic not in dict_pic_quality[j]:
                                    dict_pic_quality[j][pic] = [0, 0]
                                if list_result[k] >= 1:
                                    dict_pic_quality[j][pic][1] += 1
                                dict_pic_quality[j][pic][0] += 1
                        for j in range(2, request_num+1):
                            page = j*4 - 3
                            if page not in dict_pic_quality:
                                dict_pic_quality[page] = {}
                            for k in range((page-1)*9, page*9):
                                pic = list_pic[k]
                                if pic not in dict_pic_quality[page]:
                                    dict_pic_quality[page][pic] = [0, 0]
                                if list_result[k] >= 1:
                                    dict_pic_quality[page][pic][1] += 1
                                dict_pic_quality[page][pic][0] += 1
                    fin_result.close()
                    fin_pic.close()

    list_output = []
    for page in dict_pic_quality:
        if page <= 80:
            for pic in dict_pic_quality[page]:
                info = dict_pic_quality[page][pic]
                if info[0] >= min_show:
                    prob = float(info[1])/info[0]
                    list_output.append([page, round(prob, 3)])
    name = '4-1-pic-quality_'+st_time+'_'+end_time+'.result'
    fout = open(chart_path+name, 'w')
    output_str = str(list_output)
    fout.write('图片质量: \n\n'+output_str)
    fout.close()


# if __name__ == '__main__':
#     quality()
def count_seq_info(st_time, end_time):
    """
    :param st_time:   开始日期
    :param end_time:   结束日期
    主要操作: 酸楚有效序列的最大长度, 有效序列的最多连续1个数, 有效序列中1占的最大比例, 用户操作时间分布
    并写入到data.conf文件
    :return:  总的序列长度
    """
    list_time = Function.get_time_list(st_time, end_time)
    cf = Config('data.conf')
    raw_data_path = cf.get('path', 'raw_data')
    percent_request_num = float(cf.get('filter_percent', 'request_num'))
    percent_sub_seq_length = float(cf.get('filter_percent', 'sub_seq_length'))
    percent_click_percent = float(cf.get('filter_percent', 'click_percent'))
    act_type = int(cf.get('act_type', 'value'))

    total = 0                                # 总的有效序列个数
    dict_request_distribution = {}           # 统计序列长度分布, 找出95%对应的序列长度值  {request: xx }
    dict_sub_seq_len_distribution = {}       # 统计序列1子序列长度分布                    {len: xx }
    dict_click_percent_distribution = {}     # 统计序列1占总体的比例分布                  {percent: xx }
    dict_act_time_distribution = {}          # 统计用户在每次请求上的停留时间 act_time 字段记录了request-1次的停留时间,
                                             # 下标与请求次数对应, 第一位为0, 无含义  {request: [user_num, total_time]}

    for day in list_time:
        input_path = raw_data_path + day
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in = input_path + '\\full_' + temp_name + str(i)
                if os.path.exists(file_in):
                    fin = open(file_in, 'r')
                    dict_file = eval(fin.read())
                    for idfa in dict_file:
                        list_result = dict_file[idfa]['result']
                        act_time = dict_file[idfa]['act_time']
                        click_num = 0
                        for r in list_result:
                            if r >= act_type:
                                click_num += 1
                        if click_num == 0:   # 删除全0, act_type为1,删除没有点击的序列, 为2删除没有save的序列
                            continue
                        length = len(list_result)
                        request = length/36
                        if request not in dict_request_distribution:  # 统计序列长度分布
                            dict_request_distribution[request] = 0
                        dict_request_distribution[request] += 1
                        temp_value = get_sub_seq_cnt(list_result, act_type, length)
                        if temp_value not in dict_sub_seq_len_distribution:  # 统计连续1的长度分布
                            dict_sub_seq_len_distribution[temp_value] = 0
                        dict_sub_seq_len_distribution[temp_value] += 1
                        temp_value = int(float(click_num)/length*100)
                        if temp_value not in dict_click_percent_distribution:
                            dict_click_percent_distribution[temp_value] = 0
                        dict_click_percent_distribution[temp_value] += 1
                        for j in range(1, request):    # 首位0无意义, 过滤
                            if j not in dict_act_time_distribution:
                                dict_act_time_distribution[j] = [0, 0]
                            dict_act_time_distribution[j][0] += 1
                            dict_act_time_distribution[j][1] += act_time[j]
                        total += 1
                    fin.close()
        print 'finish ', day

    preserve_request_num = int(total*percent_request_num)
    preserve_sub_seq_length = int(total*percent_sub_seq_length)
    preserve_click_percent = int(total*percent_click_percent)

    cf = Config('rule.conf')   # 写入过滤条件至rule.conf文件
    # 找出max_request_num
    length = len(dict_request_distribution)
    temp = 0
    for i in range(1, length+1):
        if temp < preserve_request_num:
            temp += dict_request_distribution[i]
        else:
            temp = i-1
            break
    cf.set('filter_value', 'max_request_num', str(temp))
    # end

    # 找出max_su_seq_length
    length = len(dict_sub_seq_len_distribution)
    temp = 0
    for i in range(1, length+1):
        if temp < preserve_sub_seq_length:
            temp += dict_sub_seq_len_distribution[i]
        else:
            temp = i-1
            break
    cf.set('filter_value', 'max_sub_seq_length', str(temp))
    # end

    # 找出max_click_percent
    length = len(dict_click_percent_distribution)
    temp = 0
    for i in range(0, length+1):
        if temp < preserve_click_percent:
            temp += dict_click_percent_distribution[i]
        else:
            temp = i-1
            break
    temp = round(float(temp)/100, 2)  # 转换为小数
    cf.set('filter_value', 'max_click_percent', str(temp))
    # end

    # 计算出用户每次请求停留时间分布
    length = len(dict_act_time_distribution)
    temp = ''
    list_average_time = []
    for i in range(1, length+1):
        total_seq = dict_act_time_distribution[i][0]
        total_time = dict_act_time_distribution[i][1]
        average_time = round(float(total_time)/total_seq, 1)
        temp += str(average_time) + ','
        list_average_time.append([i, average_time])
    temp.strip(',')
    cf.set('filter_value', 'act_time', temp)
    # end
    cf.write()          # 执行写入
    return total
Exemple #18
0
import sys
from flask import Flask
from flask_restful import Api
from flask_login import LoginManager
from flask_wtf.csrf import CSRFProtect
from flask_sqlalchemy import SQLAlchemy

## initial workpath valus
workpath = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
sys.path.append('%s' % (workpath))

## import priviate pkgs
from lib.Config import Config

## load config
config = Config(workpath)
MARIADB_HOST = config.MARIADB_HOST
MARIADB_PORT = config.MARIADB_PORT
MARIADB_USER = config.MARIADB_USER
MARIADB_PASSWORD = config.MARIADB_PASSWORD
MARIADB_DATABASE = config.MARIADB_DATABASE

## initial some global values
db = SQLAlchemy()
login_manager = LoginManager()
csrf = CSRFProtect()


## getApp func
def getApp(name):
    ## init flask
class Application( Interface ) :
	def __init__( self , configFile , visible = True ) :
		self.configFile = configFile
		self.config = Config( self.configFile )
		self.config.fetch( )

		self.visible = visible
		self.password = None

		self.__prepareConfig( )

		self.controller = Controller( self )
		self.controller.configure( )

	def getDBA( self , path , query = None ) :
		conn = SQLite( creator = self , filename = query )
		conn.prepare( path )

		return conn

	def prepareSQLite3( self ) :
		self.conn = self.getDBA( self.config[ "db" ][ "path" ] , self.config[ "db" ][ "query" ] )

		for entity in self.conn.config[ "handler" ] :
			consumer = SQLiteConsumer( entity = entity , creator = self.conn )
			setattr( self , entity , consumer )

		return self

	def __prepareConfig( self ) :
		self.config[ "libPath" ] = self.preparePath( self.config[ "libPath" ] )
		self.config[ "driverPath" ] = self.preparePath( self.config[ "driverPath" ] )

		for key in self.config[ "gui" ][ "icon" ] :
			self.config[ "gui" ][ "icon" ][ key ] = self.preparePath( self.config[ "gui" ][ "icon" ][ key ] )

		for key in ( "query" , "path" , "recrypt" ) :
			self.config[ "db" ][ key ] = self.preparePath( self.config[ "db" ][ key ] )

		self.config[ "gui" ][ "widgetPath" ] = self.preparePath( self.config[ "gui" ][ "widgetPath" ] )

		return self

	def preparePath( self , path , add = '..' ) :
		args = [ os.path.dirname( __file__ ) ]

		if add is not None :
			args.append( add )

		args.append( path )
		result = os.path.abspath( os.path.join( * args ) )

		return result

	def prepare( self ) :
		self.ui = UI( None , config = self.config , creator = self , visible = self.visible )

		self.password = Password( self.ui )
		self.password.prepare( )

		result = self.password.login( )
		if not result :
			self.exception( "invalid_password" )

		self.prepareSQLite3( )
		self.ui.prepare( )

		return self

	def execute( self ) :
		if self.visible :
			self.ui.execute( )

		return self

	def action( self , action , * argv , ** kwargs ) :
		error = self.config[ "error" ]
		def onError( exception ) :
			self.ui.message(
				title	= str( exception ) ,
				message	= error[ "%s_comment" % action ].format( * argv ) ,
			)
			raise exception
		try :
			target = lambda : self.controller.action( action , onError , argv , kwargs )
			finish = lambda : self.ui.update( ) and self.ui.message(
				title	= error[ "%s_title" % action ].format( * argv , ** kwargs ) ,
				message	= error[ "%s_comment" % action ].format( * argv , ** kwargs ) ,
			)
			self.run( target = target , finish = finish )
		except Exception as exception :
			onError( exception )

			return False
		return True

	def run( self , target , finish ) :
		if not self.visible :
			target( )

			return finish( )

		onFinish = lambda: self.ui.update( ) and finish( )

		if not self.thread( target = target , finish = onFinish ) :
			self.exception( "unknown" )

		return False
Exemple #20
0
 def get(columns,
         ticker,
         dtime_from=None,
         dtime_to=None,
         date=None,
         market_hours=True,
         order='asc'):
     """
     Get minutely quote values for the given time range or date
     :param columns: Iterable set of columns that should be obtained
     :param ticker: Ticker name
     :param dtime_from: When selecting by time range: Time range start
     :param dtime_to: When selecting by time range: Time range end
     :param date: When selecting by date: Date
     :param market_hours: Should the data be obtained only within market hours?
     :param order: Records datetime order: 'asc' - old first, 'desc' - new first
     :return: Iterable generator with tuples for dtime + specified columns
     """
     if Data._db is None:
         Data._db = DB().db
         Data._cursor = Data._db.cursor()
     if isinstance(columns, str):
         # Specified indicators group: need to get all of them
         columns = Config.get('dbstructure.' + columns).keys()
     # MySQL tables that will be used and inner columns
     tables = []
     query_columns = []
     for column in columns:
         for table, table_indicators in Config.get('dbstructure').items():
             if column in table_indicators:
                 if len(query_columns) == 0:
                     query_columns.append('`%s`.`dtime`' % table)
                 query_columns.append('`%s`.`%s`' % (table, column))
                 if table not in tables:
                     tables.append(table)
                 break
     if len(tables) == 0:
         return
     query = 'select ' + ', '.join(query_columns) + ' from `' + '`, `'.join(
         tables) + '` '
     query += 'where `%s`.`ticker` = "%s" ' % (tables[0], ticker)
     if date is not None:
         if isinstance(date, datetime.date):
             date = datetime.datetime.combine(date, datetime.time(0, 0))
         elif isinstance(date, str):
             date = datetime.datetime.strptime(date, '%Y-%m-%d')
         if market_hours:
             dtime_from = tradetime.daystart(date)
             dtime_to = tradetime.dayend(date)
         else:
             dtime_from = date.replace(hour=0, minute=0)
             dtime_to = date.replace(hour=23, minute=59)
     query += 'and `%s`.`dtime` between "%s" and "%s" ' % (
         tables[0], dtime_from, dtime_to)
     if market_hours and date is None:
         query += 'and time(`%s`.`dtime`) between "%s" and "%s" ' % (
             tables[0], tradetime.start, tradetime.end)
     # Joining tables
     for table in tables:
         if table == tables[0]:
             continue
         query += 'and `%s`.`ticker` = `%s`.`ticker` ' % (table, tables[0])
         query += 'and `%s`.`dtime` = `%s`.`dtime` ' % (table, tables[0])
     query += 'order by `dtime` ' + order
     Data._cursor.execute(query)
     for entry in Data._cursor:
         yield entry
     Data._db.commit()
Exemple #21
0
import os, time
from selenium import webdriver
from behave import given, then, when
from lib.Config import Config
from functions.Vimbox_Login import Vimbox_Login
from support.Vimbox_Login_Page import Vimbox_Login_Page
from selenium.common.exceptions import TimeoutException

cf = Config()
log = Vimbox_Login()
lg = Vimbox_Login_Page()

vimboxstaging = cf.get_config('config/config.ini', 'links', 'vimboxstaging')
vimboxprod = cf.get_config('config/config.ini', 'links', 'vimboxprod')


@given(u'I navigate to Vimbox login page')
def step_impl(context):
    if (os.environ['location'] == "staging"):
        context.browser.get(vimboxstaging)
    elif (os.environ['location'] == "prod"):
        context.browser.get(vimboxprod)
    time.sleep(3)
    #print(lg.header_login(context))
    #assert (lg.header_login(context)).is_displayed()


#@when(u'I log in with "*****@*****.**" and "111111"')
@when(u'I log in with "{email}" and "{password}"')
def step_impl(context, email, password):
    try:
Exemple #22
0
 def __init__(self):
     self.db = Database()
     self.config = Config().getConfig()
     self.key = self.config['server_settings']['app_key']
def turn_probability(st_time, end_time, behavior, min_page, max_page, back_st, back_end, max_click):
    """
    _type: save or view
    min_seq:    用于统计的最短序列
    max_seq:    用于统计的最长序列
    max_click:  统计最多点击多少图片
    back_st:    统计范围的起始处
    back_end:   统计范围的终点处
    {req1: {0: [a. b], 1: [a, b]}} req1: seq长度,0: 最后K页点击的图片数量 a没向后翻看的人: b:向后翻的人数
    probability = (b)/(a+b)
    """
    cf_data = Config('data.conf')
    fin_path = cf_data.get('path', 'filter_data')
    if 'all' == behavior:
        act_value = 1
        # fin_path = cf_data.get('path', 'filter_data')
    elif 'view' == behavior:
        act_value = 1
        # fin_path = cf_data.get('path', 'view_data')
    elif 'save' == behavior:
        act_value = 2
        # fin_path = cf_data.get('path', 'save_data')
    else:
        return False
    chart_path = cf_data.get('path', 'chart_result')

    list_time = Function.get_time_list(st_time, end_time)
    dict_result = {}
    dict_distribution = {}
    for i in range(min_page, max_page+1):
        dict_result[i] = {}
    for day in list_time:
        input_path = fin_path + day
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in_result = input_path + '\\result_' + temp_name + str(i)
                if os.path.exists(file_in_result):
                    fin_result = open(file_in_result, 'r')
                    while True:
                        line_result = fin_result.readline()
                        if not line_result:
                            break
                        list_result = line_result.strip('\n').strip(' ').split(' ')
                        length = len(list_result)
                        for index, item in enumerate(list_result):
                            list_result[index] = int(item)
                        not_null_position = 0
                        for m in range(length-1, length-28, -1):  # 仅考虑最后3页情况
                            if list_result[m] >= 1:
                                not_null_position = m
                                break
                        if not_null_position == 0:
                            not_null_position = length - 28
                        not_null_position += 1   # 数组从0开始计数, 为了方便计算页码, 这里加1处理
                        if not_null_position % 9 == 0:
                            page_num = not_null_position/9
                        else:
                            page_num = not_null_position/9 + 1
                        dict_distribution.setdefault(page_num, 0)
                        dict_distribution[page_num] += 1
                        if page_num < min_page:
                            continue
                        if page_num > max_page:
                            page_num = max_page + 1
                        for page in range(min_page, page_num):  # 这里req<request, 不能等于
                            start = (page - back_st) * 9
                            end = (page - back_end) * 9
                            click_num = 0
                            for j in range(start, end):
                                if list_result[j] >= act_value:
                                    click_num += 1
                            dict_result[page].setdefault(click_num, [0, 0])
                            dict_result[page][click_num][1] += 1
                        # 统计序列最后一次操作情况, 当序列超过统计长度, 则他在指定范围内的所有操作都视为翻页
                        if page_num <= max_page:
                            start = (page_num - back_st) * 9
                            end = (page_num - back_end) * 9
                            click_num = 0
                            for j in range(start, end):
                                if list_result[j] >= act_value:
                                    click_num += 1
                            dict_result[page_num].setdefault(click_num, [0, 0])
                            dict_result[page_num][click_num][0] += 1
                    fin_result.close()
        print 'turn probability:  ', day
    list_distribution = []
    for p in dict_distribution:
        num = dict_distribution[p]
        list_distribution.append([p, num])
    list_distribution.sort(key=lambda x: x[0])
    print list_distribution
    # print
    file_name = '6-turn-probability-start-' + str(back_st) + '-end-' + str(back_end) + '-' + behavior + '.result'
    fout = open(chart_path+file_name, 'w')
    for page in dict_result:
        a = []
        for i in range(0, max_click+1):
            if i in dict_result[page]:
                temp = float(dict_result[page][i][1])/(dict_result[page][i][1]+dict_result[page][i][0])
                a.append([i, round(temp, 3)])
            else:
                print str(i) + 'is not in request: ' + str(page)
        fout.write("{ 'name': 'page= " + str(page) + "', 'data': ")
        fout.write(str(a))
        fout.write('}, \n')
    fout.close()
Exemple #24
0
def count_pic_info(st_time, end_time, min_show_num, behavior):
    cf_data = Config('data.conf')
    filter_data = cf_data.get('path', 'filter_data')
    dataset_path = cf_data.get('path', 'dataset_path')
    list_time = Function.get_time_list(st_time, end_time)

    # {pic: {'2014-11-04:02': [show, click, page]... }}  增加小时, key为日期+小时的组合
    dict_result = {}
    for day in list_time:
        input_path = filter_data + day
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                hour_info = temp_name + str(i)
                file_in_pic = input_path + '\\pic_' + hour_info
                file_in_result = input_path + '\\result_' + hour_info
                if os.path.exists(file_in_result):
                    fin_pic = open(file_in_pic, 'r')
                    fin_result = open(file_in_result, 'r')
                    full_time = day + ':' + hour_info
                    while True:
                        line_pic = fin_pic.readline()
                        line_result = fin_result.readline()
                        if not line_result:
                            break
                        if behavior == 'all':
                            pass
                        elif behavior == 'save':
                            if '2' not in line_result:
                                continue
                        elif behavior == 'click':
                            if '2' in line_result:
                                continue
                        list_pic = line_pic.strip('\n').strip(' ').split(' ')
                        list_result = line_result.strip('\n').strip(' ').split(' ')
                        length = len(list_result)
                        for index, item in enumerate(list_result):
                            list_result[index] = int(item)
                        for j in range(0, length):
                            page = j/9 + 1
                            picture = list_pic[j]
                            if picture not in dict_result:
                                dict_result[picture] = {}
                            if full_time not in dict_result[picture]:
                                dict_result[picture][full_time] = [0, 0, 0, page]  # [show, click, save, page]
                            if list_result[j] >= 1:
                                dict_result[picture][full_time][1] += 1
                            if list_result[j] == 2:
                                dict_result[picture][full_time][2] += 1
                            dict_result[picture][full_time][0] += 1
                    fin_pic.close()
                    fin_result.close()
        print 'pic info ' + behavior + ': ', day

    dict_output = {}
    for p in dict_result:
        if p not in dict_output:
            dict_output[p] = {}
        for full_time in dict_result[p]:
            if dict_result[p][full_time][0] > min_show_num:
                if full_time not in dict_output[p]:
                    dict_output[p][full_time] = dict_result[p][full_time]
    fout = open(dataset_path+'pic_info_'+behavior, 'w')
    fout.write(str(dict_output))
    fout.close()
Exemple #25
0
import os
import sys
from lib.Log import Log
from lib.Mail import Mail
from lib.Config import Config

if __name__ == '__main__':

    # initial val
    BASE_PATH = os.path.abspath(os.path.dirname(__file__))

    # read config
    configObj = Config(BASE_PATH)
    logObj = Log(configObj)
    logger = logObj.get_logger()

    #debug print
    logger.debug('MAIL_ENABLE {}'.format(configObj.MAIL_ENABLE))
    logger.debug('MAIL_TO {}'.format(configObj.MAIL_TO))
    logger.debug('MAIL_USER {}'.format(configObj.MAIL_USER))
    logger.debug('MAIL_HOST {}'.format(configObj.MAIL_HOST))
    logger.debug('MAIL_PORT {}'.format(configObj.MAIL_PORT))
    logger.debug('LOG_PATH {}'.format(configObj.LOG_PATH))
    logger.debug('LOG_FILE {}'.format(configObj.LOG_FILE))
    logger.debug('LOG_MAX_SIZE {}'.format(configObj.LOG_MAX_SIZE))
    logger.debug('LOG_BACKUP_COUNT {}'.format(configObj.LOG_BACKUP_COUNT))

    # argc check
    if len(sys.argv) != 2:
        logger.error('Need User')
        sys.exit(configObj.ERROR_PROC_EXIST)
Exemple #26
0
def check_pic_info(random_number):
    str_pic = '197527041 196850177 169307649 202538753 170688513 177932801 182428161 212835841 209407745 204205825 214706689 204553729 211788545 196386561 170675457 214011905 207622145 185725441 208763649 210243841 210444033 210747137 207957505 173206785 213307905 190229761 193104897 171358209 206881793 215523585 205202945 201199105 215603969 191766529 214142977 181751041 188125697 209100801 210708481 186520065 209859329 208289281 193782273 202748673 212727041 208311297 175623169 208612865 211126017 211327489 181983233 179341057 187813121 205326593 199076097 214440705 214274817 210979329 206475777 214732289 185527041 202633729 209618689 211166465 200666369 214833153 204839169 175158017 209789441 215783937 208269825 206161921 174278657 212537089 208788225 204054017 214426369 193862401 203181825 200229121 207453697 188764417 175597057 199475457 178886913 177277185 208713985 215192065 215784193 189293569 215895297 209502465 207194625 170940161 193620993 213532417 215925249 200204289 205732609 206236673 206545921 177959169 170391553 180745217 194121729 216031233 187628545 209457921 212970497 187008513 205189121 204934401 197919233 189073409 209239553 202015489 188410881 203777025 200021505 184615681 205920769 178274305 193947905 200229377 208807425 204143617 199070465 214663425 210651905 178637313 210027521 215450625 205592577 212940545 170798337 203943937 184085761 182784769 209170689 178831105 195121153 210738177 178643457 204138241 194606593 185155073 216038145 204730881 211654145 208805889 187714817 175150337 208549889 188665345 213055745 203551489 211592705 186966529 173075713 215181313 202747905 199952897 189408769 181437441 209678849 172144129 174901505 175053057 175577857 201722113 208739841 181532929 209652225 175000065 202756865 202661121 187078913 215435521 186097665 203540737 187592705 199155457 173236225 194833921 205630977 204995073 214218497 188132865 184352001 210410753 173387265 203648001 201346305 207839233 207394817 190121217 189686529 202668033 181878785 174046465 177633537 178672385 195630593 186370817 181519105 214298113 194764801 172657665 183672065 213577985 179901185 210092289 208679425 178843905 174277633 177282817 189972737 171021825 183393537 208600833 212883201 192785921 209618177 182876929 215339009 171900161 204891649 202568193 179253761 190235649 210234113 197861889 187781377 202199553 173247745 209736449 208772865 198964481 178134273 206478337 203645953 211165441 194953473 204486401 193125377 202082817 198943489 192710913 207714049 192530177 171751937 195489793 213115393 188101889 195768065 198128641 184427265 189868289 199888385 197713153 194216193 184646401 172246529 207778561 175603713 178387713 192675585 198859009 184976641 192704769 209923841 180227329 197135361 190587649 201853697 204339201 204552193 208868865 205424129 183856385 175333633 200901121 204067329 193540353 172529153 205302273 182146049 176279041 201914113 187205377 207936001 205519617 202055425 187021057 201612801 209710081 211150849 178468097 204935169 189964289 209219585 203749121 213003265 175523585 172657153 192377345 205426689 206051073 171140097 198384385 184257537 214959361 186596097 212151809 205601793 200412673 199226625 210645505 186173953 196251137 204208385 198212353 184572417 182971905 209527809 173465345 200628993 209422337 67783681 206845441 181028097 200671489 199790081 207805697 212732417 205889537 203145985 213715969 181955585 210722817 175601409 175216129 171403777 203735553 207956737 171098881 177841665 205183489 168609537 193643521 173684993 193317121 204113665 172000769 199941121 213927169 203932417 191957761 203851009 187266561 199079681 212689409 180775937 207479041 188472065 207668225 186130945 205915905 212568321 181888257 200217857 204106497 212614913 192274689 215193089 194830081 215457281 211421953 181885697 169379073 190236673 204065025 200225025 205492481 200749825 202629633 190964225 204996609 212483585 213862145 205876481 181498881 205777665 205915649 198552321 189969409 212724737 206286849 186893569 209315585 179950337 194948353 201123329 199812609 209227521 173998081 204340993 207058433 174661889 202850049 204044289 184683265 214615041 176474625 177023745 210622721 200493057 210856449 209644033 187065601 191379713 197689601 176793089 204053505 206313985 192669953 212480001 211174145 199007489 192608257 173867521 171339265 206557441 204728321 209442817 193135873 189960193 187291905 178274817 210712833 210639105 189111297 191942401 187475201 203026433 195970049 181888001 211237121 190508801 208119041 190825985 198075393 203735297 180745473 190447873 200061441 208529409 169531137 203305729 207370241 205424897 205189889 189741057 210743041 170919425 205840385 205603841 200981505 198593537 213958913 169309441 193891841 186752257 193892609 187699969 202201601 204018177 190360321 203957761 208784897 205733377 205425409 209210625 192709121 201198337 170069249 203027201 203646721 195597825 172328193 184493825 182272513 174164737 192759809 214009601 198318337 204115713 201954305 210756353 208796673 170219009 173443841 209505793 192376321 204731137 194499585 191666945 169351425 209379329 204486913 181393153 211377409 191767809 199684097 194491649 169311745 211424769 198640129 178276097 197444609 209424641 214196993 213983489 214011137 205728001 189389569 184243457 193984257 212098561 184994561 173236993 200446465 179312385 205728257 173303041 176171009 196543745 172039681 213793025 198550785 210169601 215025665 211439873 181516033 200689921 199961857 213717761 204733697 182245121 174135041 209171713 214657281 207742465 181792257 206326273 211166209 199238401 215495425 215894273 213496577 187684353 202212097 187663105 203263489 179849473 184833537 209895169 200193793 203943169 207122945 207053313 207377153 211024129 183643905 189710849 179088641 179092737 215559937 171033089 204151041 179952641 170220033 206644481 205005057 190533633 203273217 214822913 200256001 188034305 189415937 215558913 205322497 194461441 174089729 195262721 200254465 186843905 203079425 186581761 197744129 188500737 206826753 174026497 169319425 173207041 209326849 205417217 191591425 204059905 189897217 201300737 186430465'

    list_pic = str_pic.strip(' ').split(' ')
    set_random = set()  # 共计20个随机取出的图片
    dict_check_result = {}   # 统计整个图片的出现和点击, 不按照页码来计算
    for i in range(0, random_number):
        random_pic = random.choice(list_pic)
        set_random.add(random_pic)
    for pic in set_random:
        if pic not in dict_check_result:
            dict_check_result[pic] = [0, 0]

    cf_data = Config('data.conf')
    filter_data = cf_data.get('path', 'filter_data')
    list_time = Function.get_time_list('2014-11-04', '2014-11-08')
    for day in list_time:
        input_path = filter_data + day
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in_pic = input_path + '\\pic_' + temp_name + str(i)
                file_in_result = input_path + '\\result_' + temp_name + str(i)
                if os.path.exists(file_in_result):
                    fin_pic = open(file_in_pic, 'r')
                    fin_result = open(file_in_result, 'r')
                    while True:
                        line_pic = fin_pic.readline()
                        line_result = fin_result.readline()
                        if not line_result:
                            break
                        list_pic = line_pic.strip('\n').strip(' ').split(' ')
                        list_result = line_result.strip('\n').strip(' ').split(' ')
                        length = len(list_result)
                        for j in range(0, length):
                            pic = list_pic[j]
                            if pic in dict_check_result:
                                dict_check_result[pic][0] += 1
                                if int(list_result[j]) > 0:
                                    dict_check_result[pic][1] += 1
                    fin_pic.close()
                    fin_result.close()
        print 'finish: ', day

    dict_mongo = {}
    mongo = Mongo('kdd', 'pic_click_info')
    for pic in dict_check_result:
        if pic not in dict_mongo:
            dict_mongo[pic] = [0, 0]
        record = mongo.collection.find({'pid': pic}, {'_id': 0})
        if record.count() > 0:
            record = record[0]
            for day in list_time:
                if day in record:
                    for page in record[day]:
                        dict_mongo[pic][0] += record[day][str(page)][0]
                        dict_mongo[pic][1] += record[day][str(page)][1]
        else:
            print 'miss', pic
    mongo.close()

    # 比较
    valid = 0
    for pic in dict_check_result:
        if dict_mongo[pic][0] == dict_check_result[pic][0] and dict_mongo[pic][1] == dict_check_result[pic][1]:
            valid += 1
        else:
            print 'pic not valid: ', pic
    print '========= valid: ', str(valid)

    print dict_check_result
    print dict_mongo
def split_dataset(st_time, end_time):
    cf_data = Config('data.conf')
    fin_path = cf_data.get('path', 'filter_data')
    view_path = cf_data.get('path', 'view_data')
    save_path = cf_data.get('path', 'save_data')

    list_time = Function.get_time_list(st_time, end_time)
    view_cnt = 0
    save_cnt = 0
    for day in list_time:
        input_path = fin_path + day
        view_output_path = view_path + day
        save_output_path = save_path + day
        if not os.path.exists(view_output_path):
            os.mkdir(view_output_path)
        if not os.path.exists(save_output_path):
            os.mkdir(save_output_path)
        if os.path.exists(input_path):
            for i in range(0, 24):
                temp_name = ''
                if i < 10:
                    temp_name = '0'
                file_in_pic = input_path + '\\pic_' + temp_name + str(i)
                file_in_result = input_path + '\\result_' + temp_name + str(i)
                if os.path.exists(file_in_result):
                    file_view_pic = view_output_path + '\\pic_' + temp_name + str(i)
                    file_view_result = view_output_path + '\\result_' + temp_name + str(i)
                    file_save_pic = save_output_path + '\\pic_' + temp_name + str(i)
                    file_save_result = save_output_path + '\\result_' + temp_name + str(i)
                    fout_view_pic = open(file_view_pic, 'w')
                    fout_view_result = open(file_view_result, 'w')
                    fout_save_pic = open(file_save_pic, 'w')
                    fout_save_result = open(file_save_result, 'w')

                    fin_pic = open(file_in_pic, 'r')
                    fin_result = open(file_in_result, 'r')
                    while True:
                        line_pic = fin_pic.readline()
                        line_result = fin_result.readline()
                        if not line_result:
                            break
                        list_result = line_result.strip('\n').strip(' ').split(' ')
                        for index, item in enumerate(list_result):
                            list_result[index] = int(item)
                        if 2 in list_result:   # 有2的则为save, 没有则全为view
                            fout_save_pic.write(line_pic)   # 这里直接输出line, 而line中有换行符, 所以这里不用显示输出'\n'
                            fout_save_result.write(line_result)
                            save_cnt += 1
                        else:
                            fout_view_pic.write(line_pic)
                            fout_view_result.write(line_result)
                            view_cnt += 1
                    fout_view_pic.close()
                    fout_view_result.close()
                    fout_save_pic.close()
                    fout_save_result.close()
                    fin_pic.close()
                    fin_result.close()
        print 'split ', day
    print 'view: ', str(view_cnt)
    print 'save: ', str(save_cnt)
Exemple #28
0
from flask import Flask, render_template, redirect, request, jsonify, session, url_for

from lib.Config import Config
from lib.Database import Database
from lib.Users import Users

app = Flask(__name__)

# Initial configuration
if __name__ == "__main__":
    config = Config().getConfig()
    app.secret_key = config['server']['appKey']


# Routes
@app.route("/")
def index():
    if 'username' not in session:
        return redirect(url_for('login'))
    return render_template("index.html", title="Main page")


@app.route("/login", methods=['GET', 'POST'])
def login():
    error = request.args.get('error') or None
    if 'username' in session:
        return redirect(url_for('index'))

    if request.method == 'POST':
        user = request.form['username']
        passw = request.form['password']
Exemple #29
0
import os,time,random
from selenium import webdriver
from behave import given, then, when
from support.Gmail_Page import Login
from support.Gmail_Page import Inbox
from functions.Gmail_Login import Gmail_Login
from lib.Config import Config

cf = Config()
gmail = Gmail_Login()
inbox = Inbox()
login = Login()

url = cf.get_config('config/config.ini','gmail', 'url')
logout = cf.get_config('config/config.ini','gmail', 'logout')

@given(u'I sing in to Gmail')
def impl(context):
    gmail.signin(context)
    time.sleep(3)

@when(u'I click on the Compose Button')
def step_impl(context):
    inbox.composebutton(context)
    time.sleep(2)
    assert inbox.composedialog(context)

@then(u'I sign-Out from the gmail account')
def step_impl(context):
    gmail.logout(context)
Exemple #30
0
 def __init__(self):
     self.config = Config().getConfig()
Exemple #31
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# vi send_msg.py
#!/usr/bin/env python
import pika
import json
from lib.Config import Config
from lib.rabbitMq_login import rabbitMq_login
ini = Config('../conf/dataSource.ini')

credentials = pika.PlainCredentials('omp_test', '!QAZ@WSX') #注意用户名及密码
connection = pika.BlockingConnection(pika.ConnectionParameters(
                                     '10.165.196.104',
                                     5672,
                                     '/omp_test_host',
                                     credentials))
channel = connection.channel()

arguments = {"x-message-ttl":604800000}
channel.queue_declare(queue='p4p_adv_audit_queue.yml',durable='True',arguments=arguments)
body_dict =  {"agentId":1007,
         "agentName":"北京派瑞威行广告有限公司(luye)",
         "channel":2,
         "credentials":[
             {
                 "cid":1,
                 "isPerpetual":1,
                 "name":"组织机构代码证",
                 "relationId":1793,
                 "status":0,
Exemple #32
0
class RedisApi:

    ini = Config('E:\python_code\ApiAutoTest\conf\dataSource.ini')
    dict_item = {}
    list_item = ini.get_item_by_section('redis')
    print(list_item)
    for k, v in list_item:
        dict_item[k] = v

    def __init__(self):
        host = self.dict_item['redis_host']
        port = self.dict_item['redis_port']
        password = self.dict_item['redis_password']
        self.pool = redis.ConnectionPool(host=host,
                                         port=port,
                                         password=password)

    def connect(self):
        self.redis = redis.Redis(connection_pool=self.pool)
        return self.redis

    def disconnect(self):
        return self.pool.disconnect()

    '''
    String 操作
    '''

    # 在redis中设置值
    def set(self, key, value, ttl=None):
        if not isinstance(key, str):
            print("the type of key is not string")
        self.redis.set(key, value, ttl)

    # 获取值
    def get(self, key):
        if not isinstance(key, str):
            print("the type of key is not string")
        if not self.connect().exists(key):
            print("the key is not exists")
        val = self.redis.get(key)
        return val

    # 批量设置值
    def mset(self, **kwags):

        self.redis.mset(**kwags)

    # 批量获取值
    def mget(self, keys):
        self.redis.mget(keys)

    # 设置新值,打印原值
    def getset(self, key, value):
        if not isinstance(key, str):
            print("the type of key is not string")
        old_value = self.redis.getset(key, value)
        return old_value

    # 根据字节获取子序列
    def getrange(self, key, start, end):
        if not isinstance(key, str):
            print("the type of key is not string")
        self.redis.getrange(key, start, end)

    def keys(self):
        flag = self.connect().keys()
        return flag

    '''
    Hash操作
    '''

    # 根据key获取value
    def hget(self, name, key):
        if not isinstance(name, str):
            print("the type of key is not string")
        if not self.redis.exists(name):
            print("the name is not exists")
        val = self.redis.hget(name, key)
        return val