Esempio n. 1
0
def get_view_profile_rowkeys(begin, end, mode='news'):
    # topic一个月大约30w,news一个大约80w,不需要快
    print 'begin', begin, timestamp2datetime(begin)
    print 'end', end, timestamp2datetime(end)

    # 确定solr表
    solr = solr_np if mode == 'news' else solr_tp

    # 调整时间戳为毫秒格式
    begin = ensure_m_timestamp(begin)
    end = ensure_m_timestamp(end)
    print begin
    print end

    # 估算rows
    days = (end - begin) / 86400 / 1000
    rows = days * every_day_topic_cnt if mode == 'topic' else days * every_day_news_cnt
    print 'rows', rows
    # 构建索引
    solr_query = SolrQuery()
    q = '*:*'
    solr_query.set('q', q)
    solr_query.add('fq', 'event_time:[%s TO %s]' % (begin, end))
    solr_query.set('rows', rows)
    solr_query.set('fl', ['id'])

    rowkey_list = [
        item['id'] for item in solr.search(**solr_query.get_query_dict())
    ]
    print len(rowkey_list)
    return rowkey_list
Esempio n. 2
0
def get_user_qa_content(uid, begin, end):
    # 不要了,改用从hbase取数据
    # 获取用户在一段时间内所有qa的全文
    begin_ds = timestamp2datetime(ensure_second_timestamp(begin))
    end_ds = timestamp2datetime(ensure_second_timestamp(end))

    all_qa_text = []

    sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \
           % (
               uid, begin_ds, end_ds
           )

    o1 = get_medicaldb_handler().do_one(sql1)
    if o1 is None or len(o1) == 0:
        return all_qa_text

    for item in o1:
        problem_id = item[0]
        sql = 'select content from ask_problemcontent where problem_id=%s;' % problem_id
        o = get_medicaldb_handler().do_one(sql)
        if o is None or len(o) == 0:
            continue

        content = o[0][0]
        content_dict = json.loads(content)[0]
        if content_dict['type'] != 'text':
            continue
        text = content_dict['text']
        all_qa_text.append(text)

    return all_qa_text
Esempio n. 3
0
def test1():
    uid = sys.argv[2]
    end = sys.argv[3]
    interval = sys.argv[4]
    end = ensure_second_timestamp(end)
    begin = end - int(interval) * 61.0
    end += int(interval) * 61.0
    print "begin", timestamp2datetime(begin)
    print "end", timestamp2datetime(end)
    user_time_event(uid, begin, end)
def get_qa_uids(begin, end):
    # 获取begin-end之间所有qa对应的user_id
    begin_dt = timestamp2datetime(ensure_second_timestamp(begin))
    end_dt = timestamp2datetime(ensure_second_timestamp(end))
    sql = 'select distinct user_id from ask_problem where created_time>"%s" and created_time<"%s";' % (
        begin_dt, end_dt)
    o = get_medicaldb_handler().dbhandler.do_one(sql)
    uids = set()
    for item in o:
        uid = item[0]
        uids.add(int(uid))
    return uids
def get_view_news_data(row_prefix):
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=SMALL_TIMEOUT)
    table = connection.table("cy_event")
    news_viwers = defaultdict(set)
    cnt = 0
    print time.time()
    last_ts = None
    now = time.time()
    start = now - 86400 * 180

    focused_type = ('view_news', 'view_topic')
    if row_prefix not in focused_type:
        return
    all_types = defaultdict(int)
    for key, data in table.scan(row_prefix=row_prefix):

        try:
            action_type, ts, uid = key.split('|')
        except:
            continue
        all_types[action_type] += 1

        if action_type not in focused_type:
            continue

        last_ts = ensure_second_timestamp(ts)
        if last_ts < start:
            continue
        news_id = data[CY_REAL_TIME_EVENT_ATTR_MAP[action_type]]
        news_viwers[news_id].add(uid)
        cnt += 1
        if cnt % 1000 == 0:
            print timestamp2datetime(time.time()), cnt, len(news_viwers)
    print time.time()
    print 'last_ts', last_ts

    print len(news_viwers)
    for x in all_types:
        print x, all_types[x]
    with open('cy_event_%s.json' % row_prefix, 'w') as f:
        for news_id in news_viwers:
            str = json.dumps({
                'id': news_id,
                'uids': list(news_viwers[news_id]),
                'len': len(news_viwers[news_id])
            }) + '\n'
            f.write(str)
Esempio n. 6
0
 def get_ask(self, uid, begin, end):
     # 获取用户一段时间内的qa首问,begin,end都是时间戳,需要转换成datetime
     begin = timestamp2datetime(begin)
     end = timestamp2datetime(end)
     sql = 'select ask from ask_problem where user_id=%s and created_time > "%s" and created_time < "%s";' % (
         uid, begin, end)
     print "get_ask sql", sql
     o = self.dbhandler.do_one(sql)
     all_ask = []
     if o is None:
         return all_ask
     for item in o:
         ask = item[0]  # unicode
         all_ask.append(ask)
     return all_ask
Esempio n. 7
0
def user_time_event(uid, begin, end):
    begin = ensure_second_timestamp(begin)
    end = ensure_second_timestamp(end)
    connection = happybase.Connection('hbase_server',
                                      compat='0.90',
                                      port=19090,
                                      timeout=30000)
    table = connection.table("cy_real_time_event")
    for key, value in table.scan(row_prefix=str(uid) + '|'):
        print "all key", key
        _, ts, event_type = key.split('|')
        ts = ensure_second_timestamp(ts)
        print "all time", timestamp2datetime(ts)
        if ts >= begin and ts <= end:
            print "shoot key", key, value
            print "shoot time", timestamp2datetime(ts)
Esempio n. 8
0
    def get_ask_by_timestamp(self, uid, timestamp):
        if isinstance(timestamp, int):
            timestamp /= 1000.0
        datetime = timestamp2datetime(timestamp)
        sql = 'select id,ask from ask_problem where user_id=%s and created_time="%s";' % (
            uid, datetime)

        o = self.dbhandler.do_one(sql)
        if o is None or len(o) == 0:
            return None, None
        return o[0][0], o[0][1]
Esempio n. 9
0
def get_user_qa_content2(uid, begin, end):
    # 从habse problem2表中 获取用户在一段时间内所有qa的全文
    begin_ds = timestamp2datetime(ensure_second_timestamp(begin))
    end_ds = timestamp2datetime(ensure_second_timestamp(end))

    all_qa_text = []

    sql1 = 'select id from ask_problem where user_id=%s and created_time>"%s" and created_time<"%s";' \
           % (
               uid, begin_ds, end_ds
           )

    o1 = get_medicaldb_handler().do_one(sql1)
    if o1 is None or len(o1) == 0:
        return all_qa_text

    for item in o1:
        problem_id = item[0]
        qa_texts = get_qa_texts_by_pid(problem_id)
        all_qa_text.extend(qa_texts)
    return all_qa_text
def test_insert():
    fields = [
        "created_date", "news_all_input_qa", "news_all_output_qa",
        "news_no_info_qa", "news_filtered_by_preprocessing_qa",
        "news_empty_res_qa", "news_bad_res_qa", "news_all_input_bs",
        "news_all_output_bs", "news_no_info_bs",
        "news_filtered_by_preprocessing_bs", "news_empty_res_bs",
        "news_bad_res_bs"
    ]
    test_ds = timestamp2datetime(0)
    data = [[
        test_ds, 22968, 912, 8197, 1039, 185, 674, 22960, 812, 8197, 1039, 185,
        674
    ]]
    bdp_manager = BDPManager(TABLE_NAME)
    bdp_manager.insert_data(fields, data)
    bdp_manager.commit()
def insert_one_day():
    # 必须加id,否则默认为'',会覆盖之前的数据
    log_file_name1, ana_file_name1, bdp_file_name1, log_file_name2, ana_file_name2, bdp_file_name2, bdp_file_name = get_yesterday_log_filename(
    )
    bdp_data = pickle_from_file(bdp_file_name)
    now = time.time()
    yesterday = now - 86400.0
    ds = timestamp2datetime(now)
    date_int = int(timestamp2date(yesterday))
    fields = ["created_date", "id"]
    data0 = [ds, date_int]

    for field in bdp_data:
        cnt = bdp_data[field]
        print field, type(field)
        print cnt, type(cnt)
        fields.append(field)
        data0.append(cnt)
    print "fields", fields
    print "data0", data0

    data = [data0]
    insert_kernel(fields, data)
Esempio n. 12
0
def test19():
    import time
    from general_utils.db_utils import get_medicaldb_handler
    from general_utils.time_utils import timestamp2datetime
    uid = sys.argv[2]
    print 'uid', uid
    t1 = time.time()
    sql = 'select id from ask_problem where user_id=%s and created_time>"%s";' % (
        uid, timestamp2datetime(time.time() - 180 * 86400))

    o = get_medicaldb_handler().do_one(sql)
    if o is None or len(o) == 0:
        print 'nothing'
        return
    all_content = []
    for item in o:
        id = item[0]
        print id
        sql1 = 'select content from ask_problemcontent where problem_id=%s;' % id
        o1 = get_medicaldb_handler().do_one(sql1)
        all_content.append(o1)

    t2 = time.time()
    print 'time', t2 - t1
Esempio n. 13
0
def main1():
    uids = []

    # 获取所有uid
    for i in (0, 1, 2, 3):
        uid_filename = get_parti_uid_filename(part=i, mode='news')
        with open(uid_filename, 'r') as f:
            ls = f.readlines()
            t_uids = [int(item.strip('\n')) for item in ls]
            uids.extend(t_uids)

    #
    output_filename = '20180312_user_event_and_recommend_news.csv'
    yesterday_begin, yesterday_end = get_yesterday_timestamp()
    yesterday_begin = int(yesterday_begin * 1000)
    yesterday_end = int(yesterday_end * 1000)

    #

    fo = open(output_filename, 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'is_app_user', 'event_datetime', 'event_type', 'event_obj',
        'recommended_news'
    ]
    csvwriter.writerow(first_line)

    all_cnt = 0
    good_cnt = 0

    shuffle(uids)
    for uid in uids[:1000]:
        all_cnt += 1
        is_app = is_app_user(uid)
        print '+' * 10, uid, '+' * 10
        user_action_list = cy_time_event_one_user_kernel2(
            uid, yesterday_begin, yesterday_end)
        recommended_news_ids = get_caled_user_topn_news(uid)
        recommended_news_ids = new_newsids_check(recommended_news_ids, 2)
        if recommended_news_ids:
            good_cnt += 1

        cnt = 0
        for i in range(max([len(user_action_list),
                            len(recommended_news_ids)])):
            if cnt == 0:
                user_id = str(uid)

            else:
                user_id = ''
                is_app = ''
            try:
                event_datetime = timestamp2datetime(user_action_list[i][0] /
                                                    1000.0)
                event_type = user_action_list[i][2]
                event_obj = user_action_list[i][1]
                if event_type == 'vn':
                    title = nat_get_title('news_' + str(event_obj))
                    event_obj_str = str(event_obj) + '|' + title
                elif event_type == 'vt':
                    title = nat_get_title('topic_' + str(event_obj))
                    event_obj_str = str(event_obj) + '|' + title
                else:
                    event_obj_str = event_obj

            except:
                event_datetime = ''
                event_obj_str = ''
                event_type = ''

            try:
                recommended_news_id = recommended_news_ids[i]
                title = nat_get_title('news_' + str(recommended_news_id))
                recommend_str = str(recommended_news_id) + '|' + title

            except:
                recommend_str = ''

            line = convert2gbk([
                user_id,
                str(is_app), event_datetime, event_type, event_obj_str,
                recommend_str
            ])
            csvwriter.writerow(line)

            cnt += 1

    line = ['all', 'good']
    csvwriter.writerow(line)
    csvwriter.writerow([str(all_cnt), str(good_cnt)])
    fo.close()
def main():
    today_zero, today_end = get_today_timestamp()
    ds = timestamp2datetime(0)
    print ds, type(ds)
def a1(log_file_name, ana_file_name, bdp_file_name):
    START = "==========start======="
    # 2017 11 08 日,被成功推荐的,用户uid,触发时间,推送文章id,推送时间,
    # 当天推送时间后是否浏览过该文章,以及浏览时间

    today_zero, today_end = get_today_timestamp(time.time() - 86400.0)

    def get_uid(l):
        return l.split("=uid=")[1].split('=')[0]

    fi = open(log_file_name, 'r')
    fo = open(ana_file_name, "w")
    csvwriter = csv.writer(fo, dialect='excel')
    first_line = [u"uid", u"触发时间", u"触发类型", u"用户全文", u"用户tag", u"用户人群",
                  u"文章id", u"文章标题", u"文章tag", u"文章分类", u"返回时间", u"点击时间"]
    csvwriter.writerow(convert2gbk(first_line))
    uid = None
    uni_key0 = None
    trigger_time = None
    trigger_type = None
    caled = set()
    all = set()  # 所有触发的请求
    reason = None

    all_qa = defaultdict(set)
    all_bs = defaultdict(set)
    cnt = 0
    for l in fi:
        # if not l.startswith("2017-11-08"):
        #     continue
        cnt += 1
        # if cnt > 10000:
        #     continue

        if START in l:
            # 记录上一个
            if reason and uni_key0 and trigger_type:
                if trigger_type == "bs":
                    all_bs["all"].add(uni_key0)
                elif trigger_type == "qa":
                    all_qa["all"].add(uni_key0)

            if uni_key0 and trigger_type and not reason:
                if trigger_type == "bs":
                    all_bs["failed"].add(uni_key0)
                elif trigger_type == "qa":
                    all_qa["failed"].add(uni_key0)

            uid = get_uid(l)
            trigger_time = l.split(',')[0]
            uni_key0 = uid + '|' + trigger_time

            if "pid=None" in l:
                trigger_type = "bs"

            else:
                trigger_type = "qa"
                all_qa["all"].add(uni_key0)

            reason = None

            # all.add(uni_key0)

            trigger_ts = datetime_str2timestamp(trigger_time)
            print "uni_key", uni_key0
            print "ts", trigger_ts

        # if "=trigger=" in l:
        #     trigger_type0 = l.split("=trigger=")[1].split('=')[0]

        if "=special_population=" in l:
            special_population0 = l.split("=special_population=")[1].split("=")[0]

        if "=texts=" in l:
            texts0 = l.split("=texts=")[1].split("=")[0]

        if "=tags=" in l:
            tags0 = l.split("=tags=")[1].split("=")[0]

        if "failed in recommend==" in l:
            reason = l.split("failed in recommend==")[1].split("=")[0]
            if trigger_type == "qa":

                if reason not in all_qa:
                    all_qa[reason] = set([uni_key0])
                else:
                    all_qa[reason].add(uni_key0)
            elif trigger_type == "bs":

                if reason not in all_bs:
                    all_bs[reason] = set([uni_key0])
                else:
                    all_bs[reason].add(uni_key0)

        if "succeed in recommend==========" in l:
            reason = "succeed"
            if trigger_type == "qa":
                if reason not in all_qa:
                    all_qa[reason] = set([uni_key0])
                else:
                    all_qa[reason].add(uni_key0)
            elif trigger_type == "bs":
                if reason not in all_bs:
                    all_bs[reason] = set([uni_key0])
                else:
                    all_bs[reason].add(uni_key0)

            return_time = l.split(',')[0]
            uni_key = uid + return_time
            if uni_key in caled:
                continue
            print 'WWWWWWWWWWWWWWWWWWWWWWWWWWWWWWWW'

            caled.add(uni_key)
            return_ts = datetime_str2timestamp(return_time)
            nid = l.split("=====id=")[1].split("=")[0]
            ntitle = l.split("===title=")[1].split("=")[0]

            # news_title = get_db_data_local_handler().get_news_title(nid)
            news_type = get_db_data_local_handler().get_news_type(nid)
            news_tags = u'|||'.join(get_news_tags_from_solr("news_" + str(nid)))

            print uid
            print trigger_type

            print nid
            print return_ts
            print type(ntitle)

            # first_line = [u"uid", u"触发时间", u"触发类型",u"用户全文",u"用户tag",u"用户人群",
            #    u"文章id", u"文章标题",u"文章tag",u"文章分类",u"返回时间", u"点击时间"]
            views = cy_time_event_one_user_viewnews(uid, begin=return_ts, end=today_end)
            print views

            rows = [str(uid), trigger_time, trigger_type, texts0, tags0, special_population0,
                    str(nid), ntitle, news_tags, news_type, return_time,
                    str(timestamp2datetime(views.get(nid, -1)))]
            rows = convert2gbk(rows)

            csvwriter.writerow(rows)

    csvwriter.writerow([u"所有uid".encode("gbk"), u"推了的uid".encode("gbk")])
    rows = [str(len(all)), str(len(caled))]
    rows = convert2gbk(rows)
    csvwriter.writerow(rows)

    fi.close()
    fo.close()

    for x in all_qa:
        print x + "|||" + str(len(all_qa[x]))

    for x in all_bs:
        print x + "|||" + str(len(all_bs[x]))

    with open(bdp_file_name, "w") as f:
        f.write("news_all_input_qa|||" + str(len(all_qa["all"])) + "\n")
        f.write("news_all_output_qa|||" + str(len(all_qa["succeed"])) + "\n")
        f.write("news_no_info_qa|||" + str(len(all_qa["user_info is None "])) + "\n")
        f.write("news_filtered_by_preprocessing_qa|||" + str(len(all_qa["filter_user_info bad "])) + "\n")
        f.write("news_empty_res_qa|||" + str(len(all_qa["topn_ids_scores empty"])) + "\n")
        f.write("news_bad_res_qa|||" + str(len(all_qa["best_score so low"])) + "\n")
        f.write("qa_failed|||" + str(len(all_qa["failed"])) + "\n")

        f.write("news_all_input_bs|||" + str(len(all_bs["all"])) + "\n")
        f.write("news_all_output_bs|||" + str(len(all_bs["succeed"])) + "\n")
        f.write("news_no_info_bs|||" + str(len(all_bs["user_info is None "])) + "\n")
        f.write("news_filtered_by_preprocessing_bs|||" + str(len(all_bs["filter_user_info bad "])) + "\n")
        f.write("news_empty_res_bs|||" + str(len(all_bs["topn_ids_scores empty"])) + "\n")
        f.write("news_bad_res_bs|||" + str(len(all_bs["best_score so low"])) + "\n")
        f.write("bs_failed|||" + str(len(all_bs["failed"])) + "\n")
Esempio n. 16
0
def g1():
    '''
    查看不使用热卖tag扩充的覆盖率,和使用热卖tag扩充的覆盖率

    分子,能匹配上热卖tag的,分母,一天内有活动用户(cy_event
    '''

    from general_utils.hbase_utils import get_user_query, get_user_query2
    from general_utils.solr_utils import get_last_login_uids
    from recommend.manager.recommend_tags_data_helper import get_relation_plan3
    from general_utils.db_utils import get_db_data_local_handler
    from general_utils.hbase_utils import get_sp_duration_active_userid

    from general_utils.time_utils import timestamp2datetime, ensure_second_timestamp
    # 用户采样时间窗
    # 用户采样命中率

    end_ds0 = '2018-01-21 23:59:40'
    end0 = datetime_str2timestamp(end_ds0)
    begin0 = end0 - 86400 * 1

    # 每个选中用户的数据采集时间窗
    end_ds = '2018-01-22 23:59:40'
    end = datetime_str2timestamp(end_ds)
    begin = end - 86400 * 180.0  # 半年

    # 最后登录时间在2018-01-21 23:59:40前一周的用户
    # test_uids = get_last_login_uids(begin0, end0)
    # test_uids = get_sp_duration_active_userid(begin0,end0)
    test_uids = get_one_day_uid_from_file('log_event_20180122')
    print "test_uids num", len(test_uids)

    # 打乱顺序,取1000个样本
    random.shuffle(test_uids)
    selected_uids = test_uids[:3000]

    all_good_cnt = 0
    all_cnt = 0
    app_cnt = 0
    good_app_cnt = 0

    text_empty_cnt = 0
    fo = open('180129_rp_1.csv', 'w')
    csvwriter = csv.writer(fo)
    first_line = [
        'uid', 'username', 'is_app', 'last_info_time', 'use_tags',
        'systag_ids', 'tag_names', 't', 'is_tangsai'
    ]
    csvwriter.writerow(first_line)
    # status_dict = {
    #     1: "qa and query",
    #     2: "view actions",
    #     3: "search_doctor clinic_no",
    #     0: ""
    # }

    total_time = {}
    for uid in selected_uids:
        print '==============uid=%s=======================' % uid
        username = get_username(uid)
        is_app = is_app_user(uid)

        all_cnt += 1
        if is_app:
            app_cnt += 1

        t1 = time.time()
        res = get_relation_plan3(uid, test=True)
        t2 = time.time()
        t = t2 - t1
        total_time[uid] = t
        status = res['status']
        is_tangsai = False
        if status:
            all_good_cnt += 1
            if is_app:
                good_app_cnt += 1
            systag_ids = res['ids']
            if 96 in systag_ids:
                is_tangsai = True
            tagnames = [
                get_db_data_local_handler().get_systagid_name(id)
                for id in systag_ids
            ]
            if status in (1, 2, 4):
                info0 = res['systag_id_dict']
                record_info = '~'.join(info0.keys())
            elif status == 3:
                info0 = res['clinic_no']
                record_info = '~'.join(info0)
            last_ts = res['last_ts']
            last_info_time = timestamp2datetime(
                ensure_second_timestamp(last_ts))

        else:
            systag_ids = []
            tagnames = []
            record_info = ''
            last_info_time = ''

        systag_ids_str = '~'.join([str(x) for x in systag_ids])
        tagnames_str = '~'.join(tagnames)

        line = convert2gbk([
            str(uid), username,
            str(is_app), last_info_time, record_info, systag_ids_str,
            tagnames_str,
            str(t),
            str(is_tangsai)
        ])
        csvwriter.writerow(line)

    line = [str(all_cnt), str(all_good_cnt), str(app_cnt), str(good_app_cnt)]
    csvwriter.writerow(line)
    s_total_time = sorted(total_time.iteritems(),
                          key=lambda x: x[1],
                          reverse=True)
    times = total_time.values()
    line = [str(min(times)), str(max(times)), str(sum(times) / len(times))]
    csvwriter.writerow(line)
    for uid, t in s_total_time[:10]:
        line = [str(uid), str(t)]
        csvwriter.writerow(line)

    fo.close()

    print str(max(times))
    print all_good_cnt
Esempio n. 17
0
def format_content(res):
    last_value = res['last_value']
    return "最后时间戳 = %s" % timestamp2datetime(last_value / 1000)