def multi_days_time_seq(time_seq_dict, period_length=7):
    """
    :param time_seq_dict:
    :param period_length: 考察的时间,单位为天数, 日期之间未必连续,因为在不连续的日期之间补上24 * 缺少的天数个0
    如时间序列中的日期为
    :return:
    """
    sorted_time_seq_tuple = sorted(time_seq_dict.items(), reverse=False)
    sorted_time_seq = np.array([])

    # print("=======================stary===============================")
    # print("len of sorted_time_seq_tuple: %s" % (len(sorted_time_seq_tuple)))
    # 取出有序时间序列元组中的第一个日期,若它在记录起始日期之后,则计算它与起始日期的距离,并加上相应个数的0
    ret_start_date = str(
        sorted_time_seq_tuple[0]
        [0])  # 这里的sorted_time_seq_tuple[0][0]为什么会是Numpy.int64类型呢
    seq_start_date = days_offset(change_date_str_format(PERIOD_START),
                                 -1 * (DAY_RANGE - 1))
    days_gap = differate_one_day_more(seq_start_date, ret_start_date)
    if days_gap >= 0:  # 说明有序时间序列中的起始日期与记录起始日期不同
        tmp_time_seq = np.zeros(HOURS_IN_ONE_DAY * (days_gap + 1))
        sorted_time_seq = np.append(sorted_time_seq, tmp_time_seq)
        # print("add %s days zero" % days_gap)
    # print("ret_start_date: %s, seq_start_date: %s, days_gap: %s" % (ret_start_date, seq_start_date, days_gap))

    for index in range(len(sorted_time_seq_tuple)):
        date_str = str(sorted_time_seq_tuple[index][0])
        time_seq = sorted_time_seq_tuple[index][1]
        if index:
            day_before = str(sorted_time_seq_tuple[index - 1][0])
            days_gap = differate_one_day_more(day_before, date_str)
            # print("day_before: ", day_before, "date_str: ", date_str, "days_gap: ", days_gap)
            # 两个相邻的时间序列之间相差不止一天, 那么缺少的这些天都是24个0,即这两个日期之间的日期中的查询次数都是0
            if days_gap >= 0:
                tmp_time_seq = np.zeros(HOURS_IN_ONE_DAY * days_gap)
                sorted_time_seq = np.append(sorted_time_seq, tmp_time_seq)
        sorted_time_seq = np.append(sorted_time_seq, time_seq)
        # print("date_str: %s, len of sorted_time_seq: %s" % (date_str, len(sorted_time_seq)))

    # 取出有序时间序列元组中的最后一个日期,若它在记录截止日期之前,则计算它与截止日期的距离,并加上相应个数的0
    ret_end_date = str(
        sorted_time_seq_tuple[-1]
        [0])  # 这里的sorted_time_seq_tuple[0][0]为什么会是Numpy.int64类型呢
    seq_end_date = change_date_str_format(PERIOD_START)
    days_gap = differate_one_day_more(ret_end_date, seq_end_date)
    if days_gap >= 0:  # 说明有序时间序列中的截止日期与记录截止日期不同
        tmp_time_seq = np.zeros(HOURS_IN_ONE_DAY * (days_gap + 1))
        sorted_time_seq = np.append(sorted_time_seq, tmp_time_seq)
    # print("ret_end_date: %s, seq_end_date: %s, days_gap: %s" % (ret_end_date, seq_end_date, days_gap))

    # print("192 len of sorted_time_seq: %s" % (len(sorted_time_seq)))
    return sorted_time_seq
Ejemplo n.º 2
0
def date_older_than_start_date(date_str):
    seq_start_date = days_offset(change_date_str_format(START_DAY),
                                 -1 * DAY_RANGE)
    if differate_one_day_more(seq_start_date, date_str) >= 0:
        return False
    return True
Ejemplo n.º 3
0
def date_younger_than_start_date(date_str):
    seq_end_date = change_date_str_format(START_DAY)
    if differate_one_day_more(seq_end_date, date_str) >= 0:
        return True
    return False
def count_alive_days(domain_list, domain_bad):
    """
    统计域名的存活时间,从注册日到当前日期;和域名的最近修改时间
    并写入csv文件中
    :param domain_list:
    :param domain_bad:
    :return:
    """
    db = db_whois_dict[domain_bad]
    data_dict_list = []
    time_format = "%Y%m%d"
    today = datetime.now().strftime(time_format)
    for domain in domain_list:
        print("=================================================")
        print("domain: %s" % (domain))
        query_body = {DOMAIN_2ND_FIELD: domain}
        recs = db[whois_mongo_index].find(query_body)
        print("recs.count(): %s" % recs.count())
        if recs.count() > 0:
            rec = recs[0]
            create_date = rec.get(CREATE_DATE, "")
            update_date = rec.get(UPDATE_DATE, "")
            expiry_date = rec.get(EXPIRY_DATE, "")
            try:
                print("creat_date: %s, update_date: %s, expiry_date: %s" %
                      (len(create_date), len(update_date), len(expiry_date)))
                print("creat_date: %s, update_date: %s, expiry_date: %s" %
                      (create_date, update_date, expiry_date))
                # create_date = format_date_string(create_date) if len(create_date) > 0 else today
                if len(create_date) > 0:
                    create_date = format_date_string(create_date)
                else:
                    create_date = today

                # update_date = format_date_string(update_date) if len(update_date) > 0 else today
                if len(update_date) > 0:
                    update_date = format_date_string(update_date)
                else:
                    update_date = today
                # expiry_date = format_date_string(expiry_date) if len(expiry_date) > 0 else today
                if len(expiry_date) > 0:
                    expiry_date = format_date_string(expiry_date)
                else:
                    expiry_date = today

                print("12232 create: ", create_date, " expiry_date: ",
                      expiry_date, " update: ", update_date)

                days_gap1 = differate_one_day_more(create_date, today) + 1
                days_gap2 = differate_one_day_more(update_date, today) + 1
                days_gap3 = differate_one_day_more(create_date,
                                                   expiry_date) + 1
                print("days_gap1:%s, days_gap2: %s, days_gap3: %s" %
                      (days_gap1, days_gap2, days_gap3))

                # 信息不全,如果days_gap1、days_gap2、days_gap3都为0,则说明三者信息都不足,则忽略此域名
                if days_gap1 or days_gap2 or days_gap3:
                    data_dict_list.append({
                        DOMAIN_2ND_FIELD: domain,
                        REGISTER_DAYS: days_gap1,
                        UPDATE_DAYS: days_gap2,
                        ALIVE_DAYS: days_gap3
                    })
            except Exception as e:
                print("error: ", e)
    columns = [DOMAIN_2ND_FIELD, REGISTER_DAYS, ALIVE_DAYS, UPDATE_DAYS]
    days_gap2csv(data_dict_list, columns, domain_bad)