Beispiel #1
0
 def testSpawnThreads(self):
     self.jobmanager.loadJob(Job(1, "*/2 * * * *", "echo haha", 5))
     self.jobmanager.indexJobs()
     
     self.assertEqual(1, len(self.jobmanager.hour_index[15]))
     
     threads = self.jobmanager.spwanTasks(DateUtil.datetime("2014-02-18 15:30:00"), DateUtil.datetime("2014-02-18 16:00:00"))
     self.assertEqual(15, len(threads))
     
     threads = self.jobmanager.spwanTasks(DateUtil.datetime("2014-02-18 15:30:23"), DateUtil.datetime("2014-02-18 16:00:00"))
     self.assertEqual(14, len(threads))
Beispiel #2
0
 def testSpawnThreads2(self):
     self.jobmanager.loadJob(Job(1, "* * * * *", "echo haha", 5))
     self.jobmanager.indexJobs()
     
     threads = self.jobmanager.spwanTasks(DateUtil.datetime("2014-02-19 15:00:00"), DateUtil.datetime("2014-02-19 16:00:00"))
     self.assertEqual(60, len(threads))
     print threads[0].args[0].get_exc_time()
Beispiel #3
0
    def transfer_texts_2_tfRecord(self, start_date, end_date, raw_feature_file_path, tf_record_file_path,
                                  column_names,
                                  label_name, need_feature_cols, negative_ratio=None, var_length_cols=None,
                                  col_preprocess_func=None):

        """
        根据起止时间,将指定目录下的text文件,都存储为tfRecord格式
        参数见:transfer_single_text_2_tfRecord
        """
        raw_feature_folder_name = os.path.basename(raw_feature_file_path)
        tf_record_folder_name = os.path.basename(tf_record_file_path)

        raw_feature_file = os.path.join(raw_feature_file_path, raw_feature_folder_name + "_%s")
        tf_record_file = os.path.join(tf_record_file_path, tf_record_folder_name + "_%s.tfrecord")
        data_info_csv_path = os.path.join(tf_record_file_path, "data_info.csv")

        if not os.path.isdir(tf_record_file_path):
            os.makedirs(tf_record_file_path)

        date_ls = DateUtil.get_every_date(start_date, end_date)
        for date in date_ls:
            print(date)
            self.transfer_single_text_2_tfRecord(raw_feature_file % date, tf_record_file % date, data_info_csv_path,
                                                 column_names,
                                                 label_name, need_feature_cols, negative_ratio, var_length_cols,
                                                 col_preprocess_func)
def load_wechat_2_dict(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)
    dict_wechat = OrderedDict()  #

    wechat_segment_data_dir = os.path.join(home_dir, "project_data/xgb",
                                           "raw_data/aggregated_wechat_segment_data")
    wechat_segment_data = os.path.join(wechat_segment_data_dir, "aggregated_wechat_segment_data_%s")

    log.info(date_ls)
    for tmp_date in date_ls:
        log.info(tmp_date)
        dict_wechat[tmp_date] = OrderedDict()
        with codecs.open(wechat_segment_data % tmp_date, "r", "utf-8") as fin:
            for line in fin:
                arr = line.strip().split("\t")
                if len(arr) != 2:
                    continue
                opp_id = arr[0].strip()
                chat_ls = arr[1].strip()
                try:
                    chat_ls = json.loads(chat_ls)
                except Exception as e:
                    log.info(e)
                    continue
                dict_wechat[tmp_date][opp_id] = chat_ls
    return dict_wechat
def load_multi_day_order(start_date, end_date):
    multi_day_dict = {}
    date_ls = DateUtil.get_every_date(start_date, end_date)
    date_ls = sorted(date_ls, reverse=True)  # 降序排列,如果出现多次,用最小的订单时间覆盖
    for date in date_ls:
        log.info(date)
        one_day_dict = load_one_day_order(date)
        multi_day_dict.update(one_day_dict)
    return multi_day_dict
Beispiel #6
0
def get_hist_wechat_segment(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)

    for tmp_date in date_ls:
        start_time = time.time()
        get_one_day_wechat_segment(tmp_date)
        print("{0} wechat segment cost time: {1}".format(
            tmp_date,
            time.time() - start_time))
Beispiel #7
0
 def testNextScheduleSectionOverLongBatch(self):
     '''
     Test get next schedule Section
      
     Test invalid over long interval
     '''
     self.engine._fillclock(11, 4)  # Get clock list [3, 7, 11, 15, 19, 23]
     try:
         self.engine._getNextScheduleSection(DateUtil.datetime('2014-02-14 15:06:34'), 23)
     except Exception:
         raise
Beispiel #8
0
    def update_employee(self, emp: Employee):

        try:
            emp_result = self.fetch_employee_by_id(emp.emp_id)
            try:
                if emp.__eq__(emp_result):
                    #
                    print("Successfully Updated Employee- " + emp.emp_id +
                          "\n")
                else:
                    date_util = DateUtil()
                    if date_util.check_date_of_birth(emp.dob):
                        self.db.execute_dynamic_query(
                            "update employee set fname = ?, lname =?, dob = ? , dept_id = ? where emp_id=?",
                            emp.fname, emp.lname, emp.dob, emp.dept.dept_id,
                            emp.emp_id)
                        self.db.connection.commit()
                        print("Successfully Updated Employee- " + emp.emp_id +
                              "\n")
                    else:
                        print("Sorry!! Unable to update Employee- " +
                              emp.emp_id + "\n")

            except AttributeError:

                if "fname" in str(sys.exc_info()[1]):
                    print("First Name cannot be null")
                elif "lname" in str(sys.exc_info()[1]):
                    print("Last Name cannot be null")
                elif "dob" in str(sys.exc_info()[1]):
                    print("Date of birth is not in correct format")
            except ValueError:
                print(sys.exc_info()[1])
        except:
            print("Unable to update employee Id - " + str(emp.emp_id) +
                  ". Check employee Id.\n")
            print(sys.exc_info())
            print("\n")
Beispiel #9
0
def load_hist_wechat_record_dict(date):
    wechat_dict = defaultdict(list)
    start_date = DateUtil.get_relative_delta_time_str(
        date, day=-HISTORY_WECHAT_RECORD_DELTA_DAY)
    end_date = DateUtil.get_relative_delta_time_str(date, -1)
    date_ls = sorted(DateUtil.get_every_date(start_date,
                                             end_date))  # 时间在后的聊天追加在后面
    wechat_full_sentence_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data",
                                                 "wechat_full_sentence_data")

    wechat_full_sentence_data_file = os.path.join(
        wechat_full_sentence_data_dir, "wechat_full_sentence_data_%s")
    for date in date_ls:
        log.info(date)
        wechat_full_sentence_data = wechat_full_sentence_data_file % date
        with codecs.open(wechat_full_sentence_data, 'r', 'utf-8') as fin:
            for line in fin:
                arr = line.strip().split("\t")
                if len(arr) != 5:
                    continue
                opp_id, student_chat_num, teacher_chat_num, all_chat_num, chat_content = arr
                student_chat_num, teacher_chat_num, all_chat_num = int(
                    student_chat_num), int(teacher_chat_num), int(all_chat_num)

                if opp_id not in wechat_dict:
                    wechat_dict[opp_id] = {
                        "stat_info": [0, 0, 0],
                        "chat_content": ""
                    }
                wechat_dict[opp_id]["chat_content"] = wechat_dict[opp_id][
                    "chat_content"] + chat_content
                wechat_dict[opp_id]["stat_info"] = [
                    x + y for x, y in
                    zip([student_chat_num, teacher_chat_num, all_chat_num],
                        wechat_dict[opp_id]["stat_info"])
                ]
    return wechat_dict
Beispiel #10
0
 def testMatchTimePattern(self):
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * * * *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("28 * * * *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* 20 * * *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * 17 * *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * * 2 *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("* * * * 1"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("28 20 17 2 1"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     self.assertEqual(True, TimeMatcher.matchTimePattern(TimePattern("*/2 * * * *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
     
     self.assertEqual(False, TimeMatcher.matchTimePattern(TimePattern("*/3 * * * *"), 
                                                         DateUtil.datetime("2014-02-17 20:28:35")))
def get_hist_wechat_tf_feature(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)

    log.info("initial past n day wechat segment data...")
    s_date = (datetime.strptime(start_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d")
    t_date = start_date
    log.info("load past days wechat segment data...")
    dict_wechat = load_wechat_2_dict(s_date, t_date)

    log.info("extract tf feature...")
    for tmp_date in date_ls:
        log.info("extract %s tf feature..." % tmp_date)
        start_time = time.time()
        get_one_day_wechat_tf_feature(tmp_date, dict_wechat)
        log.info("extract {0} wechat tf feature cost time:{1}".format(tmp_date, time.time()-start_time))

        del_date = (datetime.strptime(tmp_date, "%Y%m%d") - timedelta(days=PAST_DAYS_LIMIT)).strftime("%Y%m%d")
        add_date = (datetime.strptime(tmp_date, "%Y%m%d") + timedelta(days=1)).strftime("%Y%m%d")
        if add_date <= end_date:
            log.info("update past days wechat segment data [del %s, add %s]..." % (del_date, add_date))
            dict_wechat = update_wechat_dict(dict_wechat, del_date, add_date)
            log.info("=======" * 3)
Beispiel #12
0
def gen_bench_mark_multi_day(start_date, end_date):
    bench_mark_text_file_tmp = os.path.join(bench_mark_text_file_path, "total_chat_num_%s", "total_chat_num_%s_%s")
    for date in DateUtil.get_every_date(start_date, end_date):
        print(date)
        # 按chat_num分组
        source_file_path = os.path.join(raw_feature_path, "wechat_basic_feature_%s" % date)
        chat_group_dict = {x: [] for x in chat_num_ls}

        with codecs.open(source_file_path, 'r', 'utf-8') as fin:
            for line in fin:
                arr = line.strip().split("\t")
                column_names = ["label", "opp_id", "acc_id", "create_time", "today_student_chat_num",
                                "today_teacher_chat_num", "today_total_chat_num", "hist_student_chat_num",
                                "hist_teacher_chat_num", "hist_total_chat_num", "chat_content"]
                tmp_dict = {key: value for key, value in zip(column_names, arr)}
                total_student_chat_num = int(tmp_dict["today_student_chat_num"]) + int(
                    tmp_dict["hist_student_chat_num"])
                if total_student_chat_num in chat_group_dict:
                    chat_group_dict[total_student_chat_num].append(line.strip())

        for chat_num, line_ls in chat_group_dict.items():
            if not line_ls:
                continue

            bench_mark_text_file = bench_mark_text_file_tmp % (chat_num, chat_num, date)
            tmp_text_folder_path = os.path.dirname(bench_mark_text_file)
            tmp_tf_record_folder_path = tmp_text_folder_path.replace("feature_file", "tf_record")

            if not os.path.isdir(tmp_text_folder_path):
                os.makedirs(tmp_text_folder_path)

            with codecs.open(bench_mark_text_file, "w", "utf-8") as fout:
                for line in line_ls:
                    fout.write(line + "\n")
            # 将转换好的text bench mark 文本文件 , 转换为tfRecord文件
            tfRecorder.transfer_texts_2_tfRecord_default(date, date, tmp_text_folder_path, tmp_tf_record_folder_path)
Beispiel #13
0
LOG_PATH = os.path.join(PROJECT_DATA_DIR, "log")


def get_logger(path):
    log_file = os.path.join(LOG_PATH, path)
    logger = logging.getLogger(PROJECT_NAME)  # 程序顶级目录的名字
    fmt = '[%(asctime)s] - %(filename)s:%(lineno)s - %(name)s - %(message)s'

    formatter = logging.Formatter(fmt)  # 实例化formatter
    handler = logging.handlers.RotatingFileHandler(log_file,
                                                   maxBytes=1024 * 1024,
                                                   backupCount=5)  # 实例化handler
    handler.setFormatter(formatter)  # 为handler添加formatter
    logger.addHandler(handler)  # 为logger添加handler

    logger.setLevel(logging.DEBUG)
    consoleHandle = logging.StreamHandler()
    consoleHandle.setFormatter(formatter)
    logger.addHandler(consoleHandle)

    return logger


G_LOG = get_logger("log_%s" % DateUtil.get_relative_delta_time_str())

if __name__ == "__main__":
    logger = get_logger("log_20170523")
    logger.info('first info message')
    logger.debug('first debug message')
    logger.debug('-----------')
Beispiel #14
0
def get_hist_wechat_full_sentence(start_date, end_date):
    date_ls = DateUtil.get_every_date(start_date, end_date)
    for tmp_date in date_ls:
        log.info("extract %s basic feature..." % tmp_date)
        get_one_day_wechat_full_sentence(tmp_date)
Beispiel #15
0
def get_one_day_wechat_basic_feature(date):
    log.info("get %s wechat basic feature..." % date)
    wechat_basic_feature_dir = os.path.join(PROJECT_DATA_DIR, "feature_file",
                                            "wechat_basic_feature")
    aggregated_wechat_data_dir = os.path.join(PROJECT_DATA_DIR, "raw_data",
                                              "aggregated_wechat_data")

    wechat_basic_feature_data = os.path.join(wechat_basic_feature_dir,
                                             "wechat_basic_feature_%s" % date)
    aggregated_wechat_data = os.path.join(aggregated_wechat_data_dir,
                                          "aggregated_wechat_data_%s" % date)
    log.info("prepare hist wechat chat dict...")
    hist_wechat_chat_dict = load_hist_wechat_record_dict(date)

    log.info("prepare hist and future order dict...")
    hist_order_dict = load_multi_day_order(
        DateUtil.get_relative_delta_time_str(date,
                                             day=-HISTORY_ORDER_DELTA_DAY),
        DateUtil.get_relative_delta_time_str(date, day=-1))
    future_order_dict = load_multi_day_order(
        date,
        DateUtil.get_relative_delta_time_str(date, day=FUTURE_ORDER_DELTA_DAY))

    log.info("start 2 gen wechat basic feature...")
    with codecs.open(aggregated_wechat_data, "r", "utf-8") as fin, \
            codecs.open(wechat_basic_feature_data, "w", "utf-8") as fout:
        for line in fin:
            arr = line.strip().split("\t")
            if len(arr) != 2:
                continue

            opp_id = arr[0].strip()
            chat_ls = arr[1].strip()
            try:
                chat_ls = json.loads(chat_ls, encoding="utf-8")
            except Exception as e:
                log.info(e)
                continue
            if opp_id in hist_order_dict:
                continue

            student_chat_idx = []
            for idx, chat_dict in enumerate(chat_ls):
                send_type = chat_dict["send_type"]
                if send_type == "1":
                    student_chat_idx.append(idx)
            if not student_chat_idx:
                continue

            # 随机选取一句学生对话,作为样本生成点
            sample_idx = np.random.choice(student_chat_idx, 1)[0]
            sample_chat = chat_ls[sample_idx]
            create_time = sample_chat["create_time"]
            account = sample_chat["account"]
            order_time = future_order_dict.get(opp_id, None)  # 是否最近成单
            label = judge_label(order_time, create_time)
            if label == "-1":
                continue
            sample_chat_ls = chat_ls[:sample_idx + 1]
            cleared_chat_sentence = clear_sentence(sample_chat_ls)
            chat_stat_ls = stat_sentence(sample_chat_ls)
            hist_chat_stat_ls = [0, 0, 0]

            hist_wechat_chat = hist_wechat_chat_dict.get(opp_id,
                                                         None)  # 是否有历史聊天记录

            if hist_wechat_chat:  # 拼接历史聊天信息
                hist_chat_stat_ls = hist_wechat_chat["stat_info"]
                cleared_chat_sentence = hist_wechat_chat[
                    "chat_content"] + cleared_chat_sentence

            today_stat_str = "\t".join(map(str, chat_stat_ls))
            hist_stat_str = "\t".join(map(str, hist_chat_stat_ls))
            result = "\t".join([
                label, opp_id, account, create_time, today_stat_str,
                hist_stat_str, cleared_chat_sentence
            ])
            fout.write(result + "\n")

            # 不抽样,剖析每个对话,学生说话则触发一次样本生成
            # for idx, chat_dict in enumerate(chat_ls):
            #     send_type = chat_dict["send_type"]
            #     create_time = chat_dict["create_time"]
            #     accout = chat_dict["account"]
            #
            #     if send_type == "0":  # 老师会话不做样本选取点
            #         continue
            #
            #     label = judge_label(order_time, create_time)
            #     if label == "-1":
            #         continue
            #
            #     cleared_chat_sentence = clear_sentence(chat_ls[:idx + 1])
            #     chat_stat_ls = stat_sentence(chat_ls[:idx + 1])
            #     hist_chat_stat_ls = [0, 0, 0]
            #
            #     if hist_wechat_chat:  # 拼接历史聊天信息
            #         hist_chat_stat_ls = hist_wechat_chat["stat_info"]
            #         cleared_chat_sentence = hist_wechat_chat["chat_content"] + cleared_chat_sentence
            #
            #     today_stat_str = "\t".join(map(str, chat_stat_ls))
            #     hist_stat_str = "\t".join(map(str, hist_chat_stat_ls))
            #     result = "\t".join(
            #         [label, opp_id, accout, create_time, today_stat_str, hist_stat_str, cleared_chat_sentence])
            #     fout.write(result + "\n")
    log.info("finished, write feature to file : %s" %
             wechat_basic_feature_data)