def test_get_data_directory(self): answer_data_dir = zhihu_util.get_data_directory("answer") self.assertTrue(os.path.exists(answer_data_dir)) self.assertTrue("sda/data/zhihu/answer" in answer_data_dir) question_data_dir = zhihu_util.get_data_directory("question") self.assertTrue(os.path.exists(question_data_dir)) self.assertTrue("sda/data/zhihu/question" in question_data_dir)
def flush_buffer(write_buffer, suffix, ts, thread_index, mode="finish"): print "...write buffer into disk..." data_dir = zhihu_util.get_data_directory("user") buffer_filename = "%s/%s%s-%s-%s" % (data_dir, suffix, USER_FILE_DELIMITER, int(ts), thread_index) if mode == "doing": buffer_filename += ".tmp" zhihu_util.write_buffer_file(write_buffer, buffer_filename, USER_FIELD_DELIMITER)
def generate_write_bloomfilter(dir_name, capacity=1000000, error_rate=0.01): bf = BloomFilter(capacity, error_rate) data_dir = zhihu_util.get_data_directory(dir_name) data_file_list = zhihu_util.get_file_list(data_dir) for data_file in data_file_list: # read url_suffix from data file with open(data_file, "r") as file_object: for line in file_object: url_suffix = line.split(USER_FIELD_DELIMITER)[0] if url_suffix.strip() != '': # print "......url suffix:%s added into bloom filter" % url_suffix bf.add(str(url_suffix)) return bf
def init_user_access(): data_dir = zhihu_util.get_data_directory("user") filenames = zhihu_util.get_file_names(data_dir) result_list = [filename.split(USER_FILE_DELIMITER)[0] for filename in filenames] return set(result_list)