def __init__(self, use_redis=False, debug=False, file_name_head="", filename="", analysis=False): """ :param use_redis: :param debug: :param file_name_head: :param filename: :param filename_list: """ self.debug = debug self.df = None self.filename = filename self.file_name_head = file_name_head USER_BASE_DIR = BASE_DIR + file_name_head + '/data/result/' util.check_dir_exist(USER_BASE_DIR) self.N_E_FILE_NAME = USER_BASE_DIR + 'n_E_mood_data.csv' self.CMT_RESULT_NAMES = USER_BASE_DIR + 'cmt_result_names.csv' if self.filename == '' and self.file_name_head != '': self.filename = USER_BASE_DIR + 'mood_data.csv' if not analysis: self.read_data_from_csv()
def __init__(self, use_redis=False, debug=False, analysis=False, recover=False, username='', mood_begin=0, mood_num=-1, stop_time='-1', from_web=True, nickname='', no_delete=True, cookie_text='', export_excel=False, export_csv=True): """ :param use_redis: 是否使用redis :param debug: 是否开启debug模式 :param analysis: 如果为true, 会执行爬虫程序,再执行分析程序,如果为false,只执行分析程序 """ QQZoneSpider.__init__(self, use_redis, debug, recover=recover, username=username, mood_num=mood_num, mood_begin=mood_begin, stop_time=stop_time, from_web=from_web, nickname=nickname, no_delete=no_delete, cookie_text=cookie_text) if self.g_tk == 0 and analysis == False: self.login() FRIEND_DIR_HEAD = BASE_DIR + 'friend/' + self.file_name_head self.FRIEND_LIST_FILE_NAME = FRIEND_DIR_HEAD + '_friend_list.json' self.FRIEND_DETAIL_FILE_NAME = FRIEND_DIR_HEAD + '_friend_detail.json' self.FRIEND_DETAIL_LIST_FILE_NAME = FRIEND_DIR_HEAD + '_friend_detail_list.csv' self.FRIEND_DETAIL_EXCEL_FILE_NAME = FRIEND_DIR_HEAD + '_friend_detail_list.xlsx' # 头像下载到web的static文件夹,以便在web中调用 self.FRIEND_HEADER_IMAGE_PATH = '../web/static/image/header/' + self.file_name_head + '/' util.check_dir_exist(self.FRIEND_HEADER_IMAGE_PATH) self.friend_detail = [] self.friend_list = [] self.friend_df = pd.DataFrame() self.re = self.connect_redis() self.friend_thread_list = [] self.export_excel = export_excel self.export_csv = export_csv
def init_log(self): logging_dir = os.path.join(BASE_DIR, "logs/") if self.debug: print("logging_dir:", logging_dir) check_dir_exist(logging_dir) logger = logging.getLogger('log') logger.setLevel(logging.INFO) log_path = logging_dir + get_now_time() + ".log" # 存在bug,无法按天分割 # 参考博客:https://blog.csdn.net/weixin_38107388/article/details/90639151 # fh = logging.handlers.TimedRotatingFileHandler(logging_dir + 'support', when='S', backupCount=5, encoding='utf-8') # fh.suffix = "%Y%m%d.log" fh = logging.FileHandler(log_path, encoding='utf-8', mode='a') fh.setLevel(logging.INFO) formatter = logging.Formatter(LOGGING_FORMAT) fh.setFormatter(formatter) logger.addHandler(fh) return logger
def init_log(self): filelog = True logging_dir = self.USER_BASE_DIR + 'log/' if self.debug: print("logging_dir:", logging_dir) util.check_dir_exist(logging_dir) # logging.basicConfig(level=logging.INFO, # format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', # datefmt='%a, %d %b %Y %H:%M:%S', # filename=logging_dir + self.username + '.log', # filemode='w+') log_path = logging_dir + self.username + '.log' logger = logging.getLogger('log') logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s' ) if filelog: fh = logging.FileHandler(log_path, encoding='utf-8') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger.addHandler(fh) return logger
def init_file_name(self): self.USER_BASE_DIR = BASE_DIR + self.username + '/' logging_dir = self.USER_BASE_DIR + 'log/' print("logging_dir:", logging_dir) util.check_dir_exist(logging_dir) logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=logging_dir + self.username + '.log', filemode='w+') logging.info('file_name_head:' + self.username) DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/' self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json' self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json' self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json' self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json' ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/' self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json' self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json' self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json' self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt' self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt' self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt' self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/' self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/' util.check_dir_exist(DATA_DIR_HEAD) util.check_dir_exist(ERROR_DIR_HEAD) util.check_dir_exist(self.SMALL_IMAGE_DIR) util.check_dir_exist(self.BIG_IMAGE_DIR) print("Init file Name Finish:", self.USER_BASE_DIR)
def init_analysis_path(self): self.friend_dir = BASE_DIR + self.username + '/friend/' + 'friend_detail_list.csv' self.history_like_agree_file_name = BASE_DIR + self.username + '/friend/' + 'history_like_list.json' RESULT_BASE_DIR = self.USER_BASE_DIR + "data/result/" self.MOOD_DATA_FILE_NAME = RESULT_BASE_DIR + 'mood_data.csv' self.MOOD_DATA_EXCEL_FILE_NAME = RESULT_BASE_DIR + 'mood_data.xlsx' LABEL_BASE_DIR = self.USER_BASE_DIR + "data/label/" self.LABEL_FILE_CSV = LABEL_BASE_DIR + 'label_data.csv' self.LABEL_FILE_EXCEL = LABEL_BASE_DIR + 'label_data.xlsx' self.label_path = self.USER_BASE_DIR + 'data/label/' self.image_path = self.USER_BASE_DIR + 'image/' util.check_dir_exist(RESULT_BASE_DIR) util.check_dir_exist(LABEL_BASE_DIR) util.check_dir_exist(self.label_path) util.check_dir_exist(self.image_path)
def init_file_name(self): logging.info('file_name_head:' + self.file_name_head) DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/' self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json' self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json' self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json' self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json' ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/' self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json' self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json' self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json' self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt' self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt' self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt' self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/' self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/' util.check_dir_exist(DATA_DIR_HEAD) util.check_dir_exist(ERROR_DIR_HEAD) util.check_dir_exist(self.SMALL_IMAGE_DIR) util.check_dir_exist(self.BIG_IMAGE_DIR) print("Init file Name Finish:", self.USER_BASE_DIR)
def init_file_name(self): """ 初始化所有文件名 :return: """ self.USER_BASE_DIR = BASE_DIR + self.username + '/' logging_dir = self.USER_BASE_DIR + 'log/' if self.debug: print("logging_dir:", logging_dir) util.check_dir_exist(logging_dir) logging.basicConfig( level=logging.INFO, format= '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=logging_dir + self.username + '.log', filemode='w+') logging.info('file_name_head:' + self.username) DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/' self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json' self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json' self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json' self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json' ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/' self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json' self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json' self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json' self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt' self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt' self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt' self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/' self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/' util.check_dir_exist(DATA_DIR_HEAD) util.check_dir_exist(ERROR_DIR_HEAD) util.check_dir_exist(self.SMALL_IMAGE_DIR) util.check_dir_exist(self.BIG_IMAGE_DIR) USER_BASE_DIR = BASE_DIR + self.username + '/' util.check_dir_exist(USER_BASE_DIR) FRIEND_DIR_HEAD = USER_BASE_DIR + 'friend/' self.FRIEND_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_list.json' self.FRIEND_DETAIL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail.json' self.FRIEND_DETAIL_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.csv' self.FRIEND_DETAIL_EXCEL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.xlsx' # 头像下载到web的static文件夹,以便在web中调用 self.FRIEND_HEADER_IMAGE_PATH = BASE_PATH + '/src/web/static/image/' + self.username + '/header/' self.web_image_bash_path = BASE_PATH + '/src/web/static/image/' + self.username + '/' util.check_dir_exist(USER_BASE_DIR + 'friend/') util.check_dir_exist(self.FRIEND_HEADER_IMAGE_PATH) self.init_analysis_path() if self.debug: print("Init file Name Finish:", self.USER_BASE_DIR)
def init_file_name(self): """ 初始化所有文件名 :return: """ self.USER_BASE_DIR = BASE_DIR + self.username + '/' self.logging = self.init_log() self.logging.info('file_name_head:' + self.username) DATA_DIR_HEAD = self.USER_BASE_DIR + 'data/' self.CONTENT_FILE_NAME = DATA_DIR_HEAD + 'QQ_content.json' self.LIKE_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_detail' + '.json' self.LIKE_LIST_NAME_FILE_NAME = DATA_DIR_HEAD + 'QQ_like_list_name' + '.json' self.MOOD_DETAIL_FILE_NAME = DATA_DIR_HEAD + 'QQ_mood_detail' + '.json' ERROR_DIR_HEAD = self.USER_BASE_DIR + 'error/' self.ERROR_LIKE_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error' + '.json' self.ERROR_LIKE_LIST_NAME_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_name_error' + '.json' self.ERROR_MOOD_DETAIL_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error' + '.json' self.ERROR_LIKE_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_detail_error_unikey' + '.txt' self.ERROR_LIKE_LIST_NAME_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_like_list_error_unikey' + '.txt' self.ERROR_MOOD_DETAIL_UNIKEY_FILE_NAME = ERROR_DIR_HEAD + 'QQ_mood_detail_error_unikey' + '.txt' self.SMALL_IMAGE_DIR = self.USER_BASE_DIR + 'qq_image/' self.BIG_IMAGE_DIR = self.USER_BASE_DIR + 'qq_big_image/' util.check_dir_exist(DATA_DIR_HEAD) util.check_dir_exist(ERROR_DIR_HEAD) util.check_dir_exist(self.SMALL_IMAGE_DIR) util.check_dir_exist(self.BIG_IMAGE_DIR) USER_BASE_DIR = BASE_DIR + self.username + '/' util.check_dir_exist(USER_BASE_DIR) FRIEND_DIR_HEAD = USER_BASE_DIR + 'friend/' self.FRIEND_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_list.json' self.FRIEND_DETAIL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail.json' self.FRIEND_DETAIL_LIST_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.csv' self.FRIEND_DETAIL_EXCEL_FILE_NAME = FRIEND_DIR_HEAD + 'friend_detail_list.xlsx' # 头像下载到web的static文件夹,以便在web中调用 self.FRIEND_HEADER_IMAGE_PATH = BASE_PATH + '/src/web/static/image/' + self.username + '/header/' self.web_image_bash_path = BASE_PATH + '/src/web/static/image/' + self.username + '/' util.check_dir_exist(USER_BASE_DIR + 'friend/') util.check_dir_exist(self.FRIEND_HEADER_IMAGE_PATH) self.init_analysis_path() if self.debug: print("Init file Name Finish:", self.USER_BASE_DIR)
def __init__(self, username): self.temp_dir = BASE_DIR + username + '/temp/' check_dir_exist(self.temp_dir)
def __init__(self, use_redis=False, debug=False, mood_begin=0, mood_num=-1, stop_time='-1', download_small_image=False, download_big_image=False, download_mood_detail=True, download_like_detail=True, download_like_names=True, recover=False, cookie_text=None, from_web=False, username='', nickname='', no_delete=True, pool_flag='127.0.0.1'): # 初始化下载项 self.mood_begin = mood_begin self.mood_num = mood_num self.recover = recover self.download_small_image = download_small_image self.download_big_image = download_big_image self.download_mood_detail = download_mood_detail self.download_like_detail = download_like_detail self.download_like_names = download_like_names self.thread_num = 5 self.thread_list = [] self.no_delete = no_delete if stop_time != '-1': self.stop_time = util.get_mktime(stop_time) else: self.stop_time = -1 self.begin_time = datetime.datetime.now() self.host = 'https://user.qzone.qq.com' self.h5_host = 'h5.qzone.qq.com' self.http_host = 'http://user.qzone.qq.com' self.use_redis = use_redis self.debug = debug self.cookie_text = cookie_text self.pool_flag = pool_flag if from_web: self.username = username self.file_name_head = username self.nickname = nickname else: self.username, self.password, self.file_name_head, self.nick_name = self.get_username_password() self.mood_host = self.http_host + '/' + self.username + '/mood/' # 在爬取好友动态时username会变为好友的QQ号,所以此处需要备份 self.raw_username = deepcopy(self.username) self.headers = { 'host': 'user.qzone.qq.com', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:66.0) Gecko/20100101 Firefox/66.0', 'connection': 'keep-alive' } self.h5_headers = deepcopy(self.headers) self.h5_headers['host'] = self.h5_host self.USER_BASE_DIR = BASE_DIR + self.username + '/' logging_dir = self.USER_BASE_DIR + 'log/' util.check_dir_exist(logging_dir) logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=logging_dir + self.username + '.log', filemode='w+') if (use_redis): self.re = self.connect_redis() self.user_info = UserInfo(self.username).load() if self.user_info is None: self.user_info = UserInfo(self.username) self.user_info.QQ = self.username self.user_info.nickname = self.nickname
def check_dirs(self): check_dir_exist(DATA_DIR) check_dir_exist(os.path.join(BASE_DIR + "/download_image/"))
def test_check_dir(self): path1 = os.path.join(BASE_DIR, 'test1') check_dir_exist(path1) pass
def __init__(self): check_dir_exist(self.temp_dir)