def __init__(self): self.set_monitor_period_sec = 3 self.set_log_file_path = 'monitor_files.log' self.logHandle = LogHandle(self.set_log_file_path) self.set_store_path = '' self.lock = threading.Lock() self.monitor_file_list = list() self.change_file_list = list() self.alive_thread_cnt = 0 self.need_quit = False pass
def do_init(self): self.db_handler = DBHandler() self.db_handler.load('sex.db') self.db_handler.add_table('sex') self.set_store_img_path = 'img' self.flag_quit = False self.log_handler = LogHandle('sex_srap.log') self.log = self.log_handler.log self.set_thread_cnt = 6 self.info_run_thread_cnt = 0 self.info_succeed_cnt = 0 self.info_failed_cnt = 0 self.task_row_list = list() self.task_update_row_list = list() self.lock = threading.RLock() if not os.path.exists(self.set_store_img_path): os.mkdir(self.set_store_img_path) pass
pass def arg_parser_init(): arg_parse = MyArgParse() arg_parse.add_option('-parse', [0, 1], 'parse img url') arg_parse.add_option('-url', [0, 1], 'parse img url') arg_parse.add_option('-download', [0, 1], 'download imgs') arg_parse.add_option('-thread', [1], 'set thread count') arg_parse.add_option('-d', [1], 'set img store folder') arg_parse.add_option('-h', [0], 'print help') return arg_parse gLogHandler = LogHandle('sex_scrap.log') def main(): arg_handler = arg_parser_init() if not arg_handler.parse(sys.argv) or arg_handler.check_option('-h'): print arg_handler return front_page_node = PPFrontPageNode() start_url = 'http://www.sex.com/' if arg_handler.check_option('-d'): front_page_node.set_download_folder( arg_handler.get_option_args('-d')[0]) if arg_handler.check_option('-thread'):
import os import sys import time import subprocess from common_lib import LogHandle gLogHandle = LogHandle('copy.log') def do_move(from_path, to_path): command = ['mv', from_path, to_path] pipe = subprocess.Popen(args=command) while True: if pipe.poll() is not None: break time.sleep(1) return pipe.returncode def do_copy(from_dir, to_dir): items = os.listdir(from_dir) file_list = list() for item in items: path = os.path.join(from_dir, item) if os.path.isfile(path): file_list.append(path) gLogHandle.log('Total file cnt [%d]' % len(file_list)) if 0 == len(file_list): gLogHandle.log('All done') return True
import time import imghdr import sys import platform import multiprocessing import sqlite3 import datetime import ConfigParser import multiprocessing import socket import mutex import Queue from multiprocessing import Process, Pipe from common_lib import MyArgParse from common_lib import LogHandle gstLogHandler = LogHandle('hkpic.log') Config_Path = 'config.ini' Cookie_Path = 'cookie' class LoginMethod: def __init__(self, set_get_cookie=False): self.log = gstLogHandler.log self.login_url = 'http://hk-pic2.xyz/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1' #http://hkpic-forum.xyz/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes&inajax=1 self.status_url = 'http://hk-pic2.xyz/forum-18-2.html' #http://hkpic-forum.xyz/forum.php self.config_file_path = Config_Path self.cookie_file_path = Cookie_Path self.username = '' self.password = ''
class MonitorFile: def __init__(self): self.set_monitor_period_sec = 3 self.set_log_file_path = 'monitor_files.log' self.logHandle = LogHandle(self.set_log_file_path) self.set_store_path = '' self.lock = threading.Lock() self.monitor_file_list = list() self.change_file_list = list() self.alive_thread_cnt = 0 self.need_quit = False pass def add_monitor_file(self, file_path): dict_item = dict() dict_item['file_path'] = file_path[:] dict_item['last_change_time'] = None self.lock.acquire() self.monitor_file_list.append(dict_item) self.lock.release() pass def delete_monitor_file(self, file_path): pass def set_monitor_period(self, time_sec): pass def set_store_folder(self, folder_path): if not os.path.exists(folder_path): os.mkdir(folder_path) self.set_store_path = folder_path[:] def get_new_file_path(self, old_file_path): file_name = os.path.basename(old_file_path) cur_datetime = datetime.datetime.now() new_file_path = os.path.join(self.set_store_path, file_name) for c in str(cur_datetime): if '0' <= c <= '9': new_file_path += c return new_file_path pass def do_if_change(self, file_path): max_size_once_read = 1 * 1024 * 1024 new_file_name = self.get_new_file_path(file_path) if os.path.exists(new_file_name): self.logHandle.log('Warning, back file already exist [%s]' % new_file_name) return with open(new_file_name, 'w+') as w_fd: with open(file_path, 'r') as r_fd: while True: content = r_fd.read(max_size_once_read) if not content: break w_fd.write(content) self.logHandle.log('Copy [%s] to [%s]' % (file_path, new_file_name)) pass def start(self): pro = threading.Thread(target=self.monitor_thread) pro.start() pass def stop(self): self.need_quit = True while self.alive_thread_cnt: self.logHandle.log('waiting for thread quit...') time.sleep(0.5) self.logHandle.log('Quit now') def monitor_thread(self): self.alive_thread_cnt += 1 while True: self.lock.acquire() for file_item in self.monitor_file_list: file_path = file_item['file_path'] time_info = file_item['last_change_time'] self.logHandle.log('Check files [%s]' % file_path) if os.path.exists(file_path): file_stat = os.stat(file_path) if file_stat.st_mtime != time_info: self.change_file_list.append(file_item) cur_time_info = file_stat.st_mtime file_item['last_change_time'] = cur_time_info self.logHandle.log('Add File [%s] to change list' % file_path) for file_item in self.change_file_list: self.logHandle.log('File [%s] will be processed' % file_item['file_path']) self.do_if_change(file_item['file_path']) self.change_file_list = list() self.lock.release() if self.need_quit: break time.sleep(self.set_monitor_period_sec) if self.need_quit: break self.alive_thread_cnt -= 1 pass
import os import sys import json import requests import re from common_lib import LogHandle gstLogHandler = LogHandle('bing.log') gImgJsonTmpFile = 'img_list_tmp.json' class BingPic: def __init__(self): self.log = gstLogHandler.log self.m_set_store_folder = '' self.m_last_json_content = '' self.m_page_idx = 0 self.m_set_imgs_json_url = 'https://cn.bing.com/HPImageArchive.aspx?format=js&idx=6&n=8' self.m_session_handler = requests.session() self.do_init() pass def do_init(self): self.set_store_folder('Imgs') set_headers = \ {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3'} self.m_session_handler.headers = set_headers pass def get_url_to_parse(self, last_json_content=None):
import requests import os import sys import json from lxml import etree import re import datetime import threading import hashlib import time from common_lib import MyArgParse, LogHandle, ThreadHandler from sqlite_util import DBRowHuaBan, DBHandler, DBRow, DBItem gstLogHandle = LogHandle('geo.log') class DBRowGeo(DBRow): def do_init(self): self.item_list.append(DBItem('pageUrl', 'CHAR')) self.item_list.append(DBItem('title', 'CHAR')) self.item_list.append(DBItem('profileUrl', 'CHAR')) self.item_list.append(DBItem('altText', 'CHAR')) self.item_list.append(DBItem('url', 'CHAR')) self.item_list.append(DBItem('url_hash', 'CHAR', is_primary=True)) self.item_list.append(DBItem('is_done', 'INT')) pass def generate_select_cmd_str(self, table_name): ret_str = ' select ' for idx, item in enumerate(self.item_list): ret_str += table_name + '.' + item.name