def get_comic_id_with_status(): find_comic_id_list = "select id from Comic where status = 0" conn = comic_hentai_data_source.get_conn() cursor = conn.cursor() cursor.execute(find_comic_id_list) comic_id_list = cursor.fetchall() return comic_id_list
def get_classified_id_list(): find_classified_id_list = "select id from Classified" conn = comic_hentai_data_source.get_conn() cursor = conn.cursor() cursor.execute(find_classified_id_list) classified_id_list = cursor.fetchall() return classified_id_list
def insert_into_category(): print "开始获取漫画和用户ID" conn = comic_hentai_data_source.get_conn() comic_id_list = get_comic_id_with_status() classified_id_list = get_classified_id_list() for classified_id in classified_id_list: print "开始添加分类ID为" + str(classified_id[0]) + "的漫画" comic_id_list_child = random.sample(comic_id_list, 20) print "分类漫画获取完成,开始添加" for comic_id in comic_id_list_child: insert_sql = "insert into Category(classifiedId, targetId, targetType, created, updated, isDeleted, status)values ("+ str(classified_id[0]) +", "+ str(comic_id[0]) +", 0, UNIX_TIMESTAMP(), UNIX_TIMESTAMP(), 0, 0)" cursor = conn.cursor() cursor.execute(insert_sql) cursor.close() conn.commit() conn.close()
def get_comic(): conn = comic_hentai_data_source.get_conn() find_comic_simple_info = "select id, title from Comic" cursor = conn.cursor() cursor.execute(find_comic_simple_info) result = cursor.fetchall() operator = "curl -XPUT " for r in result: id = str(r[0]) title = str(r[1]) title = title.replace("'", "'") data = json.dumps({ "id": id, "title": title }) url = r"'http://db.hope6537.com:9200/comichentai/comic/" + id + r"'" tmp = operator + url + " -d \'" + data + '\'' print(tmp) commands.getstatusoutput(tmp)
def write_comic_data_to_db(comic_list): foreign_id_list = "" for comic in comic_list: foreign_id_list += "'" + comic['comicId'] + "'" + "," foreign_id_list = foreign_id_list[0:-1] find_sql = "select foreignId from Capture where foreignId in (" + foreign_id_list + ")" conn = comic_hentai_data_source.get_conn() # 第一步,查询数据是否有重复的 cursor = conn.cursor() cursor.execute(find_sql) # 获取到的数据,就是重复的,排除 values = cursor.fetchall() # 刷洗数据 comic_list = flush_comic_data(comic_list, values) print(now() + "经过数据去重后,要插入的漫画数量为" + str(len(comic_list))) # 完成刷洗后开始写入 for comic in comic_list: comic_id = insert_comic_data(comic, conn) # 然后插入capture表 insert_capture_data(comic, comic_id, conn) print(now() + "完成写入") conn.close()
def mysql_connect(): conn = comic_hentai_data_source.get_conn() cursor = conn.cursor() # 得到当前数据库中的所有表 cursor.execute( "select distinct table_name from information_schema.columns where table_schema = 'ComicHentai' order by table_schema,table_name") tables = cursor.fetchall() print(tables) for table in tables: table = table[0] if table == 'TestComic' or table == "TestUser": continue cursor.execute( "select column_name,data_type,is_nullable,column_comment from information_schema.columns where table_name = '" + table + "' order by table_schema,table_name", ) values = cursor.fetchall() columns = [] for column in values: if not (column[0] == 'id' or column[0] == 'created' or column[0] == 'updated' or column[0] == 'isDeleted' or column[0] == 'status'): columns.append(column) print(columns) initAll(table, columns) cursor.close() conn.close()
# encoding:utf-8 import json import os # 收集已经完成下载的漫画ID import time import comic_hentai_data_source import oss2 conn = comic_hentai_data_source.get_conn() def now(): return time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime(int(time.time()))) if not conn: print(now() + "数据库连接初始化失败,停止操作") exit(0) else: print(now() + "数据库连接初始化成功") # 收集下载好的数据 def collect_already_download_comic_id(): print(now() + "开始收集数据") comic_id_list = [] for files in os.listdir("./ComicData"): if files.count(".json"): comic_id = files.split(".json")[0] # 查看下有没有这个文件夹
# encoding:utf-8 import json import os # 收集已经完成下载的漫画ID import time import comic_hentai_data_source import oss2 conn = comic_hentai_data_source.get_conn() def now(): return time.strftime("[%Y-%m-%d %H:%M:%S]", time.localtime(int(time.time()))) if not conn: print(now() + "数据库连接初始化失败,停止操作") exit(0) else: print(now() + "数据库连接初始化成功") # 收集下载好的数据 def collect_already_download_comic_id(): print(now() + "开始收集数据") comic_id_list = [] for files in os.listdir("./ComicData"): if files.count(".json"): comic_id = files.split(".json")[0] # 查看下有没有这个文件夹 if os.path.exists(os.getcwd() + "/ComicData/" + comic_id):