def nodes_need_to_remove(*node_ids): for node in node_ids: assert isinstance(node, int) Subject.update(locked=True).where(Subject.id.in_(node_ids)).execute() Relation.update(removed=True).where( Relation.source.in_(node_ids) | Relation.target.in_(node_ids)).execute()
def rebuild_map(map_id=None, item_id=None): if item_id: map_id = Subject.get_by_id(item_id).map if map_id: Subject.update(map=None).where(Subject.map == map_id).execute() Relation.update(map=None).where(Relation.map == map_id).execute()
def pre_remove(): print('pre remove') pre_remove_relation() nodes_need_to_remove(91493, 102098, 228714, 231982, 932, 84944, 78546) relations_need_to_remove([ (91493, 8), (8108, 35866), (446, 123207), (123207, 27466), (123217, 4294), # 高达 三国 ]) id_to_remove = [] Subject.update(locked=1).where(Subject.subject_type == 'Music').execute() for s in Subject.select(Subject.id).where(Subject.locked == 1): id_to_remove.append(s.id) Relation.update(removed=1).where( Relation.source.in_(id_to_remove) | Relation.target.in_(id_to_remove)).execute() for chunk in chunk_iter_list(list(range(SUBJECT_ID_START, SUBJECT_ID_END))): db_data = list( Subject.select(Subject.id, Subject.subject_type, Subject.locked).where( Subject.id.in_(chunk) & (Subject.subject_type != 'Music') & (Subject.locked == 0))) for s in db_data: assert s.subject_type != 'Music' assert s.locked == 0 non_exists_ids = list(set(chunk) - set([x.id for x in db_data])) Relation.update(removed=1).where( Relation.source.in_(non_exists_ids) | Relation.target.in_(non_exists_ids)).execute() for i in tqdm.tqdm(range(SUBJECT_ID_START, SUBJECT_ID_END, CHUNK_SIZE)): relation_id_need_to_remove = set() source_to_target = defaultdict(dict) sources = Relation.select().where(( ((Relation.source >= i) & (Relation.source < i + CHUNK_SIZE)) | ((Relation.target >= i) & (Relation.target < i + CHUNK_SIZE))) & (Relation.removed == 0)) sources = list(sources) for edge in sources: source_to_target[edge.source][edge.target] = True for edge in sources: if not source_to_target[edge.target].get(edge.source): relation_id_need_to_remove.add(edge.id) for i, chunk in enumerate( chunk_iter_list(list(relation_id_need_to_remove))): Relation.update(removed=1).where(Relation.id.in_(chunk)).execute() print('finish pre remove')
def subject_json(subject_id): try: s = Subject.get_by_id(subject_id) if not s.map: if s.subject_type == 'Music': raise DoesNotExist raise DoesNotExist return return_map_json(map_id=str(Subject.get_by_id(subject_id).map)) except DoesNotExist: return '没找到', 404
def do_insert(self, cursor, item): # 会从dbpool取出cursor # 执行具体的插入 if isinstance(item, SubjectItem): if not item['name']: item['name'] = item['name_cn'] # if not item['name_cn']: # item['name_cn'] = item['name'] insert_sql = Subject.insert(**item).on_conflict(preserve=( Subject.name_cn, Subject.name, Subject.image, Subject.tags, Subject.locked, Subject.info, Subject.score_details, Subject.score, Subject.wishes, Subject.done, Subject.doings, Subject.on_hold, Subject.dropped, ), ).sql() elif isinstance(item, RelationItem): insert_sql = Relation.insert( id=f'{item["source"]}-{item["target"]}', **item).on_conflict(preserve=(Relation.relation, ), ).sql() elif isinstance(item, SubjectJsonItem): insert_sql = SubjectJson.insert( id=item['id'], info=json.dumps(dict(item))).on_conflict( preserve=(SubjectJson.info), ).sql() else: return cursor.execute(*insert_sql)
def worker(start_job=None, work_fun=None): if not isinstance(work_fun, types.FunctionType): raise ValueError('work_fun must be a function') yield_job = [] if start_job is None: start_job = [ x.id for x in Subject.select(Subject.id).where(Subject.map.is_null()) ] def do(j): # time.sleep(0.1) if j in done_id: return for node in work_fun(j): yield_job.append(node) done_id.add(j) i = 0 while True: if i % 1000 == 0: print(len(yield_job) + len(start_job), end='|') if yield_job: j = yield_job.pop() do(j) elif start_job: j = start_job.pop() do(j) else: break i += 1
def return_map_json(map_id): if (not str.isdecimal(map_id)) or (map_id == '0'): return '', 400 not_type = [ "Music", ] for key, value in request.args.items(): key = key.capitalize() if key in ["Book", "Anime", "Game", "Real"]: if value and value == 'false': not_type.append(key) subjects = Subject.select( Subject.id, Subject.map, Subject.name, Subject.image, Subject.name_cn, Subject.info, Subject.subject_type ).where((Subject.map == map_id) & (Subject.subject_type.not_in(not_type))) relations = Relation.select().where(Relation.map == map_id) data = format_data({ 'edges': [model_to_dict(x) for x in relations], 'nodes': [{ 'id': x.id, 'name': x.name, 'image': x.image, 'name_cn': x.name_cn, 'begin': x.info.get("放送开始", [x.info.get('发售日', [None])[0]])[0], 'subject_type': x.subject_type.lower(), } for x in subjects], }) return jsonify(data)
def start_requests(self): for chunk in chunk_iter_list( list(range(SUBJECT_ID_START, SUBJECT_ID_END)), 29000): for i in Subject.select( Subject.id).where((Subject.subject_type == 'Anime') & (Subject.id.in_(chunk))): yield Request(('https://mirror.api.bgm.rin.cat/subject/' '{}?responseGroup=large').format(i.id))
def start_requests(self): start = int(os.getenv('SPIDER_START', '1')) end = os.getenv( 'SPIDER_END', str(Subject.select(pw.fn.MAX(Subject.id)).scalar() + 2000), ) end = int(end) if os.getenv('SPIDER_DONT_CACHE'): meta = {'dont_cache': True} else: meta = {} for i in range(start, end): yield Request(url_from_id(i), meta=meta)
def remove_nodes(node_id, rebuild=True): Subject.delete_by_id(node_id) Relation.delete().where((Relation.source == node_id) | (Relation.target == node_id)).execute()
def first_run(): subjects = {} # type: Dict[int, Subject] for i in tqdm.tqdm(range(SUBJECT_ID_START, SUBJECT_ID_END, CHUNK_SIZE)): for s in Subject.select().where((Subject.id >= i) & (Subject.id < i + CHUNK_SIZE) & (Subject.locked == 0) & (Subject.subject_type != 'Music')): assert s.subject_type != 'Music' assert s.locked == 0 s.map = 0 subjects[s.id] = s print('total', len(subjects), 'subjects') relation_from_id = defaultdict(set) edge_count = 0 for i in range(SUBJECT_ID_START, SUBJECT_ID_END, CHUNK_SIZE): for edge in Relation.select().where((Relation.source >= i) & (Relation.source < i + CHUNK_SIZE) & (Relation.removed == 0)): assert i <= edge.source < i + CHUNK_SIZE assert subjects[edge.target] assert subjects[edge.source] edge_count += 1 edge.map = 0 relation_from_id[edge.source].add(edge) relation_from_id[edge.target].add(edge) print('total', edge_count, 'edges') def deal_with_node(source_id): s = subjects.get(source_id) if not s: return edges = relation_from_id[source_id] map_id = None for edge in edges: if edge.map: map_id = edge.map break if not map_id: m = MAP.create() map_id = m.id for edge in edges: edge.map = map_id s.map = map_id done_id.add(source_id) for edge in edges: yield edge.target worker(list(subjects.keys()), deal_with_node) print('finish work, start save to db') with db.atomic() as txn: # if 1: try: # if 1: maps = defaultdict(list) for s in subjects.values(): maps[s.map].append(s.id) for map_id, ids in tqdm.tqdm(maps.items(), total=len(maps.keys())): Subject.update(map=map_id).where(Subject.id.in_(ids)).execute() maps = defaultdict(set) for source, edges in relation_from_id.items(): s = subjects[source] [maps[s.map].add(x.id) for x in edges] for map_id, ids in tqdm.tqdm(maps.items(), total=len(maps.keys())): for chunk in chunk_iter_list(list(ids)): Relation.update(map=map_id).where( Relation.id.in_(chunk)).execute() except Exception as e: txn.rollback() raise e print('finish save to db')
import copy import types from collections import defaultdict from typing import Dict import tqdm import peewee as pw from bgm.models import db, Subject, Relation SUBJECT_ID_START = 1 SUBJECT_ID_END = Subject.select(pw.fn.MAX(Subject.id)).scalar() # SUBJECT_ID_END = 6000 CHUNK_SIZE = 5000 class MAP: count = 1 def __init__(self): self.id = MAP.count @classmethod def create(cls): cls.count += 1 return cls() def chunk_iter_list(raw_list, chunk_size=CHUNK_SIZE): ds = copy.copy(raw_list) while ds: yield ds[:chunk_size]