Ejemplo n.º 1
0
def pre_remove():
    print('pre remove')
    pre_remove_relation()
    nodes_need_to_remove(91493, 102098, 228714, 231982, 932, 84944, 78546)
    relations_need_to_remove([
        (91493, 8),
        (8108, 35866),
        (446, 123207),
        (123207, 27466),
        (123217, 4294),  # 高达 三国
    ])

    id_to_remove = []
    Subject.update(locked=1).where(Subject.subject_type == 'Music').execute()
    for s in Subject.select(Subject.id).where(Subject.locked == 1):
        id_to_remove.append(s.id)
    Relation.update(removed=1).where(
        Relation.source.in_(id_to_remove)
        | Relation.target.in_(id_to_remove)).execute()

    for chunk in chunk_iter_list(list(range(SUBJECT_ID_START,
                                            SUBJECT_ID_END))):
        db_data = list(
            Subject.select(Subject.id, Subject.subject_type,
                           Subject.locked).where(
                               Subject.id.in_(chunk)
                               & (Subject.subject_type != 'Music')
                               & (Subject.locked == 0)))
        for s in db_data:
            assert s.subject_type != 'Music'
            assert s.locked == 0
        non_exists_ids = list(set(chunk) - set([x.id for x in db_data]))
        Relation.update(removed=1).where(
            Relation.source.in_(non_exists_ids)
            | Relation.target.in_(non_exists_ids)).execute()

    for i in tqdm.tqdm(range(SUBJECT_ID_START, SUBJECT_ID_END, CHUNK_SIZE)):
        relation_id_need_to_remove = set()
        source_to_target = defaultdict(dict)
        sources = Relation.select().where((
            ((Relation.source >= i) & (Relation.source < i + CHUNK_SIZE))
            | ((Relation.target >= i) & (Relation.target < i + CHUNK_SIZE)))
                                          & (Relation.removed == 0))

        sources = list(sources)

        for edge in sources:
            source_to_target[edge.source][edge.target] = True

        for edge in sources:
            if not source_to_target[edge.target].get(edge.source):
                relation_id_need_to_remove.add(edge.id)

        for i, chunk in enumerate(
                chunk_iter_list(list(relation_id_need_to_remove))):
            Relation.update(removed=1).where(Relation.id.in_(chunk)).execute()
    print('finish pre remove')
Ejemplo n.º 2
0
def worker(start_job=None, work_fun=None):
    if not isinstance(work_fun, types.FunctionType):
        raise ValueError('work_fun must be a function')
    yield_job = []
    if start_job is None:
        start_job = [
            x.id
            for x in Subject.select(Subject.id).where(Subject.map.is_null())
        ]

    def do(j):
        # time.sleep(0.1)
        if j in done_id:
            return
        for node in work_fun(j):
            yield_job.append(node)
        done_id.add(j)

    i = 0
    while True:
        if i % 1000 == 0:
            print(len(yield_job) + len(start_job), end='|')
        if yield_job:
            j = yield_job.pop()
            do(j)
        elif start_job:
            j = start_job.pop()
            do(j)
        else:
            break
        i += 1
Ejemplo n.º 3
0
def return_map_json(map_id):
    if (not str.isdecimal(map_id)) or (map_id == '0'):
        return '', 400
    not_type = [
        "Music",
    ]
    for key, value in request.args.items():
        key = key.capitalize()
        if key in ["Book", "Anime", "Game", "Real"]:
            if value and value == 'false':
                not_type.append(key)
    subjects = Subject.select(
        Subject.id, Subject.map, Subject.name, Subject.image, Subject.name_cn,
        Subject.info, Subject.subject_type
    ).where((Subject.map == map_id) & (Subject.subject_type.not_in(not_type)))
    relations = Relation.select().where(Relation.map == map_id)
    data = format_data({
        'edges': [model_to_dict(x) for x in relations],
        'nodes': [{
            'id': x.id,
            'name': x.name,
            'image': x.image,
            'name_cn': x.name_cn,
            'begin': x.info.get("放送开始", [x.info.get('发售日', [None])[0]])[0],
            'subject_type': x.subject_type.lower(),
        } for x in subjects],
    })
    return jsonify(data)
 def start_requests(self):
     for chunk in chunk_iter_list(
             list(range(SUBJECT_ID_START, SUBJECT_ID_END)), 29000):
         for i in Subject.select(
                 Subject.id).where((Subject.subject_type == 'Anime')
                                   & (Subject.id.in_(chunk))):
             yield Request(('https://mirror.api.bgm.rin.cat/subject/'
                            '{}?responseGroup=large').format(i.id))
Ejemplo n.º 5
0
 def start_requests(self):
     start = int(os.getenv('SPIDER_START', '1'))
     end = os.getenv(
         'SPIDER_END',
         str(Subject.select(pw.fn.MAX(Subject.id)).scalar() + 2000),
     )
     end = int(end)
     if os.getenv('SPIDER_DONT_CACHE'):
         meta = {'dont_cache': True}
     else:
         meta = {}
     for i in range(start, end):
         yield Request(url_from_id(i), meta=meta)
Ejemplo n.º 6
0
def first_run():
    subjects = {}  # type: Dict[int, Subject]
    for i in tqdm.tqdm(range(SUBJECT_ID_START, SUBJECT_ID_END, CHUNK_SIZE)):
        for s in Subject.select().where((Subject.id >= i)
                                        & (Subject.id < i + CHUNK_SIZE)
                                        & (Subject.locked == 0)
                                        & (Subject.subject_type != 'Music')):
            assert s.subject_type != 'Music'
            assert s.locked == 0
            s.map = 0
            subjects[s.id] = s
    print('total', len(subjects), 'subjects')
    relation_from_id = defaultdict(set)
    edge_count = 0
    for i in range(SUBJECT_ID_START, SUBJECT_ID_END, CHUNK_SIZE):
        for edge in Relation.select().where((Relation.source >= i)
                                            &
                                            (Relation.source < i + CHUNK_SIZE)
                                            & (Relation.removed == 0)):
            assert i <= edge.source < i + CHUNK_SIZE
            assert subjects[edge.target]
            assert subjects[edge.source]
            edge_count += 1
            edge.map = 0
            relation_from_id[edge.source].add(edge)
            relation_from_id[edge.target].add(edge)
    print('total', edge_count, 'edges')

    def deal_with_node(source_id):
        s = subjects.get(source_id)
        if not s:
            return
        edges = relation_from_id[source_id]
        map_id = None
        for edge in edges:
            if edge.map:
                map_id = edge.map
                break
        if not map_id:
            m = MAP.create()
            map_id = m.id
        for edge in edges:
            edge.map = map_id
        s.map = map_id
        done_id.add(source_id)
        for edge in edges:
            yield edge.target

    worker(list(subjects.keys()), deal_with_node)
    print('finish work, start save to db')

    with db.atomic() as txn:
        # if 1:
        try:
            # if 1:
            maps = defaultdict(list)
            for s in subjects.values():
                maps[s.map].append(s.id)

            for map_id, ids in tqdm.tqdm(maps.items(), total=len(maps.keys())):
                Subject.update(map=map_id).where(Subject.id.in_(ids)).execute()
            maps = defaultdict(set)

            for source, edges in relation_from_id.items():
                s = subjects[source]
                [maps[s.map].add(x.id) for x in edges]
            for map_id, ids in tqdm.tqdm(maps.items(), total=len(maps.keys())):
                for chunk in chunk_iter_list(list(ids)):
                    Relation.update(map=map_id).where(
                        Relation.id.in_(chunk)).execute()
        except Exception as e:
            txn.rollback()
            raise e

    print('finish save to db')
Ejemplo n.º 7
0
import copy
import types
from collections import defaultdict
from typing import Dict

import tqdm
import peewee as pw
from bgm.models import db, Subject, Relation

SUBJECT_ID_START = 1
SUBJECT_ID_END = Subject.select(pw.fn.MAX(Subject.id)).scalar()
# SUBJECT_ID_END = 6000
CHUNK_SIZE = 5000


class MAP:
    count = 1

    def __init__(self):
        self.id = MAP.count

    @classmethod
    def create(cls):
        cls.count += 1
        return cls()


def chunk_iter_list(raw_list, chunk_size=CHUNK_SIZE):
    ds = copy.copy(raw_list)
    while ds:
        yield ds[:chunk_size]