def is_doc_exists(title): target = db_session.query(WikiPage) \ .filter(WikiPage.title == title).first() if not target: return False, None else: return True, target.id
def insert_to_wiki_page(title, state): target = db_session.query(WikiPage) \ .filter(WikiPage.title == title).first() if not target: args = {'title': title, 'state': state} new_row = WikiPage(**args) db_session.add(new_row) db_session.flush() return new_row.id else: target.state = state return target.id
def execute(self, context): for i in range(1): result = db_session.query(WikiPage).first() if not result: docs = {'사과': 'https://namu.wiki/w/사과'} else: result = db_session.query(WikiPage) \ .filter(WikiPage.state == False).all() rows = [as_dict(x) for x in result] docs = dict() for row in rows: title = row.get('title') url = f'https://namu.wiki/w/{title}' docs[title] = url print(f'{i}번째 depth docs 갯수: {len(docs.keys())}\n') for i, (k, v) in enumerate(docs.items()): if i % 10 == 0: print(f'{i}/{len(docs.keys())}') crawl_doc(title=k, url=v)
def execute(self, context): # mongo connector mc = MongoConnector() finder = mc.make_finder() i = 0 while True: i += 1 result = db_session.query(WikiPage).first() if not result: docs = ['사과'] else: result = db_session.query(WikiPage) \ .filter(WikiPage.state == False).all() if not result: break rows = [as_dict(x) for x in result] docs = [row.get('title') for row in rows] print(f'{i}번째 depth docs 갯수: {len(docs)}\n') for i, k in enumerate(docs): if i % 10 == 0: print(f'{i}/{len(docs)}') crawl_doc(key=k, finder=finder)
def insert_to_wiki_page_relation(parent_id, child_id, updated_at): target = db_session.query(WikiPageRelation) \ .filter(WikiPageRelation.parent_id == parent_id) \ .filter(WikiPageRelation.child_id == child_id).first() if not target: args = { 'parent_id': parent_id, 'child_id': child_id, 'updated_at': updated_at } new_row = WikiPageRelation(**args) db_session.add(new_row) else: target.updated_at = updated_at