def get_weight(tag1, tag2, method=WeightFuncs.inter_divide_union): tag1_q = QuestionsCDN.get_tag_questions(tag1) tag2_q = QuestionsCDN.get_tag_questions(tag2) n1 = len(tag1_q) n2 = len(tag2_q) nu = len(tag1_q | tag2_q) weight1 = method(n1, n2, nu) weight2 = WeightFuncs.inter_divide_min(n1, n2, nu) return [n1, n1 + n2 - nu, nu, weight1, weight2]
def get_questions(self, tag): questions = self.tag_questions.setdefault(tag, set()) if len(questions) < self.min_question_num: result_qs = QuestionsCDN.get_tag_questions( tag, save=self.from_db, min_num=self.min_question_num, from_db=self.from_db, from_api=self.from_api, from_cache=self.from_cache) self.tag_questions[tag] |= result_qs return self.tag_questions[tag]
def index_tags_questions(min_questions=100, save=True, msg=None): # step2: index questions and tags to find questions of tag and tags of question tag_questions_index = QuestionsCDN.get_tag_questions_filtered(min_question=min_questions) question_tags_index = {} for tag in tag_questions_index: for question in tag_questions_index[tag]: if question in question_tags_index: question_tags_index[question].append(tag) else: question_tags_index[question] = [tag] save_step_data(tag_questions_index, step='tag_questions.json', save=save, msg=msg) save_step_data(question_tags_index, step='question_tags.json', save=save, msg=msg) logger.info(f'index tag questions finished, questions: {len(question_tags_index.keys())}, tags: {len(tag_questions_index.keys())}') return tag_questions_index, question_tags_index
def __init__(self, min_question_num=100, min_weight=0.1, update=False, from_db=True, from_api=True, from_cache=True, related_cache=False, save_db=True): self.min_question_num = min_question_num self.tag_clf = CoreTagClfCache.get() self.core_tag_index = dict([(tag, c) for c in self.tag_clf for tag in self.tag_clf[c]]) self.tag_questions = QuestionsCDN.get_tag_questions_cached() self.from_db = from_db self.from_api = from_api self.from_cache = from_cache self.update = update self.min_weight = min_weight self.related_cache = related_cache self.save_db = save_db
def save_to_db(): # QuestionsCDN.load_to_cache() QuestionsCDN.save_to_db()
def save_to_db(): # QuestionsCDN.load_to_cache() QuestionsCDN.save_to_db() if __name__ == '__main__': parser = argparse.ArgumentParser(description='cache tools') parser.add_argument('command') args = parser.parse_args() if args.command == 'save': save() elif args.command == 'save_to_db': save_to_db() elif args.command == 'tags_have_question': tag_questions = QuestionsCDN.get_tag_questions_cached() n0 = 0 n50 = 0 n100 = 0 for tag in tag_questions: if len(tag_questions[tag]) > 0: n0 += 1 if len(tag_questions[tag]) >= 50: n50 += 1 if len(tag_questions[tag]) >= 100: n100 += 1 print(f'more than 100 questions: {n100}') print(f'more than 50 questions: {n50}') print(f'more than 0 questions: {n0}') elif args.command == 'tag_related': tag_related = TagRelatedCDN.get_related_weight_all()
import asyncio import data.config.config from utils.date_transfer import * from data.cdn.sof_cdn import QuestionsCDN import logging if __name__ == '__main__': result = QuestionsCDN.dld_pages_parallel(parallel_num=5, save=False, page=1, max_page=5) print(result) print(len(result))
from data.cdn.sof_cdn import QuestionsCDN, TagsCDN if __name__ == '__main__': for i, tag in enumerate(TagsCDN.get_tags(10000)): print(f'count {i}') QuestionsCDN.get_tag_questions(tag, save=False, from_db=False, min_num=100) # QuestionsCDN.get_tag_questions('map', save=False, from_db=False, min_num=100)
from data.cdn.sof_cdn import QuestionsCDN if __name__ == '__main__': QuestionsCDN.cache.add_tag_questions( 'reactjs', QuestionsCDN.get_tag_questions("{'reactjs'}")) QuestionsCDN.cache.add_tag_questions( 'jquery', QuestionsCDN.get_tag_questions("{'jquery'}")) QuestionsCDN.cache.add_tag_questions( 'react-redux', QuestionsCDN.get_tag_questions("{'react-redux'}")) QuestionsCDN.cache.add_tag_questions( 'node.js', QuestionsCDN.get_tag_questions("{'node.js'}")) QuestionsCDN.cache.hdel("{'reactjs'}") QuestionsCDN.cache.hdel("{'jquery'}") QuestionsCDN.cache.hdel("{'react-redux'}") QuestionsCDN.cache.hdel("{'node.js'}")