コード例 #1
0
def get_weight(tag1, tag2, method=WeightFuncs.inter_divide_union):
    tag1_q = QuestionsCDN.get_tag_questions(tag1)
    tag2_q = QuestionsCDN.get_tag_questions(tag2)
    n1 = len(tag1_q)
    n2 = len(tag2_q)
    nu = len(tag1_q | tag2_q)
    weight1 = method(n1, n2, nu)
    weight2 = WeightFuncs.inter_divide_min(n1, n2, nu)
    return [n1, n1 + n2 - nu, nu, weight1, weight2]
コード例 #2
0
 def get_questions(self, tag):
     questions = self.tag_questions.setdefault(tag, set())
     if len(questions) < self.min_question_num:
         result_qs = QuestionsCDN.get_tag_questions(
             tag,
             save=self.from_db,
             min_num=self.min_question_num,
             from_db=self.from_db,
             from_api=self.from_api,
             from_cache=self.from_cache)
         self.tag_questions[tag] |= result_qs
     return self.tag_questions[tag]
コード例 #3
0
def index_tags_questions(min_questions=100, save=True, msg=None):
    # step2: index questions and tags to find questions of tag and tags of question
    tag_questions_index = QuestionsCDN.get_tag_questions_filtered(min_question=min_questions)
    question_tags_index = {}
    for tag in tag_questions_index:
        for question in tag_questions_index[tag]:
            if question in question_tags_index:
                question_tags_index[question].append(tag)
            else:
                question_tags_index[question] = [tag]
    save_step_data(tag_questions_index, step='tag_questions.json', save=save, msg=msg)
    save_step_data(question_tags_index, step='question_tags.json', save=save, msg=msg)
    logger.info(f'index tag questions finished, questions: {len(question_tags_index.keys())}, tags: {len(tag_questions_index.keys())}')
    return tag_questions_index, question_tags_index
コード例 #4
0
 def __init__(self,
              min_question_num=100,
              min_weight=0.1,
              update=False,
              from_db=True,
              from_api=True,
              from_cache=True,
              related_cache=False,
              save_db=True):
     self.min_question_num = min_question_num
     self.tag_clf = CoreTagClfCache.get()
     self.core_tag_index = dict([(tag, c) for c in self.tag_clf
                                 for tag in self.tag_clf[c]])
     self.tag_questions = QuestionsCDN.get_tag_questions_cached()
     self.from_db = from_db
     self.from_api = from_api
     self.from_cache = from_cache
     self.update = update
     self.min_weight = min_weight
     self.related_cache = related_cache
     self.save_db = save_db
コード例 #5
0
def save_to_db():
    # QuestionsCDN.load_to_cache()
    QuestionsCDN.save_to_db()
コード例 #6
0
def save_to_db():
    # QuestionsCDN.load_to_cache()
    QuestionsCDN.save_to_db()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='cache tools')
    parser.add_argument('command')
    args = parser.parse_args()
    if args.command == 'save':
        save()
    elif args.command == 'save_to_db':
        save_to_db()
    elif args.command == 'tags_have_question':
        tag_questions = QuestionsCDN.get_tag_questions_cached()
        n0 = 0
        n50 = 0
        n100 = 0
        for tag in tag_questions:
            if len(tag_questions[tag]) > 0:
                n0 += 1
            if len(tag_questions[tag]) >= 50:
                n50 += 1
            if len(tag_questions[tag]) >= 100:
                n100 += 1
        print(f'more than 100 questions: {n100}')
        print(f'more than 50 questions: {n50}')
        print(f'more than 0 questions: {n0}')
    elif args.command == 'tag_related':
        tag_related = TagRelatedCDN.get_related_weight_all()
コード例 #7
0
import asyncio

import data.config.config

from utils.date_transfer import *
from data.cdn.sof_cdn import QuestionsCDN

import logging

if __name__ == '__main__':
    result = QuestionsCDN.dld_pages_parallel(parallel_num=5,
                                             save=False,
                                             page=1,
                                             max_page=5)
    print(result)
    print(len(result))
コード例 #8
0
from data.cdn.sof_cdn import QuestionsCDN, TagsCDN

if __name__ == '__main__':
    for i, tag in enumerate(TagsCDN.get_tags(10000)):
        print(f'count {i}')
        QuestionsCDN.get_tag_questions(tag, save=False, from_db=False, min_num=100)
    # QuestionsCDN.get_tag_questions('map', save=False, from_db=False, min_num=100)
コード例 #9
0
from data.cdn.sof_cdn import QuestionsCDN

if __name__ == '__main__':
    QuestionsCDN.cache.add_tag_questions(
        'reactjs', QuestionsCDN.get_tag_questions("{'reactjs'}"))
    QuestionsCDN.cache.add_tag_questions(
        'jquery', QuestionsCDN.get_tag_questions("{'jquery'}"))
    QuestionsCDN.cache.add_tag_questions(
        'react-redux', QuestionsCDN.get_tag_questions("{'react-redux'}"))
    QuestionsCDN.cache.add_tag_questions(
        'node.js', QuestionsCDN.get_tag_questions("{'node.js'}"))
    QuestionsCDN.cache.hdel("{'reactjs'}")
    QuestionsCDN.cache.hdel("{'jquery'}")
    QuestionsCDN.cache.hdel("{'react-redux'}")
    QuestionsCDN.cache.hdel("{'node.js'}")