def test_simple(self): trefs = ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:4'] refs = [Ref(tref) for tref in trefs] clusters = RecommendationEngine.cluster_close_refs(refs, [None] * len(refs), dist_threshold=2) assert len(clusters) == 1
def test_two_clusters(self): trefs = ['Genesis 1:1', 'Genesis 1:2', 'Genesis 1:5', 'Genesis 1:7'] refs = [Ref(tref) for tref in trefs] clusters = RecommendationEngine.cluster_close_refs(refs, [None] * len(refs), dist_threshold=2) assert len(clusters) == 2 assert clusters[0][0]['ref'].normal() == 'Genesis 1:1' assert clusters[1][0]['ref'].normal() == 'Genesis 1:5'
def generate_all_topic_links_from_sheets(topic=None): """ Processes all public source sheets to create topic links. """ from sefaria.system.database import db from sefaria.recommendation_engine import RecommendationEngine from tqdm import tqdm from statistics import mean, stdev all_topics = {} # ignore sheets that are copies or were assignments query = { "status": "public", "viaOwner": { "$exists": 0 }, "assignment_id": { "$exists": 0 } } if topic: query['topics.slug'] = topic projection = {"topics": 1, "includedRefs": 1, "owner": 1} sheet_list = db.sheets.find(query, projection) for sheet in tqdm(sheet_list, desc="aggregating sheet topics"): sheet_topics = sheet.get("topics", []) for topic_dict in sheet_topics: slug = topic_dict['slug'] if slug not in all_topics: all_topics[slug] = { "topic": slug, "sources_dict": defaultdict(set), "related_topics_dict": defaultdict(set) } for tref in sheet.get("includedRefs", []): try: oref = Ref(tref) for sub_oref in oref.range_list(): all_topics[slug]["sources_dict"][ sub_oref.normal()].add(sheet['owner']) except: continue for related_topic_dict in sheet_topics: if slug != related_topic_dict['slug']: all_topics[slug]["related_topics_dict"][ related_topic_dict['slug']].add(sheet['owner']) already_created_related_links = {} related_links = [] source_links = [] for slug, blob in tqdm(all_topics.items(), desc="creating sheet topic links"): if topic is not None and slug != topic: continue # filter related topics with less than 2 users who voted for it related_topics = [ related_topic for related_topic in blob['related_topics_dict'].items() if len(related_topic[1]) >= 2 ] for related_topic, user_votes in related_topics: if related_topic == slug: continue key = (related_topic, slug) if related_topic > slug else (slug, related_topic) if already_created_related_links.get(key, False): continue already_created_related_links[key] = True related_links += [{ 'a': related_topic, 'b': slug, 'user_votes': len(user_votes) }] # filter sources with less than 3 users who added it sources = [ source for source in blob['sources_dict'].items() if len(source[1]) >= 3 ] # transform data to more convenient format temp_sources = [] for source in sources: temp_sources += [(Ref(source[0]), len(source[1]))] sources = temp_sources # cluster refs that are close to each other and break up clusters where counts differ by more than 2 standard deviations STD_DEV_CUTOFF = 2 temp_sources = [] if len(sources) == 0: continue refs, counts = zip(*sources) clustered = RecommendationEngine.cluster_close_refs(refs, counts, 2) for cluster in clustered: counts = [(x['ref'], x['data']) for x in cluster] curr_range_start = 0 for icount, (_, temp_count) in enumerate(counts): temp_counts = [x[1] for x in counts[curr_range_start:icount]] if len(temp_counts) < 2: # variance requires two data points continue count_xbar = mean(temp_counts) count_std = max(1 / STD_DEV_CUTOFF, stdev(temp_counts, count_xbar)) if temp_count > (STD_DEV_CUTOFF * count_std + count_xbar) or temp_count < ( count_xbar - STD_DEV_CUTOFF * count_std): temp_range = counts[curr_range_start][0].to(counts[icount - 1][0]) temp_sources += [ (temp_range.normal(), [r.normal() for r in temp_range.range_list()], count_xbar) ] curr_range_start = icount temp_counts = [x[1] for x in counts[curr_range_start:]] count_xbar = mean(temp_counts) temp_range = counts[curr_range_start][0].to(counts[-1][0]) temp_sources += [(temp_range.normal(), [r.normal() for r in temp_range.range_list()], count_xbar)] sources = temp_sources # create links if not topic: for source in sources: source_links += [{ "class": "refTopic", "toTopic": slug, "ref": source[0], "expandedRefs": source[1], "linkType": "about", "is_sheet": False, "dataSource": "sefaria-users", "generatedBy": "sheet-topic-aggregator", "order": { "user_votes": source[2] } }] if not topic: final_related_links = calculate_tfidf_related_sheet_links( related_links) sheet_links = generate_sheet_topic_links() # now that we've gathered all the new links, delete old ones and insert new ones RefTopicLinkSet({"generatedBy": "sheet-topic-aggregator"}).delete() IntraTopicLinkSet({"generatedBy": "sheet-topic-aggregator"}).delete() db.topic_links.insert_many(sheet_links + source_links + final_related_links, ordered=False)
def generate_all_topic_links_from_sheets(topic=None): """ Processes all public source sheets to create topic links. """ from sefaria.recommendation_engine import RecommendationEngine from statistics import mean, stdev import math OWNER_THRESH = 3 TFIDF_CUTOFF = 0.15 STD_DEV_CUTOFF = 2 all_related_topics = defaultdict(lambda: defaultdict(set)) all_related_refs = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) topic_ref_counts = defaultdict(lambda: defaultdict(int)) # ignore sheets that are copies or were assignments query = { "status": "public", "viaOwner": { "$exists": 0 }, "assignment_id": { "$exists": 0 } } if topic: query['topics.slug'] = topic projection = {"topics": 1, "expandedRefs": 1, "owner": 1} sheet_list = db.sheets.find(query, projection) for sheet in tqdm(sheet_list, desc="aggregating sheet topics"): sheet_topics = sheet.get("topics", []) for topic_dict in sheet_topics: slug = topic_dict['slug'] for tref in sheet.get("expandedRefs", []): value = all_related_refs[tref][slug].get(sheet['owner'], 0) all_related_refs[tref][slug][sheet['owner']] = max( 1 / len(sheet_topics), value) topic_ref_counts[slug][tref] += 1 for related_topic_dict in sheet_topics: if slug != related_topic_dict['slug']: all_related_topics[slug][related_topic_dict['slug']].add( sheet['owner']) already_created_related_links = {} related_links = [] source_links = [] for slug, related_topics_to_slug in tqdm( all_related_topics.items(), desc="creating sheet related topic links"): if topic is not None and slug != topic: continue # filter related topics with less than 2 users who voted for it related_topics = [ related_topic for related_topic in related_topics_to_slug.items() if len(related_topic[1]) >= 2 ] for related_topic, user_votes in related_topics: if related_topic == slug: continue key = (related_topic, slug) if related_topic > slug else (slug, related_topic) if already_created_related_links.get(key, False): continue already_created_related_links[key] = True related_links += [{ 'a': related_topic, 'b': slug, 'user_votes': len(user_votes) }] topic_idf_dict = { slug: math.log2(len(all_related_refs) / len(ref_dict)) for slug, ref_dict in topic_ref_counts.items() } raw_topic_ref_links = defaultdict(list) for tref, related_topics_to_tref in tqdm( all_related_refs.items(), desc="creating sheet related ref links"): # filter sources with less than 3 users who added it and tfidf of at least 0.15 numerator_list = [] owner_counts = [] for slug, owner_map in related_topics_to_tref.items(): numerator = sum(owner_map.values()) owner_counts += [len(owner_map)] numerator_list += [numerator] denominator = sum(numerator_list) topic_scores = [ (slug, (numerator / denominator) * topic_idf_dict[slug], owners) for slug, numerator, owners in zip(related_topics_to_tref.keys(), numerator_list, owner_counts) ] # transform data to more convenient format oref = Ref(tref) for slug, _, owners in filter( lambda x: x[1] >= TFIDF_CUTOFF and x[2] >= OWNER_THRESH, topic_scores): raw_topic_ref_links[slug] += [(oref, owners)] for slug, sources in tqdm(raw_topic_ref_links.items()): # cluster refs that are close to each other and break up clusters where counts differ by more than 2 standard deviations temp_sources = [] if len(sources) == 0: continue refs, counts = zip(*sources) clustered = RecommendationEngine.cluster_close_refs(refs, counts, 2) for cluster in clustered: counts = [(x['ref'], x['data']) for x in cluster] curr_range_start = 0 for icount, (_, temp_count) in enumerate(counts): temp_counts = [x[1] for x in counts[curr_range_start:icount]] if len(temp_counts) < 2: # variance requires two data points continue count_xbar = mean(temp_counts) count_std = max(1 / STD_DEV_CUTOFF, stdev(temp_counts, count_xbar)) if temp_count > (STD_DEV_CUTOFF * count_std + count_xbar) or temp_count < ( count_xbar - STD_DEV_CUTOFF * count_std): temp_range = counts[curr_range_start][0].to(counts[icount - 1][0]) temp_sources += [ (temp_range.normal(), [r.normal() for r in temp_range.range_list()], count_xbar) ] curr_range_start = icount temp_counts = [x[1] for x in counts[curr_range_start:]] count_xbar = mean(temp_counts) temp_range = counts[curr_range_start][0].to(counts[-1][0]) temp_sources += [(temp_range.normal(), [r.normal() for r in temp_range.range_list()], count_xbar)] sources = temp_sources # create links if not topic: for source in sources: source_links += [{ "class": "refTopic", "toTopic": slug, "ref": source[0], "expandedRefs": source[1], "linkType": "about", "is_sheet": False, "dataSource": "sefaria-users", "generatedBy": "sheet-topic-aggregator", "order": { "user_votes": source[2] } }] if not topic: related_links = calculate_tfidf_related_sheet_links(related_links) sheet_links = generate_sheet_topic_links() # convert to objects source_links = [RefTopicLink(l) for l in source_links] related_links = [IntraTopicLink(l) for l in related_links] sheet_links = [RefTopicLink(l) for l in sheet_links] return source_links, related_links, sheet_links