def create_paragraphs(self): items = [] ts('main loop') for news in self.iterate(): tc(100) paragraphs = news.create_paragraphs() items += paragraphs tp() td('Adding paragraphs of DB') self.bulk(items, model=NewsParagraph, chunk_size=100)
def create_keyword_items(self, several_news_ids=None): if several_news_ids: items = self.filter(base_id__in=several_news_ids) else: items = self.iterate() ts('main loop') keywords = [] for news in items: tc(100) keywords += news.create_keyword_items() tp() self.bulk(keywords, TitleKeywordItem, 10000)
def create_stems(self): # print dt(), '@ Creating stems of news' items = [] ts('main loop') for news in self.iterate(): tc(100) stemmed = news.create_stemmed() items.append(stemmed) tp() # print dt(), '@ Adding stems of DB' td('Adding stems of DB') self.bulk(items, model=self.stemmed_model, chunk_size=50)
def load_from_folder(self, news_path): ts('@ Loading files, adding news to DB, creating news_contests') files = os.listdir(news_path) items = [] for filename in files: tc(100) news_content = \ self.load_from_xml("{}/{}".format(news_path, filename)) items.append(news_content) tp() print dt(), '-> Total entries:', len(items) print dt(), '@ Adding news_contests to DB' self.bulk(items, model=NewsContent, chunk_size=250)
def load_news_keywords(self): if not self.several_news_ids: raise MissedValueError('several_news_ids', 'load_several_clustered_news') items = NewsKeywordItem.objects.filter(base__in=self.several_news_ids) items = items.only('word', 'base') ts('query') items = list(items) tp() self.valid_keywords = dict() for item in items: news_id = item.base_id self.valid_keywords.setdefault(news_id, list()) self.valid_keywords[news_id].append(item.word)
def create_keywords(self, stop_words=None, angry_mode=False): # print dt(), '@ Extract list of valid keywords from stemmed data' td('Extract list of valid keywords from stemmed data') items = [] ts('main loop') for news in self.iterate(100): tc(500) news_keyword = news.create_keywords(stop_words, angry_mode) items.append(news_keyword) # break tp() # print dt(), '@ Adding news_keywords to DB' td('Adding news_keywords to DB') self.bulk(items, model=self.keywords_model, chunk_size=250)
def create_keyword_items(self, alpha, beta, several_news_ids=None, title_keywords=None, gen_report=False): report = None if gen_report: report_name = '.results/filter_ab_%.2f_%.2f.txt' % (alpha, beta) report = open(report_name, 'w') if several_news_ids: items = self.filter(base_id__in=several_news_ids) else: items = self.iterate() keywords = [] ts('main loop') for news in items: tc(1) keywords += news.create_keyword_items(alpha, beta, title_keywords, report) if gen_report: report.close() if not gen_report: self.bulk(keywords, NewsKeywordItem, 10000)
def news_calculate_cosinuses(self, docs, news_docs, doc_ids=None): data = dict() i = 0 last1 = last2 = 0 if doc_ids: doc_ids = set(doc_ids) news_ids = [] for doc_id in doc_ids: news_ids.append(news_docs[doc_id]) items = self.filter(base_id__in=news_ids) ts('get selected news') items = list(items) tp() model = CosResultSeveral else: last = CosResult.objects.order_by('-pk') if last: last = last[0] last1 = last.news_1_id last2 = last.news_2_id items = self.iterate() model = CosResult ts('loading news words data') for item in items: tc(10000) news_id = item.base.pk data.setdefault(news_id, dict()) data[news_id][item.word] = item.weight tp() results = [] i = 0 j = 0 ts('main loop for news: %d' % len(data)) for news_id1, news1 in data.items(): tc(10) if news_id1 < last1: continue for news_id2, news2 in data.items(): if news_id2 <= news_id1: continue if last1 == news_id1 and news_id2 <= last2: continue cos = vector_cos(news1, news2) results.append( model(news_1_id=news_id1, doc_1=docs[news_id1], news_2_id=news_id2, doc_2=docs[news_id2], cos=cos)) j += 1 if not j % 10000: model.objects.bulk_create(results) # print dt(), 'added', j td('added cos data: %d' % j) results = [] gc.collect() model.objects.bulk_create(results) tp() # print dt(), 'added', j td('total added cos data: %d' % j)
def news_calculate_cosinuses(self, docs, news_docs, doc_ids=None): data = dict() i = 0 last1 = last2 = 0 if doc_ids: doc_ids = set(doc_ids) news_ids = [] for doc_id in doc_ids: news_ids.append(news_docs[doc_id]) items = self.filter(base_id__in=news_ids) ts('get selected news') items = list(items) tp() model = CosResultSeveral else: last = CosResult.objects.order_by('-pk') if last: last = last[0] last1 = last.news_1_id last2 = last.news_2_id items = self.iterate() model = CosResult ts('loading news words data') for item in items: tc(10000) news_id = item.base.pk data.setdefault(news_id, dict()) data[news_id][item.word] = item.weight tp() results = [] i = 0 j = 0 ts('main loop for news: %d' % len(data)) for news_id1, news1 in data.items(): tc(10) if news_id1 < last1: continue for news_id2, news2 in data.items(): if news_id2 <= news_id1: continue if last1 == news_id1 and news_id2 <= last2: continue cos = vector_cos(news1, news2) results.append(model(news_1_id=news_id1, doc_1=docs[news_id1], news_2_id=news_id2, doc_2=docs[news_id2], cos=cos)) j += 1 if not j % 10000: model.objects.bulk_create(results) # print dt(), 'added', j td('added cos data: %d' % j) results = [] gc.collect() model.objects.bulk_create(results) tp() # print dt(), 'added', j td('total added cos data: %d' % j)
def paragraph_calculate_cosinuses(self, docs, min_global_cos, several=True, save_good_news=True): # print dt(), 'calculate_cosinuses' data = dict() i = 0 # last1 = last2 = 0 # last = CosResult.objects.order_by('-pk') # if last: # last = last[0] # last1 = last.news_1_id # last2 = last.news_2_id # items = CosResult.objects.iterate() if several: cos_results_model = CosResultSeveral paragraph_cos_results_model = ParagraphCosResultSeveral good_cos_results_model = CosResultAfterParagraphSeveral else: cos_results_model = CosResult paragraph_cos_results_model = ParagraphCosResult good_cos_results_model = CosResultAfterParagraph last1 = last2 = 0 last = paragraph_cos_results_model.objects.order_by('-pk') if last: last = last[0] last1 = last.news_1_id last2 = last.news_2_id # todo: get all (!!!) pairs (except cos=0 and cos>0.95) # items = cos_results_model.objects.filter(cos__gt=min_cos) ts('get news_ids from cos-table') items = cos_results_model.objects.exclude(cos=0).exclude(cos=1) # print dt(), '-> filter by min_cos (and getting news)' pairs = list() news_ids = list() for item in items: # if item.cos < min_news_cos: # continue pairs.append((item.news_1_id, item.news_2_id, item.cos)) news_ids.append(item.news_1_id) news_ids.append(item.news_2_id) news_ids = set(news_ids) tp() # print dt(), '-> filter paragraph keyword items by that news' ts('get paragraph words (using news_id from prev step)') items = ParagraphKeywordItem.objects.filter(news_id__in=news_ids) for item in items: news_id = item.news_id paragraph_id = item.base_id tc(10000) data.setdefault(news_id, dict()) data[news_id].setdefault(paragraph_id, dict()) data[news_id][paragraph_id][item.word] = item.weight # if news_id > 50: # break tp() results = [] best_results = [] j = p = c = 0 pairs_ok = list() ts('main loop, pairs: %d' % len(pairs)) pairs = sorted(pairs, key=itemgetter(1)) pairs = sorted(pairs, key=itemgetter(0)) for news_id_1, news_id_2, news_cos in pairs: if last1: if news_id_1 < last1: continue if last1 == news_id_1 and news_id_2 <= last2: continue pair_ok = False max_local_cos = best_paragraph_1 = best_paragraph_2 = -1 tc(10) for paragraph_id_1, paragraph_1 in data[news_id_1].items(): # if news_id1 < last1: # continue for paragraph_id_2, paragraph_2 in data[news_id_2].items(): if paragraph_id_2 <= paragraph_id_1: continue # if last1 == news_id1 and news_id2 <= last2: # continue cos = vector_cos(paragraph_1, paragraph_2) # results.append(paragraph_cos_results_model( # news_1_id=news_id_1, paragraph_1_id=paragraph_id_1, # news_2_id=news_id_2, paragraph_2_id=paragraph_id_2, # cos=cos)) if cos > min_global_cos: pair_ok = True if cos > max_local_cos: max_local_cos = cos best_paragraph_1 = paragraph_id_1 best_paragraph_2 = paragraph_id_2 # todo: calc and save max paragraph cos j += 1 if not j % 10000: # paragraph_cos_results_model.objects.bulk_create(results) # print dt(), ' paragraph cos added:', j td('calculated cos: %d' % j) # results = [] gc.collect() if max_local_cos != -1: c += 1 best_results.append(paragraph_cos_results_model( news_1_id=news_id_1, paragraph_1_id=best_paragraph_1, news_2_id=news_id_2, paragraph_2_id=best_paragraph_2, cos=max_local_cos)) if not c % 100: paragraph_cos_results_model.objects.bulk_create(best_results) # print dt(), ' best cos of paragraphs added:', c td('best cos of paragraphs added: %d' % c) best_results = [] if save_good_news and pair_ok: p += 1 pairs_ok.append(good_cos_results_model( news_1_id=news_id_1, news_2_id=news_id_2, doc_1=docs[news_id_1], doc_2=docs[news_id_2], cos=news_cos)) if not p % 100: good_cos_results_model.objects.bulk_create(pairs_ok) print dt(), ' good pairs of news added:', p pairs_ok = [] # paragraph_cos_results_model.objects.bulk_create(results) # print dt(), '-> paragraph cos added:', j paragraph_cos_results_model.objects.bulk_create(best_results) print dt(), ' best cos of paragraphs added:', c if save_good_news: good_cos_results_model.objects.bulk_create(pairs_ok) print dt(), '-> good pairs of news added:', p
def paragraph_calculate_cosinuses(self, docs, min_global_cos, several=True, save_good_news=True): # print dt(), 'calculate_cosinuses' data = dict() i = 0 # last1 = last2 = 0 # last = CosResult.objects.order_by('-pk') # if last: # last = last[0] # last1 = last.news_1_id # last2 = last.news_2_id # items = CosResult.objects.iterate() if several: cos_results_model = CosResultSeveral paragraph_cos_results_model = ParagraphCosResultSeveral good_cos_results_model = CosResultAfterParagraphSeveral else: cos_results_model = CosResult paragraph_cos_results_model = ParagraphCosResult good_cos_results_model = CosResultAfterParagraph last1 = last2 = 0 last = paragraph_cos_results_model.objects.order_by('-pk') if last: last = last[0] last1 = last.news_1_id last2 = last.news_2_id # todo: get all (!!!) pairs (except cos=0 and cos>0.95) # items = cos_results_model.objects.filter(cos__gt=min_cos) ts('get news_ids from cos-table') items = cos_results_model.objects.exclude(cos=0).exclude(cos=1) # print dt(), '-> filter by min_cos (and getting news)' pairs = list() news_ids = list() for item in items: # if item.cos < min_news_cos: # continue pairs.append((item.news_1_id, item.news_2_id, item.cos)) news_ids.append(item.news_1_id) news_ids.append(item.news_2_id) news_ids = set(news_ids) tp() # print dt(), '-> filter paragraph keyword items by that news' ts('get paragraph words (using news_id from prev step)') items = ParagraphKeywordItem.objects.filter(news_id__in=news_ids) for item in items: news_id = item.news_id paragraph_id = item.base_id tc(10000) data.setdefault(news_id, dict()) data[news_id].setdefault(paragraph_id, dict()) data[news_id][paragraph_id][item.word] = item.weight # if news_id > 50: # break tp() results = [] best_results = [] j = p = c = 0 pairs_ok = list() ts('main loop, pairs: %d' % len(pairs)) pairs = sorted(pairs, key=itemgetter(1)) pairs = sorted(pairs, key=itemgetter(0)) for news_id_1, news_id_2, news_cos in pairs: if last1: if news_id_1 < last1: continue if last1 == news_id_1 and news_id_2 <= last2: continue pair_ok = False max_local_cos = best_paragraph_1 = best_paragraph_2 = -1 tc(10) for paragraph_id_1, paragraph_1 in data[news_id_1].items(): # if news_id1 < last1: # continue for paragraph_id_2, paragraph_2 in data[news_id_2].items(): if paragraph_id_2 <= paragraph_id_1: continue # if last1 == news_id1 and news_id2 <= last2: # continue cos = vector_cos(paragraph_1, paragraph_2) # results.append(paragraph_cos_results_model( # news_1_id=news_id_1, paragraph_1_id=paragraph_id_1, # news_2_id=news_id_2, paragraph_2_id=paragraph_id_2, # cos=cos)) if cos > min_global_cos: pair_ok = True if cos > max_local_cos: max_local_cos = cos best_paragraph_1 = paragraph_id_1 best_paragraph_2 = paragraph_id_2 # todo: calc and save max paragraph cos j += 1 if not j % 10000: # paragraph_cos_results_model.objects.bulk_create(results) # print dt(), ' paragraph cos added:', j td('calculated cos: %d' % j) # results = [] gc.collect() if max_local_cos != -1: c += 1 best_results.append( paragraph_cos_results_model( news_1_id=news_id_1, paragraph_1_id=best_paragraph_1, news_2_id=news_id_2, paragraph_2_id=best_paragraph_2, cos=max_local_cos)) if not c % 100: paragraph_cos_results_model.objects.bulk_create( best_results) # print dt(), ' best cos of paragraphs added:', c td('best cos of paragraphs added: %d' % c) best_results = [] if save_good_news and pair_ok: p += 1 pairs_ok.append( good_cos_results_model(news_1_id=news_id_1, news_2_id=news_id_2, doc_1=docs[news_id_1], doc_2=docs[news_id_2], cos=news_cos)) if not p % 100: good_cos_results_model.objects.bulk_create(pairs_ok) print dt(), ' good pairs of news added:', p pairs_ok = [] # paragraph_cos_results_model.objects.bulk_create(results) # print dt(), '-> paragraph cos added:', j paragraph_cos_results_model.objects.bulk_create(best_results) print dt(), ' best cos of paragraphs added:', c if save_good_news: good_cos_results_model.objects.bulk_create(pairs_ok) print dt(), '-> good pairs of news added:', p