def load_several_clustered_news(self): """ get 704 news """ if not self.news_by_docs: raise MissedValueError('news_by_docs', 'build_docs_news_dependencies') several_doc_ids = read_lines('.conf/clustered.txt') if not several_doc_ids: raise Exception(u'Ошибка загрузки файла clustered.txt') several_doc_ids = map(int, several_doc_ids) self.several_doc_ids = set(several_doc_ids) self.several_news_ids = list() for doc_id in several_doc_ids: self.several_news_ids.append(self.news_by_docs[doc_id])
def load_stop_words(self): self.stop_words = read_lines('.conf/stop_words.txt', 'cp1251')
## create NewsKeyword and ParagraphKeyword # NewsStemmed.objects.create_keywords(stop_words, angry_mode=True) # ParagraphStemmed.objects.create_keywords(stop_words, angry_mode=True) # create NewsStats and ParagraphStats # NewsKeywords.objects.create_stats() # ParagraphKeywords.objects.create_stats() ''' ## gen_reports coefficients = [(0, 100)] for alpha, beta in coefficients: print dt(), 'alpha=%.2f, beta=%.2f' % (alpha, beta) NewsKeywords.objects.create_keyword_items(alpha, beta, gen_report=True) ''' # get 704 news several_doc_ids = read_lines('.conf/clustered.txt') # load clustered several_doc_ids several_doc_ids = map(int, several_doc_ids) docs = dict() news_by_docs = dict() for news in News.objects.only('doc_id'): docs[news.pk] = news.doc_id news_by_docs[news.doc_id] = news.pk several_news_ids = [] if several_doc_ids and news_by_docs: several_doc_ids = set(several_doc_ids) for doc_id in several_doc_ids: several_news_ids.append(news_by_docs[doc_id]) print 'loaded clustered ids' items = NewsParagraph.objects.filter(news__in=several_news_ids).only('news')
## create NewsKeyword and ParagraphKeyword # NewsStemmed.objects.create_keywords(stop_words, angry_mode=True) # ParagraphStemmed.objects.create_keywords(stop_words, angry_mode=True) # create NewsStats and ParagraphStats # NewsKeywords.objects.create_stats() # ParagraphKeywords.objects.create_stats() ''' ## gen_reports coefficients = [(0, 100)] for alpha, beta in coefficients: print dt(), 'alpha=%.2f, beta=%.2f' % (alpha, beta) NewsKeywords.objects.create_keyword_items(alpha, beta, gen_report=True) ''' # get 704 news several_doc_ids = read_lines( '.conf/clustered.txt') # load clustered several_doc_ids several_doc_ids = map(int, several_doc_ids) docs = dict() news_by_docs = dict() for news in News.objects.only('doc_id'): docs[news.pk] = news.doc_id news_by_docs[news.doc_id] = news.pk several_news_ids = [] if several_doc_ids and news_by_docs: several_doc_ids = set(several_doc_ids) for doc_id in several_doc_ids: several_news_ids.append(news_by_docs[doc_id]) print 'loaded clustered ids' items = NewsParagraph.objects.filter(news__in=several_news_ids).only('news')