Exemple #1
0
 def create_paragraphs(self):
     items = []
     ts('main loop')
     for news in self.iterate():
         tc(100)
         paragraphs = news.create_paragraphs()
         items += paragraphs
     tp()
     td('Adding paragraphs of DB')
     self.bulk(items, model=NewsParagraph, chunk_size=100)
 def create_keyword_items(self, several_news_ids=None):
     if several_news_ids:
         items = self.filter(base_id__in=several_news_ids)
     else:
         items = self.iterate()
     ts('main loop')
     keywords = []
     for news in items:
         tc(100)
         keywords += news.create_keyword_items()
     tp()
     self.bulk(keywords, TitleKeywordItem, 10000)
Exemple #3
0
 def create_stems(self):
     # print dt(), '@ Creating stems of news'
     items = []
     ts('main loop')
     for news in self.iterate():
         tc(100)
         stemmed = news.create_stemmed()
         items.append(stemmed)
     tp()
     # print dt(), '@ Adding stems of DB'
     td('Adding stems of DB')
     self.bulk(items, model=self.stemmed_model, chunk_size=50)
 def create_stems(self):
     # print dt(), '@ Creating stems of news'
     items = []
     ts('main loop')
     for news in self.iterate():
         tc(100)
         stemmed = news.create_stemmed()
         items.append(stemmed)
     tp()
     # print dt(), '@ Adding stems of DB'
     td('Adding stems of DB')
     self.bulk(items, model=self.stemmed_model, chunk_size=50)
 def load_from_folder(self, news_path):
     ts('@ Loading files, adding news to DB, creating news_contests')
     files = os.listdir(news_path)
     items = []
     for filename in files:
         tc(100)
         news_content = \
             self.load_from_xml("{}/{}".format(news_path, filename))
         items.append(news_content)
     tp()
     print dt(), '-> Total entries:', len(items)
     print dt(), '@ Adding news_contests to DB'
     self.bulk(items, model=NewsContent, chunk_size=250)
Exemple #6
0
 def load_from_folder(self, news_path):
     ts('@ Loading files, adding news to DB, creating news_contests')
     files = os.listdir(news_path)
     items = []
     for filename in files:
         tc(100)
         news_content = \
             self.load_from_xml("{}/{}".format(news_path, filename))
         items.append(news_content)
     tp()
     print dt(), '-> Total entries:', len(items)
     print dt(), '@ Adding news_contests to DB'
     self.bulk(items, model=NewsContent, chunk_size=250)
 def load_news_keywords(self):
     if not self.several_news_ids:
         raise MissedValueError('several_news_ids',
                                'load_several_clustered_news')
     items = NewsKeywordItem.objects.filter(base__in=self.several_news_ids)
     items = items.only('word', 'base')
     ts('query')
     items = list(items)
     tp()
     self.valid_keywords = dict()
     for item in items:
         news_id = item.base_id
         self.valid_keywords.setdefault(news_id, list())
         self.valid_keywords[news_id].append(item.word)
 def load_news_keywords(self):
     if not self.several_news_ids:
         raise MissedValueError('several_news_ids',
                                'load_several_clustered_news')
     items = NewsKeywordItem.objects.filter(base__in=self.several_news_ids)
     items = items.only('word', 'base')
     ts('query')
     items = list(items)
     tp()
     self.valid_keywords = dict()
     for item in items:
         news_id = item.base_id
         self.valid_keywords.setdefault(news_id, list())
         self.valid_keywords[news_id].append(item.word)
Exemple #9
0
 def create_keywords(self, stop_words=None, angry_mode=False):
     # print dt(), '@ Extract list of valid keywords from stemmed data'
     td('Extract list of valid keywords from stemmed data')
     items = []
     ts('main loop')
     for news in self.iterate(100):
         tc(500)
         news_keyword = news.create_keywords(stop_words, angry_mode)
         items.append(news_keyword)
         # break
     tp()
     # print dt(), '@ Adding news_keywords to DB'
     td('Adding news_keywords to DB')
     self.bulk(items, model=self.keywords_model, chunk_size=250)
 def create_keywords(self, stop_words=None, angry_mode=False):
     # print dt(), '@ Extract list of valid keywords from stemmed data'
     td('Extract list of valid keywords from stemmed data')
     items = []
     ts('main loop')
     for news in self.iterate(100):
         tc(500)
         news_keyword = news.create_keywords(stop_words, angry_mode)
         items.append(news_keyword)
         # break
     tp()
     # print dt(), '@ Adding news_keywords to DB'
     td('Adding news_keywords to DB')
     self.bulk(items, model=self.keywords_model, chunk_size=250)
 def news_calculate_cosinuses(self, docs, news_docs, doc_ids=None):
     data = dict()
     i = 0
     last1 = last2 = 0
     if doc_ids:
         doc_ids = set(doc_ids)
         news_ids = []
         for doc_id in doc_ids:
             news_ids.append(news_docs[doc_id])
         items = self.filter(base_id__in=news_ids)
         ts('get selected news')
         items = list(items)
         tp()
         model = CosResultSeveral
     else:
         last = CosResult.objects.order_by('-pk')
         if last:
             last = last[0]
             last1 = last.news_1_id
             last2 = last.news_2_id
         items = self.iterate()
         model = CosResult
     ts('loading news words data')
     for item in items:
         tc(10000)
         news_id = item.base.pk
         data.setdefault(news_id, dict())
         data[news_id][item.word] = item.weight
     tp()
     results = []
     i = 0
     j = 0
     ts('main loop for news: %d' % len(data))
     for news_id1, news1 in data.items():
         tc(10)
         if news_id1 < last1:
             continue
         for news_id2, news2 in data.items():
             if news_id2 <= news_id1:
                 continue
             if last1 == news_id1 and news_id2 <= last2:
                 continue
             cos = vector_cos(news1, news2)
             results.append(
                 model(news_1_id=news_id1,
                       doc_1=docs[news_id1],
                       news_2_id=news_id2,
                       doc_2=docs[news_id2],
                       cos=cos))
             j += 1
             if not j % 10000:
                 model.objects.bulk_create(results)
                 # print dt(), 'added', j
                 td('added cos data: %d' % j)
                 results = []
         gc.collect()
     model.objects.bulk_create(results)
     tp()
     # print dt(), 'added', j
     td('total added cos data: %d' % j)
 def news_calculate_cosinuses(self, docs, news_docs, doc_ids=None):
     data = dict()
     i = 0
     last1 = last2 = 0
     if doc_ids:
         doc_ids = set(doc_ids)
         news_ids = []
         for doc_id in doc_ids:
             news_ids.append(news_docs[doc_id])
         items = self.filter(base_id__in=news_ids)
         ts('get selected news')
         items = list(items)
         tp()
         model = CosResultSeveral
     else:
         last = CosResult.objects.order_by('-pk')
         if last:
             last = last[0]
             last1 = last.news_1_id
             last2 = last.news_2_id
         items = self.iterate()
         model = CosResult
     ts('loading news words data')
     for item in items:
         tc(10000)
         news_id = item.base.pk
         data.setdefault(news_id, dict())
         data[news_id][item.word] = item.weight
     tp()
     results = []
     i = 0
     j = 0
     ts('main loop for news: %d' % len(data))
     for news_id1, news1 in data.items():
         tc(10)
         if news_id1 < last1:
             continue
         for news_id2, news2 in data.items():
             if news_id2 <= news_id1:
                 continue
             if last1 == news_id1 and news_id2 <= last2:
                 continue
             cos = vector_cos(news1, news2)
             results.append(model(news_1_id=news_id1, doc_1=docs[news_id1],
                                  news_2_id=news_id2, doc_2=docs[news_id2],
                                  cos=cos))
             j += 1
             if not j % 10000:
                 model.objects.bulk_create(results)
                 # print dt(), 'added', j
                 td('added cos data: %d' % j)
                 results = []
         gc.collect()
     model.objects.bulk_create(results)
     tp()
     # print dt(), 'added', j
     td('total added cos data: %d' % j)
    def paragraph_calculate_cosinuses(self, docs, min_global_cos,
                                      several=True, save_good_news=True):
        # print dt(), 'calculate_cosinuses'
        data = dict()
        i = 0
        # last1 = last2 = 0
        # last = CosResult.objects.order_by('-pk')
        # if last:
        #     last = last[0]
        #     last1 = last.news_1_id
        #     last2 = last.news_2_id
        # items = CosResult.objects.iterate()
        if several:
            cos_results_model = CosResultSeveral
            paragraph_cos_results_model = ParagraphCosResultSeveral
            good_cos_results_model = CosResultAfterParagraphSeveral
        else:
            cos_results_model = CosResult
            paragraph_cos_results_model = ParagraphCosResult
            good_cos_results_model = CosResultAfterParagraph

        last1 = last2 = 0
        last = paragraph_cos_results_model.objects.order_by('-pk')
        if last:
            last = last[0]
            last1 = last.news_1_id
            last2 = last.news_2_id

        # todo: get all (!!!) pairs (except cos=0 and cos>0.95)
        # items = cos_results_model.objects.filter(cos__gt=min_cos)
        ts('get news_ids from cos-table')
        items = cos_results_model.objects.exclude(cos=0).exclude(cos=1)
        # print dt(), '-> filter by min_cos (and getting news)'
        pairs = list()
        news_ids = list()
        for item in items:
            # if item.cos < min_news_cos:
            #     continue
            pairs.append((item.news_1_id, item.news_2_id, item.cos))
            news_ids.append(item.news_1_id)
            news_ids.append(item.news_2_id)
        news_ids = set(news_ids)
        tp()
        # print dt(), '-> filter paragraph keyword items by that news'
        ts('get paragraph words (using news_id from prev step)')
        items = ParagraphKeywordItem.objects.filter(news_id__in=news_ids)
        for item in items:
            news_id = item.news_id
            paragraph_id = item.base_id
            tc(10000)
            data.setdefault(news_id, dict())
            data[news_id].setdefault(paragraph_id, dict())
            data[news_id][paragraph_id][item.word] = item.weight
            # if news_id > 50:
            #     break
        tp()
        results = []
        best_results = []
        j = p = c = 0
        pairs_ok = list()
        ts('main loop, pairs: %d' % len(pairs))
        pairs = sorted(pairs, key=itemgetter(1))
        pairs = sorted(pairs, key=itemgetter(0))
        for news_id_1, news_id_2, news_cos in pairs:
            if last1:
                if news_id_1 < last1:
                    continue
                if last1 == news_id_1 and news_id_2 <= last2:
                    continue
            pair_ok = False
            max_local_cos = best_paragraph_1 = best_paragraph_2 = -1
            tc(10)
            for paragraph_id_1, paragraph_1 in data[news_id_1].items():
                # if news_id1 < last1:
                #     continue
                for paragraph_id_2, paragraph_2 in data[news_id_2].items():
                    if paragraph_id_2 <= paragraph_id_1:
                        continue
                    # if last1 == news_id1 and news_id2 <= last2:
                    #     continue
                    cos = vector_cos(paragraph_1, paragraph_2)
                    # results.append(paragraph_cos_results_model(
                    #     news_1_id=news_id_1, paragraph_1_id=paragraph_id_1,
                    #     news_2_id=news_id_2, paragraph_2_id=paragraph_id_2,
                    #     cos=cos))
                    if cos > min_global_cos:
                        pair_ok = True
                    if cos > max_local_cos:
                        max_local_cos = cos
                        best_paragraph_1 = paragraph_id_1
                        best_paragraph_2 = paragraph_id_2
                    # todo: calc and save max paragraph cos
                    j += 1
                    if not j % 10000:
                    #     paragraph_cos_results_model.objects.bulk_create(results)
                    #     print dt(), '   paragraph cos added:', j
                        td('calculated cos: %d' % j)
                    #     results = []
                gc.collect()
            if max_local_cos != -1:
                c += 1
                best_results.append(paragraph_cos_results_model(
                    news_1_id=news_id_1, paragraph_1_id=best_paragraph_1,
                    news_2_id=news_id_2, paragraph_2_id=best_paragraph_2,
                    cos=max_local_cos))
                if not c % 100:
                    paragraph_cos_results_model.objects.bulk_create(best_results)
                    # print dt(), '   best cos of paragraphs added:', c
                    td('best cos of paragraphs added: %d' % c)
                    best_results = []
            if save_good_news and pair_ok:
                p += 1
                pairs_ok.append(good_cos_results_model(
                    news_1_id=news_id_1, news_2_id=news_id_2,
                    doc_1=docs[news_id_1], doc_2=docs[news_id_2],
                    cos=news_cos))
                if not p % 100:
                    good_cos_results_model.objects.bulk_create(pairs_ok)
                    print dt(), '   good pairs of news added:', p
                    pairs_ok = []
        # paragraph_cos_results_model.objects.bulk_create(results)
        # print dt(), '-> paragraph cos added:', j
        paragraph_cos_results_model.objects.bulk_create(best_results)
        print dt(), '   best cos of paragraphs added:', c
        if save_good_news:
            good_cos_results_model.objects.bulk_create(pairs_ok)
            print dt(), '-> good pairs of news added:', p
    def paragraph_calculate_cosinuses(self,
                                      docs,
                                      min_global_cos,
                                      several=True,
                                      save_good_news=True):
        # print dt(), 'calculate_cosinuses'
        data = dict()
        i = 0
        # last1 = last2 = 0
        # last = CosResult.objects.order_by('-pk')
        # if last:
        #     last = last[0]
        #     last1 = last.news_1_id
        #     last2 = last.news_2_id
        # items = CosResult.objects.iterate()
        if several:
            cos_results_model = CosResultSeveral
            paragraph_cos_results_model = ParagraphCosResultSeveral
            good_cos_results_model = CosResultAfterParagraphSeveral
        else:
            cos_results_model = CosResult
            paragraph_cos_results_model = ParagraphCosResult
            good_cos_results_model = CosResultAfterParagraph

        last1 = last2 = 0
        last = paragraph_cos_results_model.objects.order_by('-pk')
        if last:
            last = last[0]
            last1 = last.news_1_id
            last2 = last.news_2_id

        # todo: get all (!!!) pairs (except cos=0 and cos>0.95)
        # items = cos_results_model.objects.filter(cos__gt=min_cos)
        ts('get news_ids from cos-table')
        items = cos_results_model.objects.exclude(cos=0).exclude(cos=1)
        # print dt(), '-> filter by min_cos (and getting news)'
        pairs = list()
        news_ids = list()
        for item in items:
            # if item.cos < min_news_cos:
            #     continue
            pairs.append((item.news_1_id, item.news_2_id, item.cos))
            news_ids.append(item.news_1_id)
            news_ids.append(item.news_2_id)
        news_ids = set(news_ids)
        tp()
        # print dt(), '-> filter paragraph keyword items by that news'
        ts('get paragraph words (using news_id from prev step)')
        items = ParagraphKeywordItem.objects.filter(news_id__in=news_ids)
        for item in items:
            news_id = item.news_id
            paragraph_id = item.base_id
            tc(10000)
            data.setdefault(news_id, dict())
            data[news_id].setdefault(paragraph_id, dict())
            data[news_id][paragraph_id][item.word] = item.weight
            # if news_id > 50:
            #     break
        tp()
        results = []
        best_results = []
        j = p = c = 0
        pairs_ok = list()
        ts('main loop, pairs: %d' % len(pairs))
        pairs = sorted(pairs, key=itemgetter(1))
        pairs = sorted(pairs, key=itemgetter(0))
        for news_id_1, news_id_2, news_cos in pairs:
            if last1:
                if news_id_1 < last1:
                    continue
                if last1 == news_id_1 and news_id_2 <= last2:
                    continue
            pair_ok = False
            max_local_cos = best_paragraph_1 = best_paragraph_2 = -1
            tc(10)
            for paragraph_id_1, paragraph_1 in data[news_id_1].items():
                # if news_id1 < last1:
                #     continue
                for paragraph_id_2, paragraph_2 in data[news_id_2].items():
                    if paragraph_id_2 <= paragraph_id_1:
                        continue
                    # if last1 == news_id1 and news_id2 <= last2:
                    #     continue
                    cos = vector_cos(paragraph_1, paragraph_2)
                    # results.append(paragraph_cos_results_model(
                    #     news_1_id=news_id_1, paragraph_1_id=paragraph_id_1,
                    #     news_2_id=news_id_2, paragraph_2_id=paragraph_id_2,
                    #     cos=cos))
                    if cos > min_global_cos:
                        pair_ok = True
                    if cos > max_local_cos:
                        max_local_cos = cos
                        best_paragraph_1 = paragraph_id_1
                        best_paragraph_2 = paragraph_id_2
                    # todo: calc and save max paragraph cos
                    j += 1
                    if not j % 10000:
                        #     paragraph_cos_results_model.objects.bulk_create(results)
                        #     print dt(), '   paragraph cos added:', j
                        td('calculated cos: %d' % j)
                    #     results = []
                gc.collect()
            if max_local_cos != -1:
                c += 1
                best_results.append(
                    paragraph_cos_results_model(
                        news_1_id=news_id_1,
                        paragraph_1_id=best_paragraph_1,
                        news_2_id=news_id_2,
                        paragraph_2_id=best_paragraph_2,
                        cos=max_local_cos))
                if not c % 100:
                    paragraph_cos_results_model.objects.bulk_create(
                        best_results)
                    # print dt(), '   best cos of paragraphs added:', c
                    td('best cos of paragraphs added: %d' % c)
                    best_results = []
            if save_good_news and pair_ok:
                p += 1
                pairs_ok.append(
                    good_cos_results_model(news_1_id=news_id_1,
                                           news_2_id=news_id_2,
                                           doc_1=docs[news_id_1],
                                           doc_2=docs[news_id_2],
                                           cos=news_cos))
                if not p % 100:
                    good_cos_results_model.objects.bulk_create(pairs_ok)
                    print dt(), '   good pairs of news added:', p
                    pairs_ok = []
        # paragraph_cos_results_model.objects.bulk_create(results)
        # print dt(), '-> paragraph cos added:', j
        paragraph_cos_results_model.objects.bulk_create(best_results)
        print dt(), '   best cos of paragraphs added:', c
        if save_good_news:
            good_cos_results_model.objects.bulk_create(pairs_ok)
            print dt(), '-> good pairs of news added:', p