def create_keyword_items(self, all_paragraphs=None, news_by_paragraph=None,
                          valid_keywords=None):
     print dt(), '@ Create paragraph keywords'
     i = 0
     if all_paragraphs:
         items = self.filter(base_id__in=all_paragraphs)
     else:
         items = self.iterate()
     for news in items:
         i += 1
         if not i % 100:
             print dt(), '-> processed:', i
         news.create_keyword_items(news_by_paragraph, valid_keywords)
 def load_from_folder(self, news_path):
     ts('@ Loading files, adding news to DB, creating news_contests')
     files = os.listdir(news_path)
     items = []
     for filename in files:
         tc(100)
         news_content = \
             self.load_from_xml("{}/{}".format(news_path, filename))
         items.append(news_content)
     tp()
     print dt(), '-> Total entries:', len(items)
     print dt(), '@ Adding news_contests to DB'
     self.bulk(items, model=NewsContent, chunk_size=250)
Beispiel #3
0
 def load_from_folder(self, news_path):
     ts('@ Loading files, adding news to DB, creating news_contests')
     files = os.listdir(news_path)
     items = []
     for filename in files:
         tc(100)
         news_content = \
             self.load_from_xml("{}/{}".format(news_path, filename))
         items.append(news_content)
     tp()
     print dt(), '-> Total entries:', len(items)
     print dt(), '@ Adding news_contests to DB'
     self.bulk(items, model=NewsContent, chunk_size=250)
 def create_stats(self):
     print dt(), '@ Create stats'
     i = 0
     items = []
     for news in self.iterate():
         i += 1
         if not i % 1000:
             print dt(), '-> processed:', i
         stats = news.create_stats()
         if stats:
             items.append(stats)
     print dt(), '@ Adding stats to DB'
     self.bulk(items, model=self.stats_model, chunk_size=1000)
    def paragraph_calculate_cosinuses(self, docs, min_global_cos,
                                      several=True, save_good_news=True):
        # print dt(), 'calculate_cosinuses'
        data = dict()
        i = 0
        # last1 = last2 = 0
        # last = CosResult.objects.order_by('-pk')
        # if last:
        #     last = last[0]
        #     last1 = last.news_1_id
        #     last2 = last.news_2_id
        # items = CosResult.objects.iterate()
        if several:
            cos_results_model = CosResultSeveral
            paragraph_cos_results_model = ParagraphCosResultSeveral
            good_cos_results_model = CosResultAfterParagraphSeveral
        else:
            cos_results_model = CosResult
            paragraph_cos_results_model = ParagraphCosResult
            good_cos_results_model = CosResultAfterParagraph

        last1 = last2 = 0
        last = paragraph_cos_results_model.objects.order_by('-pk')
        if last:
            last = last[0]
            last1 = last.news_1_id
            last2 = last.news_2_id

        # todo: get all (!!!) pairs (except cos=0 and cos>0.95)
        # items = cos_results_model.objects.filter(cos__gt=min_cos)
        ts('get news_ids from cos-table')
        items = cos_results_model.objects.exclude(cos=0).exclude(cos=1)
        # print dt(), '-> filter by min_cos (and getting news)'
        pairs = list()
        news_ids = list()
        for item in items:
            # if item.cos < min_news_cos:
            #     continue
            pairs.append((item.news_1_id, item.news_2_id, item.cos))
            news_ids.append(item.news_1_id)
            news_ids.append(item.news_2_id)
        news_ids = set(news_ids)
        tp()
        # print dt(), '-> filter paragraph keyword items by that news'
        ts('get paragraph words (using news_id from prev step)')
        items = ParagraphKeywordItem.objects.filter(news_id__in=news_ids)
        for item in items:
            news_id = item.news_id
            paragraph_id = item.base_id
            tc(10000)
            data.setdefault(news_id, dict())
            data[news_id].setdefault(paragraph_id, dict())
            data[news_id][paragraph_id][item.word] = item.weight
            # if news_id > 50:
            #     break
        tp()
        results = []
        best_results = []
        j = p = c = 0
        pairs_ok = list()
        ts('main loop, pairs: %d' % len(pairs))
        pairs = sorted(pairs, key=itemgetter(1))
        pairs = sorted(pairs, key=itemgetter(0))
        for news_id_1, news_id_2, news_cos in pairs:
            if last1:
                if news_id_1 < last1:
                    continue
                if last1 == news_id_1 and news_id_2 <= last2:
                    continue
            pair_ok = False
            max_local_cos = best_paragraph_1 = best_paragraph_2 = -1
            tc(10)
            for paragraph_id_1, paragraph_1 in data[news_id_1].items():
                # if news_id1 < last1:
                #     continue
                for paragraph_id_2, paragraph_2 in data[news_id_2].items():
                    if paragraph_id_2 <= paragraph_id_1:
                        continue
                    # if last1 == news_id1 and news_id2 <= last2:
                    #     continue
                    cos = vector_cos(paragraph_1, paragraph_2)
                    # results.append(paragraph_cos_results_model(
                    #     news_1_id=news_id_1, paragraph_1_id=paragraph_id_1,
                    #     news_2_id=news_id_2, paragraph_2_id=paragraph_id_2,
                    #     cos=cos))
                    if cos > min_global_cos:
                        pair_ok = True
                    if cos > max_local_cos:
                        max_local_cos = cos
                        best_paragraph_1 = paragraph_id_1
                        best_paragraph_2 = paragraph_id_2
                    # todo: calc and save max paragraph cos
                    j += 1
                    if not j % 10000:
                    #     paragraph_cos_results_model.objects.bulk_create(results)
                    #     print dt(), '   paragraph cos added:', j
                        td('calculated cos: %d' % j)
                    #     results = []
                gc.collect()
            if max_local_cos != -1:
                c += 1
                best_results.append(paragraph_cos_results_model(
                    news_1_id=news_id_1, paragraph_1_id=best_paragraph_1,
                    news_2_id=news_id_2, paragraph_2_id=best_paragraph_2,
                    cos=max_local_cos))
                if not c % 100:
                    paragraph_cos_results_model.objects.bulk_create(best_results)
                    # print dt(), '   best cos of paragraphs added:', c
                    td('best cos of paragraphs added: %d' % c)
                    best_results = []
            if save_good_news and pair_ok:
                p += 1
                pairs_ok.append(good_cos_results_model(
                    news_1_id=news_id_1, news_2_id=news_id_2,
                    doc_1=docs[news_id_1], doc_2=docs[news_id_2],
                    cos=news_cos))
                if not p % 100:
                    good_cos_results_model.objects.bulk_create(pairs_ok)
                    print dt(), '   good pairs of news added:', p
                    pairs_ok = []
        # paragraph_cos_results_model.objects.bulk_create(results)
        # print dt(), '-> paragraph cos added:', j
        paragraph_cos_results_model.objects.bulk_create(best_results)
        print dt(), '   best cos of paragraphs added:', c
        if save_good_news:
            good_cos_results_model.objects.bulk_create(pairs_ok)
            print dt(), '-> good pairs of news added:', p
    def paragraph_calculate_cosinuses(self,
                                      docs,
                                      min_global_cos,
                                      several=True,
                                      save_good_news=True):
        # print dt(), 'calculate_cosinuses'
        data = dict()
        i = 0
        # last1 = last2 = 0
        # last = CosResult.objects.order_by('-pk')
        # if last:
        #     last = last[0]
        #     last1 = last.news_1_id
        #     last2 = last.news_2_id
        # items = CosResult.objects.iterate()
        if several:
            cos_results_model = CosResultSeveral
            paragraph_cos_results_model = ParagraphCosResultSeveral
            good_cos_results_model = CosResultAfterParagraphSeveral
        else:
            cos_results_model = CosResult
            paragraph_cos_results_model = ParagraphCosResult
            good_cos_results_model = CosResultAfterParagraph

        last1 = last2 = 0
        last = paragraph_cos_results_model.objects.order_by('-pk')
        if last:
            last = last[0]
            last1 = last.news_1_id
            last2 = last.news_2_id

        # todo: get all (!!!) pairs (except cos=0 and cos>0.95)
        # items = cos_results_model.objects.filter(cos__gt=min_cos)
        ts('get news_ids from cos-table')
        items = cos_results_model.objects.exclude(cos=0).exclude(cos=1)
        # print dt(), '-> filter by min_cos (and getting news)'
        pairs = list()
        news_ids = list()
        for item in items:
            # if item.cos < min_news_cos:
            #     continue
            pairs.append((item.news_1_id, item.news_2_id, item.cos))
            news_ids.append(item.news_1_id)
            news_ids.append(item.news_2_id)
        news_ids = set(news_ids)
        tp()
        # print dt(), '-> filter paragraph keyword items by that news'
        ts('get paragraph words (using news_id from prev step)')
        items = ParagraphKeywordItem.objects.filter(news_id__in=news_ids)
        for item in items:
            news_id = item.news_id
            paragraph_id = item.base_id
            tc(10000)
            data.setdefault(news_id, dict())
            data[news_id].setdefault(paragraph_id, dict())
            data[news_id][paragraph_id][item.word] = item.weight
            # if news_id > 50:
            #     break
        tp()
        results = []
        best_results = []
        j = p = c = 0
        pairs_ok = list()
        ts('main loop, pairs: %d' % len(pairs))
        pairs = sorted(pairs, key=itemgetter(1))
        pairs = sorted(pairs, key=itemgetter(0))
        for news_id_1, news_id_2, news_cos in pairs:
            if last1:
                if news_id_1 < last1:
                    continue
                if last1 == news_id_1 and news_id_2 <= last2:
                    continue
            pair_ok = False
            max_local_cos = best_paragraph_1 = best_paragraph_2 = -1
            tc(10)
            for paragraph_id_1, paragraph_1 in data[news_id_1].items():
                # if news_id1 < last1:
                #     continue
                for paragraph_id_2, paragraph_2 in data[news_id_2].items():
                    if paragraph_id_2 <= paragraph_id_1:
                        continue
                    # if last1 == news_id1 and news_id2 <= last2:
                    #     continue
                    cos = vector_cos(paragraph_1, paragraph_2)
                    # results.append(paragraph_cos_results_model(
                    #     news_1_id=news_id_1, paragraph_1_id=paragraph_id_1,
                    #     news_2_id=news_id_2, paragraph_2_id=paragraph_id_2,
                    #     cos=cos))
                    if cos > min_global_cos:
                        pair_ok = True
                    if cos > max_local_cos:
                        max_local_cos = cos
                        best_paragraph_1 = paragraph_id_1
                        best_paragraph_2 = paragraph_id_2
                    # todo: calc and save max paragraph cos
                    j += 1
                    if not j % 10000:
                        #     paragraph_cos_results_model.objects.bulk_create(results)
                        #     print dt(), '   paragraph cos added:', j
                        td('calculated cos: %d' % j)
                    #     results = []
                gc.collect()
            if max_local_cos != -1:
                c += 1
                best_results.append(
                    paragraph_cos_results_model(
                        news_1_id=news_id_1,
                        paragraph_1_id=best_paragraph_1,
                        news_2_id=news_id_2,
                        paragraph_2_id=best_paragraph_2,
                        cos=max_local_cos))
                if not c % 100:
                    paragraph_cos_results_model.objects.bulk_create(
                        best_results)
                    # print dt(), '   best cos of paragraphs added:', c
                    td('best cos of paragraphs added: %d' % c)
                    best_results = []
            if save_good_news and pair_ok:
                p += 1
                pairs_ok.append(
                    good_cos_results_model(news_1_id=news_id_1,
                                           news_2_id=news_id_2,
                                           doc_1=docs[news_id_1],
                                           doc_2=docs[news_id_2],
                                           cos=news_cos))
                if not p % 100:
                    good_cos_results_model.objects.bulk_create(pairs_ok)
                    print dt(), '   good pairs of news added:', p
                    pairs_ok = []
        # paragraph_cos_results_model.objects.bulk_create(results)
        # print dt(), '-> paragraph cos added:', j
        paragraph_cos_results_model.objects.bulk_create(best_results)
        print dt(), '   best cos of paragraphs added:', c
        if save_good_news:
            good_cos_results_model.objects.bulk_create(pairs_ok)
            print dt(), '-> good pairs of news added:', p