Esempio n. 1
0
    def build_name_ngrams(self):
        self.logger.info("build bigrams")
        office_bigrams = defaultdict(set)
        office_stems = defaultdict(set)
        self.office_squeezes = dict()
        office: TOfficeInMemory
        for office in RUSSIA.iterate_offices():
            region_id = office.region_id
            if region_id is None:
                region_id = 0
            self.office_squeezes[office.office_id] = {
                'name': office.name,
                'region': region_id,
                'parent_id': office.parent_id
            }
            for b in self.get_bigrams(office.name):
                office_bigrams[b].add(office.office_id)
            for w in TOfficePredictIndex.get_word_stems(office.name, add_starter_and_enders=False):
                office_stems[w].add(office.office_id)

        self.office_name_bigrams = self.ngrams_from_default_dict(office_bigrams)
        self.logger.info("bigrams count = {}".format(self.get_bigrams_count()))

        self.office_name_unigrams = self.ngrams_from_default_dict(office_stems, 3)
        self.logger.info("unigrams count = {}".format(self.get_unigrams_count()))
 def build_regional_tax_offices(self):
     o: TOfficeInMemory
     tax_offices = dict()
     for o in RUSSIA.iterate_offices():
         if o.rubric_id == TOfficeRubrics.Tax:
             tax_offices[o.region_id] = o.office_id
     assert len(tax_offices) > 0
     return tax_offices
Esempio n. 3
0
 def build_offices_sitemap(self):
     self.logger.info("build_offices_sitemaps")
     sitemap_path = os.path.join(
         os.path.dirname(__file__),
         "../../../disclosures/static/sitemap-office.xml")
     url_paths = list()
     for o in RUSSIA.iterate_offices():
         info = RUSSIA.calc_data_current.office_stats.get_group_data(
             o.office_id)
         if info is not None:
             doc_cnt = info.source_document_count
             if doc_cnt is not None and doc_cnt > 10:
                 url_paths.append("office/{}".format(o.office_id))
     self.write_sitemap(url_paths, sitemap_path, priority=0.4)
     self.sitemaps.append(os.path.basename(sitemap_path))
    def build_aux_office_params(self, office_data: TGroupStatDataList):
        # ignore self.income_stat_start_year
        query = """
            select o.id, min(s.income_year), count(s.id) 
            from declarations_office o
            join declarations_section s on s.office_id = o.id
            where s.income_year >= 2009 and s.income_year <= {}
            group by o.id, s.income_year
        """.format(self.last_year)
        with connection.cursor() as cursor:
            self.logger.info("execute {}".format(query.replace("\n", " ")))
            cursor.execute(query)
            params = defaultdict(dict)
            self.logger.info("read data")
            for office_id, income_year, section_count in cursor:
                ys = office_data.get_or_create_group_data(
                    office_id).get_or_create_year_snapshot(income_year)
                ys.declarants_count = section_count

        query = """
                    select o.id, count(distinct d.id) 
                    from declarations_office o
                    join declarations_section s on s.office_id = o.id
                    join declarations_source_document d on d.id = s.source_document_id
                    group by o.id
                """
        with connection.cursor() as cursor:
            self.logger.info("execute {}".format(query.replace("\n", " ")))
            cursor.execute(query)
            for office_id, cnt in cursor:
                oi = office_data.get_or_create_group_data(office_id)
                oi.source_document_count = cnt

        offices = RUSSIA.offices_in_memory
        child_offices = offices.get_child_offices_dict()
        for office in RUSSIA.iterate_offices():
            office_id = office.office_id
            oi = office_data.get_or_create_group_data(office_id)
            if office.parent_id is None:
                oi.child_office_examples = list()
            else:
                oi.child_office_examples = list(
                    c.office_id for c in child_offices[office_id][:5])
            oi.child_offices_count = len(child_offices[office_id])
            oi.section_count = sum(s.declarants_count
                                   for s in oi.year_snapshots.values())
            oi.urls = list(x.url for x in office.office_web_sites
                           if x.can_communicate())
Esempio n. 5
0
    def gen_documents(self):
        for o in RUSSIA.iterate_offices():
            info = RUSSIA.calc_data_current.office_stats.get_group_data(
                o.office_id)
            if info is not None:
                doc_cnt = info.source_document_count
            else:
                doc_cnt = 0

            yield {
                "_id": o.office_id,
                "_index": self.index_name,
                "_source": {
                    'id': o.office_id,
                    'name': o.name,
                    'parent_id': o.parent_id,
                    'source_document_count': doc_cnt,
                    'rubric_id': o.rubric_id,
                    'region_id': o.region_id
                }
            }