예제 #1
0
    def handle(self, *args, **options):
        all_decls = NACPDeclaration.search().query('match_all')
        if options["to"] is not None:
            all_decls = all_decls[options["from"]:options["to"]].execute()
        elif options["from"]:
            all_decls = all_decls[options["from"]:].execute()
        else:
            all_decls = all_decls.scan()

        w = DictWriter(options["outfile"],
                       fieldnames=["_id"] + options["field"])

        w.writeheader()

        for decl in all_decls:
            decl_dict = decl.to_dict()

            row = {
                field: self.fetch_field(decl_dict, field)
                for field in options["field"]
            }

            row["_id"] = decl.meta.id

            w.writerow(row)
    def handle(self, *args, **options):
        to_export = NACPDeclaration.search().source(
            include=[AGGREGATED_FIELD_NAME]).query("exists",
                                                   field=AGGREGATED_FIELD_NAME)

        if not options["export_all"]:
            to_export = to_export.query(
                "bool",
                must=[Q("term", intro__doc_type="Щорічна")],
                must_not=[Q("exists", field="corrected_declarations")])

        if options["filter_future_declarations"]:
            to_export = to_export.query(
                "range", intro__declaration_year={"lt": datetime.now().year})

        w = None
        with open(options["destination"], "w") as fp:
            for i, d in enumerate(to_export.scan()):
                row = d[AGGREGATED_FIELD_NAME].to_dict()
                row['id'] = d.meta.id

                if not w:
                    w = DictWriter(fp, fieldnames=row.keys())
                    w.writeheader()

                w.writerow(row)
                if i % 10000 == 0 and i:
                    self.stdout.write("{} declarations exported".format(i))
    def get_raw_data(self, year, order_by, limit=10000):
        to_export = NACPDeclaration.search().source(
            include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME)

        to_export = to_export.query(
            "bool",
            must=[
                Q("term", intro__doc_type="Щорічна"),
                Q("term", intro__declaration_year=year)
            ],
            must_not=[
                Q("exists", field="corrected_declarations"),
                Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278"),  # F*****g Melnytchuk
                Q("term", _id="nacp_c67549d0-abc0-48fe-b529-9185efe1a3ce"),  # F*****g idiots
                Q("term", _id="nacp_2e07bb01-5ca8-4188-97c6-6297f7a4d2ad"),  # F*****g idiots
                Q("term", _id="nacp_f1b25e4d-e691-48d6-99b1-758e94764b91"), # F*****g Motsyor
                Q("term", **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True})
            ]
        ).sort(
            {'{}.{}'.format(AGGREGATED_FIELD_NAME, order_by): {"order": "desc"}}
        )[:limit]

        res = []

        for d in to_export.execute():
            row = d[AGGREGATED_FIELD_NAME].to_dict()
            if row[order_by] > 10000000000:
                continue

            row["id"] = d._id
            res.append(row)

        return res
예제 #4
0
def load_declarations(new_ids, limit=LOAD_DECLS_LIMIT):
    decl_list = list()
    fields = ['meta.id', 'general.*', 'intro.declaration_year']

    if len(new_ids) > limit:
        logger.error("load new_ids %d limit %d exceed", len(new_ids), limit)
        new_ids = new_ids[:limit]

    decl_list = NACPDeclaration.mget(new_ids,
                                     raise_on_error=False,
                                     missing='skip',
                                     _source=fields)

    if not decl_list:
        decl_list = []

    if len(decl_list) < len(new_ids):
        add_list = Declaration.mget(new_ids,
                                    raise_on_error=False,
                                    missing='skip',
                                    _source=fields)
        if add_list:
            decl_list.extend(add_list)

    if len(decl_list) < len(new_ids):
        logger.error("load new_ids %d docs not found",
                     len(new_ids) - len(decl_list))

    return decl_list
예제 #5
0
    def get_raw_data(self, year, order_by, limit=10000):
        to_export = NACPDeclaration.search().source(
            include=[AGGREGATED_FIELD_NAME]).query("exists",
                                                   field=AGGREGATED_FIELD_NAME)

        to_export = to_export.query(
            "bool",
            must=[
                Q("term", intro__doc_type="Щорічна"),
                Q("term", intro__declaration_year=year)
            ],
            must_not=[
                Q("exists", field="corrected_declarations"),
                Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278"
                  ),  # F*****g Melnytchuk
                Q("term",
                  **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True})
            ]).sort({'aggregated.{}'.format(order_by): {
                         "order": "desc"
                     }})[:limit]

        res = []

        for d in to_export.execute():
            row = d[AGGREGATED_FIELD_NAME].to_dict()
            row["id"] = d._id
            res.append(row)

        return res
예제 #6
0
def populate_declarant_id(apps, schema_editor):
    Declaration = apps.get_model('landings', 'Declaration')
    for d in Declaration.objects.filter(user_declarant_id__isnull=True):
        d.user_declarant_id = d.source["infocard"].get("user_declarant_id",
                                                       None)
        if d.user_declarant_id is None:
            es_decl = NACPDeclaration.get(id=d.declaration_id)
            d.user_declarant_id = getattr(es_decl.intro, "user_declarant_id",
                                          None)

        d.save()
    def handle(self, *args, **options):
        all_decls = (NACPDeclaration.search().query("match_all").source([
            "declaration.url",
            "intro.date",
            "intro.doc_type",
            "nacp_orig.step_1",
        ]))

        all_decls = all_decls.filter(
            "range",
            intro__date={
                "gte":
                date(options["year_since"], 1, 1),
                "lt":
                datetime.now().replace(hour=0,
                                       minute=0,
                                       second=0,
                                       microsecond=0),
            },
        )

        w = DictWriter(
            options["outfile"],
            fieldnames=[
                "id",
                "declaration.url",
                "intro.date",
                "intro.doc_type",
                "nacp_orig.step_1.postCategory",
                "nacp_orig.step_1.postType",
            ],
        )

        for decl in tqdm(all_decls.scan(), total=all_decls.count()):
            w.writerow({
                "id":
                decl.meta.id,
                "declaration.url":
                decl.declaration.url,
                "intro.date":
                decl.intro.date.date(),
                "intro.doc_type":
                decl.intro.doc_type,
                "nacp_orig.step_1.postCategory":
                getattr(decl.nacp_orig.step_1, "postCategory", ""),
                "nacp_orig.step_1.postType":
                getattr(decl.nacp_orig.step_1, "postType", ""),
            })
예제 #8
0
    def handle(self, *args, **options):
        try:
            base_dir = options['file_path']
            corrected_file = options['corrected_file']
        except IndexError:
            raise CommandError(
                'First argument must be a path to source files and second is file name of CSV with corrected declarations')

        self.stdout.write("Gathering JSON documents from {}".format(base_dir))
        self.jsons = list(glob2.glob(os.path.join(base_dir, "**/*.json")))
        self.stdout.write("Gathered {} JSON documents".format(len(self.jsons)))

        corrected = set()
        with open(corrected_file, "r") as fp:
            r = DictReader(fp)
            for l in r:
                corrected.add(l["uuid"])

        DeclarationStaticObj.corrected = corrected

        NACPDeclaration.init()
        counter = 0

        my_tiny_pool = Pool(self.number_of_processes)

        if not options["update_all_docs"]:
            self.stdout.write("Obtaining uuids of already indexed documents")

            s = NACPDeclaration.search().source([])
            existing_guids = set(
                h.meta.id.replace("nacp_", "") for h in s.scan())
            self.stdout.write("{} uuids are currently in index".format(
                len(existing_guids)))

            incoming_files = dict(
                filter(
                    None,
                    my_tiny_pool.map(parse_guid_from_fname, self.jsons)
                )
            )

            incoming_guids = set(incoming_files.keys())

            self.stdout.write("{} uuids are found in input folder".format(
                len(incoming_guids)))

            self.jsons = [
                incoming_files[k] for k in incoming_guids - existing_guids
            ]

            self.stdout.write("{} uuids left after the filtering".format(
                len(self.jsons)))

        for ix in range(0, len(self.jsons), self.chunk_size):
            chunk = self.jsons[ix:ix + self.chunk_size]

            result = list(
                filter(
                    None,
                    my_tiny_pool.map(DeclarationStaticObj.parse, chunk)
                )
            )

            counter += len(result)

            bulk(self.es, result)

            if ix:
                self.stdout.write(
                    'Loaded {} items to persistence storage'.format(ix))

        self.stdout.write(
            'Finished loading {} items to persistence storage'.format(counter))
예제 #9
0
    def _parse_me(cls, base_fname):
        json_fname = "{}.json".format(base_fname)
        html_fname = "{}.html".format(base_fname)
        resp = {
            "intro": {},
            "declaration": {}
        }

        try:
            with open(json_fname, "r") as fp:
                data = json.load(fp)

            with open(html_fname, "r") as fp:
                raw_html = fp.read()
                html = Selector(raw_html)
        except ValueError:
            print(
                "File {} or it's HTML counterpart cannot be parsed".format(json_fname))
            return None
        except FileNotFoundError:
            print(
                "File {} or it's HTML counterpart cannot be found".format(json_fname))
            return None

        id_ = data.get("id")
        created_date = data.get("created_date")

        raw_html_lowered = raw_html.lower()
        for chunk in cls.dangerous_chunks:
            if chunk in raw_html_lowered:
                raise BadHTMLData("Dangerous fragment found: {}, {}".format(
                    id_, base_fname))

        try:
            data = data["data"]
        except KeyError:
            raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname))

        if "step_0" not in data:
            raise BadJSONData("Bad header format: {}, {}".format(id_, base_fname))

        resp["_id"] = "nacp_{}".format(id_)
        resp["ft_src"] = "\n".join(cls.extract_textual_data(html))
        resp["nacp_orig"] = data
        resp["declaration"]["url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_)
        resp["declaration"]["source"] = "NACP"
        resp["declaration"]["basename"] = os.path.basename(base_fname)

        resp["intro"]["corrected"] = id_ in cls.corrected
        resp["intro"]["date"] = cls.parse_date(created_date)

        if "declarationType" not in data["step_0"] or "changesYear" in data["step_0"]:
            resp["intro"]["doc_type"] = "Форма змін"

            if "changesYear" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["changesYear"])
        else:
            resp["intro"]["doc_type"] = cls.declaration_types[data["step_0"]["declarationType"]]
            if "declarationYearTo" in data["step_0"]:
                resp["intro"]["declaration_year_to"] = cls.parse_date(data["step_0"]["declarationYearTo"])

            if "declarationYearFrom" in data["step_0"]:
                resp["intro"]["declaration_year_from"] = cls.parse_date(data["step_0"]["declarationYearFrom"])
                resp["intro"]["declaration_year"] = resp["intro"]["declaration_year_from"].year

            if "declarationYear1" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear1"])

            if "declarationYear3" in data["step_0"] and data["step_0"]["declarationYear3"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear3"])

            if "declarationYear4" in data["step_0"] and data["step_0"]["declarationYear4"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear4"])

        resp["general"] = {
            "last_name": replace_apostrophes(title(data["step_1"]["lastname"])),
            "name": replace_apostrophes(title(data["step_1"]["firstname"])),
            "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])),
            "full_name": replace_apostrophes("{} {} {}".format(
                title(data["step_1"]["lastname"]),
                title(data["step_1"]["firstname"]),
                title(data["step_1"]["middlename"]),
            )),
            "post": {
                "post": replace_apostrophes(data["step_1"].get("workPost", "")),
                "post_type": replace_apostrophes(data["step_1"].get("postType", "")),
                "office": replace_apostrophes(data["step_1"].get("workPlace", "")),
                "actual_region": replace_apostrophes(cls.region_types.get(data["step_1"].get("actual_region", ""), "")),
                "region": replace_apostrophes(cls.region_types.get(data["step_1"].get("region", ""), "")),
            }
        }

        if "step_2" in data:
            family = data["step_2"]

            if isinstance(family, dict):
                resp["general"]["family"] = []

                for member in family.values():
                    if not isinstance(member, dict):
                        continue

                    resp["general"]["family"].append({
                        "family_name": replace_apostrophes("{} {} {}".format(
                            title(member.get("lastname", "")),
                            title(member.get("firstname", "")),
                            title(member.get("middlename", "")),
                        )),

                        "relations": member.get("subjectRelation", "")
                    })

        # get regions from estate list
        if "step_3" in data and isinstance(data["step_3"], dict) and data["step_3"]:
            if "estate" not in resp:
                resp["estate"] = []
            for estate in data["step_3"].values():
                if isinstance(estate, dict) and "region" in estate:
                    region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), ""))
                    if region:
                        resp["estate"].append({"region": region})

        if "step_4" in data and isinstance(data["step_4"], dict) and data["step_4"]:
            if "estate" not in resp:
                resp["estate"] = []
            for estate in data["step_4"].values():
                if isinstance(estate, dict) and "region" in estate:
                    region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), ""))
                    if region:
                        resp["estate"].append({"region": region})

        if "estate" in resp:
            estate_list = html.css(
                "table:contains('Місцезнаходження') td:contains('Населений пункт') span::text"
            ).extract()

            for estate in estate_list:
                region = cls.decode_region(estate)
                if region:
                    resp["estate"].append({"region": region})

        resp['general']['full_name_suggest'] = [
            {
                'input': resp['general']['full_name'],
                'weight': 5
            },
            {
                'input': ' '.join(
                    [
                        resp['general']['name'],
                        resp['general']['patronymic'],
                        resp['general']['last_name']
                    ]
                ),
                'weight': 3
            },
            {
                'input': ' '.join(
                    [
                        resp['general']['name'],
                        resp['general']['last_name']
                    ]
                ),
                'weight': 3
            }
        ]

        resp['general']['full_name_for_sorting'] = keyword_for_sorting(resp['general']['full_name'])

        if not resp["general"]["post"]["region"]:
            region_html = html.css(
                "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text"
            ).extract()
            if len(region_html) > 1:
                resp["general"]["post"]["region"] = cls.decode_region(region_html[1])

        if not resp["general"]["post"]["actual_region"]:
            region_html = html.css(
                "fieldset:contains('Місце фактичного проживання') .person-info:contains('Місто')::text"
            ).extract()
            if len(region_html) > 1:
                resp["general"]["post"]["actual_region"] = cls.decode_region(region_html[1])

        # if set only one region use it value for second one
        if not resp["general"]["post"]["actual_region"] and resp["general"]["post"]["region"]:
            resp["general"]["post"]["actual_region"] = resp["general"]["post"]["region"]
        elif not resp["general"]["post"]["region"] and resp["general"]["post"]["actual_region"]:
            resp["general"]["post"]["region"] = resp["general"]["post"]["actual_region"]

        resp["index_card"] = concat_fields(resp, NACPDeclaration.INDEX_CARD_FIELDS)

        return NACPDeclaration(**resp).to_dict(True)
예제 #10
0
    def _parse_me(cls, base_fname):
        json_fname = "{}.json".format(base_fname)
        html_fname = "{}.html".format(base_fname)
        resp = {"intro": {}, "declaration": {}}

        try:
            with open(json_fname, "r") as fp:
                data = json.load(fp)

            with open(html_fname, "r") as fp:
                raw_html = fp.read()
                html = Selector(raw_html)
        except ValueError:
            print("File {} or it's HTML counterpart cannot be parsed".format(
                json_fname))
            return None
        except FileNotFoundError:
            print("File {} or it's HTML counterpart cannot be found".format(
                json_fname))
            return None

        id_ = data.get("id")
        created_date = data.get("created_date")

        raw_html_lowered = raw_html.lower()
        for chunk in cls.dangerous_chunks:
            if chunk in raw_html_lowered:
                raise BadHTMLData("Dangerous fragment found: {}, {}".format(
                    id_, base_fname))

        try:
            data = data["data"]
        except KeyError:
            raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname))

        if "step_0" not in data:
            raise BadJSONData("Bad header format: {}, {}".format(
                id_, base_fname))

        resp["_id"] = "nacp_{}".format(id_)
        resp["nacp_src"] = "\n".join(cls.extract_textual_data(html))
        resp["nacp_orig"] = data
        resp["declaration"][
            "url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_)
        resp["declaration"]["source"] = "NACP"
        resp["declaration"]["basename"] = os.path.basename(base_fname)

        resp["intro"]["corrected"] = id_ in cls.corrected
        resp["intro"]["date"] = cls.parse_date(created_date)

        if "declarationType" not in data["step_0"] or "changesYear" in data[
                "step_0"]:
            resp["intro"]["doc_type"] = "Форма змін"

            if "changesYear" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(
                    data["step_0"]["changesYear"])
        else:
            resp["intro"]["doc_type"] = cls.declaration_types[
                data["step_0"]["declarationType"]]
            if "declarationYearTo" in data["step_0"]:
                resp["intro"]["declaration_year_to"] = cls.parse_date(
                    data["step_0"]["declarationYearTo"])

            if "declarationYearFrom" in data["step_0"]:
                resp["intro"]["declaration_year_from"] = cls.parse_date(
                    data["step_0"]["declarationYearFrom"])
                resp["intro"]["declaration_year"] = resp["intro"][
                    "declaration_year_from"].year

            if "declarationYear1" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(
                    data["step_0"]["declarationYear1"])

            if "declarationYear3" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(
                    data["step_0"]["declarationYear3"])

            if "declarationYear4" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(
                    data["step_0"]["declarationYear4"])

        resp["general"] = {
            "last_name":
            replace_apostrophes(title(data["step_1"]["lastname"])),
            "name":
            replace_apostrophes(title(data["step_1"]["firstname"])),
            "patronymic":
            replace_apostrophes(title(data["step_1"]["middlename"])),
            "full_name":
            replace_apostrophes("{} {} {}".format(
                title(data["step_1"]["lastname"]),
                title(data["step_1"]["firstname"]),
                title(data["step_1"]["middlename"]),
            )),
            "post": {
                "post":
                replace_apostrophes(data["step_1"].get("workPost", "")),
                "office":
                replace_apostrophes(data["step_1"].get("workPlace", "")),
                "region":
                replace_apostrophes(
                    cls.region_types.get(
                        data["step_1"].get("actual_region", ""), "")),
            }
        }

        if "step_2" in data:
            family = data["step_2"]

            if isinstance(family, dict):
                resp["general"]["family"] = []

                for member in family.values():
                    if not isinstance(member, dict):
                        continue

                    resp["general"]["family"].append({
                        "family_name":
                        replace_apostrophes("{} {} {}".format(
                            title(member.get("lastname", "")),
                            title(member.get("firstname", "")),
                            title(member.get("middlename", "")),
                        )),
                        "relations":
                        member.get("subjectRelation", "")
                    })

        resp['general']['full_name_suggest'] = [{
            'input':
            resp['general']['full_name'],
            'weight':
            5
        }, {
            'input':
            ' '.join([
                resp['general']['name'], resp['general']['patronymic'],
                resp['general']['last_name']
            ]),
            'weight':
            3
        }, {
            'input':
            ' '.join([resp['general']['name'], resp['general']['last_name']]),
            'weight':
            3
        }]

        if not resp["general"]["post"]["region"]:
            region_html = html.css(
                "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text"
            ).extract()
            if len(region_html) > 1:
                chunks = region_html[1].split("/")
                if len(chunks) > 1:
                    resp["general"]["post"]["region"] = chunks[-2].strip()
                else:
                    pass

        if resp["general"]["post"]["region"].lower() in cls.region_mapping:
            resp["general"]["post"]["region"] = cls.region_mapping[
                resp["general"]["post"]["region"].lower()]
        else:
            resp["general"]["post"]["region"] = ""

        return NACPDeclaration(**resp).to_dict(True)
예제 #11
0
    def pull_declarations(self):
        def get_search_clause(kwd):
            if "область" not in kwd:
                return Q(
                    "multi_match",
                    query=kwd,
                    operator="or",
                    minimum_should_match=1,
                    fields=[
                        "general.post.region",
                        "general.post.office",
                        "general.post.post",
                        "general.post.actual_region",
                    ],
                )
            else:
                return Q(
                    "multi_match",
                    query=kwd,
                    fields=[
                        "general.post.region", "general.post.actual_region"
                    ],
                )

        search_clauses = [
            get_search_clause(x) for x in filter(
                None, map(str.strip, self.body.keywords.split("\n")))
        ]

        q = "{} {}".format(self.name, self.extra_keywords)

        if search_clauses:
            for sc in search_clauses:
                first_pass = (NACPDeclaration.search().query(
                    "bool",
                    must=[
                        Q(
                            "match",
                            general__full_name={
                                "query": q,
                                "operator": "and"
                            },
                        )
                    ],
                    should=[sc],
                    minimum_should_match=1,
                )[:100].execute())

                if first_pass:
                    break
        else:
            first_pass = (NACPDeclaration.search().query(
                "bool",
                must=[
                    Q("match",
                      general__full_name={
                          "query": q,
                          "operator": "and"
                      })
                ],
            )[:100].execute())

        Declaration.objects.create_declarations(self, first_pass)

        user_declarant_ids = set(
            filter(
                None,
                self.declarations.exclude(exclude=True).values_list(
                    "user_declarant_id", flat=True),
            ))

        if user_declarant_ids:
            second_pass = NACPDeclaration.search().filter(
                "terms",
                **{"intro.user_declarant_id": list(user_declarant_ids)})

            second_pass = second_pass.execute()

        if not user_declarant_ids or not second_pass:
            obj_ids_to_find = set(
                chain(*self.declarations.exclude(
                    exclude=True).values_list("obj_ids", flat=True)))

            second_pass = NACPDeclaration.search().query(
                "bool",
                must=[
                    Q("match",
                      general__full_name={
                          "query": q,
                          "operator": "or"
                      }),
                    Q("match", obj_ids=" ".join(list(obj_ids_to_find)[:512])),
                ],
                should=[],
                minimum_should_match=0,
            )[:100]

            second_pass = second_pass.execute()

        Declaration.objects.create_declarations(self, second_pass)
예제 #12
0
    def handle(self, *args, **options):
        corrected = NACPDeclaration.search().filter("term",
                                                    intro__corrected=True)

        cntr = 0
        success_rate = 0
        for i, d in enumerate(corrected.scan()):
            must = [
                ConstantScore(query=Q(
                    "multi_match",
                    query=d.general.full_name,
                    operator="and",
                    fields=[
                        "general.last_name",
                        "general.name",
                        "general.patronymic",
                        "general.full_name",
                    ],
                ),
                              boost=10)
            ]

            should = [
                ConstantScore(query=Q(
                    "match",
                    general__post__post={
                        "query": d.general.post.post,
                        "minimum_should_match": "50%"
                    },
                ),
                              boost=2),
                ConstantScore(query=Q(
                    "match",
                    general__post__office={
                        "query": d.general.post.office,
                        "minimum_should_match": "50%"
                    },
                ),
                              boost=2),
                ConstantScore(query=Q(
                    "match",
                    general__post__region={
                        "query": d.general.post.region.replace(" область", ""),
                        "minimum_should_match": "60%"
                    },
                ),
                              boost=1)
            ]

            for fam in getattr(d.general, "family", []):
                should.append(
                    ConstantScore(query=Q(
                        "multi_match",
                        query=fam.family_name,
                        operator="and",
                        fields=["general.family.family_name"]),
                                  boost=2))

            candidates = NACPDeclaration.search() \
                .query(
                    FunctionScore(
                        query=Q("bool", must=must, should=should),
                        score_mode="sum"
                    )
                ) \
                .filter("term",
                    intro__declaration_year=d.intro.declaration_year) \
                .query(~Q('term', _id=d.meta.id)) \
                .filter("term", intro__corrected=False) \
                .query(
                    ConstantScore(
                        query=Q("term", intro__doc_type=d.intro.doc_type),
                        boost=0
                    )
                )

            if options["store_matches"]:
                candidates = candidates \
                    .highlight_options(
                        order='score', fragment_size=500,
                        number_of_fragments=100, pre_tags=['||!'],
                        post_tags=["||"]) \
                    .highlight(
                        "general.full_name", "general.post.region",
                        "general.post.office", "general.post.post",
                        "general.family.family_name")

            candidates = candidates.execute()

            success = self.store_example(
                d,
                candidates,
                debug=options["debug"],
                store_matches=options["store_matches"])

            if success:
                success_rate += 1

            cntr += 1

            if cntr and cntr % 5000 == 0:
                self.stdout.write("%s declarations processed, SR: %s%%" %
                                  (cntr, success_rate / cntr * 100))

        self.stdout.write("%s declarations processed, SR: %s%%" %
                          (cntr, success_rate / cntr * 100))

        if options["store_matches"]:
            self.save_to_excel(options["store_matches"])