def handle(self, *args, **options): to_export = NACPDeclaration.search().source( include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME) if not options["export_all"]: to_export = to_export.query( "bool", must=[Q("term", intro__doc_type="Щорічна")], must_not=[Q("exists", field="corrected_declarations")]) if options["filter_future_declarations"]: to_export = to_export.query( "range", intro__declaration_year={"lt": datetime.now().year}) w = None with open(options["destination"], "w") as fp: for i, d in enumerate(to_export.scan()): row = d[AGGREGATED_FIELD_NAME].to_dict() row['id'] = d.meta.id if not w: w = DictWriter(fp, fieldnames=row.keys()) w.writeheader() w.writerow(row) if i % 10000 == 0 and i: self.stdout.write("{} declarations exported".format(i))
def get_raw_data(self, year, order_by, limit=10000): to_export = NACPDeclaration.search().source( include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME) to_export = to_export.query( "bool", must=[ Q("term", intro__doc_type="Щорічна"), Q("term", intro__declaration_year=year) ], must_not=[ Q("exists", field="corrected_declarations"), Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278"), # F*****g Melnytchuk Q("term", _id="nacp_c67549d0-abc0-48fe-b529-9185efe1a3ce"), # F*****g idiots Q("term", _id="nacp_2e07bb01-5ca8-4188-97c6-6297f7a4d2ad"), # F*****g idiots Q("term", _id="nacp_f1b25e4d-e691-48d6-99b1-758e94764b91"), # F*****g Motsyor Q("term", **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True}) ] ).sort( {'{}.{}'.format(AGGREGATED_FIELD_NAME, order_by): {"order": "desc"}} )[:limit] res = [] for d in to_export.execute(): row = d[AGGREGATED_FIELD_NAME].to_dict() if row[order_by] > 10000000000: continue row["id"] = d._id res.append(row) return res
def handle(self, *args, **options): all_decls = NACPDeclaration.search().query('match_all') if options["to"] is not None: all_decls = all_decls[options["from"]:options["to"]].execute() elif options["from"]: all_decls = all_decls[options["from"]:].execute() else: all_decls = all_decls.scan() w = DictWriter(options["outfile"], fieldnames=["_id"] + options["field"]) w.writeheader() for decl in all_decls: decl_dict = decl.to_dict() row = { field: self.fetch_field(decl_dict, field) for field in options["field"] } row["_id"] = decl.meta.id w.writerow(row)
def get_raw_data(self, year, order_by, limit=10000): to_export = NACPDeclaration.search().source( include=[AGGREGATED_FIELD_NAME]).query("exists", field=AGGREGATED_FIELD_NAME) to_export = to_export.query( "bool", must=[ Q("term", intro__doc_type="Щорічна"), Q("term", intro__declaration_year=year) ], must_not=[ Q("exists", field="corrected_declarations"), Q("term", _id="nacp_e46bba0c-32d5-4b0d-a290-9fdc4afcc278" ), # F*****g Melnytchuk Q("term", **{"{}__outlier".format(AGGREGATED_FIELD_NAME): True}) ]).sort({'aggregated.{}'.format(order_by): { "order": "desc" }})[:limit] res = [] for d in to_export.execute(): row = d[AGGREGATED_FIELD_NAME].to_dict() row["id"] = d._id res.append(row) return res
def handle(self, *args, **options): all_decls = (NACPDeclaration.search().query("match_all").source([ "declaration.url", "intro.date", "intro.doc_type", "nacp_orig.step_1", ])) all_decls = all_decls.filter( "range", intro__date={ "gte": date(options["year_since"], 1, 1), "lt": datetime.now().replace(hour=0, minute=0, second=0, microsecond=0), }, ) w = DictWriter( options["outfile"], fieldnames=[ "id", "declaration.url", "intro.date", "intro.doc_type", "nacp_orig.step_1.postCategory", "nacp_orig.step_1.postType", ], ) for decl in tqdm(all_decls.scan(), total=all_decls.count()): w.writerow({ "id": decl.meta.id, "declaration.url": decl.declaration.url, "intro.date": decl.intro.date.date(), "intro.doc_type": decl.intro.doc_type, "nacp_orig.step_1.postCategory": getattr(decl.nacp_orig.step_1, "postCategory", ""), "nacp_orig.step_1.postType": getattr(decl.nacp_orig.step_1, "postType", ""), })
def handle(self, *args, **options): try: base_dir = options['file_path'] corrected_file = options['corrected_file'] except IndexError: raise CommandError( 'First argument must be a path to source files and second is file name of CSV with corrected declarations') self.stdout.write("Gathering JSON documents from {}".format(base_dir)) self.jsons = list(glob2.glob(os.path.join(base_dir, "**/*.json"))) self.stdout.write("Gathered {} JSON documents".format(len(self.jsons))) corrected = set() with open(corrected_file, "r") as fp: r = DictReader(fp) for l in r: corrected.add(l["uuid"]) DeclarationStaticObj.corrected = corrected NACPDeclaration.init() counter = 0 my_tiny_pool = Pool(self.number_of_processes) if not options["update_all_docs"]: self.stdout.write("Obtaining uuids of already indexed documents") s = NACPDeclaration.search().source([]) existing_guids = set( h.meta.id.replace("nacp_", "") for h in s.scan()) self.stdout.write("{} uuids are currently in index".format( len(existing_guids))) incoming_files = dict( filter( None, my_tiny_pool.map(parse_guid_from_fname, self.jsons) ) ) incoming_guids = set(incoming_files.keys()) self.stdout.write("{} uuids are found in input folder".format( len(incoming_guids))) self.jsons = [ incoming_files[k] for k in incoming_guids - existing_guids ] self.stdout.write("{} uuids left after the filtering".format( len(self.jsons))) for ix in range(0, len(self.jsons), self.chunk_size): chunk = self.jsons[ix:ix + self.chunk_size] result = list( filter( None, my_tiny_pool.map(DeclarationStaticObj.parse, chunk) ) ) counter += len(result) bulk(self.es, result) if ix: self.stdout.write( 'Loaded {} items to persistence storage'.format(ix)) self.stdout.write( 'Finished loading {} items to persistence storage'.format(counter))
def pull_declarations(self): def get_search_clause(kwd): if "область" not in kwd: return Q( "multi_match", query=kwd, operator="or", minimum_should_match=1, fields=[ "general.post.region", "general.post.office", "general.post.post", "general.post.actual_region", ], ) else: return Q( "multi_match", query=kwd, fields=[ "general.post.region", "general.post.actual_region" ], ) search_clauses = [ get_search_clause(x) for x in filter( None, map(str.strip, self.body.keywords.split("\n"))) ] q = "{} {}".format(self.name, self.extra_keywords) if search_clauses: for sc in search_clauses: first_pass = (NACPDeclaration.search().query( "bool", must=[ Q( "match", general__full_name={ "query": q, "operator": "and" }, ) ], should=[sc], minimum_should_match=1, )[:100].execute()) if first_pass: break else: first_pass = (NACPDeclaration.search().query( "bool", must=[ Q("match", general__full_name={ "query": q, "operator": "and" }) ], )[:100].execute()) Declaration.objects.create_declarations(self, first_pass) user_declarant_ids = set( filter( None, self.declarations.exclude(exclude=True).values_list( "user_declarant_id", flat=True), )) if user_declarant_ids: second_pass = NACPDeclaration.search().filter( "terms", **{"intro.user_declarant_id": list(user_declarant_ids)}) second_pass = second_pass.execute() if not user_declarant_ids or not second_pass: obj_ids_to_find = set( chain(*self.declarations.exclude( exclude=True).values_list("obj_ids", flat=True))) second_pass = NACPDeclaration.search().query( "bool", must=[ Q("match", general__full_name={ "query": q, "operator": "or" }), Q("match", obj_ids=" ".join(list(obj_ids_to_find)[:512])), ], should=[], minimum_should_match=0, )[:100] second_pass = second_pass.execute() Declaration.objects.create_declarations(self, second_pass)
def handle(self, *args, **options): corrected = NACPDeclaration.search().filter("term", intro__corrected=True) cntr = 0 success_rate = 0 for i, d in enumerate(corrected.scan()): must = [ ConstantScore(query=Q( "multi_match", query=d.general.full_name, operator="and", fields=[ "general.last_name", "general.name", "general.patronymic", "general.full_name", ], ), boost=10) ] should = [ ConstantScore(query=Q( "match", general__post__post={ "query": d.general.post.post, "minimum_should_match": "50%" }, ), boost=2), ConstantScore(query=Q( "match", general__post__office={ "query": d.general.post.office, "minimum_should_match": "50%" }, ), boost=2), ConstantScore(query=Q( "match", general__post__region={ "query": d.general.post.region.replace(" область", ""), "minimum_should_match": "60%" }, ), boost=1) ] for fam in getattr(d.general, "family", []): should.append( ConstantScore(query=Q( "multi_match", query=fam.family_name, operator="and", fields=["general.family.family_name"]), boost=2)) candidates = NACPDeclaration.search() \ .query( FunctionScore( query=Q("bool", must=must, should=should), score_mode="sum" ) ) \ .filter("term", intro__declaration_year=d.intro.declaration_year) \ .query(~Q('term', _id=d.meta.id)) \ .filter("term", intro__corrected=False) \ .query( ConstantScore( query=Q("term", intro__doc_type=d.intro.doc_type), boost=0 ) ) if options["store_matches"]: candidates = candidates \ .highlight_options( order='score', fragment_size=500, number_of_fragments=100, pre_tags=['||!'], post_tags=["||"]) \ .highlight( "general.full_name", "general.post.region", "general.post.office", "general.post.post", "general.family.family_name") candidates = candidates.execute() success = self.store_example( d, candidates, debug=options["debug"], store_matches=options["store_matches"]) if success: success_rate += 1 cntr += 1 if cntr and cntr % 5000 == 0: self.stdout.write("%s declarations processed, SR: %s%%" % (cntr, success_rate / cntr * 100)) self.stdout.write("%s declarations processed, SR: %s%%" % (cntr, success_rate / cntr * 100)) if options["store_matches"]: self.save_to_excel(options["store_matches"])