def __init__(self, *args, **kwargs): block = {'val': ''} def validate(value): if type(value) is dict: for val in value: value[val] = validate(value[val]) elif type(value) in (list, tuple): for val in value: val = validate(val) else: if value: value = self._decode(value) block['val'] += self._encode_for_hash(value) return value validate(kwargs) kwargs['inhash'] = hashlib.sha512(block['val']).hexdigest() if check_duplicate(create_connection(), kwargs['inhash']): self.__class__.error_count = 0 super(DataHead, self).__init__(*args, **kwargs) else: self.__class__.error_count += 1 if self.__class__.error_count >= self.get_collision_limit(): raise DuplicateDataError raise DuplicateHashError
def validate_es_data(): es = create_connection() body = { "query": { "match_all": { } } } res = es.search( index=INDEX_NAME, body=body, size=50 ) es_totals = res['hits']['total'] n = 0 for data_file in data_files: n += get_row_count(data_file) print("--------------Checking for errors---------") print(str(n) + " | " + str(es_totals)) if n != es_totals: return False else: return True
def read_csv(file_name): print("Loading File...") row_count = get_row_count(file_name) fl_row_count = float(row_count) f = open(get_path(file_name), 'rb') reader = csv.DictReader(f) print("File Loaded.") print("Checking Started...") es = create_connection() error_count = 0 for row_num, row in enumerate(reader): try: for key in row.keys(): row[key_mapping[key]] = row.pop(key).decode('utf-8', 'ignore').encode('ascii', 'ignore').decode('ascii') if get_res_count(es, row) == 0: obj = DataHead(inhash=get_hash(row), **row) obj.save() print("--------------------------------------------") print(row) print("---------New Entry Created------------------") except Exception as e: log_error(str(row) + "[" + e.message + "][" + traceback.format_exc() + "]") error_count += 1 else: print("%.2f" % ((row_num / fl_row_count) * 100.00) + ' %' + ' Loaded\t[%s/%s]' % (row_num, row_count) + '[Faults: %s]' % error_count, end='\r')
def search_view_third(request): print request.GET start = int(request.GET.get('start', 0)) end = int(request.GET.get('end', 100)) if request.GET.get('keyword'): keyword = request.GET.get('keyword') zone = request.GET.get('zone', None) district = request.GET.get('district', None) zone_code = request.GET.get('zone_code', None) name = request.GET.get('name', None) body = { "from": start, "size": 100, "query": { "bool": { "must": { "bool": { "should": [ { "multi_match": { "query": str(keyword), "type": "best_fields", "fields": ["district", "zone", "zone_code", "full_name", "supply", "billing_address"], "fuzziness": "AUTO" } }, { "multi_match": { "query": str(keyword), "type": "best_fields", "fields": ["district", "zone", "zone_code", "full_name", "supply", "billing_address"], } } ] } } } } } print body es = create_connection() res = es.search( index=INDEX_NAME, body=body ) return JsonResponse(res['hits']) else: return JsonResponse({"status": "No results"}, status=400)
def search_view_first(request): print request.GET start = int(request.GET.get('start', 0)) end = int(request.GET.get('end', 100)) if request.GET.get('keyword'): keyword = request.GET.get('keyword') zone = request.GET.get('zone', None) district = request.GET.get('district', None) zone_code = request.GET.get('zone_code', None) name = request.GET.get('name', None) start = int(request.GET.get('start', 0)) end = int(request.GET.get('end', 100)) body = { "from": start, "size": 100, "query": { "bool": { "must": { "bool": { "filter": [ { "terms": { "zone_code": zone_code.split(), } } if zone_code else {}, { "match": { "district": { "query": district, "zero_terms_query": "all" } } }, { "match": { "zone": { "query": zone, "zero_terms_query": "all" } } }, ], "must": [ { 'bool': { 'should': [{ "match": { "full_name": name, } }, { "fuzzy": { "full_name": name, } }] } } if name else {}, { 'bool': { 'should': [{ "multi_match": { "query": keyword, "fields": ["supply", "billing_address"], "fuzziness": "AUTO" } }, { "multi_match": { "query": keyword, "fields": ["supply", "billing_address"], } }] } }, ] }, } } } } es = create_connection() res = es.search(index=INDEX_NAME, body=body) return JsonResponse(res['hits']) else: return JsonResponse({"status": "No results"}, status=400)
def csv_first(request): print request.GET if request.GET.get('keyword'): keyword = request.GET.get('keyword') zone = request.GET.get('zone', None) district = request.GET.get('district', None) zone_code = request.GET.get('zone_code', None) name = request.GET.get('name', None) body = { "size": 9999, "query": { "bool": { "must": { "bool": { "filter": [ { "terms": { "zone_code": zone_code.split(), } } if zone_code else {}, { "match": { "district": { "query": district, "zero_terms_query": "all" } } }, { "match": { "zone": { "query": zone, "zero_terms_query": "all" } } }, ], "must": [ { 'bool': { 'should': [{ "match": { "full_name": name, } }, { "fuzzy": { "full_name": name, } }] } } if name else {}, { 'bool': { 'should': [{ "multi_match": { "query": keyword, "fields": ["supply", "billing_address"], "fuzziness": "AUTO" } }, { "multi_match": { "query": keyword, "fields": ["supply", "billing_address"], } }] } }, ] }, } } } } es = create_connection() res = es.search(index=INDEX_NAME, body=body) sample = res['hits'] print("Got %d Hits:" % res['hits']['total']) with open('outputfile.csv', 'wb') as csvfile: #set name of output file here filewriter = csv.writer( csvfile, delimiter=str( u',' ), # we use TAB delimited, to handle cases where freeform text may have a comma quotechar=str(u'|'), quoting=csv.QUOTE_MINIMAL) filewriter.writerow([ "Match Score", "District", "Zone Code", "Zone", "Full name", "vkont", "instlion", "Supply", "Billing Address" ]) #change the column labels here for hit in res['hits']['hits']: col1 = hit["_score"] col2 = hit["_source"]["district"] col3 = hit["_source"]["zone_code"] col4 = hit["_source"]["zone"] col5 = hit["_source"]["full_name"] col6 = hit["_source"]["vkont"] col7 = hit["_source"]["instlion"] col8 = hit["_source"]["supply"] col9 = hit["_source"]["billing_address"] filewriter.writerow( [col1, col2, col3, col4, col5, col6, col7, col8, col9]) data = open('outputfile.csv', 'r').read() resp = HttpResponse(data, content_type='text/csv') resp['Content-Disposition'] = 'attachment; filename=outputfile.csv' return resp
def bulk_create(docs): bulk(create_connection(), (d.to_dict(True) for d in docs))
def bulk_create(docs): bulk(create_connection(), (d.to_dict(True) for d in docs), chunk_size=CHUNK_SIZE)