Example #1
0
    def __init__(self, *args, **kwargs):
        block = {'val': ''}

        def validate(value):
            if type(value) is dict:
                for val in value:
                    value[val] = validate(value[val])
            elif type(value) in (list, tuple):
                for val in value:
                    val = validate(val)
            else:
                if value:
                    value = self._decode(value)
                    block['val'] += self._encode_for_hash(value)
                return value

        validate(kwargs)

        kwargs['inhash'] = hashlib.sha512(block['val']).hexdigest()

        if check_duplicate(create_connection(), kwargs['inhash']):
            self.__class__.error_count = 0
            super(DataHead, self).__init__(*args, **kwargs)
        else:
            self.__class__.error_count += 1
            if self.__class__.error_count >= self.get_collision_limit():
                raise DuplicateDataError
            raise DuplicateHashError
Example #2
0
def validate_es_data():
    es = create_connection()
    body = {
        "query": {
            "match_all": {
            }
        }
    }

    res = es.search(
        index=INDEX_NAME,
        body=body,
        size=50
    )

    es_totals = res['hits']['total']

    n = 0

    for data_file in data_files:
        n += get_row_count(data_file)

    print("--------------Checking for errors---------")
    print(str(n) + " | " + str(es_totals))

    if n != es_totals:
        return False
    else:
        return True
Example #3
0
def read_csv(file_name):
    print("Loading File...")
    row_count = get_row_count(file_name)
    fl_row_count = float(row_count)
    f = open(get_path(file_name), 'rb')
    reader = csv.DictReader(f)
    print("File Loaded.")

    print("Checking Started...")
    es = create_connection()
    error_count = 0

    for row_num, row in enumerate(reader):
        try:
            for key in row.keys():
                row[key_mapping[key]] = row.pop(key).decode('utf-8', 'ignore').encode('ascii', 'ignore').decode('ascii')

            if get_res_count(es, row) == 0:
                obj = DataHead(inhash=get_hash(row), **row)
                obj.save()
                print("--------------------------------------------")
                print(row)
                print("---------New Entry Created------------------")
        except Exception as e:
            log_error(str(row) + "[" + e.message + "][" + traceback.format_exc() + "]")
            error_count += 1
        else:
            print("%.2f" % ((row_num / fl_row_count) * 100.00) + ' %' + ' Loaded\t[%s/%s]' % (row_num, row_count) +
                  '[Faults: %s]' % error_count, end='\r')
Example #4
0
def search_view_third(request):
    print request.GET
    start = int(request.GET.get('start', 0))
    end = int(request.GET.get('end', 100))

    if request.GET.get('keyword'):
        keyword = request.GET.get('keyword')
        zone = request.GET.get('zone', None)
        district = request.GET.get('district', None)
        zone_code = request.GET.get('zone_code', None)
        name = request.GET.get('name', None)


        body = {
        "from": start,
                "size": 100,
            "query": {
                "bool": {
                    "must": {
                        "bool": {
                            "should": [
                                {
                                    "multi_match": {
                                        "query": str(keyword),
                                        "type": "best_fields",
                                        "fields": ["district", "zone", "zone_code", "full_name", "supply",
                                                   "billing_address"],
                                        "fuzziness": "AUTO"
                                    }
                                },
                                {
                                    "multi_match": {
                                        "query": str(keyword),
                                        "type": "best_fields",
                                        "fields": ["district", "zone", "zone_code", "full_name", "supply",
                                                   "billing_address"],
                                    }
                                }
                            ]
                        }
                    }
                }
            }
        }

        print body
        es = create_connection()
        res = es.search(
            index=INDEX_NAME,
            body=body
        )

        return JsonResponse(res['hits'])
    else:
        return JsonResponse({"status": "No results"}, status=400)
Example #5
0
def search_view_first(request):
    print request.GET
    start = int(request.GET.get('start', 0))
    end = int(request.GET.get('end', 100))

    if request.GET.get('keyword'):
        keyword = request.GET.get('keyword')
        zone = request.GET.get('zone', None)
        district = request.GET.get('district', None)
        zone_code = request.GET.get('zone_code', None)
        name = request.GET.get('name', None)
        start = int(request.GET.get('start', 0))
        end = int(request.GET.get('end', 100))

        body = {
            "from": start,
            "size": 100,
            "query": {
                "bool": {
                    "must": {
                        "bool": {
                            "filter": [
                                {
                                    "terms": {
                                        "zone_code": zone_code.split(),
                                    }
                                } if zone_code else {},
                                {
                                    "match": {
                                        "district": {
                                            "query": district,
                                            "zero_terms_query": "all"
                                        }
                                    }
                                },
                                {
                                    "match": {
                                        "zone": {
                                            "query": zone,
                                            "zero_terms_query": "all"
                                        }
                                    }
                                },
                            ],
                            "must": [
                                {
                                    'bool': {
                                        'should': [{
                                            "match": {
                                                "full_name": name,
                                            }
                                        }, {
                                            "fuzzy": {
                                                "full_name": name,
                                            }
                                        }]
                                    }
                                } if name else {},
                                {
                                    'bool': {
                                        'should': [{
                                            "multi_match": {
                                                "query":
                                                keyword,
                                                "fields":
                                                ["supply", "billing_address"],
                                                "fuzziness":
                                                "AUTO"
                                            }
                                        }, {
                                            "multi_match": {
                                                "query":
                                                keyword,
                                                "fields":
                                                ["supply", "billing_address"],
                                            }
                                        }]
                                    }
                                },
                            ]
                        },
                    }
                }
            }
        }

        es = create_connection()
        res = es.search(index=INDEX_NAME, body=body)

        return JsonResponse(res['hits'])
    else:
        return JsonResponse({"status": "No results"}, status=400)
Example #6
0
def csv_first(request):
    print request.GET
    if request.GET.get('keyword'):
        keyword = request.GET.get('keyword')
        zone = request.GET.get('zone', None)
        district = request.GET.get('district', None)
        zone_code = request.GET.get('zone_code', None)
        name = request.GET.get('name', None)

        body = {
            "size": 9999,
            "query": {
                "bool": {
                    "must": {
                        "bool": {
                            "filter": [
                                {
                                    "terms": {
                                        "zone_code": zone_code.split(),
                                    }
                                } if zone_code else {},
                                {
                                    "match": {
                                        "district": {
                                            "query": district,
                                            "zero_terms_query": "all"
                                        }
                                    }
                                },
                                {
                                    "match": {
                                        "zone": {
                                            "query": zone,
                                            "zero_terms_query": "all"
                                        }
                                    }
                                },
                            ],
                            "must": [
                                {
                                    'bool': {
                                        'should': [{
                                            "match": {
                                                "full_name": name,
                                            }
                                        }, {
                                            "fuzzy": {
                                                "full_name": name,
                                            }
                                        }]
                                    }
                                } if name else {},
                                {
                                    'bool': {
                                        'should': [{
                                            "multi_match": {
                                                "query":
                                                keyword,
                                                "fields":
                                                ["supply", "billing_address"],
                                                "fuzziness":
                                                "AUTO"
                                            }
                                        }, {
                                            "multi_match": {
                                                "query":
                                                keyword,
                                                "fields":
                                                ["supply", "billing_address"],
                                            }
                                        }]
                                    }
                                },
                            ]
                        },
                    }
                }
            }
        }

        es = create_connection()
        res = es.search(index=INDEX_NAME, body=body)

        sample = res['hits']
        print("Got %d Hits:" % res['hits']['total'])
        with open('outputfile.csv',
                  'wb') as csvfile:  #set name of output file here
            filewriter = csv.writer(
                csvfile,
                delimiter=str(
                    u','
                ),  # we use TAB delimited, to handle cases where freeform text may have a comma
                quotechar=str(u'|'),
                quoting=csv.QUOTE_MINIMAL)

            filewriter.writerow([
                "Match Score", "District", "Zone Code", "Zone", "Full name",
                "vkont", "instlion", "Supply", "Billing Address"
            ])  #change the column labels here
            for hit in res['hits']['hits']:
                col1 = hit["_score"]
                col2 = hit["_source"]["district"]
                col3 = hit["_source"]["zone_code"]
                col4 = hit["_source"]["zone"]
                col5 = hit["_source"]["full_name"]
                col6 = hit["_source"]["vkont"]
                col7 = hit["_source"]["instlion"]
                col8 = hit["_source"]["supply"]
                col9 = hit["_source"]["billing_address"]
                filewriter.writerow(
                    [col1, col2, col3, col4, col5, col6, col7, col8, col9])

        data = open('outputfile.csv', 'r').read()
        resp = HttpResponse(data, content_type='text/csv')
        resp['Content-Disposition'] = 'attachment; filename=outputfile.csv'
        return resp
Example #7
0
 def bulk_create(docs):
     bulk(create_connection(), (d.to_dict(True) for d in docs))
Example #8
0
 def bulk_create(docs):
     bulk(create_connection(), (d.to_dict(True) for d in docs),
          chunk_size=CHUNK_SIZE)