def test_in(self): self.assertTrue( dq.match({'role': 'admin'}, 'role in ["admin", "observer"]')) self.assertTrue( dq.match({'age': 18}, 'age in [12, 56, 78, 18, 90, 20]')) self.assertFalse( dq.match({'role': 'user'}, 'role in ["admin", "observer"]'))
def test_contains(self): self.assertTrue( dq.match({'roles': ['admin', 'observer']}, 'roles CONTAINS "admin"')) self.assertFalse( dq.match({'roles': ['admin', 'observer']}, 'roles CONTAINS "user"'))
def test_validation(self): data = {} with self.assertRaises(DQValidationError): dq.match(data, "44 == 44") with self.assertRaises(DQValidationError): dq.compile("44 == 44")
def test_pars(self): data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0} self.assertTrue(dq.match(data, "(a) AND (c)")) self.assertTrue(dq.match(data, "((a) AND (c))")) self.assertTrue(dq.match(data, "((((a)) AND ((c))))")) with self.assertRaises(DQSyntaxError): self.assertTrue(dq.match(data, "(a) AND (c")) with self.assertRaises(DQSyntaxError): self.assertTrue(dq.match(data, ")a AND c"))
def test_pars(self): data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0} self.assertTrue(match(data, "(`a`) AND (`c`)")) self.assertTrue(match(data, "((`a`) AND (`c`))")) self.assertTrue(match(data, "((((`a`)) AND ((`c`))))")) with self.assertRaises(DQSyntaxError): self.assertTrue(match(data, "(`a`) AND (`c`")) with self.assertRaises(DQSyntaxError): self.assertTrue(match(data, ")`a` AND `c`"))
def test_key_order(self): data1 = {'age': 26} data2 = {'x': 12, 'y': 33} data3 = { 'age': 12, 'friends': [ { 'age': 14 }, { 'age': 16 }, { 'age': 18 }, { 'age': 20 }, ] } self.assertTrue(dq.match(data1, "26 == age")) self.assertTrue(dq.match(data1, "[23, 45, 12, 26] CONTAINS age")) self.assertTrue(dq.match(data1, "age == age")) self.assertTrue(dq.match(data2, "x < y")) self.assertFalse(dq.match(data2, "x >= y")) self.assertTrue(dq.match(data2, "x != y")) self.assertTrue(dq.match(data3, "age < `friends.age`"))
def test_key_order(self): data1 = {'age': 26} data2 = {'x': 12, 'y': 33} data3 = { 'age': 12, 'friends': [ { 'age': 14 }, { 'age': 16 }, { 'age': 18 }, { 'age': 20 }, ] } self.assertTrue(match(data1, "26 == `age`")) self.assertTrue(match(data1, "[23, 45, 12, 26] CONTAIN `age`")) self.assertTrue(match(data1, "`age` == `age`")) self.assertTrue(match(data2, "`x` < `y`")) self.assertFalse(match(data2, "`x` >= `y`")) self.assertTrue(match(data2, "`x` != `y`")) self.assertTrue(match(data3, "`age` < `friends.age`"))
def test_only_keys(self): self.assertTrue( dq.match({ 'username': '******', 'age': 26 }, "username AND age")) self.assertTrue( dq.match({ 'username': '******', 'age': 26 }, "username")) self.assertFalse(dq.match({'username': '******', 'age': 0}, "age")) self.assertFalse(dq.match({'username': '******'}, "age")) self.assertFalse( dq.match({ 'username': '******', 'age': 0 }, "username AND age"))
def test_only_keys(self): self.assertTrue( match({ 'username': '******', 'age': 26 }, "`username` AND `age`")) self.assertTrue( match({ 'username': '******', 'age': 26 }, "`username`")) self.assertFalse(match({'username': '******', 'age': 0}, "`age`")) self.assertFalse(match({'username': '******'}, "`age`")) self.assertFalse( match({ 'username': '******', 'age': 0 }, "`username` AND `age`"))
def test_match(self): data = {'username': '******'} self.assertTrue(dq.match(data, r'username MATCH /.*admin.*/')) self.assertTrue(dq.match(data, r'username MATCH /test.*/')) self.assertTrue(dq.match({'age': '98'}, r'age MATCH /\d+/')) self.assertFalse(dq.match(data, r'username MATCH /qwerty/')) with self.assertRaises(DQSyntaxError): self.assertTrue(dq.match(data, r'/\d+/ MATCH username')) with self.assertRaises(DQSyntaxError): self.assertTrue(dq.match(data, r'username MATCH "test"'))
def test_like(self): data = {'username': '******'} self.assertTrue(dq.match(data, 'username LIKE "*admin*"')) self.assertTrue(dq.match(data, 'username LIKE "test*"')) self.assertTrue(dq.match(data, 'username LIKE "test?admin?username"')) self.assertFalse(dq.match(data, 'username LIKE "test"')) with self.assertRaises(DQSyntaxError): self.assertTrue(dq.match(data, 'username LIKE 23')) with self.assertRaises(DQSyntaxError): self.assertTrue(dq.match(data, '"test" LIKE username'))
def test_now(self): utcnow = datetime.utcnow() self.assertTrue( dq.match({'time': utcnow - timedelta(hours=1)}, "time < NOW")) self.assertFalse( dq.match({'time': utcnow - timedelta(hours=1)}, "time == NOW"))
def test_lt(self): self.assertTrue(match({'age': 18}, '`age` < 20')) self.assertFalse(match({'age': 18}, '`age` < 17')) self.assertFalse(match({'age': 18}, '`age` < 18'))
def test_gt(self): self.assertTrue(match({'age': 18}, '`age` > 12')) self.assertFalse(match({'age': 18}, '`age` > 20')) self.assertFalse(match({'age': 18}, '`age` > 18'))
def test_gte(self): self.assertTrue(dq.match({'age': 18}, 'age >= 12')) self.assertTrue(dq.match({'age': 18}, 'age >= 18')) self.assertFalse(dq.match({'age': 18}, 'age >= 20'))
def test_equal(self): self.assertTrue(match({'age': 18}, '`age` == 18')) self.assertFalse(match({'age': 18}, '`age` == 12'))
def split(self, fromfile, options={}): """Splits the given file with data into chunks based on chunk size or field value""" f_type = get_file_type( fromfile) if options['format_in'] is None else options['format_in'] if options['zipfile']: z = zipfile.ZipFile(fromfile, mode='r') fnames = z.namelist() finfilename = fnames[0] if f_type == 'bson': infile = z.open(fnames[0], 'rb') else: infile = z.open(fnames[0], 'r') else: finfilename = fromfile if f_type == 'bson': infile = open(fromfile, 'rb') else: infile = open(fromfile, 'r', encoding=get_option(options, 'encoding')) fields = options['fields'].split( ',') if options['fields'] is not None else None valuedict = {} delimiter = get_option(options, 'delimiter') if f_type == 'csv': reader = csv.DictReader(infile, delimiter=delimiter) n = 0 chunknum = 1 if options['fields'] is None: splitname = finfilename.rsplit('.', 1)[0] + '_%d.csv' % (chunknum) out = open(splitname, 'w', encoding=get_option(options, 'encoding')) writer = csv.DictWriter(out, fieldnames=reader.fieldnames, delimiter=delimiter) writer.writeheader() for r in reader: n += 1 if n % 10000 == 0: logging.info('split: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue writer.writerow(r) if n % options['chunksize'] == 0: out.close() chunknum += 1 splitname = finfilename.rsplit( '.', 1)[0] + '_%d.csv' % (chunknum) out = open(splitname, 'w', encoding=get_option(options, 'encoding')) writer = csv.DictWriter(out, fieldnames=reader.fieldnames, delimiter=delimiter) writer.writeheader() out.close() elif f_type == 'jsonl': n = 0 chunknum = 1 if options['fields'] is None: splitname = finfilename.rsplit('.', 1)[0] + '_%d.jsonl' % (chunknum) out = open(splitname, 'w', encoding=get_option(options, 'encoding')) for l in infile: n += 1 if n % 10000 == 0: logging.info('split: processing %d records of %s' % (n, fromfile)) r = json.loads(l) if options['filter'] is not None: if not dq.match(r, options['filter']): continue out.write(json.dumps(r) + '\n') if n % options['chunksize'] == 0: out.close() chunknum += 1 splitname = finfilename.rsplit( '.', 1)[0] + '_%d.jsonl' % (chunknum) logging.info('split: new chunk %s' % splitname) out = open(splitname, 'w', encoding=get_option(options, 'encoding')) else: for l in infile: n += 1 if n % 10000 == 0: logging.info('split: processing %d records of %s' % (n, fromfile)) r = json.loads(l) if options['filter'] is not None: if not dq.match(r, options['filter']): continue try: kx = get_dict_value(r, fields[0].split('.'))[0] except IndexError: continue kx = "None" v = valuedict.get(kx, None) if v is None: splitname = finfilename.rsplit( '.', 1)[0] + '_%s.jsonl' % (kx) valuedict[kx] = open(splitname, 'w', encoding='utf8') valuedict[kx].write(l) # valuedict[kx].write(l.decode('utf8'))#.decode('utf8')#) for opened in valuedict.values(): opened.close() elif f_type == 'bson': bson_iter = bson.decode_file_iter(infile) n = 0 for r in bson_iter: n += 1 # print(r) r_selected = strip_dict_fields(r, fields, 0) # out.write(json.dumps(r_selected)+'\n') if n % 10000 == 0: logging.info('split: processing %d records of %s' % (n, fromfile)) else: logging.info('File type not supported') return logging.debug('split: %d records processed' % (n))
def uniq(self, fromfile, options={}): logging.debug('Processing %s' % fromfile) f_type = get_file_type( fromfile) if options['format_in'] is None else options['format_in'] if options['zipfile']: z = zipfile.ZipFile(fromfile, mode='r') fnames = z.namelist() if f_type == 'bson': infile = z.open(fnames[0], 'rb') else: infile = z.open(fnames[0], 'r') else: if f_type == 'bson': infile = open(fromfile, 'rb') else: infile = open(fromfile, 'r', encoding=get_option(options, 'encoding')) to_file = get_option(options, 'output') if to_file: to_type = get_file_type(to_file) if not to_file: logging.debug('Output file type not supported') return out = open(to_file, 'w', encoding='utf8') else: to_type = 'csv' out = sys.stdout fields = options['fields'].split(',') logging.info('uniq: looking for fields: %s' % (options['fields'])) if f_type == 'csv': delimiter = get_option(options, 'delimiter') uniqval = [] reader = csv.DictReader(infile, delimiter=delimiter) n = 0 for r in reader: n += 1 if n % 1000 == 0: logging.info('uniq: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue k = [r[x] for x in fields] if k not in uniqval: uniqval.append(k) elif f_type == 'jsonl': uniqval = [] n = 0 for l in infile: n += 1 if n % 10000 == 0: logging.info('uniq: processing %d records of %s' % (n, fromfile)) r = json.loads(l) if options['filter'] is not None: if not dq.match(r, options['filter']): continue try: allvals = [] for field in fields: allvals.append(get_dict_value(r, field.split('.'))) for n1 in range(0, len(allvals[0]), 1): k = [] for n2 in range(0, len(allvals)): k.append(str(allvals[n2][n1])) if k not in uniqval: uniqval.append(k) except KeyError: pass elif f_type == 'bson': uniqval = [] bson_iter = bson.decode_file_iter(infile) n = 0 for r in bson_iter: n += 1 if n % 1000 == 0: logging.info('uniq: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue try: allvals = [] for field in fields: allvals.append(get_dict_value(r, field.split('.'))) for n1 in range(0, len(allvals[0]), 1): k = [] for n2 in range(0, len(allvals)): k.append(str(allvals[n2][n1])) if k not in uniqval: uniqval.append(k) except KeyError: pass else: logging.error('Invalid filed format provided') return infile.close() logging.debug('%d unique values found' % (len(uniqval))) write_items(fields, uniqval, filetype=to_type, handle=out)
def test_not(self): self.assertTrue(dq.match({'age': 18}, 'NOT age == 12'))
def test_eval_order(self): data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0} self.assertTrue(dq.match(data, "a == 1 OR c == 0")) self.assertFalse(dq.match(data, "a == 0 AND c == 1")) self.assertTrue(dq.match(data, "a == 0 AND c == 1 OR z == 0")) self.assertFalse(dq.match(data, "a == 0 AND (c == 1 OR z == 0)"))
def test_contain(self): self.assertTrue( match({'roles': ['admin', 'observer']}, '`roles` CONTAIN "admin"')) self.assertFalse( match({'roles': ['admin', 'observer']}, '`roles` CONTAIN "user"'))
def test_eval_order(self): data = {'a': 1, 'b': 0, 'c': 1, 'x': 0, 'y': 1, 'z': 0} self.assertTrue(match(data, "`a` == 1 OR `c` == 0")) self.assertFalse(match(data, "`a` == 0 AND `c` == 1")) self.assertTrue(match(data, "`a` == 0 AND `c` == 1 OR `z` == 0")) self.assertFalse(match(data, "`a` == 0 AND (`c` == 1 OR `z` == 0)"))
def frequency(self, fromfile, options={}): """Calculates frequency of the values in the file""" f_type = get_file_type( fromfile) if options['format_in'] is None else options['format_in'] if options['zipfile']: z = zipfile.ZipFile(fromfile, mode='r') fnames = z.namelist() if f_type == 'bson': infile = z.open(fnames[0], 'rb') else: infile = z.open(fnames[0], 'r') else: if f_type == 'bson': infile = open(fromfile, 'rb') else: infile = open(fromfile, 'r', encoding=get_option(options, 'encoding')) to_file = get_option(options, 'output') if to_file: to_type = get_file_type(to_file) if not to_file: print('Output file type not supported') return out = open(to_file, 'w', encoding='utf8') else: to_type = 'csv' out = sys.stdout fields = options['fields'].split(',') valuedict = {} if f_type == 'csv': delimiter = get_option(options, 'delimiter') reader = csv.DictReader(infile, delimiter=delimiter) n = 0 for r in reader: n += 1 if n % 10000 == 0: logging.info('frequency: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue k = [r[x] for x in fields] kx = '\t'.join(k) v = valuedict.get(kx, 0) valuedict[kx] = v + 1 elif f_type == 'jsonl': n = 0 for l in infile: n += 1 if n % 10000 == 0: logging.info('frequency: processing %d records of %s' % (n, fromfile)) r = json.loads(l) if options['filter'] is not None: if not dq.match(r, options['filter']): continue try: allvals = [] for field in fields: allvals.append(get_dict_value(r, field.split('.'))) for n1 in range(0, len(allvals[0]), 1): k = [] for n2 in range(0, len(allvals)): k.append(str(allvals[n2][n1])) kx = '\t'.join(k) v = valuedict.get(kx, 0) valuedict[kx] = v + 1 except KeyError: pass elif f_type == 'bson': bson_iter = bson.decode_file_iter(infile) n = 0 for r in bson_iter: n += 1 if n % 10000 == 0: logging.info('frequency: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue # print(r) allvals = [] for field in fields: allvals.append(get_dict_value(r, field.split('.'))) for n1 in range(0, len(allvals[0]), 1): k = [] for n2 in range(0, len(allvals)): k.append(str(allvals[n2][n1])) v = valuedict.get(k, 0) valuedict[k] = v + 1 else: logging.info('File type not supported') return logging.debug('frequency: %d unique values found' % (len(valuedict))) thedict = sorted(valuedict.items(), key=lambda item: item[1], reverse=False) output = get_option(options, 'output') strkeys = '\t'.join(fields) + '\tcount' if output: f = open(output, 'w', encoding=get_option(options, 'encoding')) f.write(strkeys + '\n') for k, v in thedict: f.write('%s\t%d\n' % (k, v)) f.close() else: print(strkeys) for k, v in thedict: print('%s\t%d' % (k, v))
def test_notequal(self): self.assertTrue(dq.match({'age': 18}, 'age != 12')) self.assertFalse(dq.match({'age': 18}, 'age != 18'))
def select(self, fromfile, options={}): """Select or re-order columns from file""" f_type = get_file_type( fromfile) if options['format_in'] is None else options['format_in'] if options['zipfile']: z = zipfile.ZipFile(fromfile, mode='r') fnames = z.namelist() if f_type == 'bson': infile = z.open(fnames[0], 'rb') else: infile = z.open(fnames[0], 'r') else: if f_type == 'bson': infile = open(fromfile, 'rb') else: infile = open(fromfile, 'r', encoding=get_option(options, 'encoding')) to_file = get_option(options, 'output') if to_file: to_type = get_file_type(to_file) if not to_file: print('Output file type not supported') return if to_type == 'bson': out = open(to_file, 'wb') else: out = open(to_file, 'w', encoding='utf8') else: to_type = f_type out = sys.stdout fields = options['fields'].split(',') valuedict = {} delimiter = get_option(options, 'delimiter') if f_type == 'csv': reader = csv.DictReader(infile, delimiter=delimiter) if to_type == 'csv': writer = csv.DictWriter(out, fieldnames=fields, delimiter=delimiter) writer.writeheader() n = 0 for r in reader: n += 1 if n % 10000 == 0: logging.info('select: processing %d records of %s' % (n, fromfile)) item = {} if options['filter'] is not None: if not dq.match(r, options['filter']): continue for x in fields: item[x] = r[x] if to_type == 'csv': writer.writerow(item) elif to_type == 'jsonl': out.write(json.dumps(item) + "\n") elif f_type == 'jsonl': n = 0 fields = [field.split('.') for field in fields] for l in infile: n += 1 if n % 10000 == 0: logging.info('select: processing %d records of %s' % (n, fromfile)) r = json.loads(l) if options['filter'] is not None: res = dq.match(r, options['filter']) # print(options['filter'], r) if not res: continue r_selected = strip_dict_fields(r, fields, 0) out.write(json.dumps(r_selected) + '\n') elif f_type == 'bson': bson_iter = bson.decode_file_iter(infile) n = 0 fields = [field.split('.') for field in fields] for r in bson_iter: n += 1 if n % 10000 == 0: logging.info('select: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: res = dq.match(r, options['filter']) if not res: continue r_selected = strip_dict_fields(r, fields, 0) out.write(json.dumps(r_selected) + '\n') else: logging.info('File type not supported') return logging.debug('select: %d records processed' % (n)) out.close()
def test_lte(self): self.assertTrue(dq.match({'age': 18}, 'age <= 20')) self.assertTrue(dq.match({'age': 18}, 'age <= 18')) self.assertFalse(dq.match({'age': 18}, 'age <= 17'))
def validate(self, fromfile, options={}): """Validates selected field against validation rule""" logging.debug('Processing %s' % fromfile) f_type = get_file_type( fromfile) if options['format_in'] is None else options['format_in'] if options['zipfile']: z = zipfile.ZipFile(fromfile, mode='r') fnames = z.namelist() if f_type == 'bson': infile = z.open(fnames[0], 'rb') else: infile = z.open(fnames[0], 'r') else: if f_type == 'bson': infile = open(fromfile, 'rb') else: infile = open(fromfile, 'r', encoding=get_option(options, 'encoding')) to_file = get_option(options, 'output') if to_file: to_type = get_file_type(to_file) if not to_file: logging.debug('Output file type not supported') return out = open(to_file, 'w', encoding='utf8') else: to_type = 'csv' out = sys.stdout fields = options['fields'].split(',') val_func = VALIDATION_RULEMAP[options['rule']] logging.info('uniq: looking for fields: %s' % (options['fields'])) validated = [] stats = {'total': 0, 'invalid': 0, 'novalue': 0} if f_type == 'csv': delimiter = get_option(options, 'delimiter') reader = csv.DictReader(infile, delimiter=delimiter) n = 0 for r in reader: n += 1 if n % 1000 == 0: logging.info('uniq: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue res = val_func(r[fields[0]]) stats['total'] += 1 if not res: stats['invalid'] += 1 validated.append({ fields[0]: r[fields[0]], fields[0] + '_valid': res }) elif f_type == 'jsonl': n = 0 for l in infile: n += 1 if n % 10000 == 0: logging.info('uniq: processing %d records of %s' % (n, fromfile)) r = json.loads(l) if options['filter'] is not None: if not dq.match(r, options['filter']): continue stats['total'] += 1 values = get_dict_value(r, fields[0].split('.')) if len(values) > 0: res = val_func(values[0]) if not res: stats['invalid'] += 1 validated.append({ fields[0]: values[0], fields[0] + '_valid': res }) else: stats['novalue'] += 1 elif f_type == 'bson': uniqval = [] bson_iter = bson.decode_file_iter(infile) n = 0 for r in bson_iter: n += 1 if n % 1000 == 0: logging.info('uniq: processing %d records of %s' % (n, fromfile)) if options['filter'] is not None: if not dq.match(r, options['filter']): continue stats['total'] += 1 values = get_dict_value(r, fields[0].split('.')) if len(values) > 0: res = val_func(values[0]) if not res: stats['invalid'] += 1 validated.append({ fields[0]: values[0], fields[0] + '_valid': res }) else: stats['novalue'] += 1 else: logging.error('Invalid filed format provided') return infile.close() stats['share'] = 100.0 * stats['invalid'] / stats['total'] logging.debug( 'validate: complete, %d records (%.2f%%) not valid and %d (%.2f%%) not found of %d against %s' % (stats['invalid'], stats['share'], stats['novalue'], 100.0 * stats['novalue'] / stats['total'], stats['total'], options['rule'])) if options['mode'] != 'stats': writer = csv.DictWriter( out, fieldnames=[fields[0], fields[0] + '_valid'], delimiter=get_option(options, 'delimiter')) for row in validated: if options['mode'] == 'invalid': if not row[fields[0] + '_valid']: writer.writerow(row) elif options['mode'] == 'all': writer.writerow(row) else: out.write(json.dumps(stats, indent=4))
def test_not(self): self.assertTrue(match({'age': 18}, 'NOT `age` == 12'))