def consumer_queue(proc_id, queue): loader = Loader(conf) while True: try: consumer_data = queue.get(proc_id, 1) if consumer_data == 'STOP': queue.put('STOP') break for data in consumer_data: v = Validation() v.create(data) loader.load(v) except Empty: pass loader.close()
'/home/ubuntu/semantics_pipeline/pipeline_tests/identified/*.json') with open('local_rds.conf', 'r') as f: conf = json.loads(f.read()) loader = Loader(conf) for i, f in enumerate(files[100:]): with open(f, 'r') as g: data = json.loads(g.read()) fname = f.split('/')[-1].split('_')[0] fmt = data.get('response_datatype', 'unknown') identity = data.get('identity', []) try: r = Response() r.create(data) r_id = loader.load(r) except Exception as ex: print fname, ex continue if identity and r_id: try: d = Identity() data.update({"response_id": r_id}) d.create(data) loader.load(d) except: continue
conf = json.loads(f.read()) reader = ResponseReader(conf) loader = Loader(conf) # get the set, validate, store outputs # but need to paginate because of ram issues for i in xrange(0, 668110, 25): print 'QUERYING {0}:{1}'.format(i, 25) for response in reader.read('', limit=25, offset=i): print response.source_url xml = response.cleaned_content stderr = validate_in_memory(xml) data = { "response_id": response.id, "valid": 'Error at' not in stderr, "validated_on": datetime.now() } if stderr: data.update({"errors": [s.strip() for s in stderr.split('\n\n')]}) print '\t{0}'.format(stderr[:100]) try: v = Validation() v.create(data) loader.load(v) except Exception as ex: print ex
from mpp.models import Response import traceback """ """ # set up the connection with open("local_rds.conf", "r") as f: conf = json.loads(f.read()) loader = Loader(conf) with open("responses_with_federal_schemas.txt", "r") as f: responses = [g.strip() for g in f.readlines() if g] for i, response in enumerate(responses): if i % 5000 == 0: print "finished: ", i with open(response, "r") as f: data = json.loads(f.read()) r = Response() r.create(data) try: loader.load(r) except Exception as ex: print response print "\t", ex traceback.print_exc()
loader = Loader(conf) # get the set, validate, store outputs # but need to paginate because of ram issues for i in xrange(0, 668110, 25): print 'QUERYING {0}:{1}'.format(i, 25) for response in reader.read('', limit=25, offset=i): print response.source_url xml = response.cleaned_content stderr = validate_in_memory(xml) data = { "response_id": response.id, "valid": 'Error at' not in stderr, "validated_on": datetime.now() } if stderr: data.update({ "errors": [s.strip() for s in stderr.split('\n\n')] }) print '\t{0}'.format(stderr[:100]) try: v = Validation() v.create(data) loader.load(v) except Exception as ex: print ex
from mpp.loaders import Loader from mpp.models import Response import traceback ''' ''' # set up the connection with open('local_rds.conf', 'r') as f: conf = json.loads(f.read()) loader = Loader(conf) with open('responses_with_federal_schemas.txt', 'r') as f: responses = [g.strip() for g in f.readlines() if g] for i, response in enumerate(responses): if i % 5000 == 0: print 'finished: ', i with open(response, 'r') as f: data = json.loads(f.read()) r = Response() r.create(data) try: loader.load(r) except Exception as ex: print response print '\t', ex traceback.print_exc()