def consumer_queue(proc_id, queue): loader = Loader(conf) while True: try: consumer_data = queue.get(proc_id, 1) if consumer_data == 'STOP': queue.put('STOP') break for data in consumer_data: v = Validation() v.create(data) loader.load(v) except Empty: pass loader.close()
import json import glob import os from mpp.loaders import Loader from mpp.models import Response, Identity from sqlalchemy.exc import IntegrityError files = glob.glob( '/home/ubuntu/semantics_pipeline/pipeline_tests/identified/*.json') with open('local_rds.conf', 'r') as f: conf = json.loads(f.read()) loader = Loader(conf) for i, f in enumerate(files[100:]): with open(f, 'r') as g: data = json.loads(g.read()) fname = f.split('/')[-1].split('_')[0] fmt = data.get('response_datatype', 'unknown') identity = data.get('identity', []) try: r = Response() r.create(data) r_id = loader.load(r) except Exception as ex: print fname, ex continue
from mpp.readers import ResponseReader from mpp.utils import validate_in_memory from mpp.models import Validation from datetime import datetime import traceback ''' geared to validate just the federal responses from the wee little metadata ec2, reading and writing to rds ''' # set up the connection with open('local_rds.conf', 'r') as f: conf = json.loads(f.read()) reader = ResponseReader(conf) loader = Loader(conf) # get the set, validate, store outputs # but need to paginate because of ram issues for i in xrange(0, 668110, 25): print 'QUERYING {0}:{1}'.format(i, 25) for response in reader.read('', limit=25, offset=i): print response.source_url xml = response.cleaned_content stderr = validate_in_memory(xml) data = { "response_id": response.id, "valid": 'Error at' not in stderr, "validated_on": datetime.now()
#!/anaconda/bin/python import json from mpp.loaders import Loader from mpp.models import Response import traceback """ """ # set up the connection with open("local_rds.conf", "r") as f: conf = json.loads(f.read()) loader = Loader(conf) with open("responses_with_federal_schemas.txt", "r") as f: responses = [g.strip() for g in f.readlines() if g] for i, response in enumerate(responses): if i % 5000 == 0: print "finished: ", i with open(response, "r") as f: data = json.loads(f.read()) r = Response() r.create(data) try: loader.load(r) except Exception as ex:
from mpp.utils import validate_in_memory from mpp.models import Validation from datetime import datetime import traceback ''' geared to validate just the federal responses from the wee little metadata ec2, reading and writing to rds ''' # set up the connection with open('local_rds.conf', 'r') as f: conf = json.loads(f.read()) reader = ResponseReader(conf) loader = Loader(conf) # get the set, validate, store outputs # but need to paginate because of ram issues for i in xrange(0, 668110, 25): print 'QUERYING {0}:{1}'.format(i, 25) for response in reader.read('', limit=25, offset=i): print response.source_url xml = response.cleaned_content stderr = validate_in_memory(xml) data = { "response_id": response.id, "valid": 'Error at' not in stderr, "validated_on": datetime.now()
#!/anaconda/bin/python import json from mpp.loaders import Loader from mpp.models import Response import traceback ''' ''' # set up the connection with open('local_rds.conf', 'r') as f: conf = json.loads(f.read()) loader = Loader(conf) with open('responses_with_federal_schemas.txt', 'r') as f: responses = [g.strip() for g in f.readlines() if g] for i, response in enumerate(responses): if i % 5000 == 0: print 'finished: ', i with open(response, 'r') as f: data = json.loads(f.read()) r = Response() r.create(data) try: loader.load(r) except Exception as ex: print response