def get_meta(db_id=None): with shared_lock(): path = safe_join(WRK_DB_DIR, db_id.lower()) path = os.path.join(path, META_FILE_NAME) if not os.path.exists(path) or not os.path.isfile(path): raise NotFound() with gzip.open(path, 'rt') as f: data = f.read() data = json.loads(data) return jsonify(data)
def get_report_list(id): try: with shared_lock(): with gzip.open(os.path.join(PARSED_DIR, id, 'index.json.gz'), 'rt') as f: result = f.read() except FileNotFoundError: raise NotFound() result = json.loads(result) return jsonify(result)
def get_db_list(db_id=None): with shared_lock(): with gzip.open(DB_LIST_FILE_NAME, 'rt') as f: dbs = f.read() dbs = json.loads(dbs) if db_id is not None: db = next((d for d in dbs if d['id'] == db_id), None) if db is None: raise NotFound() jsonify(db) return jsonify(dbs)
def get_report(cid, rid): try: with shared_lock(): with zipfile.ZipFile(os.path.join(PARSED_DIR, cid, 'content.zip'), 'r') as z: result = z.read(rid + '.json').decode() except FileNotFoundError: raise NotFound() except KeyError: raise NotFound() result = json.loads(result) return jsonify(result)
def update(limit=sys.maxsize, process_count=1): os.makedirs(PARSED_DIR, exist_ok=True) os.makedirs(RAW_DIR, exist_ok=True) ppe = ProcessPoolExecutor(process_count) # prevents leaks for fd in load_report_list(): if limit == 0: break with shared_lock(): try: with zipfile.ZipFile( os.path.join(PARSED_DIR, fd['cik'], 'content.zip'), 'r') as z: if (fd['file'] + '.json') in z.namelist(): continue except FileNotFoundError: pass ignored = [] try: with shared_lock(): with gzip.open( os.path.join(PARSED_DIR, fd['cik'], 'ignored.txt.gz'), 'rt') as f: ignored = f.read().split('\n') ignored = [i.split(':', 1)[1] for i in ignored[:-1]] except FileNotFoundError: pass if fd['file'] in ignored: continue limit -= 1 ppe.exec(parse_raw_one, [fd]) ppe.join()
def get_company(id=None): with shared_lock(): with gzip.open(os.path.join(PARSED_DIR, 'index.json.gz'), 'rt') as f: companies = f.read() companies = json.loads(companies) if id is not None: company = next((i for i in companies if i['cik'] == id), None) if company is None: raise NotFound() else: return jsonify(company) else: return jsonify(companies)
def update_indexes(): os.makedirs(PARSED_DIR, exist_ok=True) global_index = [] for i in os.listdir(PARSED_DIR): if os.path.isfile(os.path.join(PARSED_DIR, i)): continue logger.info("reindex: " + i) local_index = [] try: with shared_lock(): with zipfile.ZipFile( os.path.join(PARSED_DIR, i, 'content.zip'), 'r') as z: for j in z.namelist(): content = z.read(j).decode() content = json.loads(content) local_index.append({ 'id': content['id'], 'name': content['name'], 'date': content['date'], 'type': content['type'], 'facts': len(content['facts']) }) except FileNotFoundError: continue except: logger.exception('wtf') with exclusive_lock(): os.remove(os.path.join(PARSED_DIR, i, 'content.zip')) continue local_index.sort(key=lambda i: (i['date'], i['id'])) if len(local_index) == 0: continue with exclusive_lock(): with gzip.open(os.path.join(PARSED_DIR, i, 'index.json.gz'), 'wt') as f: f.write(json.dumps(local_index, indent=1)) global_index.append({ 'cik': i, 'name': local_index[-1]['name'], 'reports': len(local_index), 'last_date': local_index[-1]['date'] }) global_index.sort(key=lambda i: i['cik']) with exclusive_lock(): with gzip.open(os.path.join(PARSED_DIR, 'index.json.gz'), 'wt') as f: f.write(json.dumps(global_index, indent=1))
def get_files(path=''): with shared_lock(): path = safe_join(PARSED_DIR, path) if len(path) > 0 else PARSED_DIR if not os.path.exists(path): raise NotFound() if os.path.isdir(path): lst = os.listdir(path) lst = [{ "name": i, "type": 'dir' if os.path.isdir(os.path.join(path, i)) else 'file' } for i in lst] return jsonify(lst) else: return send_file(path, as_attachment=True)
def get_series(db_id=None, series_id=None): with shared_lock(): last_series_id = request.args.get('after') db_path = safe_join(WRK_DB_DIR, db_id.lower()) files = os.listdir(db_path) files = [{ "from": f.split(FILE_NAME_DELIMITER)[1], "to": f.split(FILE_NAME_DELIMITER)[2], "name": f } for f in files if f.startswith(SERIES_PREFIX)] files.sort(key=lambda f: f['from']) if series_id is not None: series_file = next( (f for f in files if f['from'] <= series_id <= f['to']), None) elif last_series_id is None: series_file = files[0] else: series_file = next((f for f in files if f['to'] > last_series_id), None) if series_file is None: return jsonify([]) series_path = os.path.join(db_path, series_file['name']) with gzip.open(series_path, 'rt') as f: series = f.read() series = json.loads(series) if series_id is not None: series = next((s for s in series if s['id'] == series_id), None) if series is None: raise NotFound() return jsonify(series) if last_series_id is not None: series = [s for s in series if s['id'] > last_series_id] return { 'data': series, 'next_page': None if files.index(series_file) >= len(files) - 1 else ('?after=' + series[-1]['id']) # TODO better pagination: count, offset, limit }
def get_data(db_id, series_id=None, kind=None): with shared_lock(): prefix = kind + FILE_NAME_DELIMITER db_path = safe_join(WRK_DB_DIR, db_id.lower()) files = os.listdir(db_path) files = [{ "from": f.split(FILE_NAME_DELIMITER)[1], "to": f.split(FILE_NAME_DELIMITER)[2], "name": f } for f in files if f.startswith(prefix)] data_file = next( (f for f in files if f['from'] <= series_id <= f['to']), None) if data_file is None: raise NotFound() path = os.path.join(db_path, data_file['name']) with zipfile.ZipFile(path, 'r') as z: try: content = z.read(series_id + JSON_SUFFIX) except KeyError: raise NotFound() content = content.decode() content = json.loads(content) return jsonify(content)