Exemple #1
0
def get_meta(db_id=None):
    with shared_lock():
        path = safe_join(WRK_DB_DIR, db_id.lower())
        path = os.path.join(path, META_FILE_NAME)
        if not os.path.exists(path) or not os.path.isfile(path):
            raise NotFound()
        with gzip.open(path, 'rt') as f:
            data = f.read()
        data = json.loads(data)
        return jsonify(data)
Exemple #2
0
def get_report_list(id):
    try:
        with shared_lock():
            with gzip.open(os.path.join(PARSED_DIR, id, 'index.json.gz'),
                           'rt') as f:
                result = f.read()
    except FileNotFoundError:
        raise NotFound()
    result = json.loads(result)
    return jsonify(result)
Exemple #3
0
def get_db_list(db_id=None):
    with shared_lock():
        with gzip.open(DB_LIST_FILE_NAME, 'rt') as f:
            dbs = f.read()
        dbs = json.loads(dbs)
        if db_id is not None:
            db = next((d for d in dbs if d['id'] == db_id), None)
            if db is None:
                raise NotFound()
            jsonify(db)
        return jsonify(dbs)
Exemple #4
0
def get_report(cid, rid):
    try:
        with shared_lock():
            with zipfile.ZipFile(os.path.join(PARSED_DIR, cid, 'content.zip'),
                                 'r') as z:
                result = z.read(rid + '.json').decode()
    except FileNotFoundError:
        raise NotFound()
    except KeyError:
        raise NotFound()
    result = json.loads(result)
    return jsonify(result)
Exemple #5
0
def update(limit=sys.maxsize, process_count=1):
    os.makedirs(PARSED_DIR, exist_ok=True)
    os.makedirs(RAW_DIR, exist_ok=True)

    ppe = ProcessPoolExecutor(process_count)  # prevents leaks

    for fd in load_report_list():
        if limit == 0:
            break

        with shared_lock():
            try:
                with zipfile.ZipFile(
                        os.path.join(PARSED_DIR, fd['cik'], 'content.zip'),
                        'r') as z:
                    if (fd['file'] + '.json') in z.namelist():
                        continue
            except FileNotFoundError:
                pass

        ignored = []
        try:
            with shared_lock():
                with gzip.open(
                        os.path.join(PARSED_DIR, fd['cik'], 'ignored.txt.gz'),
                        'rt') as f:
                    ignored = f.read().split('\n')
                    ignored = [i.split(':', 1)[1] for i in ignored[:-1]]
        except FileNotFoundError:
            pass

        if fd['file'] in ignored:
            continue

        limit -= 1

        ppe.exec(parse_raw_one, [fd])

    ppe.join()
Exemple #6
0
def get_company(id=None):
    with shared_lock():
        with gzip.open(os.path.join(PARSED_DIR, 'index.json.gz'), 'rt') as f:
            companies = f.read()
    companies = json.loads(companies)
    if id is not None:
        company = next((i for i in companies if i['cik'] == id), None)
        if company is None:
            raise NotFound()
        else:
            return jsonify(company)
    else:
        return jsonify(companies)
Exemple #7
0
def update_indexes():
    os.makedirs(PARSED_DIR, exist_ok=True)

    global_index = []
    for i in os.listdir(PARSED_DIR):
        if os.path.isfile(os.path.join(PARSED_DIR, i)):
            continue
        logger.info("reindex: " + i)
        local_index = []
        try:
            with shared_lock():
                with zipfile.ZipFile(
                        os.path.join(PARSED_DIR, i, 'content.zip'), 'r') as z:
                    for j in z.namelist():
                        content = z.read(j).decode()
                        content = json.loads(content)
                        local_index.append({
                            'id': content['id'],
                            'name': content['name'],
                            'date': content['date'],
                            'type': content['type'],
                            'facts': len(content['facts'])
                        })
        except FileNotFoundError:
            continue
        except:
            logger.exception('wtf')
            with exclusive_lock():
                os.remove(os.path.join(PARSED_DIR, i, 'content.zip'))
                continue

        local_index.sort(key=lambda i: (i['date'], i['id']))

        if len(local_index) == 0:
            continue
        with exclusive_lock():
            with gzip.open(os.path.join(PARSED_DIR, i, 'index.json.gz'),
                           'wt') as f:
                f.write(json.dumps(local_index, indent=1))

        global_index.append({
            'cik': i,
            'name': local_index[-1]['name'],
            'reports': len(local_index),
            'last_date': local_index[-1]['date']
        })

    global_index.sort(key=lambda i: i['cik'])
    with exclusive_lock():
        with gzip.open(os.path.join(PARSED_DIR, 'index.json.gz'), 'wt') as f:
            f.write(json.dumps(global_index, indent=1))
Exemple #8
0
def get_files(path=''):
    with shared_lock():
        path = safe_join(PARSED_DIR, path) if len(path) > 0 else PARSED_DIR
        if not os.path.exists(path):
            raise NotFound()
        if os.path.isdir(path):
            lst = os.listdir(path)
            lst = [{
                "name":
                i,
                "type":
                'dir' if os.path.isdir(os.path.join(path, i)) else 'file'
            } for i in lst]
            return jsonify(lst)
        else:
            return send_file(path, as_attachment=True)
Exemple #9
0
def get_series(db_id=None, series_id=None):
    with shared_lock():
        last_series_id = request.args.get('after')
        db_path = safe_join(WRK_DB_DIR, db_id.lower())

        files = os.listdir(db_path)
        files = [{
            "from": f.split(FILE_NAME_DELIMITER)[1],
            "to": f.split(FILE_NAME_DELIMITER)[2],
            "name": f
        } for f in files if f.startswith(SERIES_PREFIX)]
        files.sort(key=lambda f: f['from'])
        if series_id is not None:
            series_file = next(
                (f for f in files if f['from'] <= series_id <= f['to']), None)
        elif last_series_id is None:
            series_file = files[0]
        else:
            series_file = next((f for f in files if f['to'] > last_series_id),
                               None)
        if series_file is None:
            return jsonify([])

        series_path = os.path.join(db_path, series_file['name'])
        with gzip.open(series_path, 'rt') as f:
            series = f.read()
        series = json.loads(series)

        if series_id is not None:
            series = next((s for s in series if s['id'] == series_id), None)
            if series is None:
                raise NotFound()
            return jsonify(series)

        if last_series_id is not None:
            series = [s for s in series if s['id'] > last_series_id]

        return {
            'data':
            series,
            'next_page':
            None if files.index(series_file) >= len(files) - 1 else
            ('?after=' + series[-1]['id'])
            # TODO better pagination: count, offset, limit
        }
Exemple #10
0
def get_data(db_id, series_id=None, kind=None):
    with shared_lock():
        prefix = kind + FILE_NAME_DELIMITER
        db_path = safe_join(WRK_DB_DIR, db_id.lower())
        files = os.listdir(db_path)
        files = [{
            "from": f.split(FILE_NAME_DELIMITER)[1],
            "to": f.split(FILE_NAME_DELIMITER)[2],
            "name": f
        } for f in files if f.startswith(prefix)]
        data_file = next(
            (f for f in files if f['from'] <= series_id <= f['to']), None)
        if data_file is None:
            raise NotFound()
        path = os.path.join(db_path, data_file['name'])
        with zipfile.ZipFile(path, 'r') as z:
            try:
                content = z.read(series_id + JSON_SUFFIX)
            except KeyError:
                raise NotFound()
        content = content.decode()
        content = json.loads(content)
        return jsonify(content)