def index(file_path, debug=False):
    """ Index a file from the repositoy. """
    from harvest import build_fs_path
    es_url = flask.current_app.config['PUBDOCS_ES_URL']
    repo = flask.current_app.config['PUBDOCS_FILE_REPO'] / ''

    (section, year, name) = file_path.replace(repo, "").split('/')
    fs_path = build_fs_path(file_path)
    with NamedTempFile(mode='w+b', delete=True) as temp:
        start = time()
        try:
            with open(fs_path, 'rb') as pdf_file:
                for chunk in invoke_tika(pdf_file):
                    temp.write(chunk)
                temp.seek(0)

        except Exception as exp:
            log.critical(exp)
        text = clean(temp.name, debug)
        index_data = {
            'file': b64encode(text),
            'path': file_path,
            'year': int(year),
            'section': int(section[3:]),
        }
        index_resp = requests.post(es_url + '/mof/attachment/' + name,
                                   data=flask.json.dumps(index_data))
        assert index_resp.status_code in [200, 201], repr(index_resp)
        if index_resp.status_code == 200:
            log.info('Skipping. Already indexed!')
        else:
            duration = time() - start
            log.info('%s[indexed in %f]' %(fs_path.name, duration))
 def extract_laws_summary(file_path):
     from harvest import build_fs_path
     no = name = start_pg = end_pg = None
     laws = []
     fs_path = build_fs_path(file_path)
     with NamedTempFile(mode='w+b', delete=False) as temp:
         command = ('pdf2htmlEX --process-nontext 0 --dest-dir '
                    '/tmp %s %s' %(fs_path, temp.name.split('/')[-1]))
         subprocess.check_call(command, shell=True)
     with open(temp.name, 'rb') as tmp:
         html = pq(tmp.read())
         for div in html('#p1 .b .h3'):
             sum_entry = div.text_content()
             if 'Lege privind' in sum_entry:
                 if no_pat.search(sum_entry):
                     import pdb; pdb.set_trace()
                     no = no_pat.search(sum_entry).group(0)
                 if name_pat.search(sum_entry):
                     name = name_pat.search(sum_entry).group(0)
                 if interval_pat.search(sum_entry):
                     start_pg = interval_pat.search(sum_entry).group(1)
                     end_pg = interval_pat.search(sum_entry).group(2)
                 laws.append([no, name, start_pg, end_pg])
     os.remove(temp.name)
     return laws
Exemple #3
0
    def index(file_path):
        """ Index a file from the repositoy. """
        from harvest import build_fs_path
        es_url = flask.current_app.config['PUBDOCS_ES_URL']

        (section, year, name) = file_path.split('/')
        fs_path = build_fs_path(file_path)
        index_data = {
            'file': b64encode(fs_path.bytes()),
            'path': file_path,
            'year': int(year),
            'section': int(section[3:]),
        }
        index_resp = requests.post(es_url + '/mof/attachment/' + name,
                                   data=flask.json.dumps(index_data))
        assert index_resp.status_code == 201, repr(index_resp)
Exemple #4
0
    def index(file_path):
        """ Index a file from the repositoy. """
        from harvest import build_fs_path
        es_url = flask.current_app.config['PUBDOCS_ES_URL']

        (section, year, name) = file_path.split('/')
        fs_path = build_fs_path(file_path)
        index_data = {
            'file': b64encode(fs_path.bytes()),
            'path': file_path,
            'year': int(year),
            'section': int(section[3:]),
        }
        index_resp = requests.post(es_url + '/mof/attachment/' + name,
                                   data=flask.json.dumps(index_data))
        assert index_resp.status_code == 201, repr(index_resp)