def index(file_path, debug=False): """ Index a file from the repositoy. """ from harvest import build_fs_path es_url = flask.current_app.config['PUBDOCS_ES_URL'] repo = flask.current_app.config['PUBDOCS_FILE_REPO'] / '' (section, year, name) = file_path.replace(repo, "").split('/') fs_path = build_fs_path(file_path) with NamedTempFile(mode='w+b', delete=True) as temp: start = time() try: with open(fs_path, 'rb') as pdf_file: for chunk in invoke_tika(pdf_file): temp.write(chunk) temp.seek(0) except Exception as exp: log.critical(exp) text = clean(temp.name, debug) index_data = { 'file': b64encode(text), 'path': file_path, 'year': int(year), 'section': int(section[3:]), } index_resp = requests.post(es_url + '/mof/attachment/' + name, data=flask.json.dumps(index_data)) assert index_resp.status_code in [200, 201], repr(index_resp) if index_resp.status_code == 200: log.info('Skipping. Already indexed!') else: duration = time() - start log.info('%s[indexed in %f]' %(fs_path.name, duration))
def extract_laws_summary(file_path): from harvest import build_fs_path no = name = start_pg = end_pg = None laws = [] fs_path = build_fs_path(file_path) with NamedTempFile(mode='w+b', delete=False) as temp: command = ('pdf2htmlEX --process-nontext 0 --dest-dir ' '/tmp %s %s' %(fs_path, temp.name.split('/')[-1])) subprocess.check_call(command, shell=True) with open(temp.name, 'rb') as tmp: html = pq(tmp.read()) for div in html('#p1 .b .h3'): sum_entry = div.text_content() if 'Lege privind' in sum_entry: if no_pat.search(sum_entry): import pdb; pdb.set_trace() no = no_pat.search(sum_entry).group(0) if name_pat.search(sum_entry): name = name_pat.search(sum_entry).group(0) if interval_pat.search(sum_entry): start_pg = interval_pat.search(sum_entry).group(1) end_pg = interval_pat.search(sum_entry).group(2) laws.append([no, name, start_pg, end_pg]) os.remove(temp.name) return laws
def index(file_path): """ Index a file from the repositoy. """ from harvest import build_fs_path es_url = flask.current_app.config['PUBDOCS_ES_URL'] (section, year, name) = file_path.split('/') fs_path = build_fs_path(file_path) index_data = { 'file': b64encode(fs_path.bytes()), 'path': file_path, 'year': int(year), 'section': int(section[3:]), } index_resp = requests.post(es_url + '/mof/attachment/' + name, data=flask.json.dumps(index_data)) assert index_resp.status_code == 201, repr(index_resp)