def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, user=None, pdf=False, preserve_color=False, **kwargs): #sys.stderr.write("ocr_page_tasks running\n") if isinstance(image_file, set): return [] if not isinstance(image_file, (DAFile, DAFileList, list)): return word("(Not a DAFile, DAFileList, or list object)") pdf_to_ppm = get_config("pdftoppm") if pdf_to_ppm is None: pdf_to_ppm = 'pdftoppm' ocr_resolution = get_config("ocr dpi") if ocr_resolution is None: ocr_resolution = '300' langs = get_available_languages() if language is None: language = get_language() if language in langs: lang = language else: ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = {} if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n") except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n") if isinstance(image_file, DAFile): image_file = [image_file] todo = [] for doc in image_file: if hasattr(doc, 'extension'): if doc.extension not in ['pdf', 'png', 'jpg', 'gif', 'docx', 'doc', 'odt', 'rtf']: raise Exception("document with extension " + doc.extension + " is not a readable image file") if doc.extension == 'pdf': #doc.page_path(1, 'page') for i in range(safe_pypdf_reader(doc.path()).getNumPages()): todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color)) elif doc.extension in ("docx", "doc", "odt", "rtf"): doc_conv = docassemble.base.util.pdf_concatenate(doc) for i in range(safe_pypdf_reader(doc_conv.path()).getNumPages()): todo.append(dict(doc=doc_conv, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color)) else: todo.append(dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color)) #sys.stderr.write("ocr_page_tasks finished\n") return todo
def send_answers(): docassemble_api_key = get_config("docassemble api key") answers = all_variables(simplify=False) endpoint = "https://community.lawyer/docassemble_answers/new/" for q, a in answers.items(): if isinstance(a, DAFileList) or isinstance(a, DAFile): s3_config = get_config('s3') local_path = a.path().replace("/tmp/", "") if bool(s3_config): bucket_name = s3_config['bucket'] answers[q] = "s3://%s/%s" % (bucket_name, local_path) else: answers[q] = local_path for q, a in answers.items(): if isinstance(a, DADict) and type(a.values()[0]) is bool: checked_boxes = [] for box_name, box_value in a.items(): if box_value is True: checked_boxes.append(box_name) answers[q] = checked_boxes answers = json.dumps(answers, default=lambda x: str(x)) metadata = json.dumps(all_variables(special='metadata'), default=lambda x: str(x)) logged_in_user = json.dumps(get_user_info_hash()) return requests.post(endpoint, data={ 'docassemble_api_key': docassemble_api_key, 'answers': answers, 'metadata': metadata, 'respondent': logged_in_user })
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, **kwargs): #sys.stderr.write("ocr_page_tasks running\n") if not (isinstance(image_file, DAFile) or isinstance(image_file, DAFileList)): return word("(Not a DAFile or DAFileList object)") pdf_to_ppm = get_config("pdftoppm") if pdf_to_ppm is None: pdf_to_ppm = 'pdftoppm' ocr_resolution = get_config("ocr dpi") if ocr_resolution is None: ocr_resolution = '300' langs = get_available_languages() if language is None: language = get_language() if language in langs: lang = language else: ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = dict() if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n") except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n") if isinstance(image_file, DAFile): image_file = [image_file] todo = list() for doc in image_file: if hasattr(doc, 'extension'): if doc.extension not in ['pdf', 'png', 'jpg', 'gif']: raise Exception("document with extension " + doc.extension + " is not a readable image file") if doc.extension == 'pdf': #doc.page_path(1, 'page') for i in xrange(PdfFileReader(open(doc.path(), 'rb')).getNumPages()): todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) else: todo.append(dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) #sys.stderr.write("ocr_page_tasks finished\n") return todo
def send_answers(variables_to_reject=[], include_logged_in_user=True): ''' Sends your Docassemble user's serialized answers to your Community.lawyer account :param list variables_to_reject: A list of variables to exclude from sending to the server, defaults to none ''' docassemble_api_key = get_config("docassemble api key") answers = all_variables(simplify=False) endpoint = "https://community.lawyer/docassemble_answers/new/" if len(variables_to_reject): answers = { k: v for k, v in answers.iteritems() if k not in variables_to_reject } for q, a in answers.items(): if isinstance(a, DAFileList) or isinstance(a, DAFile): s3_config = get_config('s3') local_path = a.path().replace("/tmp/", "") if bool(s3_config): bucket_name = s3_config['bucket'] answers[q] = "s3://%s/%s" % (bucket_name, local_path) else: answers[q] = local_path for q, a in answers.items(): if isinstance(a, DADict) and type(a.values()[0]) is bool: checked_boxes = [] for box_name, box_value in a.items(): if box_value is True: checked_boxes.append(box_name) answers[q] = checked_boxes for q, a in answers.items(): try: json.dumps(a) except: answers[q] = str(a) answers = json.dumps(answers, default=lambda x: str(x)) metadata = json.dumps(all_variables(special='metadata'), default=lambda x: str(x)) logged_in_user = json.dumps( get_user_info_hash()) if include_logged_in_user else {} return requests.post(endpoint, data={ 'docassemble_api_key': docassemble_api_key, 'answers': answers, 'metadata': metadata, 'respondent': logged_in_user })
def get_ocr_language(language): langs = get_available_languages() if language is None: language = get_language() ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = dict() if language in langs: lang = language else: if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] raise Exception( "could not get OCR language for language " + str(language) + "; using language " + str(lang)) except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] raise Exception("could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error)) return lang
def mmdc(input_text, file_format='svg', flags=dict()): if not isinstance(flags, dict): raise Exception("mmdc: flags not a dictionary") if not isinstance(file_format, string_types) or re.search(r'[^a-z]', file_format) or len(file_format) == 0: raise Exception("mmdc: invalid file format") if not isinstance(input_text, string_types): input_text = text_type(input_text) sys.stderr.write("Writing:\n" + input_text + "\n") input_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix=".mmd", delete=False) input_file.write(input_text) input_file.close() output_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix="." + file_format, delete=False) output_file.close() commands = [get_config('mmdc path', 'mmdc'), '-p', os.path.join(expanduser("~"), 'puppeteer-config.json'), '-i', input_file.name, '-o', output_file.name] for key, val in flags.items(): commands.append('-' + str(key)) commands.append(repr(str(val))) sys.stderr.write("Commands are: " + " ".join(commands) + "\n") try: output = subprocess.check_output(commands, stderr=subprocess.STDOUT).decode() except subprocess.CalledProcessError as err: output = err.output.decode() raise Exception("mmdc: there was an error. " + output) if os.path.getsize(output_file.name) == 0: raise Exception("mmdc: the command did not produce any output. " + output) obj = DAFile() obj.set_random_instance_name() obj.initialize(extension=file_format) obj.copy_into(output_file.name) obj.commit() return obj
def markdown_to_docx(text, tpl): if get_config('new markdown to docx', False): source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False) source_code = re.sub("\n", ' ', source_code) source_code = re.sub(">\s+<", '><', source_code) soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser') parser = SoupParser(tpl) for elem in soup.find_all(recursive=False): parser.traverse(elem) output = text_type(parser) # logmessage(output) return output else: source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False) source_code = re.sub(r'(?<!\>)\n', ' ', source_code) #source_code = re.sub("\n", ' ', source_code) #source_code = re.sub(">\s+<", '><', source_code) rt = RichText('') soup = BeautifulSoup(source_code, 'lxml') html_parsed = deque() html_parsed = html_linear_parse(soup) rt = add_to_rt(tpl, rt, html_parsed) return rt
def mmdc(input_text, file_format='svg', flags=dict()): if not isinstance(flags, dict): raise Exception("mmdc: flags not a dictionary") if not isinstance(file_format, str) or re.search(r'[^a-z]', file_format) or len(file_format) == 0: raise Exception("mmdc: invalid file format") if not isinstance(input_text, str): input_text = str(input_text) sys.stderr.write("Writing:\n" + input_text + "\n") input_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix=".mmd", delete=False) input_file.write(input_text) input_file.close() output_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix="." + file_format, delete=False) output_file.close() commands = [get_config('mmdc path', 'mmdc'), '-p', os.path.join(expanduser("~"), 'puppeteer-config.json'), '-i', input_file.name, '-o', output_file.name] for key, val in flags.items(): commands.append('-' + str(key)) commands.append(repr(str(val))) sys.stderr.write("Commands are: " + " ".join(commands) + "\n") try: output = subprocess.check_output(commands, stderr=subprocess.STDOUT).decode() except subprocess.CalledProcessError as err: output = err.output.decode() raise Exception("mmdc: there was an error. " + output) if os.path.getsize(output_file.name) == 0: raise Exception("mmdc: the command did not produce any output. " + output) obj = DAFile() obj.set_random_instance_name() obj.initialize(extension=file_format) obj.copy_into(output_file.name) obj.commit() return obj
def local_config(param_name: str, default=None, dev_mode: bool = False): if dev_mode: return default config = get_config('us-tx-family') if not config: return default return config.get(param_name, default)
def get_service(org_slug, service_slug): service_endpoint = (get_config("exitpage") + "/api/v1/organizations/" + str(org_slug) + "/services/" + str(service_slug)) r = requests.get(service_endpoint) if r.status_code == 200: try: r_json = r.json() if r_json: return r_json.get("data") except: return None return None
def markdown_to_docx(text, question, tpl): if get_config('new markdown to docx', False): source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False) source_code = re.sub("\n", ' ', source_code) source_code = re.sub(">\s+<", '><', source_code) soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser') parser = SoupParser(tpl) for elem in soup.find_all(recursive=False): parser.traverse(elem) output = str(parser) # logmessage(output) return docassemble.base.filter.docx_template_filter(output, question=question) else: return inline_markdown_to_docx(text, question, tpl)
def get_eligibility_code(eligibility_id): eligibility_endpoint = (get_config("exitpage") + "/api/v1/eligibility/" + str(eligibility_id)) default_code = "def qualify():\n return -1" r = requests.get(eligibility_endpoint) if r.status_code == 200: try: r_json = r.json() if r_json and r_json.get("data") and r_json.get("data").get( "code"): return r_json.get("data").get("code") except: return default_code return default_code
def eligibility(name, **kwargs): endpoint = get_config("exitpage") + "/api/v1/eligibility" r = requests.get(endpoint, params={ "name": name, "args": json.dumps(kwargs) }) if r.status_code == 200: try: r_json = r.json() if r_json and r_json.get("data") and r_json.get("data").get( "value"): return r_json.get("data").get("value") except: return None return None
def markdown_to_docx(text, tpl): if get_config('new markdown to docx', False): source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False) source_code = re.sub("\n", ' ', source_code) source_code = re.sub(">\s+<", '><', source_code) soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser') parser = SoupParser(tpl) for elem in soup.find_all(recursive=False): parser.traverse(elem) output = text_type(parser) logmessage(output) return output else: source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False) source_code = re.sub(r'(?<!\>)\n', ' ', source_code) #source_code = re.sub("\n", ' ', source_code) #source_code = re.sub(">\s+<", '><', source_code) rt = RichText('') soup = BeautifulSoup(source_code, 'lxml') html_parsed = deque() html_parsed = html_linear_parse(soup) rt = add_to_rt(tpl, rt, html_parsed) return rt
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, **kwargs): #sys.stderr.write("ocr_page_tasks running\n") if not (isinstance(image_file, DAFile) or isinstance(image_file, DAFileList)): return word("(Not a DAFile or DAFileList object)") pdf_to_ppm = get_config("pdftoppm") if pdf_to_ppm is None: pdf_to_ppm = 'pdftoppm' ocr_resolution = get_config("ocr dpi") if ocr_resolution is None: ocr_resolution = '300' langs = get_available_languages() if language is None: language = get_language() if language in langs: lang = language else: ocr_langs = get_config("ocr languages") if ocr_langs is None: ocr_langs = dict() if language in ocr_langs and ocr_langs[language] in langs: lang = ocr_langs[language] else: try: pc_lang = pycountry.languages.get(alpha_2=language) lang_three_letter = pc_lang.alpha_3 if lang_three_letter in langs: lang = lang_three_letter else: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write( "ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n") except Exception as the_error: if 'eng' in langs: lang = 'eng' else: lang = langs[0] sys.stderr.write( "ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n") if isinstance(image_file, DAFile): image_file = [image_file] todo = list() for doc in image_file: if hasattr(doc, 'extension'): if doc.extension not in ['pdf', 'png', 'jpg', 'gif']: raise Exception("document with extension " + doc.extension + " is not a readable image file") if doc.extension == 'pdf': #doc.page_path(1, 'page') for i in range( PdfFileReader(open(doc.path(), 'rb')).getNumPages()): todo.append( dict(doc=doc, page=i + 1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) else: todo.append( dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code)) #sys.stderr.write("ocr_page_tasks finished\n") return todo
def sendQuery(filename, number=0): number_flag = "-s" + str(number) if no_docassemble: scasp_location = "scasp" else: scasp_location = get_config('scasp')['location'] if ( get_config('scasp') and get_config('scasp')['location'] ) else '/var/www/.ciao/build/bin/scasp' results = subprocess.run( [scasp_location, '--human', '--tree', number_flag, filename], capture_output=True).stdout.decode('utf-8') pattern = re.compile(r"daSCASP_([^),\s]*)") matches = list(pattern.finditer(results)) for m in matches: results = results.replace( m.group(0), urllib.parse.unquote_plus( m.group(1).replace('__perc__', '%').replace('__plus__', '+'))) output = {} # If result is no models if results.endswith('no models\n\n'): query = results.replace('\n\nno models\n\n', '').replace( '\n ', '').replace('QUERY:', '').replace('{', '').replace('}', '').replace('% ', '') output['query'] = query output['result'] = 'No' return output else: # Divide up the remainder into individual answers answers = results.split("\tANSWER:\t") query = answers[0] del answers[0] query = query.replace('\n', '').replace(' ', ' ').replace( 'QUERY:', '').replace('% ', '').replace('{', '').replace('}', '') output['query'] = query output['result'] = 'Yes' output['answers'] = [] # for each actual answer for a in answers: #Separate out the time, tree, model, and bindings answer_parts = a.split('\n\nJUSTIFICATION_TREE:\n') time = answer_parts[0] answer_parts = answer_parts[1].split('\n\nMODEL:\n') tree = answer_parts[0] answer_parts = answer_parts[1].split('\n\nBINDINGS:') model = answer_parts[0] bindings = [] # The bindings may not exist if len(answer_parts) > 1: bindings = answer_parts[1].splitlines() # Reformat the Time time = time.replace(' ms)', '').replace('(in ', '').split(' ')[1] # Reformat the Tree explanations = make_tree(tree) explanations = display_list(explanations) # Reformat the Model model = model.replace('{ ', '').replace(' }', '').split(', ') # Reformat the Bindings if bindings: bindings = [b for b in bindings if b != '' and b != ' '] bindings = [b.replace(' equal ', ': ') for b in bindings] # Create a dictionary for this answer new_answer = {} new_answer['time'] = time new_answer['model'] = model if bindings: new_answer['bindings'] = bindings new_answer['explanations'] = explanations # Add the answer to the output_answers list output['answers'].append(new_answer.copy()) # Reorganize the tree so that bindings are a level above models and explanations. new_output = {} new_output['query'] = output['query'] new_output['result'] = output['result'] new_output['answers'] = [] for a in output['answers']: present = False for na in new_output['answers']: if a['bindings'].sort() == na['bindings'].sort(): present = True if not present: new_output['answers'].append({ 'bindings': a['bindings'], 'models': [] }) for a in output['answers']: for na in new_output['answers']: if a['bindings'] == na['bindings']: na['models'].append({ 'time': a['time'], 'model': a['model'], 'explanations': a['explanations'] }) # na['models']['time'] = a['time'] # na['models']['model'] = a['model'] # na['models']['explanations'] = a['explanations'] for i in range(len(new_output['answers'])): nlg_answer = new_output['query'] nlg_answer = nlg_answer.replace('I would like to know if ', '') for b in new_output['answers'][i]['bindings']: splitbinding = b.split(': ') nlg_answer = nlg_answer.replace(splitbinding[0], splitbinding[1]) new_output['answers'][i]['nlg_answer'] = nlg_answer return new_output