Beispiel #1
0
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, user=None, pdf=False, preserve_color=False, **kwargs):
    #sys.stderr.write("ocr_page_tasks running\n")
    if isinstance(image_file, set):
        return []
    if not isinstance(image_file, (DAFile, DAFileList, list)):
        return word("(Not a DAFile, DAFileList, or list object)")
    pdf_to_ppm = get_config("pdftoppm")
    if pdf_to_ppm is None:
        pdf_to_ppm = 'pdftoppm'
    ocr_resolution = get_config("ocr dpi")
    if ocr_resolution is None:
        ocr_resolution = '300'
    langs = get_available_languages()
    if language is None:
        language = get_language()
    if language in langs:
        lang = language
    else:
        ocr_langs = get_config("ocr languages")
        if ocr_langs is None:
            ocr_langs = {}
        if language in ocr_langs and ocr_langs[language] in langs:
            lang = ocr_langs[language]
        else:
            try:
                pc_lang = pycountry.languages.get(alpha_2=language)
                lang_three_letter = pc_lang.alpha_3
                if lang_three_letter in langs:
                    lang = lang_three_letter
                else:
                    if 'eng' in langs:
                        lang = 'eng'
                    else:
                        lang = langs[0]
                    sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n")
            except Exception as the_error:
                if 'eng' in langs:
                    lang = 'eng'
                else:
                    lang = langs[0]
                sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n")
    if isinstance(image_file, DAFile):
        image_file = [image_file]
    todo = []
    for doc in image_file:
        if hasattr(doc, 'extension'):
            if doc.extension not in ['pdf', 'png', 'jpg', 'gif', 'docx', 'doc', 'odt', 'rtf']:
                raise Exception("document with extension " + doc.extension + " is not a readable image file")
            if doc.extension == 'pdf':
                #doc.page_path(1, 'page')
                for i in range(safe_pypdf_reader(doc.path()).getNumPages()):
                    todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color))
            elif doc.extension in ("docx", "doc", "odt", "rtf"):
                doc_conv = docassemble.base.util.pdf_concatenate(doc)
                for i in range(safe_pypdf_reader(doc_conv.path()).getNumPages()):
                    todo.append(dict(doc=doc_conv, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color))
            else:
                todo.append(dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code, user=user, pdf=pdf, preserve_color=preserve_color))
    #sys.stderr.write("ocr_page_tasks finished\n")
    return todo
def send_answers():
    docassemble_api_key = get_config("docassemble api key")
    answers = all_variables(simplify=False)
    endpoint = "https://community.lawyer/docassemble_answers/new/"

    for q, a in answers.items():
        if isinstance(a, DAFileList) or isinstance(a, DAFile):
            s3_config = get_config('s3')
            local_path = a.path().replace("/tmp/", "")
            if bool(s3_config):
                bucket_name = s3_config['bucket']
                answers[q] = "s3://%s/%s" % (bucket_name, local_path)
            else:
                answers[q] = local_path
    for q, a in answers.items():
        if isinstance(a, DADict) and type(a.values()[0]) is bool:
            checked_boxes = []
            for box_name, box_value in a.items():
                if box_value is True:
                    checked_boxes.append(box_name)
            answers[q] = checked_boxes

    answers = json.dumps(answers, default=lambda x: str(x))
    metadata = json.dumps(all_variables(special='metadata'),
                          default=lambda x: str(x))
    logged_in_user = json.dumps(get_user_info_hash())

    return requests.post(endpoint,
                         data={
                             'docassemble_api_key': docassemble_api_key,
                             'answers': answers,
                             'metadata': metadata,
                             'respondent': logged_in_user
                         })
Beispiel #3
0
def ocr_page_tasks(image_file, language=None, psm=6, x=None, y=None, W=None, H=None, user_code=None, **kwargs):
    #sys.stderr.write("ocr_page_tasks running\n")
    if not (isinstance(image_file, DAFile) or isinstance(image_file, DAFileList)):
        return word("(Not a DAFile or DAFileList object)")
    pdf_to_ppm = get_config("pdftoppm")
    if pdf_to_ppm is None:
        pdf_to_ppm = 'pdftoppm'
    ocr_resolution = get_config("ocr dpi")
    if ocr_resolution is None:
        ocr_resolution = '300'
    langs = get_available_languages()
    if language is None:
        language = get_language()
    if language in langs:
        lang = language
    else:
        ocr_langs = get_config("ocr languages")
        if ocr_langs is None:
            ocr_langs = dict()
        if language in ocr_langs and ocr_langs[language] in langs:
            lang = ocr_langs[language]
        else:
            try:
                pc_lang = pycountry.languages.get(alpha_2=language)
                lang_three_letter = pc_lang.alpha_3
                if lang_three_letter in langs:
                    lang = lang_three_letter
                else:
                    if 'eng' in langs:
                        lang = 'eng'
                    else:
                        lang = langs[0]
                    sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "\n")
            except Exception as the_error:
                if 'eng' in langs:
                    lang = 'eng'
                else:
                    lang = langs[0]
                sys.stderr.write("ocr_file: could not get OCR language for language " + str(language) + "; using language " + str(lang) + "; error was " + str(the_error) + "\n")
    if isinstance(image_file, DAFile):
        image_file = [image_file]
    todo = list()
    for doc in image_file:
        if hasattr(doc, 'extension'):
            if doc.extension not in ['pdf', 'png', 'jpg', 'gif']:
                raise Exception("document with extension " + doc.extension + " is not a readable image file")
            if doc.extension == 'pdf':
                #doc.page_path(1, 'page')
                for i in xrange(PdfFileReader(open(doc.path(), 'rb')).getNumPages()):
                    todo.append(dict(doc=doc, page=i+1, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code))
            else:
                todo.append(dict(doc=doc, page=None, lang=lang, ocr_resolution=ocr_resolution, psm=psm, x=x, y=y, W=W, H=H, pdf_to_ppm=pdf_to_ppm, user_code=user_code))
    #sys.stderr.write("ocr_page_tasks finished\n")
    return todo
Beispiel #4
0
def send_answers(variables_to_reject=[], include_logged_in_user=True):
    '''
  Sends your Docassemble user's serialized answers to your Community.lawyer account

  :param list variables_to_reject: A list of variables to exclude from sending to the server, defaults to none
  '''
    docassemble_api_key = get_config("docassemble api key")
    answers = all_variables(simplify=False)
    endpoint = "https://community.lawyer/docassemble_answers/new/"

    if len(variables_to_reject):
        answers = {
            k: v
            for k, v in answers.iteritems() if k not in variables_to_reject
        }

    for q, a in answers.items():
        if isinstance(a, DAFileList) or isinstance(a, DAFile):
            s3_config = get_config('s3')
            local_path = a.path().replace("/tmp/", "")
            if bool(s3_config):
                bucket_name = s3_config['bucket']
                answers[q] = "s3://%s/%s" % (bucket_name, local_path)
            else:
                answers[q] = local_path
    for q, a in answers.items():
        if isinstance(a, DADict) and type(a.values()[0]) is bool:
            checked_boxes = []
            for box_name, box_value in a.items():
                if box_value is True:
                    checked_boxes.append(box_name)
            answers[q] = checked_boxes
    for q, a in answers.items():
        try:
            json.dumps(a)
        except:
            answers[q] = str(a)

    answers = json.dumps(answers, default=lambda x: str(x))
    metadata = json.dumps(all_variables(special='metadata'),
                          default=lambda x: str(x))
    logged_in_user = json.dumps(
        get_user_info_hash()) if include_logged_in_user else {}

    return requests.post(endpoint,
                         data={
                             'docassemble_api_key': docassemble_api_key,
                             'answers': answers,
                             'metadata': metadata,
                             'respondent': logged_in_user
                         })
Beispiel #5
0
def get_ocr_language(language):
    langs = get_available_languages()
    if language is None:
        language = get_language()
    ocr_langs = get_config("ocr languages")
    if ocr_langs is None:
        ocr_langs = dict()
    if language in langs:
        lang = language
    else:
        if language in ocr_langs and ocr_langs[language] in langs:
            lang = ocr_langs[language]
        else:
            try:
                pc_lang = pycountry.languages.get(alpha_2=language)
                lang_three_letter = pc_lang.alpha_3
                if lang_three_letter in langs:
                    lang = lang_three_letter
                else:
                    if 'eng' in langs:
                        lang = 'eng'
                    else:
                        lang = langs[0]
                    raise Exception(
                        "could not get OCR language for language " +
                        str(language) + "; using language " + str(lang))
            except Exception as the_error:
                if 'eng' in langs:
                    lang = 'eng'
                else:
                    lang = langs[0]
                raise Exception("could not get OCR language for language " +
                                str(language) + "; using language " +
                                str(lang) + "; error was " + str(the_error))
    return lang
Beispiel #6
0
def mmdc(input_text, file_format='svg', flags=dict()):
    if not isinstance(flags, dict):
        raise Exception("mmdc: flags not a dictionary")
    if not isinstance(file_format, string_types) or re.search(r'[^a-z]', file_format) or len(file_format) == 0:
        raise Exception("mmdc: invalid file format")
    if not isinstance(input_text, string_types):
        input_text = text_type(input_text)
    sys.stderr.write("Writing:\n" + input_text + "\n")
    input_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix=".mmd", delete=False)
    input_file.write(input_text)
    input_file.close()
    output_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix="." + file_format, delete=False)
    output_file.close()
    commands = [get_config('mmdc path', 'mmdc'), '-p', os.path.join(expanduser("~"), 'puppeteer-config.json'), '-i', input_file.name, '-o', output_file.name]
    for key, val in flags.items():
        commands.append('-' + str(key))
        commands.append(repr(str(val)))
    sys.stderr.write("Commands are: " + " ".join(commands) + "\n")
    try:
        output = subprocess.check_output(commands, stderr=subprocess.STDOUT).decode()
    except subprocess.CalledProcessError as err:
        output = err.output.decode()
        raise Exception("mmdc: there was an error.  " + output)
    if os.path.getsize(output_file.name) == 0:
        raise Exception("mmdc: the command did not produce any output.  " + output)
    obj = DAFile()
    obj.set_random_instance_name()
    obj.initialize(extension=file_format)
    obj.copy_into(output_file.name)
    obj.commit()
    return obj
Beispiel #7
0
def markdown_to_docx(text, tpl):
    if get_config('new markdown to docx', False):
        source_code = docassemble.base.filter.markdown_to_html(text,
                                                               do_terms=False)
        source_code = re.sub("\n", ' ', source_code)
        source_code = re.sub(">\s+<", '><', source_code)
        soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser')
        parser = SoupParser(tpl)
        for elem in soup.find_all(recursive=False):
            parser.traverse(elem)
        output = text_type(parser)
        # logmessage(output)
        return output
    else:
        source_code = docassemble.base.filter.markdown_to_html(text,
                                                               do_terms=False)
        source_code = re.sub(r'(?<!\>)\n', ' ', source_code)
        #source_code = re.sub("\n", ' ', source_code)
        #source_code = re.sub(">\s+<", '><', source_code)
        rt = RichText('')
        soup = BeautifulSoup(source_code, 'lxml')
        html_parsed = deque()
        html_parsed = html_linear_parse(soup)
        rt = add_to_rt(tpl, rt, html_parsed)
        return rt
Beispiel #8
0
def mmdc(input_text, file_format='svg', flags=dict()):
    if not isinstance(flags, dict):
        raise Exception("mmdc: flags not a dictionary")
    if not isinstance(file_format, str) or re.search(r'[^a-z]', file_format) or len(file_format) == 0:
        raise Exception("mmdc: invalid file format")
    if not isinstance(input_text, str):
        input_text = str(input_text)
    sys.stderr.write("Writing:\n" + input_text + "\n")
    input_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix=".mmd", delete=False)
    input_file.write(input_text)
    input_file.close()
    output_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="w", suffix="." + file_format, delete=False)
    output_file.close()
    commands = [get_config('mmdc path', 'mmdc'), '-p', os.path.join(expanduser("~"), 'puppeteer-config.json'), '-i', input_file.name, '-o', output_file.name]
    for key, val in flags.items():
        commands.append('-' + str(key))
        commands.append(repr(str(val)))
    sys.stderr.write("Commands are: " + " ".join(commands) + "\n")
    try:
        output = subprocess.check_output(commands, stderr=subprocess.STDOUT).decode()
    except subprocess.CalledProcessError as err:
        output = err.output.decode()
        raise Exception("mmdc: there was an error.  " + output)
    if os.path.getsize(output_file.name) == 0:
        raise Exception("mmdc: the command did not produce any output.  " + output)
    obj = DAFile()
    obj.set_random_instance_name()
    obj.initialize(extension=file_format)
    obj.copy_into(output_file.name)
    obj.commit()
    return obj
def local_config(param_name: str, default=None, dev_mode: bool = False):
    if dev_mode:
        return default
    config = get_config('us-tx-family')
    if not config:
        return default
    return config.get(param_name, default)
def get_service(org_slug, service_slug):
    service_endpoint = (get_config("exitpage") + "/api/v1/organizations/" +
                        str(org_slug) + "/services/" + str(service_slug))
    r = requests.get(service_endpoint)
    if r.status_code == 200:
        try:
            r_json = r.json()
            if r_json:
                return r_json.get("data")
        except:
            return None

    return None
Beispiel #11
0
def markdown_to_docx(text, question, tpl):
    if get_config('new markdown to docx', False):
        source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False)
        source_code = re.sub("\n", ' ', source_code)
        source_code = re.sub(">\s+<", '><', source_code)
        soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser')
        parser = SoupParser(tpl)
        for elem in soup.find_all(recursive=False):
            parser.traverse(elem)
        output = str(parser)
        # logmessage(output)
        return docassemble.base.filter.docx_template_filter(output, question=question)
    else:
        return inline_markdown_to_docx(text, question, tpl)
def get_eligibility_code(eligibility_id):
    eligibility_endpoint = (get_config("exitpage") + "/api/v1/eligibility/" +
                            str(eligibility_id))
    default_code = "def qualify():\n  return -1"
    r = requests.get(eligibility_endpoint)
    if r.status_code == 200:
        try:
            r_json = r.json()
            if r_json and r_json.get("data") and r_json.get("data").get(
                    "code"):
                return r_json.get("data").get("code")
        except:
            return default_code

    return default_code
def eligibility(name, **kwargs):
    endpoint = get_config("exitpage") + "/api/v1/eligibility"
    r = requests.get(endpoint,
                     params={
                         "name": name,
                         "args": json.dumps(kwargs)
                     })
    if r.status_code == 200:
        try:
            r_json = r.json()
            if r_json and r_json.get("data") and r_json.get("data").get(
                    "value"):
                return r_json.get("data").get("value")
        except:
            return None

    return None
Beispiel #14
0
def markdown_to_docx(text, tpl):
    if get_config('new markdown to docx', False):
        source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False)
        source_code = re.sub("\n", ' ', source_code)
        source_code = re.sub(">\s+<", '><', source_code)
        soup = BeautifulSoup('<html>' + source_code + '</html>', 'html.parser')
        parser = SoupParser(tpl)
        for elem in soup.find_all(recursive=False):
            parser.traverse(elem)
        output = text_type(parser)
        logmessage(output)
        return output
    else:
        source_code = docassemble.base.filter.markdown_to_html(text, do_terms=False)
        source_code = re.sub(r'(?<!\>)\n', ' ', source_code)
        #source_code = re.sub("\n", ' ', source_code)
        #source_code = re.sub(">\s+<", '><', source_code)
        rt = RichText('')
        soup = BeautifulSoup(source_code, 'lxml')
        html_parsed = deque()
        html_parsed = html_linear_parse(soup)
        rt = add_to_rt(tpl, rt, html_parsed)
        return rt
Beispiel #15
0
def ocr_page_tasks(image_file,
                   language=None,
                   psm=6,
                   x=None,
                   y=None,
                   W=None,
                   H=None,
                   user_code=None,
                   **kwargs):
    #sys.stderr.write("ocr_page_tasks running\n")
    if not (isinstance(image_file, DAFile)
            or isinstance(image_file, DAFileList)):
        return word("(Not a DAFile or DAFileList object)")
    pdf_to_ppm = get_config("pdftoppm")
    if pdf_to_ppm is None:
        pdf_to_ppm = 'pdftoppm'
    ocr_resolution = get_config("ocr dpi")
    if ocr_resolution is None:
        ocr_resolution = '300'
    langs = get_available_languages()
    if language is None:
        language = get_language()
    if language in langs:
        lang = language
    else:
        ocr_langs = get_config("ocr languages")
        if ocr_langs is None:
            ocr_langs = dict()
        if language in ocr_langs and ocr_langs[language] in langs:
            lang = ocr_langs[language]
        else:
            try:
                pc_lang = pycountry.languages.get(alpha_2=language)
                lang_three_letter = pc_lang.alpha_3
                if lang_three_letter in langs:
                    lang = lang_three_letter
                else:
                    if 'eng' in langs:
                        lang = 'eng'
                    else:
                        lang = langs[0]
                    sys.stderr.write(
                        "ocr_file: could not get OCR language for language " +
                        str(language) + "; using language " + str(lang) + "\n")
            except Exception as the_error:
                if 'eng' in langs:
                    lang = 'eng'
                else:
                    lang = langs[0]
                sys.stderr.write(
                    "ocr_file: could not get OCR language for language " +
                    str(language) + "; using language " + str(lang) +
                    "; error was " + str(the_error) + "\n")
    if isinstance(image_file, DAFile):
        image_file = [image_file]
    todo = list()
    for doc in image_file:
        if hasattr(doc, 'extension'):
            if doc.extension not in ['pdf', 'png', 'jpg', 'gif']:
                raise Exception("document with extension " + doc.extension +
                                " is not a readable image file")
            if doc.extension == 'pdf':
                #doc.page_path(1, 'page')
                for i in range(
                        PdfFileReader(open(doc.path(), 'rb')).getNumPages()):
                    todo.append(
                        dict(doc=doc,
                             page=i + 1,
                             lang=lang,
                             ocr_resolution=ocr_resolution,
                             psm=psm,
                             x=x,
                             y=y,
                             W=W,
                             H=H,
                             pdf_to_ppm=pdf_to_ppm,
                             user_code=user_code))
            else:
                todo.append(
                    dict(doc=doc,
                         page=None,
                         lang=lang,
                         ocr_resolution=ocr_resolution,
                         psm=psm,
                         x=x,
                         y=y,
                         W=W,
                         H=H,
                         pdf_to_ppm=pdf_to_ppm,
                         user_code=user_code))
    #sys.stderr.write("ocr_page_tasks finished\n")
    return todo
def sendQuery(filename, number=0):
    number_flag = "-s" + str(number)
    if no_docassemble:
        scasp_location = "scasp"
    else:
        scasp_location = get_config('scasp')['location'] if (
            get_config('scasp') and get_config('scasp')['location']
        ) else '/var/www/.ciao/build/bin/scasp'
    results = subprocess.run(
        [scasp_location, '--human', '--tree', number_flag, filename],
        capture_output=True).stdout.decode('utf-8')

    pattern = re.compile(r"daSCASP_([^),\s]*)")
    matches = list(pattern.finditer(results))
    for m in matches:
        results = results.replace(
            m.group(0),
            urllib.parse.unquote_plus(
                m.group(1).replace('__perc__', '%').replace('__plus__', '+')))

    output = {}

    # If result is no models
    if results.endswith('no models\n\n'):
        query = results.replace('\n\nno models\n\n', '').replace(
            '\n    ',
            '').replace('QUERY:',
                        '').replace('{', '').replace('}',
                                                     '').replace('% ', '')
        output['query'] = query
        output['result'] = 'No'
        return output
    else:
        # Divide up the remainder into individual answers
        answers = results.split("\tANSWER:\t")
        query = answers[0]
        del answers[0]
        query = query.replace('\n', '').replace('     ', ' ').replace(
            'QUERY:', '').replace('% ', '').replace('{', '').replace('}', '')
        output['query'] = query
        output['result'] = 'Yes'
        output['answers'] = []

        # for each actual answer
        for a in answers:
            #Separate out the time, tree, model, and bindings
            answer_parts = a.split('\n\nJUSTIFICATION_TREE:\n')
            time = answer_parts[0]
            answer_parts = answer_parts[1].split('\n\nMODEL:\n')
            tree = answer_parts[0]
            answer_parts = answer_parts[1].split('\n\nBINDINGS:')
            model = answer_parts[0]
            bindings = []
            # The bindings may not exist
            if len(answer_parts) > 1:
                bindings = answer_parts[1].splitlines()
            # Reformat the Time
            time = time.replace(' ms)', '').replace('(in ', '').split(' ')[1]

            # Reformat the Tree
            explanations = make_tree(tree)
            explanations = display_list(explanations)

            # Reformat the Model
            model = model.replace('{ ', '').replace(' }', '').split(',  ')

            # Reformat the Bindings
            if bindings:
                bindings = [b for b in bindings if b != '' and b != ' ']
                bindings = [b.replace(' equal ', ': ') for b in bindings]

            # Create a dictionary for this answer
            new_answer = {}
            new_answer['time'] = time
            new_answer['model'] = model
            if bindings:
                new_answer['bindings'] = bindings
            new_answer['explanations'] = explanations

            # Add the answer to the output_answers list
            output['answers'].append(new_answer.copy())

        # Reorganize the tree so that bindings are a level above models and explanations.

        new_output = {}
        new_output['query'] = output['query']
        new_output['result'] = output['result']
        new_output['answers'] = []
        for a in output['answers']:
            present = False
            for na in new_output['answers']:
                if a['bindings'].sort() == na['bindings'].sort():
                    present = True
            if not present:
                new_output['answers'].append({
                    'bindings': a['bindings'],
                    'models': []
                })
        for a in output['answers']:
            for na in new_output['answers']:
                if a['bindings'] == na['bindings']:
                    na['models'].append({
                        'time': a['time'],
                        'model': a['model'],
                        'explanations': a['explanations']
                    })
                    # na['models']['time'] = a['time']
                    # na['models']['model'] = a['model']
                    # na['models']['explanations'] = a['explanations']

        for i in range(len(new_output['answers'])):
            nlg_answer = new_output['query']
            nlg_answer = nlg_answer.replace('I would like to know if ', '')
            for b in new_output['answers'][i]['bindings']:
                splitbinding = b.split(': ')
                nlg_answer = nlg_answer.replace(splitbinding[0],
                                                splitbinding[1])
            new_output['answers'][i]['nlg_answer'] = nlg_answer

        return new_output