from fileReader import FileReader

reader = FileReader()

to_find = ["Trademarks and Trade Names."]  # words for searching
complete_path = "v5_signage_bad4.docx"  # complete path to file location, can be pdf or word
# header_only=True means it will return only headings

result = reader.find(path=complete_path, to_find=to_find, headers_only=True)

for line in result:
    print line
Exemple #2
0
def create_table(filename, clauses):
    reader = FileReader()
    complete_path = filename

    l = []
    name = splitext(filename)[0]
    document = Document()
    document.add_heading('Clause Extraction', 0)
    table = document.add_table(rows=1, cols=6)
    table.style = 'TableGrid'
    hdr_cells = table.rows[0].cells
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Amended No.'
    hdr_cells[1].text = 'Clause No.'
    hdr_cells[2].text = 'ClauseName'
    hdr_cells[3].text = 'Lease required amendment'
    hdr_cells[4].text = 'Required action'
    hdr_cells[5].text = 'Lessor response to amendment Agreed/Not agreed'
    index = 0
    total_index = 1
    #4 Clauses to Extract
    for clause in clauses:
        t = {}

        t['index'] = total_index
        t['action'] = clause['action']
        t['clausename'] = clause['clause_name']
        t['reason'] = clause['reason']
        t['agree'] = ""
        to_find = [clause['clause_name']]
        row_cells = table.add_row().cells
        row_cells[0].text = str(t['index'])
        row_cells[1].text = ""
        row_cells[2].text = t['clausename']
        row_cells[3].text = t['reason']
        row_cells[5].text = t['agree']
        row_cells[4].text = t['action']
        total_index += 1
        if (clause['text']):
            cleaned_string = ''.join(c for c in clause['text']
                                     if valid_xml_char_ordinal(c))
            f = remove_clause_number(cleaned_string)

            index += 1
            t['keep'] = 0
            clause_no = ""
            # lst = {}
            try:
                temp_no = 0
                clause_no = re.match(
                    r"([0-9]+\.[0-9]+\.[0-9]+|[0-9]+\.[0-9]+|[0-9]+\.)",
                    cleaned_string)
                if (clause_no != None):
                    clause_no = clause_no.group()
                    temp_no = int(clause_no.replace('.', ''))
                else:
                    clause_no = ""
                lst = reader.find_with(clause['text'], [clause['key_text']],
                                       headers_only=False)
                if (len(lst) > 0 and len(lst[0]) > 0):
                    try:
                        if (string.ascii_letters.find(lst[0].replace(
                                "(", "").replace(')',
                                                 '').replace('.',
                                                             '')[:-1]) == 0):
                            clause_no = clause_no + ','.join(lst)
                        elif (int(lst[0].replace("(", "").replace(
                                ')', '').replace('.', '')) != temp_no):
                            clause_no = clause_no + ','.join(lst)
                        else:
                            clause_no = ','.join(lst)
                    except ValueError:
                        clause_no = clause_no + ','.join(lst)

            except Exception as e:
                print e
                print(complete_path)
                result = reader.find(path=complete_path,
                                     to_find=to_find,
                                     headers_only=True)
                print result
                try:
                    clause_no = result[0]
                except:
                    clause_no = ""
                print(clause_no)
            if (clause_no != None and clause_no != ""):
                t['clause_no'] = clause_no
                row_cells[1].text = t['clause_no']
                l.append(t)
        else:
            t['clause_no'] = ""
            t['keep'] = 1
            l.append(t)
    document.add_page_break()

    document.save(settings.BASE_DIR + '/leasingai/ai/temp/' + name + '.docx')
    return l