from fileReader import FileReader reader = FileReader() to_find = ["Trademarks and Trade Names."] # words for searching complete_path = "v5_signage_bad4.docx" # complete path to file location, can be pdf or word # header_only=True means it will return only headings result = reader.find(path=complete_path, to_find=to_find, headers_only=True) for line in result: print line
def create_table(filename, clauses): reader = FileReader() complete_path = filename l = [] name = splitext(filename)[0] document = Document() document.add_heading('Clause Extraction', 0) table = document.add_table(rows=1, cols=6) table.style = 'TableGrid' hdr_cells = table.rows[0].cells hdr_cells = table.rows[0].cells hdr_cells[0].text = 'Amended No.' hdr_cells[1].text = 'Clause No.' hdr_cells[2].text = 'ClauseName' hdr_cells[3].text = 'Lease required amendment' hdr_cells[4].text = 'Required action' hdr_cells[5].text = 'Lessor response to amendment Agreed/Not agreed' index = 0 total_index = 1 #4 Clauses to Extract for clause in clauses: t = {} t['index'] = total_index t['action'] = clause['action'] t['clausename'] = clause['clause_name'] t['reason'] = clause['reason'] t['agree'] = "" to_find = [clause['clause_name']] row_cells = table.add_row().cells row_cells[0].text = str(t['index']) row_cells[1].text = "" row_cells[2].text = t['clausename'] row_cells[3].text = t['reason'] row_cells[5].text = t['agree'] row_cells[4].text = t['action'] total_index += 1 if (clause['text']): cleaned_string = ''.join(c for c in clause['text'] if valid_xml_char_ordinal(c)) f = remove_clause_number(cleaned_string) index += 1 t['keep'] = 0 clause_no = "" # lst = {} try: temp_no = 0 clause_no = re.match( r"([0-9]+\.[0-9]+\.[0-9]+|[0-9]+\.[0-9]+|[0-9]+\.)", cleaned_string) if (clause_no != None): clause_no = clause_no.group() temp_no = int(clause_no.replace('.', '')) else: clause_no = "" lst = reader.find_with(clause['text'], [clause['key_text']], headers_only=False) if (len(lst) > 0 and len(lst[0]) > 0): try: if (string.ascii_letters.find(lst[0].replace( "(", "").replace(')', '').replace('.', '')[:-1]) == 0): clause_no = clause_no + ','.join(lst) elif (int(lst[0].replace("(", "").replace( ')', '').replace('.', '')) != temp_no): clause_no = clause_no + ','.join(lst) else: clause_no = ','.join(lst) except ValueError: clause_no = clause_no + ','.join(lst) except Exception as e: print e print(complete_path) result = reader.find(path=complete_path, to_find=to_find, headers_only=True) print result try: clause_no = result[0] except: clause_no = "" print(clause_no) if (clause_no != None and clause_no != ""): t['clause_no'] = clause_no row_cells[1].text = t['clause_no'] l.append(t) else: t['clause_no'] = "" t['keep'] = 1 l.append(t) document.add_page_break() document.save(settings.BASE_DIR + '/leasingai/ai/temp/' + name + '.docx') return l