def get_entity_table(self, entity_id): conn = psycopg2.connect(CONN_STRING) try: cur = conn.cursor() params = [entity_id] sql = """SELECT content FROM tables t JOIN entities e ON t.document_id = e.document_id WHERE e.id = %s AND e.entity_offset > t.offset AND e.entity_offset < t.offset + t.length""" cur.execute(sql, params) content = cur.fetchone() if content: self.paragraphs = ttt.get_paragraphs_from_string(content[0]) return ttt.identify_tables(self.paragraphs)[0] else: return ttt.Table() except Exception as exp: conn.rollback() print exp raise exp finally: conn.close()
def label_doc(self, doc_path, congress, chamber, document_type, number): print doc_path paragraphs_list = text_table_tools.get_paragraphs(open(doc_path,'r')) tables = text_table_tools.identify_tables(paragraphs_list) for table in tables: table_offset = table.offset column_indices = sorted(text_table_tools.get_candidate_columns(table)) sponsor_indices = self.sponsor_coder.find_sponsor_index(table, congress) for row in table.rows: self.label_row(row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices)
def extract_tables(document_paths): print "begin table extraction" for path in document_paths: paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8')) tables = ttt.identify_tables(paragraphs_list) try: params = [(t.offset, t.length, ','.join(t.header), ' '.join(t.title), ' '.join(t.body), ' '.join(t.content), path[1]) for t in tables] cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)' cur = conn.cursor() cur.executemany(cmd, params) conn.commit() except Exception as ex: print "Failed to import doc %s: %s" % (path[0], ex)
def label_doc(self, doc_path, congress, chamber, document_type, number): print doc_path paragraphs_list = text_table_tools.get_paragraphs(open(doc_path, 'r')) tables = text_table_tools.identify_tables(paragraphs_list) for table in tables: table_offset = table.offset column_indices = sorted( text_table_tools.get_candidate_columns(table)) sponsor_indices = self.sponsor_coder.find_sponsor_index( table, congress) for row in table.rows: self.label_row(row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices)
def extract_tables(document_paths): print "begin table extraction" for path in document_paths: paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8')) tables = ttt.identify_tables(paragraphs_list) try: params = [ (t.offset, t.length, ','.join(t.header), ' '.join(t.title), ' '.join(t.body), ' '.join(t.content), path[1]) for t in tables ] cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)' cur = conn.cursor() cur.executemany(cmd, params) conn.commit() except Exception as ex: print "Failed to import doc %s: %s" % (path[0], ex)