def get_entity_table(self, entity_id):
     conn = psycopg2.connect(CONN_STRING)
     try:
         cur = conn.cursor()
         params = [entity_id]
         sql = """SELECT content
             FROM tables t
             JOIN entities e
             ON t.document_id = e.document_id
             WHERE e.id = %s
             AND e.entity_offset > t.offset
             AND e.entity_offset < t.offset + t.length"""
         cur.execute(sql, params)
         content = cur.fetchone()
         if content:
             self.paragraphs = ttt.get_paragraphs_from_string(content[0])
             return ttt.identify_tables(self.paragraphs)[0]
         else:
             return ttt.Table()
     except Exception as exp:
         conn.rollback()
         print exp
         raise exp
     finally:
         conn.close()
 def get_entity_table(self, entity_id):
     conn = psycopg2.connect(CONN_STRING)
     try:
         cur = conn.cursor()
         params = [entity_id]
         sql = """SELECT content
             FROM tables t
             JOIN entities e
             ON t.document_id = e.document_id
             WHERE e.id = %s
             AND e.entity_offset > t.offset
             AND e.entity_offset < t.offset + t.length"""
         cur.execute(sql, params)
         content = cur.fetchone()
         if content:
             self.paragraphs = ttt.get_paragraphs_from_string(content[0])
             return ttt.identify_tables(self.paragraphs)[0]
         else:
             return ttt.Table()
     except Exception as exp:
         conn.rollback()
         print exp
         raise exp
     finally:
         conn.close()
 def label_doc(self, doc_path, congress, chamber, document_type, number):
     print doc_path
     paragraphs_list = text_table_tools.get_paragraphs(open(doc_path,'r'))
     tables = text_table_tools.identify_tables(paragraphs_list)
     for table in tables:
         table_offset = table.offset
         column_indices = sorted(text_table_tools.get_candidate_columns(table))
         sponsor_indices = self.sponsor_coder.find_sponsor_index(table, congress)
         for row in table.rows:
             self.label_row(row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices)
def extract_tables(document_paths):
    print "begin table extraction"
    for path in document_paths:
        paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8'))
        tables = ttt.identify_tables(paragraphs_list)
        try:
            params = [(t.offset, t.length, ','.join(t.header), ' '.join(t.title), ' '.join(t.body), ' '.join(t.content), path[1]) for t in tables]
            cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)'
            cur = conn.cursor()
            cur.executemany(cmd, params)
            conn.commit()
        except Exception as ex:
            print "Failed to import doc %s: %s" % (path[0], ex)
 def label_doc(self, doc_path, congress, chamber, document_type, number):
     print doc_path
     paragraphs_list = text_table_tools.get_paragraphs(open(doc_path, 'r'))
     tables = text_table_tools.identify_tables(paragraphs_list)
     for table in tables:
         table_offset = table.offset
         column_indices = sorted(
             text_table_tools.get_candidate_columns(table))
         sponsor_indices = self.sponsor_coder.find_sponsor_index(
             table, congress)
         for row in table.rows:
             self.label_row(row, column_indices, table_offset, congress,
                            chamber, document_type, number, sponsor_indices)
Esempio n. 6
0
def extract_tables(document_paths):
    print "begin table extraction"
    for path in document_paths:
        paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8'))
        tables = ttt.identify_tables(paragraphs_list)
        try:
            params = [
                (t.offset, t.length, ','.join(t.header), ' '.join(t.title),
                 ' '.join(t.body), ' '.join(t.content), path[1])
                for t in tables
            ]
            cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)'
            cur = conn.cursor()
            cur.executemany(cmd, params)
            conn.commit()
        except Exception as ex:
            print "Failed to import doc %s: %s" % (path[0], ex)