def label_doc(self, doc_path, congress, chamber, document_type, number):
     print doc_path
     paragraphs_list = text_table_tools.get_paragraphs(open(doc_path,'r'))
     tables = text_table_tools.identify_tables(paragraphs_list)
     for table in tables:
         table_offset = table.offset
         column_indices = sorted(text_table_tools.get_candidate_columns(table))
         sponsor_indices = self.sponsor_coder.find_sponsor_index(table, congress)
         for row in table.rows:
             self.label_row(row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices)
 def label_doc(self, doc_path, congress, chamber, document_type, number):
     print doc_path
     paragraphs_list = text_table_tools.get_paragraphs(open(doc_path, 'r'))
     tables = text_table_tools.identify_tables(paragraphs_list)
     for table in tables:
         table_offset = table.offset
         column_indices = sorted(
             text_table_tools.get_candidate_columns(table))
         sponsor_indices = self.sponsor_coder.find_sponsor_index(
             table, congress)
         for row in table.rows:
             self.label_row(row, column_indices, table_offset, congress,
                            chamber, document_type, number, sponsor_indices)
def extract_tables(document_paths):
    print "begin table extraction"
    for path in document_paths:
        paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8'))
        tables = ttt.identify_tables(paragraphs_list)
        try:
            params = [(t.offset, t.length, ','.join(t.header), ' '.join(t.title), ' '.join(t.body), ' '.join(t.content), path[1]) for t in tables]
            cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)'
            cur = conn.cursor()
            cur.executemany(cmd, params)
            conn.commit()
        except Exception as ex:
            print "Failed to import doc %s: %s" % (path[0], ex)
Example #4
0
def extract_tables(document_paths):
    print "begin table extraction"
    for path in document_paths:
        paragraphs_list = ttt.get_paragraphs(codecs.open(path[0], 'r', 'utf8'))
        tables = ttt.identify_tables(paragraphs_list)
        try:
            params = [
                (t.offset, t.length, ','.join(t.header), ' '.join(t.title),
                 ' '.join(t.body), ' '.join(t.content), path[1])
                for t in tables
            ]
            cmd = 'INSERT INTO tables ("offset", "length", headers, title, body, content, document_id) VALUES (%s,%s,%s,%s,%s,%s,%s)'
            cur = conn.cursor()
            cur.executemany(cmd, params)
            conn.commit()
        except Exception as ex:
            print "Failed to import doc %s: %s" % (path[0], ex)