Beispiel #1
0
def index_all_table(company_id):
    from getCompanyName_machineId import getCN_MID
    getCompanyName_machineId = getCN_MID()
    company_name, machine_id = getCompanyName_machineId[company_id]
    model_number = '1'
    project_id, url_id = company_id.split('_')
    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)
    db_file = os.path.join('/mnt/eMB_db/', company_name, model_number,
                           'company_report.db')
    conn = qObj.create_connection(db_file)
    cur = conn.cursor()
    table_name = 'Table_Report'
    column_list = [('row_id', 'INTEGER PRIMARY KEY AUTOINCREMENT'),
                   ('table_id', 'VARCHAR(20)'), ('doc_id', 'VARCHAR(20)'),
                   ('classification', 'VARCHAR(256)'),
                   ('normalization', 'VARCHAR(1)'),
                   ('error_accepted', 'VARCHAR(1)'),
                   ('db_status', 'VARCHAR(1)')]
    column_tup = tuple(map(lambda x: x[0], column_list[1:]))
    qObj.createLiteTable(conn, cur, '', table_name, column_list)
    data = []
    for doc_tup in norm_res_list:
        doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup)
        db_tup = (norm_table_id, doc_id, '', 'Y', 'N', 'N')
        data.append(db_tup)
    stmt = 'delete from %s' % (table_name)
    cur.execute(stmt)
    qObj.insertIntoLite(conn, cur, '', table_name, column_tup, data)
    conn.commit()
    conn.close()
    return 'done'
def insert_update_table_report(company_id, table_ids):
    from getCompanyName_machineId import getCN_MID
    getCompanyName_machineId = getCN_MID()
    company_name, machine_id = getCompanyName_machineId[company_id]
    model_number = '1'
    project_id, url_id = company_id.split('_')
    model_number = copy.deepcopy(project_id)
    project_id, url_id = company_id.split('_')
    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)
    db_file = os.path.join('/mnt/eMB_db/', company_name, model_number,
                           'company_report.db')
    print 'db_file', db_file
    conn = qObj.create_connection(db_file)
    cur = conn.cursor()
    table_name = 'Table_Report'
    column_list = [('row_id', 'INTEGER PRIMARY KEY AUTOINCREMENT'),
                   ('table_id', 'VARCHAR(20)'), ('doc_id', 'VARCHAR(20)'),
                   ('classification', 'VARCHAR(256)'),
                   ('normalization', 'VARCHAR(1)'),
                   ('error_accepted', 'VARCHAR(1)'),
                   ('db_status', 'VARCHAR(1)')]
    column_tup = tuple(map(lambda x: x[0], column_list[1:]))
    qObj.createLiteTable(conn, cur, '', table_name, column_list)
    stmt = "select table_id, doc_id, classification, normalization, error_accepted, db_status from Table_Report"
    cur.execute(stmt)
    res = cur.fetchall()
    selected_dict = {}
    for r in res:
        table_id, doc_id, classification, normalization, error_accepted, db_status = map(
            str, r)
        selected_dict[table_id] = (classification, normalization,
                                   error_accepted, db_status)
    tids = []
    data = []
    for (doc_id, table_id, page_no, g_id, lng, g_u) in table_ids:
        classification, normalization, error_accepted, db_status = selected_dict.get(
            table_id, ('', 'Y', 'N', 'N'))
        normalization = 'Y'
        error_accepted = 'N'
        db_status = 'N'
        data.append((table_id, doc_id, classification, normalization,
                     error_accepted, db_status))
        tids.append('"' + table_id + '"')
    tstr = ', '.join(tids)
    stmt = 'delete from %s where table_id in (%s)' % (table_name, tstr)
    cur.execute(stmt)
    qObj.insertIntoLite(conn, cur, '', table_name, column_tup, data)
    conn.commit()
    conn.close()
    return 'done'
Beispiel #3
0
def get_comp_model(company_id):
    from getCompanyName_machineId import getCN_MID
    getCompanyName_machineId = getCN_MID()
    company_name, machine_id = getCompanyName_machineId[company_id]
    return company_name
def generate(company_id):
    doc_page_cord_dict = cobj.get_adjustment_coordinates1(company_id)
    from getCompanyName_machineId import getCN_MID
    getCompanyName_machineId = getCN_MID()
    company_name, machine_id = getCompanyName_machineId[company_id]
    project_id, url_id = company_id.split('_')
    all_doc_table_to_process = []
    norm_res_list = sObj.slt_normresids(project_id, url_id)
    doc_page_grid_dict = {}
    doc_table_page_dict = {}
    for doc_tup in norm_res_list:
        doc_id, page_number, norm_table_id = map(lambda x: str(x), doc_tup)
        #if doc_id != '44':continue
        #if norm_table_id != '6334':continue
        ktup = (doc_id, norm_table_id)
        doc_table_page_dict[ktup] = page_number
        all_doc_table_to_process.append(ktup)
        if doc_id not in doc_page_grid_dict:
            doc_page_grid_dict[doc_id] = {}
        if page_number not in doc_page_grid_dict[doc_id]:
            doc_page_grid_dict[doc_id][page_number] = []
        doc_page_grid_dict[doc_id][page_number].append(norm_table_id)
    #print doc_page_grid_dict['28'].keys()
    #sys.exit()
    res = pprocess.pmap(
        lambda x: generate_map_ds(company_id, all_doc_table_to_process[x]),
        range(0, len(all_doc_table_to_process)), 8)
    doc_id_page_number_bbox_dict = {}
    #######################################
    total = len(all_doc_table_to_process)
    cnt = 1
    for (ktup, rdict, celldata) in res:
        doc_id, table_id = ktup
        #page_number = doc_table_page_dict[ktup]
        xml_sec_type_dict = get_cell_mdict(celldata)
        print[ktup, cnt, '/', total]
        for xml_id, c_ar in rdict.items():
            if not xml_id.strip(): continue
            #sys.exit()
            page_number = xml_id.split('#')[0].split('_')[-1].strip()
            dk = (doc_id + '.pdf', page_number)
            r, c, txt, sec_type = xml_sec_type_dict[xml_id]
            b_ar, page_n = c_ar
            if str(page_n) == page_number:
                if dk not in doc_id_page_number_bbox_dict:
                    doc_id_page_number_bbox_dict[dk] = {}
                if sec_type not in doc_id_page_number_bbox_dict[dk]:
                    doc_id_page_number_bbox_dict[dk][sec_type] = []
                n_ar = []
                for ar in b_ar:
                    st = '_'.join(map(str, ar))
                    n_ar.append(st)
                bb = '$'.join(n_ar)
                pc = doc_page_cord_dict.get(doc_id, {}).get(page_number, '')
                #print [doc_id, table_id, page_number, xml_id, txt, pc]
                if not pc:
                    print[doc_id, table_id, page_number, xml_id, txt, pc]
                    print 'page cord error'
                    sys.exit()
                dd = (table_id, r, c, txt, bb, pc)
                if dd not in doc_id_page_number_bbox_dict[dk][sec_type]:
                    doc_id_page_number_bbox_dict[dk][sec_type].append(dd)
        cnt += 1
    #sys.exit()
    ######################################
    ff = '/var/www/html/company_bbox/'
    if not os.path.exists(ff):
        cmd = 'mkdir -p %s' % (ff)
        os.system(cmd)
    fname = os.path.join(ff, company_name + '.txt')
    fout = open(fname, 'w')
    st = '\t'.join([
        'DOC_PDF', 'TABLE_ID', 'PAGE_NUMBER', 'SECTION_TYPE', 'ROW', 'COL',
        'TXT', 'BBOX(split by $ then split by _ )', 'PAGE_CORDS'
    ])
    st += '\n'
    fout.write(st)
    for dk, sec_dict in doc_id_page_number_bbox_dict.items():
        for sec_type, bbox_ar in sec_dict.items():
            for (table_id, r, c, txt, bb, pc) in bbox_ar:
                st = '\t'.join(
                    [dk[0], table_id, dk[1], sec_type, r, c, txt, bb, pc])
                st += '\n'
                fout.write(st)
    fout.close()