Ejemplo n.º 1
0
def test():
    # convert query dict to text (without correct column references)
    details = {"sel": 5, "conds": [[3, 0, "SOUTH AUSTRALIA"]], "agg": 0}
    test_str = Query(details["sel"], details["agg"], details["conds"])
    print(test_str)

    db = records.Database('sqlite:///data/train.db')
    conn = db.get_connection()

    # convert query dict to text with table reference (still does not give the correct columns)
    # because header is not supplied
    table = Table.from_db(conn, "1-1000181-1")
    print(table.query_str(test_str))

    # convert query dict to text with table reference after supplying headers
    table_data = {
        "id":
        "1-1000181-1",
        "header": [
            "State/territory", "Text/background colour", "Format",
            "Current slogan", "Current series", "Notes"
        ],
        "types": [],
        "rows": []
    }
    t = Table(table_data["id"], table_data["header"], table_data["types"],
              table_data["rows"])
    print(t.query_str(test_str))
Ejemplo n.º 2
0
def toQueryStr(file_name, table_arr, type=0, test_batch_size=1000):
    path = os.path.join(DATA_DIR, '{}.jsonl'.format(file_name))
    print(path)
    with open(path, 'r') as pf:
        data = pf.readlines()
    
    idxs = np.arange(len(data))
    data = np.array(data, dtype=np.object)

    np.random.seed(0)   # set random seed so that random things are reproducible
    np.random.shuffle(idxs)
    data = data[idxs]
    batched_data = chunked(data, test_batch_size)

    print("start processing")
    examples = []
    for batch_idx, batch_data in enumerate(batched_data):
        if len(batch_data) < test_batch_size:
            break # the last batch is smaller than the others, exclude.
        for d_idx, d in enumerate(batch_data): 
            line = json.loads(str(d), encoding='utf-8')
            doc_token = line['question']
            code_arr = line['sql']
            query = Query(code_arr['sel'], code_arr['agg'], code_arr['conds'])

            id = line['table_id']
            table = Table("table_id", "header", "types", "rows")
            code_str = ''
            for table in table_arr:
                if table.table_id == id:
                    table = table
                    code_str = table.query_str(query)
                    break
                else:
                    continue
            isNegative = np.random.randint(2)
            if isNegative == 0:
                random_line_num = np.random.randint(len(data))
                line = json.loads(str(data[random_line_num]), encoding='utf-8')
                doc_token = line['question']
                code_token = code_str
            else:
                code_token = code_str
            example = (str(isNegative), "nothing", "nothing", doc_token, code_token)
            example = '<CODESPLIT>'.join(example)
            examples.append(example)
    data_path = os.path.join(DATA_DIR, 'train_valid/wiki_sql')
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    output_file_name = "1.txt"
    if type == 0:
        output_file_name = 'train.txt'
    else:
        output_file_name = 'valid.txt'
    file_path = os.path.join(data_path, output_file_name)
    print(file_path)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines('\n'.join(examples))