def test_get_number_of_sections_per_neurite(self): _equal(_nrt.number_of_sections_per_neurite(self.sec_nrn), get('number_of_sections_per_neurite', self.ref_nrn)) for t in NeuriteType: _equal(_nrt.number_of_sections_per_neurite(self.sec_nrn, neurite_type=t), get('number_of_sections_per_neurite', self.ref_nrn, neurite_type=t))
def test_get_trunk_origin_radii(self): _equal(fst._nrn.trunk_origin_radii(self.sec_nrn), get('trunk_origin_radii', self.ref_nrn)) for t in NeuriteType: _equal(_nrn.trunk_origin_radii(self.sec_nrn, neurite_type=t), get('trunk_origin_radii', self.ref_nrn, neurite_type=t))
def main(): parser = ArgumentParser() parser.add_argument('--din', default=data_path, help='data directory') parser.add_argument('--dout', default='annotated', help='output directory') args = parser.parse_args() if not os.path.isdir(args.dout): os.makedirs(args.dout) for split in ['dev']: #for split in ['train', 'test', 'dev']: with open(save_path+'%s.qu'%split, 'w') as qu_file, open(save_path+'%s.lon'%split, 'w') as lon_file, \ open(save_path+'%s.out'%split, 'w') as out, open(save_path+'%s_sym_pairs.txt'%split, 'w') as sym_file, \ open(save_path+'%s_ground_truth.txt'%split, 'w') as S_file: fsplit = os.path.join(args.din, split) + '.jsonl' ftable = os.path.join(args.din, split) + '.tables.jsonl' with open(fsplit) as fs, open(ftable) as ft: print('loading tables') tables = {} for line in tqdm(ft, total=count_lines(ftable)): d = json.loads(line) tables[d['id']] = d print('loading tables done.') print('loading examples') n, acc, acc_pair, acc_all, error = 0, 0, 0, 0, 0 target = -1 ADD_FIELDS = False step2 = True for line in tqdm(fs, total=count_lines(fsplit)): ADD_TO_FILE = True d = json.loads(line) Q = d['question'] rows = tables[d['table_id']]['rows'] rows = np.asarray(rows) fs = tables[d['table_id']]['header'] all_fields = [] for f in fs: all_fields.append(_preclean(f)) # all fields are sorted by length in descending order # for string match purpose all_fields = sorted(all_fields, key=len, reverse=True) smap = defaultdict(list) #f2v reverse_map = defaultdict(list) #v2f for row in rows: for i in range(len(fs)): cur_f = _preclean(str(fs[i])) cur_row = _preclean(str(row[i])) smap[cur_f].append(cur_row) if cur_f not in reverse_map[cur_row]: reverse_map[cur_row].append(cur_f) #---------------------------------------------------------- # all values are sorted by length in descending order # for string match purpose keys = sorted(reverse_map.keys(), key=len, reverse=True) Q = _preclean(Q) Q_ori = Q ##################################### ########## Annotate question ######## ##################################### candidates, cond_fields = _match_pairs( Q, Q_ori, keys, reverse_map) Q_head, head2partial = _match_head(Q, Q_ori, smap, all_fields, cond_fields) Q, Qpairs = annotate_Q(Q, Q_ori, Q_head, candidates, all_fields, head2partial, n, target) qu_file.write(Q + '\n') validation_pairs = copy.copy(Qpairs) validation_pairs.append((Q_head, '<f0>', 'head')) for i, f in enumerate(all_fields): validation_pairs.append((f, '<c' + str(i) + '>', 'c')) ##################################### ########## Annotate SQL ############# ##################################### q_sent = Query.from_dict(d['sql']) S, col_names, val_names = q_sent.to_sentence( tables[d['table_id']]['header'], rows, tables[d['table_id']]['types']) S = _preclean(S) S_ori = S S_noparen = q_sent.to_sentence_noparenthesis( tables[d['table_id']]['header'], rows, tables[d['table_id']]['types']) S_noparen = _preclean(S_noparen) col_names = [_preclean(col_name) for col_name in col_names] val_names = [_preclean(val_name) for val_name in val_names] HEAD = col_names[-1] S_head = _preclean(HEAD) #annotate for SQL name_pairs = [] for col_name, val_name in zip(col_names, val_names): if col_name == val_name: name_pairs.append([_preclean(col_name), 'true']) else: name_pairs.append( [_preclean(col_name), _preclean(val_name)]) # sort to compare with candidates name_pairs.sort(key=lambda x: x[1]) new_name_pairs = [[ '<f' + str(i + 1) + '>', '<v' + str(i + 1) + '>' ] for i, (field, value) in enumerate(name_pairs)] # only annotate S while identified (f,v) pairs are right if _equal(name_pairs, candidates): pairs = [] for (f, v), (new_f, new_v) in zip(name_pairs, new_name_pairs): pairs.append((f, new_f, 'f')) pairs.append((v, new_v, 'v')) # sort (word,symbol) pairs by length in descending order pairs.sort(key=lambda x: 100 - len(x[0])) for p, new_p, t in pairs: cp = _backslash(p) if new_p in Q: if t == 'v': S = S.replace(p + ' )', new_p + ' )') if t == 'f': S = re.sub( '\( ' + cp + ' (equal|less|greater)', '( ' + new_p + r' \1', S) # only annotate S while identified HEAD is right if S_head == Q_head and '<f0>' in Q: S = S.replace(S_head, '<f0>') # annote unseen fields if ADD_FIELDS: for i, f in enumerate(all_fields): cf = _backslash(f) S = re.sub('(\s|^)' + cf + '(\s|$|s)', ' <c' + str(i) + '> ', S) S = _clean(S) lon_file.write(S + '\n') ############################### ######### VALIDATION ########## ############################### recover_S = S for word, sym, t in validation_pairs: recover_S = recover_S.replace(sym, word) sym_file.write(sym + '=>' + word + '<>') sym_file.write('\n') S_file.write(S_noparen + '\n') #------------------------------------------------------------------------ if _equal(name_pairs, candidates): acc_pair += 1 if Q_head == S_head: acc += 1 if _equal(name_pairs, candidates) and Q_head == S_head: acc_all += 1 full_anno = True for s in S.split(): if s[0] != '<' and s not in [ '(', ')', 'where', 'less', 'greater', 'equal', 'max', 'min', 'count', 'sum', 'avg', 'and', 'true' ]: error += 1 full_anno = False break if False and not (_equal(name_pairs, candidates) and head == col_names[-1]): print('--------' + str(n) + '-----------') print(Q_ori) print(Q) print(S_ori) print(S) print('head:') print(head) print(head2partial) print('true HEAD') print(Q_head) print('fields:') print(candidates) print(p2partial) print('true fields') print(name_pairs) n += 1 print('total number of examples:' + str(n)) print('fully snnotated:' + str(1 - error * 1.0 / n)) print('accurate all percent:' + str(acc_all * 1.0 / n)) print('accurate HEAD match percent:' + str(acc * 1.0 / n)) print('accurate fields pair match percent:' + str(acc_pair * 1.0 / n))