def test_encoding(): original = 'SELECT ?city WHERE { ?m skos:broader dbc:Cities_in_Germany . ?city dct:subject ?m . ?city dbo:areaTotal ?area . ?b dbo:artist dbr:John_Halsey_(musician) } order by asc (?area)' expected_encoding = 'SELECT var_city WHERE brack_open var_m skos_broader dbc_Cities_in_Germany sep_dot var_city dct_subject var_m sep_dot var_city dbo_areaTotal var_area sep_dot var_b dbo_artist dbr_John_Halsey_ attr_open musician attr_close brack_close _oba_ var_area ' result = generator_utils.encode(original) assert result == expected_encoding assert str.strip(generator_utils.decode(result)) == original
def build_dataset_pair(binding, template): english = getattr(template, 'question') sparql = getattr(template, 'query') for variable in binding: uri = binding[variable]['uri'] label = binding[variable]['label'] placeholder = '<{}>'.format(str.upper(variable)) if placeholder in english and label is not None: english = english.replace(placeholder, strip_brackets(label)) if placeholder in sparql and uri is not None: sparql = sparql.replace(placeholder, uri) sparql = encode(sparql) dataset_pair = {'english': english, 'sparql': sparql} return dataset_pair
with open(fp, "r", encoding="UTF-8") as text: for i, line in enumerate(text.readlines()): three = line.split("\t") query = three[1] if not (query.startswith("ASK") or query.startswith("ask")): simile = get_simile(query) if simile is not None: lis = ast.literal_eval(three[-1]) flag = any([ent.startswith("http:") for ent in lis]) if flag: queriset.append((simile, i)) #lis = ast.literal_eval(three[-1]) for ent in lis: if ent.startswith("http:"): ent = preprocess_sentence( generator_utils.encode(ent)) vec = glove_embedding(ent) dataset.append((vec, i)) else: continue #print("dataset length: ", len(dataset)) #print("queriset length: ", len(queriset)) def calcul_rank(queriset, dataset): mrrs = [] hits10 = [] hits100 = [] for simile, i in queriset: #sorted_dataset = sorted(dataset, key=lambda pair:cosine(simile, pair[0])) sorted_dataset = sort_data(simile, dataset) def add(num):
used_resources_root, _ = os.path.splitext(used_resources_file) filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format( dataset_root, MINIMUM, COMP.__name__) filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__) used_resources = collections.Counter( json.loads(open(used_resources_file).read())) filtered_resources = [ elem_cnt for elem_cnt in list(used_resources.items()) if elem_cnt[1] >= MINIMUM ] save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources))) valid_encoded_resources = [ encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources ] check = lambda encoded_entity: encoded_entity in valid_encoded_resources valid_lines = [] filtered_queries = [] with open(dataset_root + '.sparql', 'r') as sparql_file: for linenumber, line in enumerate(sparql_file): entities = extract_encoded_entities(line) valid = COMP(list(map(check, entities))) if valid: filtered_queries.append(line) valid_lines.append(linenumber) filtered_questions = [] with open(dataset_root + '.en', 'r') as en_file:
sys.setdefaultencoding("utf-8") dataset_root, _ = os.path.splitext(dataset_file) used_resources_root, _ = os.path.splitext(used_resources_file) filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format( dataset_root, MINIMUM, COMP.__name__) filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__) used_resources = collections.Counter( json.loads(open(used_resources_file).read())) filtered_resources = filter(lambda (elem, cnt): cnt >= MINIMUM, used_resources.items()) save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources))) valid_encoded_resources = map(lambda (elem, cnt): encode(elem), filtered_resources) check = lambda encoded_entity: encoded_entity in valid_encoded_resources valid_lines = [] filtered_queries = [] with open(dataset_root + '.sparql', 'r') as sparql_file: for linenumber, line in enumerate(sparql_file): entities = extract_encoded_entities(line) valid = COMP(map(check, entities)) if valid: filtered_queries.append(line) valid_lines.append(linenumber) filtered_questions = [] with open(dataset_root + '.en', 'r') as en_file:
MINIMUM = int(args.minimum) COMP = any if args.comp == 'any' else all importlib.reload(sys) sys.setdefaultencoding("utf-8") dataset_root, _ = os.path.splitext(dataset_file) used_resources_root, _ = os.path.splitext(used_resources_file) filtered_sparql_file = '{}_filtered_{:d}_{}.sparql'.format(dataset_root, MINIMUM, COMP.__name__) filtered_en_file = '{}_filtered_{:d}_{}.en'.format(dataset_root, MINIMUM, COMP.__name__) used_resources = collections.Counter(json.loads(open(used_resources_file).read())) filtered_resources = [elem_cnt for elem_cnt in list(used_resources.items()) if elem_cnt[1] >= MINIMUM] save_cache('{}_filter_{:d}.json'.format(used_resources_root, MINIMUM), collections.Counter(dict(filtered_resources))) valid_encoded_resources = [encode(elem_cnt1[0]) for elem_cnt1 in filtered_resources] check = lambda encoded_entity : encoded_entity in valid_encoded_resources valid_lines = [] filtered_queries = [] with open(dataset_root+'.sparql', 'r') as sparql_file: for linenumber, line in enumerate(sparql_file): entities = extract_encoded_entities(line) valid = COMP(list(map(check, entities))) if valid: filtered_queries.append(line) valid_lines.append(linenumber) filtered_questions = [] with open(dataset_root+'.en', 'r') as en_file: for linenumber, line in enumerate(en_file):
import argparse from generator_utils import decode, encode if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('mode', nargs='?', choices=['encode', 'decode'], default='decode') parser.add_argument('input_path') args = parser.parse_args() with open(args.input_path, 'r') as input_file: for line in input_file: if args.mode == 'decode': print(decode(line.strip())) elif args.mode == 'encode': print(encode(line.strip()))