def GetQueryFeatures(mask): """ get query features for one dataset. """ # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] data_source = [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata' ] feature_file = 'docs/feature/%s_feature.pkl' % data_source[mask] conn = GetConn(data_source[mask]) valuedSession = read_valuedSession(data_source[mask], filter_users=False) # get query texts queries = [] for sessi in valuedSession: sess = sessi['session'] queries.extend(sess['query'].tolist()) queries = list(set(queries)) query2feature = {} for queryi in queries: features = GetQueryUsedFeature(conn, queryi) if queryi not in query2feature.keys(): query2feature[queryi] = [] query2feature[queryi].extend(features) write_pkl(feature_file, query2feature) return query2feature
def GetCompInfo_struc_once(name, dbpedia=False): def init(): res = {'out_node': 0, 'in_node': 0, 'out_edge': 0, 'in_edge': 0} return res def all_init(): res = {} for keyi in ['star', 'sink', 'path', 'hybrid']: res[keyi] = init() return res error_fo = open('docs/compare/%s_error.txt' % name, 'w') if not dbpedia: conn = GetConn(name) valuedSession = read_valuedSession(name, filter_users=False, dbpedia=dbpedia) count_struc = all_init() change_count = {'star': 0, 'sink': 0, 'hybrid': 0, 'path': 0, 'no': 0} # get query texts queries = [] for sessi in valuedSession: sess = sessi['session'] queries.extend(sess['query'].tolist()) queries_unique = copy.deepcopy(list(set(queries))) if not dbpedia: texts = GetTextFromQueryID(conn, queries_unique) query2text = {} for i, query in enumerate(queries_unique): query2text[query] = texts[i][0] for queryi in queries: if not dbpedia: texti = query2text[queryi] else: texti = queryi try: infoi = GetInfo(texti) except: continue nodes = get_structure_info(infoi[0]) for blocki, nodesi in nodes.items(): for nodei, topo in nodesi.items(): change_count[topo['type']] += 1 if topo['type'] != 'no': for keyi in ['in_node', 'in_edge', 'out_node', 'out_edge']: count_struc[topo['type']][keyi] += len(topo[keyi]) return change_count, count_struc
def readSessionFromDirectory(input_dir, output_dir): files = os.listdir(input_dir) for filei in tqdm(files): if 'valuedSession' in filei: print(f'read {filei} ...') namei = filei.split('valuedSession')[0][:-1] dbpedia = True if 'access.log' in namei or 'dbpedia' in namei else False if not dbpedia: data = read_valuedSession(namei, dbpedia=dbpedia) writeSessionToDirectory(output_dir, namei, data, dbpedia)
def countKeys(name, fromFile=True): """ from info get operator list. input: name: str, used to locate files which contains some info. files we used: <file_name>_info.pkl <file_name>_valuedSession.txt <file_name>_info_error.txt output: keys_dict, a list of dict. [ triple keys count { 'operator in triple': int -> times this operator appears } filter ... other ... ] """ valudSession = read_valuedSession(name) query_info = read_pkl(f'docs/info/{name}_info.pkl') error_file_name = 'docs/info/%s_info_error.txt' % name triple_keys = {} filter_keys = {} others_keys = {} keys_dict = [triple_keys, filter_keys, others_keys] ith_key = [0, 1, -1] error_list = list(read_list(error_file_name, sep='<sep>').keys()) for index, sess in enumerate(valudSession): # DataFrame -> 'query', 'time' session = sess['session'] session_len = len(session) for ith in range(session_len-1): query1 = session.iloc[ith]['query'] query2 = session.iloc[ith+1]['query'] if query1 in error_list or query2 in error_list: continue else: query_info1 = query_info[query1] query_info2 = query_info[query2] # from ipdb import set_trace; set_trace() for i in range(len(ith_key)): keys = list(query_info1[ith_key[i]].keys()) keys.extend(list(query_info2[ith_key[i]].keys())) keys = list(set(keys)) for ki in keys: if ki not in keys_dict[i].keys(): keys_dict[i][ki] = 0 keys_dict[i][ki] += 1 return keys_dict
def CollectNumOfPairs(dbpedia=False): """ comp used features between two continous queries for all datasets. generate a table, columns -> different data set index -> different feature, <feature>_add, <feature>_delete ... for all the dataset. """ # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] data_source1 = [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata' ] data_source2 = [ 'dbpedia.3.5.1.log', 'access.log-20151025', 'access.log-20151124', 'access.log-20151126', 'access.log-20151213', 'access.log-20151230', 'access.log-20160117', 'access.log-20160212', 'access.log-20160222', 'access.log-20160301', 'access.log-20160303', 'access.log-20160304', 'access.log-20160314', 'access.log-20160411' ] data_source = data_source1 + data_source2 error_fo = open('docs/feature/comp_feature_error.txt', 'w') queryForms = ['ask', 'select', 'describe', 'construct'] Patterns = [ 'path', 'group', 'optional', 'union', 'graph', 'bind', 'having', 'minus', 'filter', 'agg' ] Modifiers = [ 'orderby', 'projection', 'distinct', 'reduced', 'offset', 'limit' ] fn = ['fn', 'builtin'] other = ['values', 'service'] op_list = [queryForms, Patterns, Modifiers, fn, other] count_all = [0, 0, 0, 0, 0] count_single = [list(np.zeros(5)) for x in range(len(data_source))] pair_count = 0 for idx, datai in enumerate(data_source): dbpedia = False if idx <= 8 else True valuedSession = read_valuedSession(datai, filter_users=False, dbpedia=dbpedia) query2feature = read_pkl('docs/feature/%s_feature.pkl' % datai) for index, sess in enumerate(valuedSession): # DataFrame -> 'query', 'time' session = sess['session'] session_len = len(session) for ith in range(session_len - 1): query_key = 'idxInFile' if dbpedia else 'query' query1 = session.iloc[ith][query_key] query2 = session.iloc[ith + 1][query_key] try: feature1 = query2feature[query1] feature2 = query2feature[query2] except: continue pair_count += 1 not1 = [x for x in feature2 if x not in feature1] not2 = [x for x in feature1 if x not in feature2] for op_listi, ops in enumerate(op_list): for opi, oneop in enumerate(ops): if_next = False for changeOp in (not1 + not2): if oneop.lower() in str(changeOp).lower(): count_all[op_listi] += 1 count_single[idx][op_listi] += 1 if_next = True break if if_next: break return count_all, count_single, pair_count