def CountInit(mask, data_dir='docs/exportSession', debug=False): """ analyse how result size change influence different changes """ data_source = [ 'ncbigene','ndc','orphanet','sgd','sider','swdf','affymetrix','goa','linkedgeodata', 'dbpedia.3.5.1.log','access.log-20151025', 'access.log-20151124','access.log-20151126', 'access.log-20151213','access.log-20151230','access.log-20160117','access.log-20160212', 'access.log-20160222','access.log-20160301','access.log-20160303','access.log-20160304', 'access.log-20160314','access.log-20160411'] res = {-1: 0, 0: 0, 1: 0, 'all': 0} for mask in tqdm(range(len(data_source)), total=len(data_source)): dbpedia = False if mask <= 8 else True # dbpedia = True # valuedSession, query2text = pre(mask) # _ , query2text, valuedSession = GetQueryAndText(data_source[mask], dbpedia=dbpedia, ReturnValuedSession=True) data = readExportedData(data_dir, data_source[mask]) query2text = sessions2Query2Text(sessions) query2factor = read_csv(f'docs/factors/{data_source[mask]}_factor.csv') for index, sess in enumerate(sessions): # DataFrame -> 'query', 'time' session = sess['queries'] session_len = sess['session_length'] if_first, first_result = GetResultSize(session, query2factor, 0, dbpedia=dbpedia) if_second, second_result = GetResultSize(session, query2factor, 1, dbpedia=dbpedia) if if_first and if_second: delta1 = sign(first_result, second_result) res[delta1] += 1 res['all'] += 1 return res
def hiddenState_matrix_fromZero(mask, data_dir='docs/exportSession', debug=False): """ analyse how result size change influence different changes """ data_source = [ 'ncbigene','ndc','orphanet','sgd','sider','swdf','affymetrix','goa','linkedgeodata', 'dbpedia.3.5.1.log','access.log-20151025', 'access.log-20151124','access.log-20151126', 'access.log-20151213','access.log-20151230','access.log-20160117','access.log-20160212', 'access.log-20160222','access.log-20160301','access.log-20160303','access.log-20160304', 'access.log-20160314','access.log-20160411'] res = {-1: 0, 0: 0, 1: 0} count = np.zeros((3,3)) for mask in tqdm(range(len(data_source)), total=len(data_source)): dbpedia = False if mask <= 8 else True # dbpedia = True # valuedSession, query2text = pre(mask) # _ , query2text, valuedSession = GetQueryAndText(data_source[mask], dbpedia=dbpedia, ReturnValuedSession=True) data = readExportedData(data_dir, data_source[mask]) sessions = data['sessions'] query2text = sessions2Query2Text(sessions) query2factor = read_csv(f'docs/factors/{data_source[mask]}_factor.csv') for index, sess in enumerate(sessions): # DataFrame -> 'query', 'time' session = sess['queries'] session_len = sess['session_length'] first = True store_first = True for ith in range(session_len-1): if first: first = False if_first = True first_result = 0 else: if_first, first_result = GetResultSize(session, query2factor, ith-1, dbpedia=dbpedia) if_second, second_result = GetResultSize(session, query2factor, ith, dbpedia=dbpedia) if_third, third_result = GetResultSize(session, query2factor, ith+1, dbpedia=dbpedia) if if_first and if_second and if_third: delta1 = sign(first_result, second_result) delta2 = sign(second_result, third_result) if store_first: res[delta1] += 1 store_first = False res[delta2] += 1 count[delta1+1][delta2+1] += 1 else: store_first = True return res, count
def hiddenState_seq(mask, data_dir='docs/exportSession', debug=False): """ analyse how result size change influence different changes """ data_source = [ 'ncbigene','ndc','orphanet','sgd','sider','swdf','affymetrix','goa','linkedgeodata', 'dbpedia.3.5.1.log','access.log-20151025', 'access.log-20151124','access.log-20151126', 'access.log-20151213','access.log-20151230','access.log-20160117','access.log-20160212', 'access.log-20160222','access.log-20160301','access.log-20160303','access.log-20160304', 'access.log-20160314','access.log-20160411'] seq_x = [] seq_x_zero = [] seq_y = [] seq_y_zero = [] for mask in tqdm(range(len(data_source)), total=len(data_source)): dbpedia = False if mask <= 8 else True # dbpedia = True # valuedSession, query2text = pre(mask) # _ , query2text, valuedSession = GetQueryAndText(data_source[mask], dbpedia=dbpedia, ReturnValuedSession=True) data = readExportedData(data_dir, data_source[mask]) query2text = sessions2Query2Text(sessions) query2factor = read_csv(f'docs/factors/{data_source[mask]}_factor.csv') for index, sess in enumerate(valuedSession): # DataFrame -> 'query', 'time' session = sess['queries'] session_len = sess['session_length'] first = True for ith in range(session_len-1): if_first, first_result = GetResultSize(session, query2factor, ith, dbpedia=dbpedia) if_second, second_result = GetResultSize(session, query2factor, ith+1, dbpedia=dbpedia) if if_first and if_second: delta1 = sign(first_result, second_result) if first_result != 0: seq_x.append(ith) seq_y.append((second_result-first_result)/first_result) else: seq_x_zero.append(ith) seq_y_zero.append(delta1) return seq_x, seq_y, seq_x_zero, seq_y_zero
def GetCompInfo(name, structure=False, dbpedia=False, data_dir='docs/exportSession', out_dir='results/triples'): """ mask -> data source id output: only consider changes on triples and filter res -> { session_id -> index in valuedSession query1 -> lsq query id for query1 query2 -> lsq query id for query2 time_span -> period between query1 and query2 comp_info -> { Triple -> xxx Filter -> xxx } } keys_dict -> [ { "Triple keys count" key1: xxx key2: xxx }{ "Filter keys count" ... } ] """ error_fo = open(os.path.join(out_dir, '%s_error.txt' % name), 'w') # valuedSession = read_valuedSession(name, filter_users=False, dbpedia=dbpedia) valuedSession = readExportedData(data_dir, name) valuedSession = valuedSession['sessions'] # --------update 20200304---------- # count the number of keys appearing triple_keys = {} filter_keys = {} keys_dict = [triple_keys, filter_keys] ith_key = [0, -1] # ------------end------------------ # --------update 20200304---------- # count where the changes happen star = [0,0,0] sink = [0,0,0] path = [0,0] hybrid = [0,0,0,0,0] count_struc = {'star':star, 'sink':sink, 'hybrid':hybrid, 'path':path} change_count = 0 # -------------end----------------- # get query texts query2text = sessions2Query2Text(valuedSession) pair_count = 0 error_count = 0 res = [] error_sess = 0 for index, sess in enumerate(valuedSession): # DataFrame -> 'query', 'time' session = sess['queries'] session_len = sess['session_length'] flag = 0 for ith in range(session_len-1): pair_count += 1 query1 = session[ith]['index_in_file'] query2 = session[ith+1]['index_in_file'] temp = {} temp['session_id'] = index temp['query1'] = session[ith]['index_in_file'] temp['query2'] = session[ith+1]['index_in_file'] text1 = query2text[query1] text2 = query2text[query2] temp['time_span'] = session[ith+1]['time_stamp'] - session[ith]['time_stamp'] if temp['time_span'] < timedelta(seconds=0): print('%d time span < 0' % index) from ipdb import set_trace; set_trace() try: info1 = GetInfo(text1) info2 = GetInfo(text2) except: flag = 1 error_count += 1 error_fo.write(f'{query1}<sep>{query2}\n') continue # ------------------update 20200304--------------------- # after getting info, count the number of keys appearing for i in range(len(ith_key)): keys = list(info1[ith_key[i]].keys()) keys.extend(list(info2[ith_key[i]].keys())) keys = list(set(keys)) for ki in keys: if ki not in keys_dict[i].keys(): keys_dict[i][ki] = 0 keys_dict[i][ki] += 1 # ---------------------end------------------------------ comp_info = {} comp_info['Triple'] = comp_BGP(info1[0], info2[0]) comp_info['Filter'] = comp_Fil(info1[-1], info2[-1]) temp['comp_info'] = comp_info res.append(temp) # --------------update 20200304---------------------- # count structure change if structure: count_struc, change_count = structure_analysis(info1[0], comp_info['Triple'], res=count_struc, change_count=change_count) # --------------------end---------------------------- if flag: error_count += 1 if structure: return count_struc, change_count return res, keys_dict
queryi.fill(query2factor, dbpedia) if queryi.projectionNum == -1: from ipdb import set_trace; set_trace() query2vector[queryidx] = queryi.featureVec if debug: queryi.print_query() from ipdb import set_trace; set_trace() except: continue return query2vector if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--mask", '-m', type=int, help="choose which file to run") parser.add_argument("--data_dir", '-d', type=str, default='docs/exportSession', help="the directory of data") parser.add_argument("--output_dir", '-o', type=str, default='ISWC-extension/results', help="output directory") args = parser.parse_args() mask = args.mask data_source = ['ncbigene','ndc','orphanet','sgd','sider','swdf','affymetrix','goa','linkedgeodata', 'dbpedia.3.5.1.log','access.log-20151025', 'access.log-20151124','access.log-20151126', 'access.log-20151213','access.log-20151230','access.log-20160117','access.log-20160212', 'access.log-20160222','access.log-20160301','access.log-20160303','access.log-20160304', 'access.log-20160314','access.log-20160411'] data = readExportedData(args.data_dir, data_source[args.mask]) dbpedia = False if args.mask <= 8 else True query2vector = GetFeatureVectors(data['sessions'], data_source[args.mask], dbpedia=dbpedia) write_pkl(f'results/hyper_featureVector/{data_source[mask]}_Vector.pkl', query2vector)
default=1, help= "True when computing feature vector, false when computing IRI vector") parser.add_argument("--data_dir", '-d', type=str, default='docs/exportSession', help="the directory of data") parser.add_argument("--output_dir", '-o', type=str, default='results', help="output directory") parser.add_argument("--sub_dir", '-s', type=str, default='hyper_featureVector', help="output sub directory") args = parser.parse_args() mask = args.mask norma = True if args.normalize else False print(norma) for i in range(len(data_source)): data = readExportedData(args.data_dir, data_source[i]) vector_ana(data['sessions'], i, dir_=os.path.join(args.output_dir, args.sub_dir), normalize=norma, debug=False)
def GetFeatureFromText(text): try: pq = parse_spl(text) except: return False, 0 res = GetFeatureDBpedia(pq) return True, res if __name__ == "__main__": data_source = [ 'dbpedia.3.5.1.log', 'access.log-20151025', 'access.log-20151124', 'access.log-20151126', 'access.log-20151213', 'access.log-20151230', 'access.log-20160117', 'access.log-20160212', 'access.log-20160222', 'access.log-20160301', 'access.log-20160303', 'access.log-20160304', 'access.log-20160314', 'access.log-20160411' ] data_source += [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata' ] for i, name in tqdm(enumerate(data_source), leave=False, total=len(data_source)): dbpedia = True if i <= 13 else False data = readExportedData('docs/exportSession', name) getQueryFeature(data['sessions'], name, dbpedia=dbpedia)
def compUsedOperator(dbpedia=False, data_dir='docs/exportSession', operator_dir='results/operator'): """ comp used features between two continous queries for all datasets. generate a table, columns -> different data set index -> different feature, <feature>_add, <feature>_delete ... for all the dataset. """ # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf', # 'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia'] data_source1 = [ 'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix', 'goa', 'linkedgeodata' ] data_source2 = [ 'dbpedia.3.5.1.log', 'access.log-20151025', 'access.log-20151124', 'access.log-20151126', 'access.log-20151213', 'access.log-20151230', 'access.log-20160117', 'access.log-20160212', 'access.log-20160222', 'access.log-20160301', 'access.log-20160303', 'access.log-20160304', 'access.log-20160314', 'access.log-20160411' ] data_source = data_source1 + data_source2 error_fo = open(os.path.join(operator_dir, 'comp_operator_error.txt'), 'w') res = {} indexes = [] features_all = read_list( os.path.join(operator_dir, 'operator_list_10_new.txt')) for featurei in features_all: indexes.extend([f'{featurei}_add', f'{featurei}_delete']) res[f'{featurei}_add'] = list(np.zeros(len(data_source) + 1)) res[f'{featurei}_delete'] = list(np.zeros(len(data_source) + 1)) for idx, datai in enumerate(data_source): dbpedia = False if idx <= 8 else True # valuedSession = read_valuedSession(datai, filter_users=False, dbpedia=dbpedia) valuedSession = readExportedData(data_dir, datai) valuedSession = valuedSession['sessions'] query2feature = read_pkl( os.path.join(operator_dir, '%s_feature.pkl' % datai)) for index, sess in enumerate(valuedSession): session = sess['queries'] session_len = sess['session_length'] for ith in range(session_len - 1): query_key = 'index_in_file' query1 = session[ith][query_key] query2 = session[ith + 1][query_key] try: feature1 = query2feature[query1] feature2 = query2feature[query2] except: continue share = [x for x in feature1 if x in feature2] not1 = [x for x in feature2 if x not in feature1] not2 = [x for x in feature1 if x not in feature2] for fi in not1: res[f'{fi}_add'][idx] += 1 res[f'{fi}_add'][-1] += 1 for fi in not2: res[f'{fi}_delete'][idx] += 1 res[f'{fi}_delete'][-1] += 1 if 'all' not in data_source: data_source.append('all') df = DataFrame.from_dict(res, orient='index', columns=data_source) df = df.sort_values(by='all', ascending=False) df.to_csv(os.path.join(operator_dir, 'compUsedOperator_10_new.csv')) return df