Beispiel #1
0
def GetQueryFeatures(mask):
    """
    get query features for one dataset.
    """

    # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf',
    #            'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia']
    data_source = [
        'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix',
        'goa', 'linkedgeodata'
    ]

    feature_file = 'docs/feature/%s_feature.pkl' % data_source[mask]

    conn = GetConn(data_source[mask])
    valuedSession = read_valuedSession(data_source[mask], filter_users=False)

    # get query texts
    queries = []
    for sessi in valuedSession:
        sess = sessi['session']
        queries.extend(sess['query'].tolist())
    queries = list(set(queries))

    query2feature = {}
    for queryi in queries:
        features = GetQueryUsedFeature(conn, queryi)
        if queryi not in query2feature.keys():
            query2feature[queryi] = []
        query2feature[queryi].extend(features)

    write_pkl(feature_file, query2feature)

    return query2feature
def writeSessionToDirectory(output_dir, name, data, dbpedia):
    """
    output format: json 
    {
        dataset: 'swdf'
        sessions: [
                {
                    session_id: 0
                    session_length: 10
                    user: xxxx
                    queries: [
                        {
                            query_id: 0
                            query_content: SPARQL query
                            time_stamp: 
                            index_in_file: for dbpedia, the original index in file; 
                                           for others, the original IRI
                        }
                    ]
                }
            ]
        }
    }

    """
    if not dbpedia:
        conn, repo = GetConn(name, if_return_repo=True)
    else:
        conn = None

    output = {'dataset': name, 'sessions': []}
    index_in_file_key = 'idxInFile' if dbpedia else 'query'

    for session_idx, sessi in tqdm(enumerate(data), total=len(data)):
        sessioni = {'session_id': session_idx, 'session_length': len(sessi['session']),
                    'user': sessi['agent'], 'queries': []}
        texts = getTexts(sessi['session'], dbpedia, conn)
        for i in range(len(sessi['session'])):
            query_temp = {'query_id': i, 'query_content': texts[i], 
                          'time_stamp': sessi['session'].iloc[i]['time'],
                          'index_in_file': sessi['session'].iloc[i][index_in_file_key]}
            sessioni['queries'].append(query_temp)
        output['sessions'].append(sessioni)
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    file_name = os.path.join(output_dir, f'{name}_session_data.json')
    write_pkl(file_name, output)
    if not dbpedia:
        conn.close()
        repo.shutDown()
Beispiel #3
0
def getQueryFeature(data, name, dbpedia=False, output_dir='results/operator'):

    feature_file = os.path.join(output_dir, f'{name}_feature.pkl')

    query2text = sessions2Query2Text(data)

    idx2query = {}
    for sessi in data:
        sess = sessi['queries']
        for idxi in range(len(sess)):
            index = sess[idxi]['index_in_file']
            if index not in idx2query.keys():
                idx2query[index] = query2text[index]

    query2feature = {}
    for idx, query in idx2query.items():
        try:
            pq = parse_spl(query)
        except:
            continue
        res = GetFeatureDBpedia(pq)
        query2feature[idx] = res

    write_pkl(feature_file, query2feature)
Beispiel #4
0
            from ipdb import set_trace
            set_trace()
            temp = res[i][key]
            if key not in res_all.keys():
                res_all[key] = {
                    'add_count': 0,
                    'delete_count': 0,
                    'change_count': 0
                }

            ite = ['add_count', 'delete_count', 'change_count']
            for itei in ite:
                res_all[key][itei] += temp[itei]
    # print(res_all)
    res['all'] = res_all
    write_pkl('docs/compare_count.pkl', res)


def count_changes_block(name,
                        dict_key=['Triple', 'Filter'],
                        list_key=[],
                        pkl=None):
    """
    name -> data source name
    count_changes about some operator, is this operator new? or old? 
                                    inside this block, has some triples added? deleted?
                                    in the mappings of these triple, how did these change?
                                    where change and how to change?
    Triple/Filter/Other: {
        block_name: {new_count: xx, old_count: xx, add_count:xx, delete_count:xx, change_count:xx, 
                    change_type_count: {'type1': xx ...}}
            if queryi.projectionNum == -1:
                from ipdb import set_trace; set_trace()
            query2vector[queryidx] = queryi.featureVec
            if debug:
                queryi.print_query()
                from ipdb import set_trace; set_trace()
        except:
            continue
    return query2vector

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--mask", '-m', type=int, help="choose which file to run")
    parser.add_argument("--data_dir", '-d', type=str, default='docs/exportSession', help="the directory of data")
    parser.add_argument("--output_dir", '-o', type=str, default='ISWC-extension/results', help="output directory")
    args = parser.parse_args()
    mask = args.mask

    data_source = ['ncbigene','ndc','orphanet','sgd','sider','swdf','affymetrix','goa','linkedgeodata',
            'dbpedia.3.5.1.log','access.log-20151025', 'access.log-20151124','access.log-20151126',
            'access.log-20151213','access.log-20151230','access.log-20160117','access.log-20160212',
            'access.log-20160222','access.log-20160301','access.log-20160303','access.log-20160304',
            'access.log-20160314','access.log-20160411']
    
    data = readExportedData(args.data_dir, data_source[args.mask])
    dbpedia = False if args.mask <= 8 else True

    query2vector = GetFeatureVectors(data['sessions'], data_source[args.mask], dbpedia=dbpedia)
    write_pkl(f'results/hyper_featureVector/{data_source[mask]}_Vector.pkl', query2vector)
                        '-d',
                        type=str,
                        default='exportSession/',
                        help="the directory of data")
    parser.add_argument("--output_dir",
                        '-o',
                        type=str,
                        default='results/',
                        help="output directory")
    args = parser.parse_args()
    i = args.mask
    subdir = 'hypergraph' if args.hypergraph else 'normal_graph'

    output_directory = os.path.join(args.output_dir, subdir)
    if not os.path.exists(output_directory):
        os.mkdir(output_directory)

    data = readExportedData(args.data_dir, data_source[args.mask])

    simi_conti = compGraphSimilarity_Session(data['sessions'],
                                             begin=args.begin,
                                             end=args.end,
                                             hyper=args.hypergraph)
    write_pkl(f'{output_directory}/{data_source[i]}_simi_conti.pkl',
              simi_conti)
    simi_first = compGraphSimilarity_Session_First(data['sessions'],
                                                   begin=args.begin,
                                                   end=args.end,
                                                   hyper=args.hypergraph)
    write_pkl(f'{output_directory}/{data_source[i]}_simi_first.pkl',
              simi_first)
Beispiel #7
0
    parser.add_argument("--output_dir",
                        '-o',
                        type=str,
                        default='results',
                        help="output directory")
    args = parser.parse_args()

    # repo_names = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf',
    #            'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia']
    # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf',
    #            'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia']
    data_source = [
        'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix',
        'goa', 'linkedgeodata', 'dbpedia.3.5.1.log', 'access.log-20151025',
        'access.log-20151124', 'access.log-20151126', 'access.log-20151213',
        'access.log-20151230', 'access.log-20160117', 'access.log-20160212',
        'access.log-20160222', 'access.log-20160301', 'access.log-20160303',
        'access.log-20160304', 'access.log-20160314', 'access.log-20160411'
    ]

    # mask = args.mask
    for mask in range(len(data_source)):
        dbpedia = False if mask <= 8 else True
        data = readExportedData(args.data_dir, data_source[mask])
        query2vector, IRI_table = getAllVector(data['sessions'],
                                               data_source[mask],
                                               dbpedia=dbpedia)
        write_pkl(f'results/IRI_vector/{data_source[mask]}_IRI_table.pkl',
                  IRI_table)
        write_pkl(f'results/IRI_vector/{data_source[mask]}_Vector.pkl',
                  query2vector)
Beispiel #8
0
def vector_ana(data, mask, dir_='vector', normalize=False, debug=False):
    # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf',
    #            'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia']
    data_source = [
        'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix',
        'goa', 'linkedgeodata', 'dbpedia.3.5.1.log', 'access.log-20151025',
        'access.log-20151124', 'access.log-20151126', 'access.log-20151213',
        'access.log-20151230', 'access.log-20160117', 'access.log-20160212',
        'access.log-20160222', 'access.log-20160301', 'access.log-20160303',
        'access.log-20160304', 'access.log-20160314', 'access.log-20160411'
    ]
    dbpedia = False if mask <= 8 else True
    query2text = sessions2Query2Text(data)

    query2vector = read_pkl(
        os.path.join(dir_, f'{data_source[mask]}_Vector.pkl'))
    confusionMtrix_dataset = []

    for index, sess in tqdm(enumerate(data), total=len(data), leave=True):

        session = sess['queries']
        session_len = sess['session_length']
        flag = 0
        infos = []
        for ith in range(session_len):
            queryi = session[ith]['index_in_file']
            texti = session[ith]['query_content']
            try:
                infoi = GetInfo(texti)
                infos.append(infoi)
            except:
                flag = 1
                break
        if flag:
            continue

        if normalize:
            maximum = np.zeros(10)
            for ith1 in range(session_len):
                query1 = session[ith1]['index_in_file']
                vector1 = query2vector[query1]
                for i, num in enumerate(vector1):
                    if num > maximum[i]:
                        maximum[i] = num
                if debug:
                    print(vector1)
            maximum = np.where(maximum == 0, 1, maximum)
        if debug:
            print(maximum)
            from ipdb import set_trace
            set_trace()

        mat_kl = np.zeros((session_len, session_len))
        mat_cos = np.zeros((session_len, session_len))
        for ith1 in range(session_len):
            for ith2 in range(session_len):
                key = 'index_in_file'
                query1 = session[ith1][key]
                query2 = session[ith2][key]
                vector1 = query2vector[query1]
                vector2 = query2vector[query2]
                if debug:
                    print('before normalize')
                    print(vector1)
                    print(vector2)
                if normalize:
                    vector1 = vector1 / maximum
                    vector2 = vector2 / maximum
                if debug:
                    print('after')
                    print(vector1)
                    print(vector2)
                    from ipdb import set_trace
                    set_trace()
                mat_kl[ith1][ith2] = kl_divergence(vector1, vector2)
                mat_cos[ith1][ith2] = cosine_distance(vector1, vector2)
        confusionMtrix_dataset.append({
            'index': index,
            'mat_kl': mat_kl,
            'mat_cos': mat_cos
        })

    marker = '_normalized' if normalize else ''
    write_pkl(
        os.path.join(dir_, f'{data_source[mask]}_confusionMat{marker}.pkl'),
        confusionMtrix_dataset)
    return confusionMtrix_dataset