Example #1
0
def GetQueryFeatures(mask):
    """
    get query features for one dataset.
    """

    # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf',
    #            'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia']
    data_source = [
        'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix',
        'goa', 'linkedgeodata'
    ]

    feature_file = 'docs/feature/%s_feature.pkl' % data_source[mask]

    conn = GetConn(data_source[mask])
    valuedSession = read_valuedSession(data_source[mask], filter_users=False)

    # get query texts
    queries = []
    for sessi in valuedSession:
        sess = sessi['session']
        queries.extend(sess['query'].tolist())
    queries = list(set(queries))

    query2feature = {}
    for queryi in queries:
        features = GetQueryUsedFeature(conn, queryi)
        if queryi not in query2feature.keys():
            query2feature[queryi] = []
        query2feature[queryi].extend(features)

    write_pkl(feature_file, query2feature)

    return query2feature
Example #2
0
def GetCompInfo_struc_once(name, dbpedia=False):

    def init():
        res = {'out_node': 0, 'in_node': 0,
               'out_edge': 0, 'in_edge': 0}
        return res
    
    def all_init():
        res = {}
        for keyi in ['star', 'sink', 'path', 'hybrid']:
            res[keyi] = init()
        return res


    error_fo = open('docs/compare/%s_error.txt' % name, 'w')

    if not dbpedia:
        conn = GetConn(name)
    valuedSession = read_valuedSession(name, filter_users=False, dbpedia=dbpedia)


    count_struc = all_init()
    change_count = {'star': 0, 'sink': 0, 'hybrid': 0, 'path': 0, 'no': 0}

    # get query texts
    queries = []
    for sessi in valuedSession:
        sess = sessi['session']
        queries.extend(sess['query'].tolist())
    queries_unique = copy.deepcopy(list(set(queries)))

    if not dbpedia:
        texts = GetTextFromQueryID(conn, queries_unique)
        query2text = {}
        for i, query in enumerate(queries_unique):
            query2text[query] = texts[i][0]

    for queryi in queries:
        if not dbpedia:
            texti = query2text[queryi]
        else:
            texti = queryi

        try:
            infoi = GetInfo(texti)
        except:
            continue
        
        nodes = get_structure_info(infoi[0])

        for blocki, nodesi in nodes.items():
            for nodei, topo in nodesi.items():
                change_count[topo['type']] += 1
                if topo['type'] != 'no':
                    for keyi in ['in_node', 'in_edge', 'out_node', 'out_edge']:
                        count_struc[topo['type']][keyi] += len(topo[keyi])

    
    return change_count, count_struc
Example #3
0
def readSessionFromDirectory(input_dir, output_dir):
    files = os.listdir(input_dir)
    for filei in tqdm(files):
        if 'valuedSession' in filei:
            print(f'read {filei} ...')
            namei = filei.split('valuedSession')[0][:-1]
            dbpedia = True if 'access.log' in namei or 'dbpedia' in namei else False
            if not dbpedia:
                data = read_valuedSession(namei, dbpedia=dbpedia)
                writeSessionToDirectory(output_dir, namei, data, dbpedia)
Example #4
0
def countKeys(name, fromFile=True):
    """
    from info get operator list.
    
    input:
        name: str, used to locate files which contains some info.
              files we used: <file_name>_info.pkl
                             <file_name>_valuedSession.txt
                             <file_name>_info_error.txt
    
    output:
        keys_dict, a list of dict.
                 [  triple keys count
                     {
                         'operator in triple': int -> times this operator appears
                     }
                     filter ...
                     other ...
                 ]
    """
    valudSession =  read_valuedSession(name)
    query_info = read_pkl(f'docs/info/{name}_info.pkl')
    error_file_name = 'docs/info/%s_info_error.txt' % name
    
    triple_keys = {}
    filter_keys = {}
    others_keys = {}
    keys_dict = [triple_keys, filter_keys, others_keys]
    ith_key = [0, 1, -1]
    
    error_list = list(read_list(error_file_name, sep='<sep>').keys())

    for index, sess in enumerate(valudSession):
        # DataFrame -> 'query', 'time'
        session = sess['session']
        session_len = len(session)
        for ith in range(session_len-1):
            query1 = session.iloc[ith]['query']
            query2 = session.iloc[ith+1]['query']
            if query1 in error_list or query2 in error_list:
                continue
            else:
                query_info1 = query_info[query1]
                query_info2 = query_info[query2]
                # from ipdb import set_trace; set_trace()
                for i in range(len(ith_key)):
                    keys = list(query_info1[ith_key[i]].keys())
                    keys.extend(list(query_info2[ith_key[i]].keys()))
                    keys = list(set(keys))
                    for ki in keys:
                        if ki not in keys_dict[i].keys():
                            keys_dict[i][ki] = 0
                        keys_dict[i][ki] += 1
    return keys_dict
Example #5
0
def CollectNumOfPairs(dbpedia=False):
    """
    comp used features between two continous queries for all datasets.
    generate a table, columns -> different data set
                      index   -> different feature, <feature>_add, <feature>_delete ...
    for all the dataset.
    """
    # data_source = ['lsr', 'mesh', 'ncbigene', 'ndc', 'omim', 'orphanet', 'sabiork', 'sgd', 'sider', 'swdf',
    #            'affymetrix', 'dbsnp', 'gendr', 'goa', 'linkedgeodata', 'linkedspl', 'dbpedia']
    data_source1 = [
        'ncbigene', 'ndc', 'orphanet', 'sgd', 'sider', 'swdf', 'affymetrix',
        'goa', 'linkedgeodata'
    ]
    data_source2 = [
        'dbpedia.3.5.1.log', 'access.log-20151025', 'access.log-20151124',
        'access.log-20151126', 'access.log-20151213', 'access.log-20151230',
        'access.log-20160117', 'access.log-20160212', 'access.log-20160222',
        'access.log-20160301', 'access.log-20160303', 'access.log-20160304',
        'access.log-20160314', 'access.log-20160411'
    ]
    data_source = data_source1 + data_source2

    error_fo = open('docs/feature/comp_feature_error.txt', 'w')

    queryForms = ['ask', 'select', 'describe', 'construct']
    Patterns = [
        'path', 'group', 'optional', 'union', 'graph', 'bind', 'having',
        'minus', 'filter', 'agg'
    ]
    Modifiers = [
        'orderby', 'projection', 'distinct', 'reduced', 'offset', 'limit'
    ]
    fn = ['fn', 'builtin']
    other = ['values', 'service']

    op_list = [queryForms, Patterns, Modifiers, fn, other]
    count_all = [0, 0, 0, 0, 0]
    count_single = [list(np.zeros(5)) for x in range(len(data_source))]
    pair_count = 0

    for idx, datai in enumerate(data_source):
        dbpedia = False if idx <= 8 else True
        valuedSession = read_valuedSession(datai,
                                           filter_users=False,
                                           dbpedia=dbpedia)
        query2feature = read_pkl('docs/feature/%s_feature.pkl' % datai)

        for index, sess in enumerate(valuedSession):
            # DataFrame -> 'query', 'time'
            session = sess['session']
            session_len = len(session)
            for ith in range(session_len - 1):
                query_key = 'idxInFile' if dbpedia else 'query'
                query1 = session.iloc[ith][query_key]
                query2 = session.iloc[ith + 1][query_key]
                try:
                    feature1 = query2feature[query1]
                    feature2 = query2feature[query2]
                except:
                    continue
                pair_count += 1
                not1 = [x for x in feature2 if x not in feature1]
                not2 = [x for x in feature1 if x not in feature2]

                for op_listi, ops in enumerate(op_list):
                    for opi, oneop in enumerate(ops):
                        if_next = False
                        for changeOp in (not1 + not2):
                            if oneop.lower() in str(changeOp).lower():
                                count_all[op_listi] += 1
                                count_single[idx][op_listi] += 1
                                if_next = True
                                break
                        if if_next:
                            break
    return count_all, count_single, pair_count