Exemple #1
0
def __candidate_construction(query, row, query_pool, match_list,
                             candidate_rate):
    # select set of queries matching X
    match_query = {}
    document = ''
    for term in match_list:
        document += alphnum(getElement(term, row).lower()) + ' '
    for q, v in query_pool.iteritems():
        match_query[q] = v
        for subq in q:
            if subq not in document:
                match_query.pop(q)
                break
    match_query[query] = query_pool[query]
    candidate_query = random.sample(match_query.keys(),
                                    int(len(match_query) * candidate_rate))
    return match_query, candidate_query
Exemple #2
0
def __candidate_construction(query, row, query_pool, match_term, candidate_rate):
    # select set of queries matching X
    match_query = {}
    document = ''
    for term in match_term:
        try:
            document += data_process.alphnum(eval(term).lower()) + ' '
        except KeyError:
            continue
    for q, v in query_pool.iteritems():
        match_query[q] = v
        for subq in q:
            if subq not in document:
                match_query.pop(q)
                break
    match_query[query] = query_pool[query]
    candidate_query = random.sample(match_query.keys(), int(len(match_query) * candidate_rate))
    return match_query, candidate_query
Exemple #3
0
def sota_sampler(query_pool, api, match_term, top_k, adjustment=1, samplenum=500):
    """
    A method to crawl each document from a search engine's corpus in the same probability
    ------**Random sampling from a search engine's index**

    :param query_pool: A dict contains the queries and their benefits. {set(['yong','jun']):5}
    :param api: An implementation of simapi for specific api.
    :param match_term: Some fields for matching queries and returned document.
    :param top_k: Only top_k documents would be returned by api.
    :param adjustment: A paramters used to improve the probability of accepting a document
    :param samplenum: The size of the sample
    :return: A list of sample documents returned by api
    """
    sample = []
    query_cost = 0
    params = api.getKwargs()
    query_pool_copy = copy.deepcopy(query_pool)
    matchlist = []
    for m in match_term:
        matchlist.append(m.split('.'))

    while len(sample) < samplenum:
        query_cost += 1
        curQuery = random.choice(query_pool.items())
        params[api.getSearchTerm()] = '+'.join(curQuery[0])
        result = api.callAPI(params=params)
        # choose one valid query
        if len(result) < top_k and len(result) > 0:
            # with prob of q/k
            if random.uniform(0, 1) <= len(result) / (top_k * 1.0):
                # choose one edge uniformly
                rint = random.randint(0, len(result) - 1)
                row = result[rint]
                document = ''
                for term in matchlist:
                    document += alphnum(getElement(term, row).lower()) + ' '
                # accept with prob of 1/freq
                # else continue with prob(1 - q/k)
                Mx = 0
                for q in query_pool_copy.keys():
                    Mx += 1
                    for subq in q:
                        if subq not in document:
                            Mx -= 1
                            break

                for subq in curQuery[0]:
                    if subq not in document:
                        Mx += 1
                        break

                if random.uniform(0, 1) < 1.0 * adjustment / Mx:
                    sample.append(document)
                    print 'sample num:', len(sample), ' query cost:', query_cost
                    # accept with prob of 1/M(X)
                    # else continue
        else:
            query_pool.pop(curQuery[0])
    print >> perr, query_cost, 'used for sampling.'

    with open('sample_' + str(query_cost), 'wb') as f:
        pickle.dump(sample, f)
    return sample
Exemple #4
0
def sota_estimator(query_pool, api, match_term, uniqueid, query_num):
    """
    A method to estimate the aggregation of a search engine's corpus efficient
    ------**Efficient search engine measurements**

    :param query_pool: A dict contains the queries and their benefits. {set(['yong','jun']):5}
    :param api: An implementation of simapi for specific api.
    :param match_term: Some fields for matching queries and returned document.
    :param uniqueid: The uniqueid of returned messages.
    :param query_num: The number of queries you want to estimate
    :return: count(*) of the search engine
    """
    count = 0
    query_cost = 0
    params = api.getKwargs()

    for i in range(query_num):
        # choose one query
        curQuery = random.choice(query_pool.items())
        params[api.getSearchTerm()] = '+'.join(curQuery[0])
        result = api.callAPI(params=params)
        query_cost += 1
        if len(result) == 0:
            continue

        # estimate weight for each query
        for row in result:
            try:
                r_id = eval(uniqueid)
            except KeyError:
                continue
            document = ''
            for term in match_term:
                try:
                    document += data_process.alphnum(eval(term).lower()) + ' '
                except KeyError:
                    continue

            # get a set of queries match document
            match_query = []
            for q in query_pool:
                match_query.append(q)
                for subq in q:
                    if subq not in document:
                        match_query.pop()
                        break

            if curQuery[0] not in match_query:
                match_query.append(curQuery[0])

            # estimate weight for each edge
            t = 0
            while True:
                t += 1
                query = random.choice(match_query)
                if query == curQuery[0]:
                    count += 1.0 * t / len(match_query)
                    print 'count: ', count, ' query cost: ', query_cost
                    break

                params[api.getSearchTerm()] = '+'.join(query)
                mresult = api.callAPI(params=params)
                query_cost += 1

                if len(mresult) == 0:
                    continue
                for mrow in mresult:
                    try:
                        if r_id == eval('m' + uniqueid):
                            count += 1.0 * t / len(match_query)
                            print 'count: ', count, ' query cost: ', query_cost
                            break
                    except KeyError:
                        continue
                else:
                    continue
                break
    count = 1.0 * count * len(query_pool) / query_num
    print 'query cost: ', query_cost, ' count: ', count