Esempio n. 1
0
def sep_stratum(first_stratum, fname):
    """
    seperate the two-strata sampled qrels
    :param fname:
    :return:
    """
    qrel_str = []
    first_qrel = Qrel(first_stratum)
    sampled_qrel = Qrel(fname)
    qid = first_qrel.get_qid()
    for i in range(0, len(qid)):
        curr_qrel = first_qrel.get_rel_by_qid(qid[i])
        curr_sampled_qrel = sampled_qrel.get_rel_by_qid(qid[i])
        for doc, rel in curr_sampled_qrel.iteritems():
            if doc not in curr_qrel:
                qrel_str.append(
                    str(qid[i]) + " 0 " + doc + " " +
                    str(curr_sampled_qrel[doc]))
    with open(fname + "-2s", "w") as fout:
        for line in qrel_str:
            fout.write(line + "\n")
Esempio n. 2
0
def main(argv):
    qrelfile = ""
    depth = 10
    collection = "robust"
    # pd = 100

    try:
        opts, args = getopt.getopt(argv, "j:d:hc:", ["runf", "jfile", "depth"])
    except getopt.GetoptError:
        print('-r <runlist> -j <qrelfile> -d <depth> -h help')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('-r <runlist> -j <qrelfile> -o <output> -d <depth> -h help')
            sys.exit()
        elif opt in ("-j", "--jfile"):
            qrelfile = arg
        elif opt in ("-d", "--d"):
            depth = int(arg)
        elif opt in ("-c", "--c"):
            collection = arg
    # if collection == "tb06":
    #     pd = 50
    # elif collection == "tb04":
    #     pd = 80
    prifix_dir = "testcase/"
    rank_dir = prifix_dir + collection + "/doc_rank/"
    fit_dir = prifix_dir + collection + "/background_gain/fit/origin/" + str(
        depth) + "/"
    out_dir = prifix_dir + collection + "/background_gain/sample_rbp/hybrid/" + str(
        depth) + "/"

    curr_qrel = Qrel(qrelfile)
    result_list = [None] * 4
    t_list = []
    qid = curr_qrel.get_qid()
    w_param = [None] * 4
    for q in range(0, len(qid)):
        for i in range(0, 4):
            result_list[i] = HybridOpt(0.95, 1000, qid[q],
                                       curr_qrel.get_rel_by_qid(qid[q]),
                                       out_dir, rank_dir, (depth, i))
            result_list[i].start()
            t_list.append(result_list[i])
        for t in t_list:
            t.join()
        for i in range(0, 4):
            w_param[i] = result_list[i].res
        get_doc_prob(out_dir, w_param, qid[q], depth)
Esempio n. 3
0
class NaiveEstimator:
    """
    calculate sample coverage.
    Try to estimate total number of
    relevant documents using Chao92
    """
    def __init__(self, qrelname, d, gname, gidx):
        """
        init Chao92 estimator
        :param qrelname: qrel name
        :param d: considered pooling depth
        """
        self._d = d
        self._qrel = Qrel(qrelname)
        self._qid = self._qrel.get_qid()
        self._isout = False
        if len(gidx) > 0:
            self._gname = gname
            self._gidx = np.array(gidx)
            self._isout = True

    def naive_estimator(self, rank_dir, out_dir):
        jaccard_mat = np.zeros((len(self._qid), 6))
        jaccard_mat[:, 0] = self._qid
        col_num = 0
        for q in range(0, len(self._qid)):
            curr_rel = self._qrel.get_rel_by_qid(self._qid[q])
            doc_idx = []
            col_num = 0
            with open(rank_dir + str(self._qid[q]) + "-rank.txt", "rb") as fin:
                i = 0
                for lines in fin:
                    curr_line = lines.strip().split(",")
                    if col_num == 0:
                        col_num = len(curr_line)
                    if curr_line[0] in curr_rel:
                        if curr_rel[curr_line[0]] > 0:
                            doc_idx.append(i)
                    i += 1
            if len(doc_idx) == 0:
                jaccard_mat[q, 1:] = 0
            else:
                rank_mat = np.loadtxt(rank_dir + str(self._qid[q]) +
                                      "-rank.txt",
                                      usecols=range(1, col_num),
                                      delimiter=",",
                                      dtype=int)
                rank_mat = rank_mat[np.array(doc_idx), :]
                rank_mat[rank_mat >= self._d] = -2
                rank_mat[rank_mat >= 0] = 1
                rank_mat[rank_mat < 0] = 0
                col_num -= 1
                k = len(doc_idx)
                for j in range(0, (col_num - 1)):
                    sys_i = rank_mat[:, j]
                    for r in range(j + 1, col_num):
                        sys_j = rank_mat[:, r]
                        if sum(sys_i | sys_j) > 0:
                            jaccard_mat[q,
                                        1] += sum(sys_i & sys_j) / sum(sys_i
                                                                       | sys_j)
                        else:
                            jaccard_mat[q, 1] += 0
                f_stat = np.sum(rank_mat, axis=1)
                uniq_f_stat = np.unique(f_stat)
                tot_sample = np.sum(rank_mat)
                for f in uniq_f_stat:
                    jaccard_mat[q, 3] += f * (f - 1) * len(f_stat[f_stat == f])
                jaccard_mat[q, 2] = 1 - (len(f_stat[f_stat == 1]) / tot_sample)
                if jaccard_mat[q, 2] == 0:
                    jaccard_mat[q, 3] = 0
                    jaccard_mat[q, 4] = k
                else:
                    tmp_val = (k / jaccard_mat[q, 2]) / (
                        (tot_sample - 1) * tot_sample)
                    gamma_sq = max(jaccard_mat[q, 3] * tmp_val - 1, 0)
                    jaccard_mat[q, -2] = (k / jaccard_mat[q, 2]) + \
                                         gamma_sq * (tot_sample*(1-jaccard_mat[q, 2]))/jaccard_mat[q, 2]
                    jaccard_mat[q, 3] = np.sqrt(gamma_sq) / col_num
                jaccard_mat[q, -1] = k
        jaccard_mat[:, 1] /= ((col_num - 1) * col_num * 0.5)
        jaccard_mat[:, 2] /= col_num  # average to per run
        fname = out_dir + str(self._d) + "-rsim.txt"
        if self._isout:
            fname = out_dir + self._gname + "-" + str(self._d) + "-rsim.txt"
        np.savetxt(fname=fname,
                   X=jaccard_mat,
                   fmt="%.4f",
                   delimiter=",",
                   header="qid, jaccard, hat_c, gamma, est_N, tot_N")

    def avg_sim_all(self, rank_dir, out_dir, k=1000):
        """
        Calculate different similarity measurement
        :param rank_dir:
        :param out_dir:
        :param k: considered depth
        :return:
        """
        jaccard_mat = np.zeros((len(self._qid), 4))
        col_num = 0
        jaccard_mat[:, 0] = self._qid
        for i in range(0, len(self._qid)):
            with open(rank_dir + str(self._qid[i]) + "-rank.txt", "rb") as fin:
                col_num = len(fin.readline().strip().split(","))
            rank_mat = np.loadtxt(rank_dir + str(self._qid[i]) + "-rank.txt",
                                  usecols=range(1, col_num),
                                  delimiter=",",
                                  dtype=int)
            if self._isout:
                rank_mat = np.delete(rank_mat, self._gidx, axis=1)
            rank_mat[rank_mat >= k] = -2
            rank_mat[rank_mat > -1] = 1
            rank_mat[rank_mat < 0] = 0
            col_num = rank_mat.shape[1]
            for j in range(0, (col_num - 1)):
                sys_i = rank_mat[:, j]
                for r in range(j + 1, col_num):
                    sys_j = rank_mat[:, r]
                    jaccard_mat[i,
                                1] += sum(sys_i & sys_j) / sum(sys_i | sys_j)
            f_stat = np.sum(rank_mat, axis=1)
            uniq_f_stat = np.unique(f_stat)
            tot_sample = np.sum(rank_mat)
            for f in uniq_f_stat:
                jaccard_mat[i, -1] += f * (f - 1) * len(f_stat[f_stat == f])
            jaccard_mat[i, 2] = 1 - (len(f_stat[f_stat == 1]) / tot_sample)
            tmp_val = (rank_mat.shape[0] / jaccard_mat[i, 2]) / (
                (tot_sample - 1) * tot_sample)
            jaccard_mat[i, -1] *= tmp_val
            jaccard_mat[i,
                        -1] = np.sqrt(max(jaccard_mat[i, -1] - 1, 0)) / col_num
        jaccard_mat[:, 1] /= ((col_num - 1) * col_num * 0.5)
        jaccard_mat[:, 2:] /= col_num  # average to per run
        fname = out_dir + str(self._d) + "-asim.txt"
        if self._isout:
            fname = out_dir + self._gname + "-" + str(self._d) + "-asim.txt"
        print fname
        np.savetxt(fname=fname,
                   X=jaccard_mat,
                   fmt="%.4f",
                   delimiter=",",
                   header="qid, jaccard, hat_c, gamma")