Ejemplo n.º 1
0
def pickle_payload(slave_id):
    query_path = "/mnt/nfs/work3/youngwookim/code/adhoc/robus/queries.train.tsv"
    docs = load_robust("/mnt/nfs/work3/youngwookim/data/robust04")
    query = load_marco_query(query_path)

    print("{}] Load sampled".format(slave_id))

    sample_filename = "id_pair_{}.pickle".format(slave_id)
    sampled = pickle.load(open(sample_filename, "rb"))

    inst = []
    for q_id_1, doc_id_1, q_id_2, doc_id_2 in sampled:
        q1 = query[q_id_1]
        d1 = docs[doc_id_1]

        q2 = query[q_id_2]
        d2 = docs[doc_id_2]
        inst.append((q1, d1, q2, d2))

    print(len(inst))
    step = 1000
    n_block = int(len(inst) / step)
    for i in range(n_block):
        st = i * step
        ed = (i + 1) * step
        name = str(slave_id) + "_" + str(i)
        pickle.dump(inst[st:ed],
                    open("../output/plain_pair_{}.pickle".format(name), "wb"))
Ejemplo n.º 2
0
def split_docs():
    print("loading...")
    collection = trec.load_robust(trec.robust_path)
    window_size = 200 * 3

    def sample_shift():
        return random.randrange(0, window_size * 2)

    fout = open("rob04.split.txt", "w")
    def write(new_id, text_span):
        fout.write("<DOC>\n")
        fout.write("<DOCNO>{}</DOCNO>\n".format(new_id))
        fout.write("<TEXT>\n")
        fout.write(text_span)
        fout.write("</TEXT>\n")
        fout.write("</DOC>\n")

    print("writing...")
    for doc_id in collection:
        content = collection[doc_id]
        loc_ptr = 0
        while loc_ptr < len(content):
            text_span = content[loc_ptr:loc_ptr + window_size]
            new_id = doc_id + "_{}".format(loc_ptr)
            write(new_id, text_span)
            loc_ptr += window_size #sample_shift()
Ejemplo n.º 3
0
def main():
    robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
    data = trec.load_robust(robust_path)

    def iter_docs():
        for doc_id, text in data.items():
            yield text

    summarize_doc_length(iter_docs())
Ejemplo n.º 4
0
def sanity_check():
    ranked_list = load_2k_rank()
    collection = trec.load_robust(trec.robust_path)

    def process(doc):
        return doc.lower().split()

    for q_id, listings in ranked_list.items():
        for doc_id, rank, score in listings[:1]:
            docs_path = os.path.join(cpath.data_path, "robust", "docs", doc_id)
            content = process(collection[doc_id])
            open(docs_path, "w").write(content)
Ejemplo n.º 5
0
def add_rob_sents():
    print("loading...")
    collection = trec.load_robust(trec.robust_path)
    print("writing...")
    for doc_id in collection:
        content = collection[doc_id]
        sents = sent_tokenize(content)

        for i, s in enumerate(sents):
            if len(s) > 500:
                sents[i] = s[:500]

        add_sents(doc_id, sents)
Ejemplo n.º 6
0
    def __init__(self, out_dir):
        robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
        tprint("Loading doc ids")
        self.doc_ids = all_doc_ids_of_interest()
        tprint("Loading robust docs")
        self.docs: Dict[str, str] = trec.load_robust(robust_path)
        tprint("Start processing")

        n_docs = len(self.doc_ids)
        docs_per_job = int((n_docs+n_jobs) / 5)
        self.docs_per_job = docs_per_job
        self.tokenizer = PCTokenizer()
        self.out_dir = out_dir
Ejemplo n.º 7
0
def save_doc_len():
    collection = trec.load_robust(trec.robust_path)
    print("writing...")
    ticker = TimeEstimator(len(collection))

    doc_len = dict()
    for doc_id in collection:
        content = collection[doc_id]
        tokens = nltk.tokenize.wordpunct_tokenize(content)
        doc_len[doc_id] = len(tokens)
        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc", "doc_len.pickle")
    pickle.dump(doc_len, open(save_path, "wb"))
Ejemplo n.º 8
0
def build_krovetz_index():
    stemmer = Stemmer()
    stopwords = load_stopwords()

    stem_dict = dict()

    def stem(token):
        if token in stem_dict:
            return stem_dict[token]
        else:
            r = stemmer.stem(token)
            stem_dict[token] = r
            return r

    collection = trec.load_robust(trec.robust_path)
    print("writing...")
    inv_index = dict()
    ticker = TimeEstimator(len(collection))

    for doc_id in collection:
        content = collection[doc_id]
        tokens = nltk.tokenize.wordpunct_tokenize(content)
        terms = dict()
        for idx, t in enumerate(tokens):
            if t in stopwords:
                continue

            t_s = stem(t)

            if t_s not in terms:
                terms[t_s] = list()

            terms[t_s].append(idx)

        for t_s in terms:
            if t_s not in inv_index:
                inv_index[t_s] = list()

            posting = (doc_id, terms[t_s])
            inv_index[t_s].append(posting)

        ticker.tick()

    save_path = os.path.join(cpath.data_path, "adhoc",
                             "robust_inv_index.pickle")
    pickle.dump(inv_index, open(save_path, "wb"))
Ejemplo n.º 9
0
def main():
    top_k = 1000
    galago_rank = load_bm25_best()

    doc_id_set = set()
    for query_id, ranked_list in galago_rank.items():
        ranked_list.sort(key=lambda x :x[1])
        doc_id_set.update([x[0] for x in ranked_list[:top_k]])
    doc_id_list = list(doc_id_set)
    robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
    data = load_robust(robust_path)

    save_d = {}
    for doc_id in doc_id_list:
        try:
            save_d[doc_id] = data[doc_id]
        except KeyError:
            print(doc_id, 'not found')


    save_to_pickle(save_d, "robust04_docs_predict")
Ejemplo n.º 10
0
 def __init__(self):
     robust_path = "/mnt/nfs/work3/youngwookim/data/robust04"
     self.data = trec.load_robust(robust_path)