Beispiel #1
0
def remove_frequent_terms(frequency=100):
    for l in tqdm(os.listdir(INDEX_PATH),
                  ascii=True,
                  desc="Giving most frequent terms their own index"):
        index = utils.load_index("indexes/" + l.split(".")[0])
        for term in list(index.keys()):
            if index[term]["doc_frequency"] > frequency:
                new = {term: index[term]}
                utils.save_index(new, "indexes/inverted_index_" + term)
                index.pop(term)
        utils.save_index(index, filename="indexes/" + l.split(".")[0])
Beispiel #2
0
def remove_single_docs():
    for l in tqdm(os.listdir(INDEX_PATH),
                  ascii=True,
                  desc="Removing terms with single doc_frequency"):
        index = utils.load_index("indexes/" + l.split(".")[0])
        for term in list(index.keys()):
            for doc_id in list(index[term]["doc_ids"].keys()):
                if index[term]["doc_ids"][doc_id] <= 1:
                    index[term]["doc_ids"].pop(doc_id)
                    index[term]["doc_frequency"] -= 1

            if index[term]["doc_frequency"] < 1:
                index.pop(term)
        utils.save_index(index, filename="indexes/" + l.split(".")[0])
Beispiel #3
0
def build_papers_index(filename, save=False):
    paper_index = dict()
    with open(filename, "rb") as f:
        papers_as_json = f.readlines()  # since one paper in json per line

        for p in papers_as_json:
            paper, paperID = get_paper_from_json(p)
            paper_index[paperID] = paper
            CITATION_COUNTS[paperID] = paper["citations"]

        # save papers index to compressed file
        if save:
            utils.save_index(paper_index, filename="papers_index")

    return paper_index
Beispiel #4
0
def main():
    try:
        os.makedirs("indexes")
    except:
        pass
    for filename in os.listdir(ARXIV_PATH):
        papers_index = build_papers_index(ARXIV_PATH + filename)
        inverted_index = build_inverted_index(papers_index,
                                              debug=False,
                                              desc=filename)
        split_and_save(inverted_index)
    sort_indexes()
    remove_single_docs()  # 2.25GB before, 1.19GB after
    remove_frequent_terms()
    utils.save_index(CITATION_COUNTS, filename="citations")
Beispiel #5
0
def split_and_save(index):
    for l in tqdm(ALPHABET, ascii=True, desc="Processing letters"):
        terms = [i for i in index.keys() if i.startswith(l)]
        try:
            tmp = utils.load_index(filename="indexes/inverted_index_" + l)
        except FileNotFoundError:
            tmp = dict()
        for term in terms:
            if term not in tmp:
                # if we have a new term initialise index with info
                tmp[term] = index[term]
            else:
                # else merge the data
                tmp[term]["doc_frequency"] += index[term]["doc_frequency"]
                for pid in list(index[term]["doc_ids"].keys()):
                    tmp[term]["doc_ids"][pid] = index[term]["doc_ids"][pid]
        utils.save_index(tmp, filename="indexes/inverted_index_" + l)
Beispiel #6
0
def sort_indexes():
    for l in tqdm(os.listdir(INDEX_PATH), ascii=True, desc="Sorting indexes"):
        index = utils.load_index("indexes/" + l.split(".")[0])
        index = {
            k: v
            for k, v in reversed(
                sorted(index.items(),
                       key=lambda item: item[1]["doc_frequency"]))
        }

        for term in index.keys():
            index[term]["doc_ids"] = {
                k: v
                for k, v in reversed(
                    sorted(index[term]["doc_ids"].items(),
                           key=lambda item: item[1]))
            }

        utils.save_index(index, filename="indexes/" + l.split(".")[0])