コード例 #1
0
import os,heapq
import shelve
from collections import Counter
from typing import Union, List, Tuple
from tqdm import tqdm
from utils import timer,load_wapo
from text_processing import TextProcessing

text_processor = TextProcessing.from_nltk()
# include your customized text processing class


@timer
def build_inverted_index(
    wapo_jl_path: Union[str, os.PathLike], index_shelve_path: str
) -> None:
    """
    load wapo_pa3.jl to build the inverted index and store the index as a shelf in the provided path
    :param wapo_jl_path:
    :param index_shelve_path:
    :return:
    """


    # Note: Generating inverted index and then assigning it to shelf --> big speed improvement
    #---> but doing so ignores the whole point of using shelf for the index
    # Current iteration takes about 15-25 minutes to run
    with shelve.open(index_shelve_path,flag='n',writeback=True) as index:
        index["___count"] = Counter() #this is used for analysis in custom processing
        for doc in load_wapo(wapo_jl_path):
            normal_tokens, stops= text_processor.get_normalized_tokens(doc['title'],doc['content_str'])
コード例 #2
0
from nltk.stem import SnowballStemmer
from text_processing import TextProcessing
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords  # type: ignore
from pathlib import Path
"""
    I made this simple script to test the effectiveness of three popular stemming
    algorithms on the number of tokens returned. In increacing agressiveness:
    - Porter
    - Snowball
    - Lancaster
"""

snow = TextProcessing(stemmer=SnowballStemmer('english').stem,
                      stop_words=stopwords.words("english"))
port = TextProcessing.from_nltk()
lan = TextProcessing(stemmer=LancasterStemmer().stem,
                     stop_words=stopwords.words("english"))
from utils import load_wapo

data_dir = Path("pa3_data")
wapo_path = data_dir.joinpath("wapo_pa3.jl")
ss = set()
ps = set()
ls = set()

for doc in list(load_wapo(wapo_path))[:200]:
    ss = ss.union(snow.get_normalized_tokens("", doc['content_str'])[0])
    ps = ps.union(port.get_normalized_tokens("", doc['content_str'])[0])
    ls = ls.union(lan.get_normalized_tokens("", doc['content_str'])[0])
print("Snow", len(ss), "port", len(ps), "lan", len(ls))