import os,heapq import shelve from collections import Counter from typing import Union, List, Tuple from tqdm import tqdm from utils import timer,load_wapo from text_processing import TextProcessing text_processor = TextProcessing.from_nltk() # include your customized text processing class @timer def build_inverted_index( wapo_jl_path: Union[str, os.PathLike], index_shelve_path: str ) -> None: """ load wapo_pa3.jl to build the inverted index and store the index as a shelf in the provided path :param wapo_jl_path: :param index_shelve_path: :return: """ # Note: Generating inverted index and then assigning it to shelf --> big speed improvement #---> but doing so ignores the whole point of using shelf for the index # Current iteration takes about 15-25 minutes to run with shelve.open(index_shelve_path,flag='n',writeback=True) as index: index["___count"] = Counter() #this is used for analysis in custom processing for doc in load_wapo(wapo_jl_path): normal_tokens, stops= text_processor.get_normalized_tokens(doc['title'],doc['content_str'])
from nltk.stem import SnowballStemmer from text_processing import TextProcessing from nltk.stem.lancaster import LancasterStemmer from nltk.corpus import stopwords # type: ignore from pathlib import Path """ I made this simple script to test the effectiveness of three popular stemming algorithms on the number of tokens returned. In increacing agressiveness: - Porter - Snowball - Lancaster """ snow = TextProcessing(stemmer=SnowballStemmer('english').stem, stop_words=stopwords.words("english")) port = TextProcessing.from_nltk() lan = TextProcessing(stemmer=LancasterStemmer().stem, stop_words=stopwords.words("english")) from utils import load_wapo data_dir = Path("pa3_data") wapo_path = data_dir.joinpath("wapo_pa3.jl") ss = set() ps = set() ls = set() for doc in list(load_wapo(wapo_path))[:200]: ss = ss.union(snow.get_normalized_tokens("", doc['content_str'])[0]) ps = ps.union(port.get_normalized_tokens("", doc['content_str'])[0]) ls = ls.union(lan.get_normalized_tokens("", doc['content_str'])[0]) print("Snow", len(ss), "port", len(ps), "lan", len(ls))