def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import ShingleGenerator from corpus import InMemoryCorpus from invertedindex import InMemoryInvertedIndex from ranking import BrainDeadRanker from searchengine import SimpleSearchEngine print("Indexing MeSH corpus...") normalizer = BrainDeadNormalizer() tokenizer = ShingleGenerator(3) corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt')) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) ranker = BrainDeadRanker() engine = SimpleSearchEngine(corpus, index) options = {"debug": False, "hit_count": 5, "match_threshold": 0.5} print("Enter a query and find matching documents.") print(f"Lookup options are {options}.") print(f"Tokenizer is {tokenizer.__class__.__name__}.") print(f"Ranker is {ranker.__class__.__name__}.") def evaluator(query): matches = [] engine.evaluate(query, options, ranker, lambda m: matches.append(m)) return matches simple_repl("query", evaluator)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from ahocorasick import Trie, StringFinder print("Building trie from MeSH corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt')) dictionary = Trie() for document in corpus: dictionary.add( normalizer.normalize(normalizer.canonicalize(document["body"])), tokenizer) engine = StringFinder(dictionary, tokenizer) print("Enter some text and locate words and phrases that are MeSH terms.") def evaluator(text): matches = [] engine.scan(normalizer.normalize(normalizer.canonicalize(text)), lambda m: matches.append(m)) return matches simple_repl("text", evaluator)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from naivebayesclassifier import NaiveBayesClassifier print("Initializing naive Bayes classifier from news corpora...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() languages = ["en", "no", "da", "de"] training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages} classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer) print(f"Enter some text and classify it into {languages}.") print(f"Returned scores are log-probabilities.") def evaluator(text): results = [] classifier.classify(text, lambda m: results.append(m)) return results simple_repl("text", evaluator)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from invertedindex import InMemoryInvertedIndex print("Building inverted index from Cranfield corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml')) index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer) print("Enter one or more index terms and inspect their posting lists.") def evaluator(terms): terms = index.get_terms(terms) return { term: list(index.get_postings_iterator(term)) for term in terms } simple_repl("terms", evaluator)
def main(): import os.path from normalization import BrainDeadNormalizer from tokenization import BrainDeadTokenizer from corpus import InMemoryCorpus from suffixarray import SuffixArray print("Building suffix array from Cranfield corpus...") normalizer = BrainDeadNormalizer() tokenizer = BrainDeadTokenizer() corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml')) engine = SuffixArray(corpus, ["body"], normalizer, tokenizer) options = {"debug": False, "hit_count": 5} print("Enter a prefix phrase query and find matching documents.") print(f"Lookup options are {options}.") print("Returned scores are occurrence counts.") def evaluator(query): matches = [] engine.evaluate(query, options, lambda m: matches.append(m)) return matches simple_repl("query", evaluator)