Ejemplo n.º 1
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import ShingleGenerator
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex
    from ranking import BrainDeadRanker
    from searchengine import SimpleSearchEngine
    print("Indexing MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = ShingleGenerator(3)
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    ranker = BrainDeadRanker()
    engine = SimpleSearchEngine(corpus, index)
    options = {"debug": False, "hit_count": 5, "match_threshold": 0.5}
    print("Enter a query and find matching documents.")
    print(f"Lookup options are {options}.")
    print(f"Tokenizer is {tokenizer.__class__.__name__}.")
    print(f"Ranker is {ranker.__class__.__name__}.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, ranker, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)
Ejemplo n.º 2
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from ahocorasick import Trie, StringFinder
    print("Building trie from MeSH corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'mesh.txt'))
    dictionary = Trie()
    for document in corpus:
        dictionary.add(
            normalizer.normalize(normalizer.canonicalize(document["body"])),
            tokenizer)
    engine = StringFinder(dictionary, tokenizer)
    print("Enter some text and locate words and phrases that are MeSH terms.")

    def evaluator(text):
        matches = []
        engine.scan(normalizer.normalize(normalizer.canonicalize(text)),
                    lambda m: matches.append(m))
        return matches

    simple_repl("text", evaluator)
Ejemplo n.º 3
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from naivebayesclassifier import NaiveBayesClassifier
    print("Initializing naive Bayes classifier from news corpora...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    languages = ["en", "no", "da", "de"]
    training_set = {language: InMemoryCorpus(os.path.join(data_path,f"{language}.txt")) for language in languages}
    classifier = NaiveBayesClassifier(training_set, ["body"], normalizer, tokenizer)
    print(f"Enter some text and classify it into {languages}.")
    print(f"Returned scores are log-probabilities.")

    def evaluator(text):
        results = []
        classifier.classify(text, lambda m: results.append(m))
        return results
    simple_repl("text", evaluator)
Ejemplo n.º 4
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from invertedindex import InMemoryInvertedIndex

    print("Building inverted index from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    index = InMemoryInvertedIndex(corpus, ["body"], normalizer, tokenizer)
    print("Enter one or more index terms and inspect their posting lists.")

    def evaluator(terms):
        terms = index.get_terms(terms)
        return {
            term: list(index.get_postings_iterator(term))
            for term in terms
        }

    simple_repl("terms", evaluator)
Ejemplo n.º 5
0
def main():
    import os.path
    from normalization import BrainDeadNormalizer
    from tokenization import BrainDeadTokenizer
    from corpus import InMemoryCorpus
    from suffixarray import SuffixArray
    print("Building suffix array from Cranfield corpus...")
    normalizer = BrainDeadNormalizer()
    tokenizer = BrainDeadTokenizer()
    corpus = InMemoryCorpus(os.path.join(data_path, 'cran.xml'))
    engine = SuffixArray(corpus, ["body"], normalizer, tokenizer)
    options = {"debug": False, "hit_count": 5}
    print("Enter a prefix phrase query and find matching documents.")
    print(f"Lookup options are {options}.")
    print("Returned scores are occurrence counts.")

    def evaluator(query):
        matches = []
        engine.evaluate(query, options, lambda m: matches.append(m))
        return matches

    simple_repl("query", evaluator)