def getTokenizedCorpus(self, tokenizer, corpus = '', language = 'automatic', languages = []): import helper, retriever, classifier, analyser # create parser helper parserHelper = helper.ParserHelper() # parse corpus corpus = parserHelper.getPlainText(corpus) # stop word list stopWords = retriever.StopWords() # initialise classifier classifier = classifier.BayesClassifier() # if language is set to 'automatic', try to guess language by Bayesian classification if language == 'automatic': language = classifier.guessLanguage(stopWords, corpus, languages) # strip stop words tokenizedCorpus = self.stripStopWords(tokenizer, stopWords, language, corpus) # return corpus information return dict(corpus = corpus, tokenizedCorpus = tokenizedCorpus, language = language)
def stripStopWords(self, tokenizer, stopWords = '', language = '', corpus = ''): import retriever # if no stop words instance was supplied by user if stopWords == '': # stop word list stopWords = retriever.StopWords() # get appropriate stop word list for language stopWordList = stopWords.getStopWordList(language) # tokenize, get paragraphs if language == 'german': tokenizedCorpus = list(tokenizer.processWhitespaces(corpus, stopWordList, 1)) else: tokenizedCorpus = list(tokenizer.processWhitespaces(corpus, stopWordList, 0)) # return return tokenizedCorpus
# get corpus # corpus = urlRetriever.retrieveURL('http://www.krohne-mar.com/Schwebekoerper-Durchflussmessgeraete_nass_kalibriert.11121.0.html') corpusSet = urlRetriever.retrieveURL('http://linguistik-fachschaft.de/info.html') corpus = corpusSet['corpus'] charset = corpusSet['charset'] # ML tag stripper mlStripper = retriever.MLStripper() # remove ML tags mlStripper.feed(corpus) corpus = mlStripper.getStripped() # stop word list stopWords = retriever.StopWords() stopWordList = stopWords.getStopWordList('german') # tokenizer tokenizer = analyser.Tokenizer() # tokenize tokens = tokenizer.processWhitespaces(corpus, stopWordList, 1) tokenCount = len(tokens) # analyse text structure textStructure = analyser.TextStructure() # get N-grams ngrams = textStructure.getNGrams(tokens, tokenCount)