def process_text(text, stemmer=None, regexstr=None, lowercase=True, removestop=False, verbose=True): """Helper function to pre-process text. Combines several preprocessing steps: lowercase, remove stop, regex text cleaning, stemming. If savedpath is passed, then try to load saved processed text data and return that instead of processing.""" if type(stemmer) == str: if stemmer.lower() == 'porter': stemmer = PorterStemmer() elif stemmer.lower() == 'snowball': stemmer = SnowballStemmer(language='english') else: stemmer = None # convert text list to pandas Series if type(text) == list or type(text) == np.array: processed = pd.Series(text) else: processed = text # make text lowercase if lowercase == True: if verbose: print('make text lowercase') processed = processed.str.lower() # remove stop words # NOTE: stop words w/ capitals not removed! if removestop == True: if verbose: print('remove stop words') stopwords = sw.words("english") processed = processed.map(lambda text: ' '.join( [word for word in text.split() if word not in stopwords])) # apply regex expression if regexstr is not None: if verbose: print('apply regex expression') regex = re.compile(regexstr) processed = processed.str.replace(regex, ' ') # stemming # NOTE: stemming makes all lowercase if stemmer is not None: if verbose: print('stemming') processed = processed.map( lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')])) if verbose: print('done') return processed
def getToken(self, line): regex_word = re.compile('\w+') words = regex_word.findall(line) if self.isReversed: words.reverse() for token in words: if token in ['START', 'END']: yield token if token not in self._stops: if self._isStem: token = PorterStemmer().stem(token).lower() yield token else: token = token.lower() yield token
def boolean_clean(self): # clean_tokens = [] search_edit = str(self.query_search) search_edit = re.sub('[^a-zA-Z0-9()]', ' ', search_edit) search_edit = re.split(r'[^a-zA-Z0-9()]', search_edit) search_edit = list(filter(None, search_edit)) temp_list = [] for r in search_edit: if (r != 'AND') and (r != 'OR'): r = PorterStemmer().stem(r.lower()) temp_list.append(r) else: temp_list.append(r) # clean tokens query_again = " ".join(str(x) for x in temp_list) search = BP.bool_expr_ast(query_again) return search
def tokens(string): words = nltk.word_tokenize(string) tokenized = [] for w in words: tokenized += w.split("_") un_cameled = [] camel = re.compile("(.*)(Date|Name|Place)") # hard-coded for DBpedia for t in tokenized: out = camel.match(t) if out == None: un_cameled.append(t) else: un_cameled.append(out.group(1)) un_cameled.append(out.group(2)) stemmed = [] for u in un_cameled: s = PorterStemmer().stem(u) stemmed.append(s.lower()) filtered = filter(lambda s: s != '?' and s != ',' and s != '.' and s != "'s", stemmed) return filtered
def text_cleaner(text, deep_clean=False, stem=True, stop_words=True, translite_rate=True): rules = [ { r'>\s+': u'>' }, # remove spaces after a tag opens or closes { r'\s+': u' ' }, # replace consecutive spaces { r'\s*<br\s*/?>\s*': u'\n' }, # newline after a <br> { r'</(div)\s*>\s*': u'\n' }, # newline after </p> and </div> and <h1/>... { r'</(p|h\d)\s*>\s*': u'\n\n' }, # newline after </p> and </div> and <h1/>... { r'<head>.*<\s*(/head|body)[^>]*>': u'' }, # remove <head> to </head> { r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1' }, # show links instead of texts { r'[ \t]*<[^<]*?/?>': u'' }, # remove remaining tags { r'^\s+': u'' } # remove spaces at the beginning ] if deep_clean: text = text.replace(".", "") text = text.replace("[", " ") text = text.replace(",", " ") text = text.replace("]", " ") text = text.replace("(", " ") text = text.replace(")", " ") text = text.replace("\"", "") text = text.replace("-", " ") text = text.replace("=", " ") text = text.replace("?", " ") text = text.replace("!", " ") for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() text = text.replace('+', ' ').replace('.', ' ').replace(',', ' ').replace(':', ' ') text = re.sub("(^|\W)\d+($|\W)", " ", text) if translite_rate: text = transliterate(text) if stem: text = PorterStemmer().stem(text) text = WordNetLemmatizer().lemmatize(text) if stop_words: stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(text) text = [w for w in word_tokens if not w in stop_words] text = ' '.join(str(e) for e in text) else: for rule in rules: for (k, v) in rule.items(): regex = re.compile(k) text = regex.sub(v, text) text = text.rstrip() text = text.strip() return text.lower()
def __init__(self, corpus, tf='raw', idf='base', stopword='standard', stemmer=PorterStemmer(), ignorecase='yes'): if isinstance(stemmer, str): if stemmer.lower() == 'porter': stemmer = PorterStemmer() elif stemmer.lower() == 'snowball': stemmer = SnowballStemmer("english") else: print("Error Unknown stemmer option") exit() wordStorage = dict() termFrequency = dict() fileFrequencyStorage = dict() dictClone = dict() self.corpus = corpus self.tf = tf self.idf = idf self.stopword = stopword self.actualStopWords = set() self.stemmer = stemmer self.ignorecase = ignorecase self.vectors = dict() self.dimensions = list() self.wordDocumentFrequency = dict() # depending on preferences read in the stopwords # if 'none' do not remove any stopwords if (self.stopword != 'none'): if (self.stopword == 'standard'): self.actualStopWords = set(stopwords.words('english')) else: #read in the stopwords from a file with open(stopword) as file: fileContents = file.read() self.actualStopWords = set( nltk.word_tokenize(fileContents)) self.actualStopWords = self.lowerCaseCheckConversion( self.actualStopWords) # where all words in any document will be stored self.allWords = list() #calculate the vectors for the documents for fileId in corpus.fileids(): tempWords = corpus.words(fileId) filteredWords = self.lowerCaseCheckConversion(tempWords) #now filter out the stopwords filteredWords = self.filterWords(filteredWords) filteredWords = self.stemWords(filteredWords) self.allWords.extend(filteredWords) #now need to store the read words so that the vector can be calculated later wordStorage[fileId] = filteredWords self.allWords = set(self.allWords) # initially each term is present in no documents for word in self.allWords: self.wordDocumentFrequency[word] = 0 termFrequency[word] = 0 self.dimensions.append(word) dictClone = copy.deepcopy(termFrequency) # iterate through every file and its contents for file, wordList in wordStorage.items(): # count up how many times each term occurs in each file for word in wordList: termFrequency[word] = termFrequency[word] + 1 for word, wordFrequency in termFrequency.items(): if wordFrequency > 0: self.wordDocumentFrequency[ word] = self.wordDocumentFrequency[word] + 1 fileFrequencyStorage[file] = termFrequency termFrequency = copy.deepcopy(dictClone) # now actually calculate the tf-idf values for fileId in corpus.fileids(): vector = list() frequencyMap = fileFrequencyStorage[fileId] for word in self.allWords: # calculate the TFIDF value using both the term frequency of the term # and the number of docs it appears vector.append( self.calculateTfIdfValue(frequencyMap[word], self.wordDocumentFrequency[word])) del fileFrequencyStorage[fileId] self.vectors[fileId] = vector