def process_text(text,
                 stemmer=None,
                 regexstr=None,
                 lowercase=True,
                 removestop=False,
                 verbose=True):
    """Helper function to pre-process text.
        Combines several preprocessing steps: lowercase, 
            remove stop, regex text cleaning, stemming.
        If savedpath is passed, then try to load saved processed text data
            and return that instead of processing."""

    if type(stemmer) == str:
        if stemmer.lower() == 'porter':
            stemmer = PorterStemmer()
        elif stemmer.lower() == 'snowball':
            stemmer = SnowballStemmer(language='english')
        else:
            stemmer = None

    # convert text list to pandas Series
    if type(text) == list or type(text) == np.array:
        processed = pd.Series(text)
    else:
        processed = text

    # make text lowercase
    if lowercase == True:
        if verbose: print('make text lowercase')
        processed = processed.str.lower()

    # remove stop words
    # NOTE: stop words w/ capitals not removed!
    if removestop == True:
        if verbose: print('remove stop words')
        stopwords = sw.words("english")
        processed = processed.map(lambda text: ' '.join(
            [word for word in text.split() if word not in stopwords]))

    # apply regex expression
    if regexstr is not None:
        if verbose: print('apply regex expression')
        regex = re.compile(regexstr)
        processed = processed.str.replace(regex, ' ')

    # stemming
    # NOTE: stemming makes all lowercase
    if stemmer is not None:
        if verbose: print('stemming')
        processed = processed.map(
            lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

    if verbose: print('done')

    return processed
Exemple #2
0
 def getToken(self, line):
     regex_word = re.compile('\w+')
     words = regex_word.findall(line)
     if self.isReversed:
         words.reverse()
     for token in words:
         if token in ['START', 'END']:
             yield token
         if token not in self._stops:
             if self._isStem:
                 token = PorterStemmer().stem(token).lower()
                 yield token
             else:
                 token = token.lower()
                 yield token
Exemple #3
0
 def boolean_clean(self):
     # clean_tokens = []
     search_edit = str(self.query_search)
     search_edit = re.sub('[^a-zA-Z0-9()]', ' ', search_edit)
     search_edit = re.split(r'[^a-zA-Z0-9()]', search_edit)
     search_edit = list(filter(None, search_edit))
     temp_list = []
     for r in search_edit:
         if (r != 'AND') and (r != 'OR'):
             r = PorterStemmer().stem(r.lower())
             temp_list.append(r)
         else:
             temp_list.append(r)
     # clean tokens
     query_again = " ".join(str(x) for x in temp_list)
     search = BP.bool_expr_ast(query_again)
     return search
Exemple #4
0
def tokens(string):
	words = nltk.word_tokenize(string)
	tokenized = []
	for w in words:
		tokenized += w.split("_")
	
	un_cameled = []
	camel = re.compile("(.*)(Date|Name|Place)") # hard-coded for DBpedia
	for t in tokenized:
		out = camel.match(t)
		if out == None:
			un_cameled.append(t)
		else:
			un_cameled.append(out.group(1))
			un_cameled.append(out.group(2))

	stemmed = []
	for u in un_cameled:
		s = PorterStemmer().stem(u)
		stemmed.append(s.lower())

	filtered = filter(lambda s: s != '?' and s != ',' and s != '.' and s != "'s", stemmed)
	return filtered
Exemple #5
0
def text_cleaner(text,
                 deep_clean=False,
                 stem=True,
                 stop_words=True,
                 translite_rate=True):
    rules = [
        {
            r'>\s+': u'>'
        },  # remove spaces after a tag opens or closes
        {
            r'\s+': u' '
        },  # replace consecutive spaces
        {
            r'\s*<br\s*/?>\s*': u'\n'
        },  # newline after a <br>
        {
            r'</(div)\s*>\s*': u'\n'
        },  # newline after </p> and </div> and <h1/>...
        {
            r'</(p|h\d)\s*>\s*': u'\n\n'
        },  # newline after </p> and </div> and <h1/>...
        {
            r'<head>.*<\s*(/head|body)[^>]*>': u''
        },  # remove <head> to </head>
        {
            r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'
        },  # show links instead of texts
        {
            r'[ \t]*<[^<]*?/?>': u''
        },  # remove remaining tags
        {
            r'^\s+': u''
        }  # remove spaces at the beginning
    ]

    if deep_clean:
        text = text.replace(".", "")
        text = text.replace("[", " ")
        text = text.replace(",", " ")
        text = text.replace("]", " ")
        text = text.replace("(", " ")
        text = text.replace(")", " ")
        text = text.replace("\"", "")
        text = text.replace("-", " ")
        text = text.replace("=", " ")
        text = text.replace("?", " ")
        text = text.replace("!", " ")

        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
        text = text.replace('+',
                            ' ').replace('.',
                                         ' ').replace(',',
                                                      ' ').replace(':', ' ')
        text = re.sub("(^|\W)\d+($|\W)", " ", text)
        if translite_rate:
            text = transliterate(text)
        if stem:
            text = PorterStemmer().stem(text)
        text = WordNetLemmatizer().lemmatize(text)
        if stop_words:
            stop_words = set(stopwords.words('english'))
            word_tokens = word_tokenize(text)
            text = [w for w in word_tokens if not w in stop_words]
            text = ' '.join(str(e) for e in text)
    else:
        for rule in rules:
            for (k, v) in rule.items():
                regex = re.compile(k)
                text = regex.sub(v, text)
            text = text.rstrip()
            text = text.strip()
    return text.lower()
Exemple #6
0
    def __init__(self,
                 corpus,
                 tf='raw',
                 idf='base',
                 stopword='standard',
                 stemmer=PorterStemmer(),
                 ignorecase='yes'):
        if isinstance(stemmer, str):
            if stemmer.lower() == 'porter':
                stemmer = PorterStemmer()
            elif stemmer.lower() == 'snowball':
                stemmer = SnowballStemmer("english")
            else:
                print("Error Unknown stemmer option")
                exit()
        wordStorage = dict()
        termFrequency = dict()
        fileFrequencyStorage = dict()
        dictClone = dict()

        self.corpus = corpus
        self.tf = tf
        self.idf = idf
        self.stopword = stopword
        self.actualStopWords = set()
        self.stemmer = stemmer
        self.ignorecase = ignorecase
        self.vectors = dict()
        self.dimensions = list()
        self.wordDocumentFrequency = dict()

        # depending on preferences read in the stopwords
        # if 'none' do not remove any stopwords
        if (self.stopword != 'none'):
            if (self.stopword == 'standard'):
                self.actualStopWords = set(stopwords.words('english'))
            else:
                #read in the stopwords from a file
                with open(stopword) as file:
                    fileContents = file.read()
                    self.actualStopWords = set(
                        nltk.word_tokenize(fileContents))

        self.actualStopWords = self.lowerCaseCheckConversion(
            self.actualStopWords)

        # where all words in any document will be stored
        self.allWords = list()

        #calculate the vectors for the documents
        for fileId in corpus.fileids():
            tempWords = corpus.words(fileId)

            filteredWords = self.lowerCaseCheckConversion(tempWords)
            #now filter out the stopwords
            filteredWords = self.filterWords(filteredWords)
            filteredWords = self.stemWords(filteredWords)

            self.allWords.extend(filteredWords)

            #now need to store the read words so that the vector can be calculated later
            wordStorage[fileId] = filteredWords

        self.allWords = set(self.allWords)

        # initially each term is present in no documents
        for word in self.allWords:
            self.wordDocumentFrequency[word] = 0
            termFrequency[word] = 0
            self.dimensions.append(word)

        dictClone = copy.deepcopy(termFrequency)

        # iterate through every file and its contents
        for file, wordList in wordStorage.items():
            # count up how many times each term occurs in each file
            for word in wordList:
                termFrequency[word] = termFrequency[word] + 1
            for word, wordFrequency in termFrequency.items():
                if wordFrequency > 0:
                    self.wordDocumentFrequency[
                        word] = self.wordDocumentFrequency[word] + 1
            fileFrequencyStorage[file] = termFrequency
            termFrequency = copy.deepcopy(dictClone)

        # now actually calculate the tf-idf values
        for fileId in corpus.fileids():
            vector = list()
            frequencyMap = fileFrequencyStorage[fileId]
            for word in self.allWords:
                # calculate the TFIDF value  using both the term frequency of the term
                # and the number of docs it appears
                vector.append(
                    self.calculateTfIdfValue(frequencyMap[word],
                                             self.wordDocumentFrequency[word]))
            del fileFrequencyStorage[fileId]
            self.vectors[fileId] = vector