Beispiel #1
0
    def testStopWords(self):

        try:
            result = StandardTokenizer()
            result.setReader(self.reader)
            result = StopFilter(result, self.stop_set)
        except Exception, e:
            self.fail(str(e))
Beispiel #2
0
    def testStopWords(self):

        try:
            result = StandardTokenizer()
            result.setReader(self.reader)
            result = StopFilter(result, self.stop_set)
        except Exception, e:
            self.fail(str(e))
Beispiel #3
0
    def createComponents(self, fieldName):
        source = StandardTokenizer()
        filter1 = LowerCaseFilter(source)
        filter1 = PorterStemFilter(filter1)
        filter1 = StopFilter(filter1, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)

        return self.TokenStreamComponents(source, filter1)
Beispiel #4
0
    def testStopWords(self):

        try:
            result = StandardTokenizer(Version.LUCENE_CURRENT, self.reader)
            result = StopFilter(Version.LUCENE_CURRENT, result, self.stop_set)
        except Exception, e:
            self.fail(str(e))
Beispiel #5
0
	def createComponents(self, fieldName):
		source = StandardTokenizer()
		stream = LowerCaseFilter(source)
		stream = StopFilter(stream, PortugueseAnalyzer.getDefaultStopSet())
		stream = PortugueseLightStemFilter(stream)

		return self.TokenStreamComponents(source, stream)
Beispiel #6
0
 def preprocess(text):
     """Tokenize and stop the input text."""
     ts = StandardTokenizer()
     ts.setReader(StringReader(text.lower()))
     ts = StopFilter(ts, StopAnalyzer.ENGLISH_STOP_WORDS_SET)
     string_builder = StringBuilder()
     ts.reset()
     char_term_attr = ts.addAttribute(CharTermAttribute.class_)
     while ts.incrementToken():
         if string_builder.length() > 0:
             string_builder.append(" ")
         string_builder.append(char_term_attr.toString())
     return string_builder.toString()
Beispiel #7
0
 def createComponents(self, fieldName, reader):
     source = StandardTokenizer(Version.LUCENE_CURRENT, reader)
     filter = StandardFilter(Version.LUCENE_CURRENT, source)
     filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter)
     filter = PorterStemFilter(filter)
     filter = StopFilter(Version.LUCENE_CURRENT, filter,
     StopAnalyzer.ENGLISH_STOP_WORDS_SET)
     return self.TokenStreamComponents(source, filter)
Beispiel #8
0
def clean_sentence_apache(sentence, remove_numbers = True, remove_acronyms = True, remove_propernouns = True, unhyphenate = False, return_string = False):
    
    #sentence = ' '.join( [word.lower() for word in sentence.split() if len(word)>1] ) #removing ANY single letter words
    # Replace unicode spaces, tabs, and underscores with spaces, and remove whitespaces from start/end of sentence:
    sentence = sentence.replace(u"\xa0", u" ").replace(u"\\t", u" ").replace(u"_", u" ").strip(" ")

    if unhyphenate:              
        ls = re.findall(r"\w+-\s\w+", sentence)
        if len(ls) > 0:
            ls_new = [re.sub(r"- ", "", word) for word in ls]
            for i in range(len(ls)):
                sentence= sentence.replace(ls[i], ls_new[i])
    sentence = re.sub(r"\b[a-zA-Z]\b", "", sentence) #removing any single letter alphabets
    if remove_acronyms:
        sentence = re.sub(r"\b[A-Z][A-Z]+\b\s+", "", sentence)
    if remove_numbers:
        sentence = re.sub(r"\b[0-9]+\b\s*", "", sentence)
#         ls = re.findall(r"\b[0-9]+\b\s*", sentence)
#         if len(ls) > 0:
#             ls_new = np.repeat("", len(ls))
#             for i in range(len(ls)):
#                 sentence = sentence.replace(ls[i], ls_new[i])

    sent_list = []
    
    #tokenizing
    tokenizer = StandardTokenizer()
    tokenizer.setReader(StringReader(sentence))
    charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class_)
    tokenizer.reset()
    while tokenizer.incrementToken():
        sent_list.append(charTermAttrib.toString().lower()) #lowercasing
        
    jstor_list_words = set(["a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]) #jstor removal words
    sent_list = [word for word in sent_list if word not in jstor_list_words] #removing stopwords as well as lowercasing all words
        
        
    # If True, include the proper nouns in stop_words_list
    if remove_propernouns:              
#         doc = nlp(sentence) # Create a document object in spacy
#         proper_nouns = gather_propernouns(doc) # Creates a wordbank of proper nouns we should exclude
        #trying to gather proper nouns by passing in pure sentence in gather_propernouns
        proper_nouns = gather_propernouns(sentence)
        print(proper_nouns)
        # Remove each proper noun from sentence:
        sent_list = [word for word in sent_list if word not in proper_nouns]
        #for term in proper_nouns: # Loop over wordbank
        #    sentence = re.sub(term, "", sentence) # Less effective because removes characters from within terms
    
    if return_string:
        return ' '.join(sent_list) # Return clean, tokenized sentence (string)
    
    return sent_list
 def createComponents(self, fieldName):
     source = StandardTokenizer()
     result = EnglishPossessiveFilter(source)
     result = LowerCaseFilter(result)
     result = DiacriticFilter(result)
     result = StopFilter(result, self.stopwords)
     if self.stemExclusionSet.isEmpty() is False:
         result = SetKeywordMarkerFilter(result, self.stemExclusionSet)
     result = PorterStemFilter(result)
     return Analyzer.TokenStreamComponents(source, result)
    def createComponents(self, _):
        tokenizer = StandardTokenizer()
        stream = StandardFilter(tokenizer)

        # Order of filtering is important
        stream = LowerCaseFilter(stream)  # case independent
        stream = ASCIIFoldingFilter(stream)  # convert diacritics
        stream = self.filter_stopwords(stream)  # ignore stopwords
        stream = SnowballFilter(stream, RomanianStemmer())  # stemming

        return self.TokenStreamComponents(tokenizer, stream)
    def getSynonyms(self, query, tokenizer=StandardTokenizer()):
        '''
        :param query: The query for which to get synonyms
        :param tokenizer: The tokenizer used for synonymgraphfilter
        :return: A tokenStream with the synonyms
        '''
        tokenizer.reset()
        # Add query to tokenizer
        tokenizer.setReader(StringReader(query))

        # Use synonymfilter to generate synonyms & flatten to get words from graph
        synGraph = SynonymGraphFilter(tokenizer, self.map, True)
        return FlattenGraphFilter(synGraph)
 def preprocess(text):
     """Tokenize and stop the input text."""
     ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower()))
     ts = StopFilter(Lucene.get_version(), ts,  StopAnalyzer.ENGLISH_STOP_WORDS_SET)
     string_builder = StringBuilder()
     ts.reset()
     char_term_attr = ts.addAttribute(CharTermAttribute.class_)
     while ts.incrementToken():
         if string_builder.length() > 0:
             string_builder.append(" ")
         string_builder.append(char_term_attr.toString())
     return string_builder.toString()
def apache_tokenize(sentence, 
                    lowercase = True):
    '''
    Tokenizes sentences into words using the Apache Lucene Standard Tokenizer (same as JSTOR).
    
    Args:
        sentence: str
        lowercase: binary indicator: whether to lowercase each word
    Returns:
        list of str: each element of list is a word
        
    Requires these packages: 
    lucene
    org.apache.lucene.analysis.standard.StandardAnalyzer
    org.apache.lucene.analysis.standard.StandardTokenizer
    java.io.StringReader
    org.apache.lucene.analysis.tokenattributes.CharTermAttribute
    '''
    
    sent_list = [] # initialize empty list to add words to
    
    tokenizer = StandardTokenizer() # start Tokenizer
    tokenizer.setReader(StringReader(sentence))
    charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class_)
    tokenizer.reset()
    
    if lowercase:
        while tokenizer.incrementToken():
            sent_list.append(charTermAttrib.toString().lower()) #lowercasing
            
        return sent_list
    
    # if not lower-casing:
    while tokenizer.incrementToken():
        sent_list.append(charTermAttrib.toString())
        
    return sent_list
import lucene
import sys

from java.io import StringReader, File
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.analysis.pt import PortugueseAnalyzer
from org.apache.lucene.analysis.standard import StandardAnalyzer, StandardTokenizer
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute
lucene.initVM()

INDEX_DIR = "/index_test"

# Tokenizer exemplo.
test = "Isso é um teste."
tokenizer = StandardTokenizer()
tokenizer.setReader(StringReader(test))
charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class_)
tokenizer.reset()
tokens = []

while tokenizer.incrementToken():
    tokens.append(charTermAttrib.toString())

print(tokens)

# PortugueseAnalyzer exemplo.
analyzer = PortugueseAnalyzer()
stream = analyzer.tokenStream("", StringReader(test))
stream.reset()
tokens = []