Python PorterStemmer.stem Exemples, nltk.PorterStemmer.stem Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : preprocessing.py Projet : LewkowskiArkadiusz/magisterka

def stemming(words_l, type="PorterStemmer", lang="english", encoding="utf8"):
    supported_stemmers = [
        "PorterStemmer", "SnowballStemmer",
        "LancasterStemmer", "WordNetLemmatizer"]
    if type is False or type not in supported_stemmers:
        return words_l
    else:
        l = []
        if type == "PorterStemmer":
            stemmer = PorterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "SnowballStemmer":
            stemmer = SnowballStemmer(lang)
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "LancasterStemmer":
            stemmer = LancasterStemmer()
            for word in words_l:
                l.append(stemmer.stem(word).encode(encoding))
        if type == "WordNetLemmatizer":  # TODO: context
            wnl = WordNetLemmatizer()
            for word in words_l:
                l.append(wnl.lemmatize(word).encode(encoding))
        return l

Exemple #2

0

Afficher le fichier

Fichier : discretizer.py Projet : piercolella/qa-scrapers

    def _log_likelihood(answer_text, stemmed_vocabulary, distrib_matrix):
        LL = 0
        if answer_text is not '':
            tokens = word_tokenize(str(answer_text), language='english')
            porter_stemmer = PorterStemmer()
            unique_wordcount = len(stemmed_vocabulary)
            """
            per ogni w unica print_function words
                Cw = conta w in answer_text
                PwM = self.distrib_matrix[stemmer(w)]
                unique_wordcount = len(tokenize(answer_text)
            """
            for w in tokens:
                _w = w.strip().lower()
                Cw = 0
                for _ in answer_text.split():
                    if _w == _.strip().lower():
                        Cw += 1

                try:
                    w_stem = porter_stemmer.stem(_w.decode('utf-8', 'replace').encode('ascii', 'replace'))
                except AttributeError:
                    w_stem = porter_stemmer.stem(_w)
                try:
                    PwM = distrib_matrix[w_stem]
                except KeyError:  # key error means frequency is equal to cutoff point 1
                    PwM = 1
                LL += (Cw * log(float(PwM)))

            try:
                LL = "{0:.2f}".format(LL / float(unique_wordcount))
            except ZeroDivisionError:
                LL = 0 

        return LL

Exemple #3

0

Afficher le fichier

Fichier : Meme.py Projet : jayhack/automeme

    def get_ngram_features(self):

        stemmer = PorterStemmer()

        top_features = [(stemmer.stem(token) + "__TOP__", True) for token in self.top_text]
        bottom_features = [(stemmer.stem(token) + "__BOTTOM__", True) for token in self.bottom_text]
        all_features = [(stemmer.stem(token) + "__ALL__", True) for token in self.all_text]
        self.ngram_features = dict(top_features + bottom_features + all_features)

Exemple #4

0

Afficher le fichier

Fichier : featurizer.py Projet : JT17/445Project

def stem(input):
	from nltk import PorterStemmer
	stemmer = PorterStemmer();
	stemmed_training_input = [];
	stemmed_testing_input = [];
	for training_example in input['training']:
		word_list = training_example.split();
		stemmed_training_input.append(' '.join([stemmer.stem(word) for word in word_list]))

	for testing_example in input['testing']:
		word_list = testing_example.split();
		stemmed_testing_input.append(' '.join([stemmer.stem(word) for word in word_list]))

	result = {'training':stemmed_training_input, 'training_labels':input['training_labels'], 'testing':stemmed_testing_input, 'testing_labels':input['testing_labels']}
	return result

Exemple #5

0

Afficher le fichier

Fichier : v1.py Projet : abryu/WebMining-Python

def openAndProcessingFiles(path,resultDict):  # Main Function

    for filename in os.listdir(os.getcwd()+path):

        thisFile = open(os.getcwd()+path+'/'+filename,'r') #open the file and process each file
        
        currentTextString = " ".join(thisFile.read().split())#store the file as a string for removing HTML tags
        
        textAfterHtmlRemovingString = re.sub('<[^>]*>', '', currentTextString) # remove HTML tags (String)
        
        textAfterHtmlRemovingList = textAfterHtmlRemovingString.split() # convert String to List for the text contains only characters
        
        textRemoveingUnnecessaryCharactersList = [removeUnnecessaryCharacters(word) for word in textAfterHtmlRemovingList ] 

        textRemoveingUnnecessaryCharactersList = [word for word in textRemoveingUnnecessaryCharactersList if word is not None]
        
        stop_words = set(stopwords.words('english'))
        
        stop_words.update(['texthtml', 'html', 'server', "email", 'date', 'gmt', 'www']) # By analying the previous result set, continully adding new stopwords
    
        textAfterStopwordsRemovingList = [word for word in textRemoveingUnnecessaryCharactersList if word not in stop_words] #remove stopwords

        stemmer = PorterStemmer() #stemming
        
        for eachWord in textAfterStopwordsRemovingList:
            eachWord = stemmer.stem(eachWord)
            storeToResultDict(eachWord,resultDict)
    
        thisFile.close()

Exemple #6

0

Afficher le fichier

Fichier : textprocessing.py Projet : devjyotip/twitter-analytics-dashboard

    def stemm(cls, tokens):
        stemmer = PorterStemmer()

        for i, t in enumerate(tokens):
            tokens[i] = stemmer.stem(t)

        return tokens

Exemple #7

0

Afficher le fichier

Fichier : spam_classifier.py Projet : aidad/MachineLearning

def process_email(filename):
  
  f = open(filename, 'r')
  text = f.read()
  f.close()
  
  text = text.lower()
  
  #replaces html tags by space
  text = re.sub(r'<[^<>]+>', ' ', text)
  
  #replaces numbers by word number
  text = re.sub(r'[0-9]+', 'number', text)
  
  #replaces URLs by word httpaddr
  text = re.sub(r'(http|https)://[^\s]*', 'httpaddr', text)
  
  #replaces email addresses by word emailaddr
  text = re.sub(r'[^\s]+@[^\s]+', 'emailaddr', text)
  
  #replaces dollar signs with word dollar 
  text = re.sub(r'[$]+', 'dollar', text)
  
  #removes punctuation and non-words and separates words 
  words = re.split('[^a-z0-9]| ', text)
  
  #removes nans 
  words = filter(lambda x: x!='', words)
  
  #reduces words to their stems
  stemmer = PorterStemmer()
  words = [stemmer.stem(word) for word in words]
  
  return words

Exemple #8

0

Afficher le fichier

Fichier : dn_predict.py Projet : USStateDept/dn-predict

class Model(FileIO):
  def __init__(self, *args, **kwargs):
    FileIO.__init__(self, *args, **kwargs)
    self.data_list = []
    self.stemmer = PorterStemmer() # correct syntax?
    self.score_map = 
    self.ranges = 

  def isInt(self, val):
    try:
      val = int(val)
      return True
    except ValueError:
      return False

  def cleanString(self, word):
    if (word not in stopwords) and (word is not " ") and (self.isInt(word) is False):
      word = word.lower()
      return self.stemmer.stem(word)
    else:
      return None

  def makeScoreList(self):
    '''Initialize a new array of 0s for each range'''
    s_list = [0] * len(self.ranges))

Exemple #9

0

Afficher le fichier

Fichier : ch8_72.py Projet : N4CL/NLP100

def main():
    with open("sentiment.txt", 'r') as _file:
        stemmer = PorterStemmer()
        features = []

        for words in _file:
            feature = []
            is_sentence = True

            # 極性ラベルを除外
            for word in words.split()[1:]:
                try:
                    word = word.decode("utf-8")
                    if word not in [".", ",", ":", "?", "!"] \
                            and not has_stop_list(word):

                        feature.append(stemmer.stem(word))
                except UnicodeDecodeError:
                    # 文字化けは無視する
                    is_sentence = False
                    break

            if is_sentence:
                features.append(feature)

    return features

Exemple #10

0

Afficher le fichier

Fichier : ReviewParsing.py Projet : aidad/MachineLearning

    def review_to_words(raw_review, remove_stopwords = False):
        # BeautifulSoup pulls data out of html file
        # here it removes html tags and markups
        text = BeautifulSoup(raw_review).get_text()

        # replace numbers by word number
        text=re.sub(r'[0-9]+','number',text)

        # remove punctuations (they can be analyzed for better results)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()

        #make a list of words
        words_list = text.split()

        #download nltk text data sets, including stop words
        #nltk.download()

        if remove_stopwords:
            # get stopwords, searching a set is faster than searching a list
            stops = set(stopwords.words('english'))
            # remove stopwords
            words_list = [word for word in words_list if not word in stops]

        # reduce words to their stems
        stemmer=PorterStemmer()
        words_list=[stemmer.stem(word) for word in words_list]
        # return the list of words
        return words_list

Exemple #11

0

Afficher le fichier

Fichier : utilities.py Projet : genehwung/gutenberg_indexer

def normalize(word):
    '''
    normalize the the word for query or indexing
    :param word: unicode string
    :return: unicode string of the normalized ter
    '''
    porter = PorterStemmer()
    return porter.stem(word) if word[0].isalpha() else ''

Exemple #12

0

Afficher le fichier

Fichier : job_spider.py Projet : danmerl/jobbot

 def processContent(self, content):
     stemmer = PorterStemmer()
     tokens = word_tokenize(content)
     tokens = filter(lambda x: len(x) < 20 and x.isalnum(), tokens)
     tokens = [stemmer.stem(token.lower()) for token in tokens]
     tokens = filter(lambda x: x not in stopwords.words('english'), tokens)
     tokens = [str(token) for token in tokens]      
     bow = FreqDist(tokens)
     return(bow)

Exemple #13

0

Afficher le fichier

Fichier : stemmer.py Projet : 2mh/PyBioC

def main():
    # Use file defined by BIOC_IN as default if no other provided
    bioc_in = BIOC_IN
    if len(sys.argv) >= 2:
        bioc_in = sys.argv[1]
    
    # A BioCReader object is put in place to hold the example BioC XML
    # document
    bioc_reader = BioCReader(bioc_in, dtd_valid_file=DTD_FILE)
    
    # A BioCWRiter object is prepared to write out the annotated data
    bioc_writer = BioCWriter(BIOC_OUT)
    
    # The NLTK porter stemmer is used for stemming
    stemmer = PorterStemmer()
    
    # The example input file given above (by BIOC_IN) is fed into
    # a BioCReader object; validation is done by the BioC DTD
    bioc_reader.read()
    
    # Pass over basic data
    bioc_writer.collection = bioc_reader.collection
    
    # Get documents to manipulate
    documents = bioc_writer.collection.documents
    
    # Go through each document
    annotation_id = 0
    for document in documents:
        
        # Go through each passage of the document
        for passage in document:
            #  Stem all the tokens found
            stems = [stemmer.stem(token) for 
                     token in wordpunct_tokenize(passage.text)]
            # Add an anotation showing the stemmed version, in the
            # given order
            for stem in stems:
                annotation_id += 1
                
                # For each token an annotation is created, providing
                # the surface form of a 'stemmed token'.
                # (The annotations are collectively added following
                #  a document passage with a <text> tag.)
                bioc_annotation = BioCAnnotation()
                bioc_annotation.text = stem
                bioc_annotation.id = str(annotation_id)
                bioc_annotation.put_infon('surface form', 
                                          'stemmed token')
                passage.add_annotation(bioc_annotation)
    
    # Print file to screen w/o trailing newline
    # (Can be redirected into a file, e. g output_bioc.xml)
    sys.stdout.write(str(bioc_writer))
    
    # Write to disk
    bioc_writer.write()

Exemple #14

0

Afficher le fichier

Fichier : execute_xperiment.py Projet : ARGHZ/ClassifTweets

 def stemmingword(word_list, stemtype='porter'):
     if stemtype == 'porter':
         stemengine = PorterStemmer()
     else:
         stemengine = LancasterStemmer()
     try:
         filtered_words = [stemengine.stem(token).encode('latin-1', errors='ignore') for token in word_list]
     except UnicodeDecodeError, e:
         print 'Error en el tipo de caracteres descartando texto "{}"'.format(' '.join(word_list))

Exemple #15

0

Afficher le fichier

Fichier : textmining.py Projet : ajerome9/kaggle-classifier

class PorterStemmerTokenizer(object):
    """A tokenizer that also stems tokens using a porter stemmer"""
    def __init__(self):
        self.non_alphanum_regex = re.compile('[^ 0-9a-zA-Z]')
        self.porter = PorterStemmer()
    def __call__(self, doc):
        doc = self.non_alphanum_regex.sub(' ', doc)
        tokens_alpha = word_tokenize(doc.lower())
        return [self.porter.stem(t) for t in tokens_alpha]

Exemple #16

0

Afficher le fichier

Fichier : util.py Projet : JoshCason/LING573

class Tokenizer(object):
    def __init__(self):
        self.stem = PorterStemmer()
        self.punct = set(string.punctuation) | set(['·™','..','...','....','.....','......'])
        self.punct = self.punct | set(["``", "·", "–", "--", "”","—","•","—"])
    def __call__(self, doc):
        return [t.lower() for t in word_tokenize(doc) if t not in self.punct]
    def stem_toke(self, doc):
        return [self.stem.stem(t.lower()) for t in word_tokenize(doc) if t not in self.punct]

Exemple #17

0

Afficher le fichier

Fichier : Sentiment.py Projet : ffmaer2/newstrader

def getPosWords():
  stemmer = PorterStemmer()
  stemmedPosTokens = []
  pos = open(r'pos.txt').read()
  pos = re.sub("\d", "", pos)
  posWords = nltk.word_tokenize(pos)
  for posWord in posWords:
    stemmedPosWord = stemmer.stem(posWord)
    stemmedPosTokens.append(stemmedPosWord.lower())
  return stemmedPosTokens

Exemple #18

0

Afficher le fichier

Fichier : operations.py Projet : JunZhuSecurity/scholarec

 def stemmer(self, raw):
     """
     Use porter stemmer from nltk library 
     to stem tokens in raw text.
     """
     tokens = word_tokenize(raw)
     porter = PorterStemmer()
     # lancaster = LancasterStemmer()
     # stem_lancaster = [lancaster.stem(t) for t in tokens]
     stem_porter = [porter.stem(t) for t in tokens]
     return stem_porter

Exemple #19

0

Afficher le fichier

Fichier : Sentiment.py Projet : ffmaer2/newstrader

def getUncertainWords():  
  stemmer = PorterStemmer()
  stemmedUnTokens = []

  un = open(r'uncertain.txt').read()
  un = re.sub("\d", "", un)
  unWords = nltk.word_tokenize(un)
  for unWord in unWords:
    stemmedUnWord = stemmer.stem(unWord)
    stemmedUnTokens.append(stemmedUnWord.lower())
  return stemmedUnTokens

Exemple #20

0

Afficher le fichier

Fichier : Pyfunctions.py Projet : nchernia/Presidio

def update_Porter_stemming(): #We use stems occasionally.
    "Updating stems from Porter algorithm..."
    from nltk import PorterStemmer
    stemmer = PorterStemmer()
    cursor.execute("""SELECT word FROM words WHERE wordid <= 750000 and stem is null;""")
    words = cursor.fetchall()
    for local in words:
        word = ''.join(local)
        if re.match("^[A-Za-z]+$",word):
            query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';""" 
            z = cursor.execute(query)

Exemple #21

0

Afficher le fichier

Fichier : document.py Projet : Bankq/CS6998

 def tokenize(self, sentence, do_stopwords, do_stemming,use_bigrams):
         words = word_tokenize(sentence)
         words = [w.lower() for w in words if len(w) > 2]
         if do_stopwords:
                 words = [w for w in words if w not in stop_set]
         if do_stemming:
                 stemmer = PorterStemmer()
                 words = [stemmer.stem(w) for w in words]
         if use_bigrams:
                 words = bigrams(words)
         return words

Exemple #22

0

Afficher le fichier

Fichier : Sentiment.py Projet : ffmaer2/newstrader

def getNegWords():  
  stemmer = PorterStemmer()
  stemmedNegTokens = []

  neg = open(r'neg.txt').read()
  neg = re.sub("\d", "", neg)
  negWords = nltk.word_tokenize(neg)
  for negWord in negWords:
    stemmedNegWord = stemmer.stem(negWord)
    stemmedNegTokens.append(stemmedNegWord.lower())
  return stemmedNegTokens

Exemple #23

0

Afficher le fichier

Fichier : naiveBayes.py Projet : sushilraje8/SentimentalAnalysis-shared

def normalize_data(lines):
    norm_words = []
    punctuation = ['!', '.', ';', ':', '\'', '"','`','?']
    exceptions = ['\n', '\'s', '\'t', " "]
    stemmer = PorterStemmer()
    stop = stopwords.words('english')
    mega_stop_list = list(itertools.chain(punctuation, exceptions))
    print "    Now Normalizing......."
    for sentence in lines:
        words = [stemmer.stem(word.lower()) for word in word_tokenize(sentence.rstrip("\\n")) if word not in [stop, "not"]]
        norm_words.extend([word for word in negate_Ngram(words) if not re.match("[0-9]+", word) if word.lower() not in mega_stop_list])
    return norm_words

Exemple #24

0

Afficher le fichier

 def buildTrainTokensBigram(self): 
   self.trainTokens = []
   with open(self.trainingData, 'r') as reviews:
     for review in reviews:
       data = json.loads(review)
       words = word_tokenize(data['text'])
       words = [norm(word) for word in words if norm(word)]
       words = [word for word in words if word not in stwords]
       stemmer = PorterStemmer()
       words = [stemmer.stem(word) for word in words]
       featureSet = self.buildWordFeatureSetBigram(words)
       self.trainTokens.append((featureSet, data['stars']))

Exemple #25

0

Afficher le fichier

Fichier : hack_ariba.py Projet : tanejavick/hackathon

def clean_data_to_feed_classifier(tweests):
	st = PorterStemmer()
	stop = stopwords.words('english')
	parsed_tweests = []
	for x in tweests:
		y=x[0]
		y = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",y).split())
		y = ' '.join(re.sub(r'(.)\1+', r'\1\1', i.lower())  for i in y.split() if i not in stop)
		y = ' '.join(st.stem(i) for i in y.split() if len(i) > 3 and i.isalpha() and wordnet.synsets(i))
		# y = punctuations_repl(y)
		parsed_tweests.append(y)
	return parsed_tweests

Exemple #26

0

Afficher le fichier

Fichier : stem.py Projet : wangk1/research

class EnglishStemmer:
    """
    Stemmer wrapper on Sumy's Stemmer for compatibility reasons with summarizer, but uses nltk's porter stemmer to do
    the actual stemming.

    """

    def __init__(self):
        self.__stemmer=PorterStemmer()

    def __call__(self, word):
        return self.__stemmer.stem(word)

Exemple #27

0

Afficher le fichier

Fichier : utilities.py Projet : genehwung/gutenberg_indexer

def tokenize_normalize(raw):
    '''
    tokenize raw texts
    :param raw: unicode string
    :return: list[unicode]: a list of tokenized unicode
    Example: words = tokenize_normalize(line)
    '''
    tokens = [t for t in word_tokenize(raw) if len(t) < 20]  # don't use any token too long (like genetic sequence)
    porter = PorterStemmer()
    tokens_n = [porter.stem(t) for t in tokens if t[0].isalpha()]  # only interested in word
    tokens_n = ['NUMBER' if all(a.isdigit() for a in t) else t for t in tokens_n]  # combine all numbers to one
    return tokens_n

Exemple #28

0

Afficher le fichier

Fichier : CreateDatabase.py Projet : CulturalObservatory/Bookworm

 def update_Porter_stemming(self): #We use stems occasionally.
     print "Updating stems from Porter algorithm..."
     from nltk import PorterStemmer
     stemmer = PorterStemmer()
     cursor = db.query("""SELECT word FROM words""")
     words = cursor.fetchall()
     for local in words:
         word = ''.join(local) #Could probably take the first element of the tuple as well?
         #Apostrophes have the save stem as the word, if they're included        
         word = word.replace("'s","")
         if re.match("^[A-Za-z]+$",word):
             query = """UPDATE words SET stem='""" + stemmer.stem(''.join(local)) + """' WHERE word='""" + ''.join(local) + """';"""
             z = cursor.execute(query)

Exemple #29

0

Afficher le fichier

 def review_mapper(self, _, data):
   review = data['text']
   rating = data['stars']
   business_id = data['business_id']
   category = data['category']
   words = word_tokenize(review)
   words = [norm(word) for word in words if norm(word)]
   words = [word for word in words if word not in stwords]
   tagged_words = tagger.tag(words)
   stemmer = PorterStemmer()
   tagged_words = [(stemmer.stem(tagged_word[0]), tagged_word[1]) for tagged_word in tagged_words]
   for tagged_word in tagged_words:
     yield (category, tagged_word), (business_id, rating, 1)

Exemple #30

0

Afficher le fichier

Fichier : build_hierarchical_classifier_go.py Projet : laineyzoo/protein_annotation

def text_preprocessing(text):
    #lowercase everything
    text = text.lower()
    #remove punctuation
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub(" ", text)
    #remove stopwords
    no_stopwords = [word for word in text.split() if word.lower() not in ext_stopwords]
    text = " ".join(no_stopwords)
    #stem the words
    stemmer = PorterStemmer()
    text = " ".join([stemmer.stem(w) for w in text.split()])
    return text

Exemple #31

0

Afficher le fichier

Fichier : doc_term_prep_mp.py Projet : rajaupadhyay/IRDM_2019

def get_non_zero_count(list_of_wiki_files):
    non_zero_count = 0
    porter_stemmer = PorterStemmer()
    for fn in list_of_wiki_files:
        print(fn, non_zero_count)
        sys.stdout.flush()
        start_time = time.time()
        with open(wiki_files_path + fn, 'r') as openfile:
            for line in file_reader_generator(openfile):

                json_dict = json.loads(line)
                file_key = json_dict['id']
                if file_key:
                    text_data = json_dict['text']

                    # Tokenise
                    tokens = word_tokenize(text_data)
                    if tokens:
                        # Lowercase
                        tokens = list(map(lambda x: x.lower(), tokens))
                        # Remove stopwords
                        tokens = list(
                            filter(lambda l_ph: l_ph not in stop_words,
                                   tokens))
                        # Remove punctuation and stem
                        tokens = [
                            porter_stemmer.stem((val.translate(translator)))
                            for val in tokens
                        ]
                        # tokens = list(map(lambda val: PorterStemmer().stem(val), tokens))

                        tokens = set(tokens)
                        if '' in tokens:
                            tokens.remove('')

                        non_zero_count += len(tokens)

        end_time = time.time()
        print('time taken', end_time - start_time)
        sys.stdout.flush()

    return non_zero_count

Exemple #32

0

Afficher le fichier

Fichier : ir.py Projet : Zaradacht/nlp

def tokenize_and_normalize(file_name):
    """
    this function takes in a path to a song, reads the song file,
    tokenizes it into words, then stems and lowercases these words.

    INPUT:
    file_name - a path to a file as a string
    OUTPUT:
    normalized_song - a song represented as a list of stems.
    """
    ps = PorterStemmer()

    # YOUR CODE HERE
    song = open(file_name, 'r').read()
    song = word_tokenize(song.lower())
    normalized_song = [None] * len(song)
    for i, word in enumerate(song):
        normalized_song[i] = ps.stem(word)

    return normalized_song

Exemple #33

0

Afficher le fichier

def frequencyMatrix(sentences):
    frequency_matrix = {}
    stop_words = stopwords.words('english')
    ps = PorterStemmer()

    for sentence in sentences:
        frequency_table = {}
        tokenized_words = word_tokenize(sentence)

        for word in tokenized_words:
            word = word.lower()
            word = ps.stem(word)
            if word in stop_words:
                continue
            if word in frequency_table:
                frequency_table[word] += 1
            else:
                frequency_table[word] = 1

        frequency_matrix[sentence[:15]] = frequency_table

    return frequency_matrix

Exemple #34

0

Afficher le fichier

    def twitter_setiment_analyze(test):
        """
        This function main goal is to pre-process the data from the input dataset going through tokenization, stemming and others processes
        """
        global bow

        warnings.filterwarnings("ignore", category=DeprecationWarning)
        
        def remove_pattern(text,pattern):
            
            # re.findall() finds the pattern i.e @user and puts it in a list for further task
            r = re.findall(pattern,text)
            
            # re.sub() removes @user from the sentences in the dataset
            for i in r:
                text = re.sub(i,"",text)
            
            return text

        test['Tidy_Tweets'] = np.vectorize(remove_pattern)(test['tweet'], "@[\w]*")

        test['Tidy_Tweets'] = test['Tidy_Tweets'].str.replace("[^a-zA-Z#]", " ")

        test['Tidy_Tweets'] = test['Tidy_Tweets'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

        tokenized_tweet = test['Tidy_Tweets'].apply(lambda x: x.split())

        from nltk import PorterStemmer

        ps = PorterStemmer()

        tokenized_tweet = tokenized_tweet.apply(lambda x: [ps.stem(i) for i in x])

        for i in range(len(tokenized_tweet)):
            tokenized_tweet[i] = ' '.join(tokenized_tweet[i])

        test['Tidy_Tweets'] = tokenized_tweet

        return test['Tidy_Tweets']

Exemple #35

0

Afficher le fichier

class Stemmer:
    def __init__(self):
        self.ps = PorterStemmer()

    def stem_term(self, token):
        """
        This function stem a token
        :param token: string of a token
        :return: stemmed token
        """

        return self.ps.stem(token[0])

    def porter_stemmer(self, terms_list):

        index = 0
        for w in terms_list:
            new_stem = self.stem_term(w)
            if new_stem != w[0]:
                terms_list[index] = (new_stem, terms_list[index][1])
            index += 1
        return terms_list

Exemple #36

0

Afficher le fichier

def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = PorterStemmer()
    # here i denotes the index of sentence
    for i, sent in list(enumerate(sentences)):
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15] + str(i)] = freq_table

    return frequency_matrix

Exemple #37

0

Afficher le fichier

Fichier : read_data.py Projet : thomasyoung-audet/twitterSentiment

def process_sentence(tokens, preprocessing_params):
    if preprocessing_params[1]:
        stopwordlist = set(stopwords.words("english"))
    else:
        stopwordlist = []
    # Create Lemmatizer and Stemmer.
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    processed_sentence = []
    partofspeech = []
    for word, tag in pos_tag(tokens):
        if len(word) > 1:
            if word not in stopwordlist:
                if tag.startswith('NN'):
                    pos = 'n'  # noun
                elif tag.startswith('VB'):
                    pos = 'v'  # verb
                elif tag.startswith('JJ'):
                    pos = 'a'  # adjective
                elif tag.startswith('RB'):
                    pos = 'r'  # adverb
                else:
                    pos = 'o'  # other

                if pos in ['n', 'v', 'a', 'r']:
                    word = lemmatizer.lemmatize(word, pos)
                else:
                    word = lemmatizer.lemmatize(word)
                # now stem
                if preprocessing_params[0]:
                    word = stemmer.stem(word)
                processed_sentence.append(word)
                partofspeech.append(pos)

    final_text = ' '.join(processed_sentence)
    final_pos = ' '.join(partofspeech)
    return final_text, final_pos