コード例 #1
0
 def __init__(self, tagger_backup):
     t0 = time()
     print 'Initialisation ...'
     self.extracter = Extracter()
     self.tokenizer = Tokenizer()
     self.tagger = Tagger(name=tagger_backup)
     self.replacer = AntonymReplacer()
     self.lemmatizer = Lemmatizer()
     print 'Initialisation done in %0.3fs.' % (time() - t0)
コード例 #2
0
sw = word_tokenize('Sup! Awesome bday? brb!')

sw2 = ""

for word in sw:
    result = replacer.replace(word)
    sw2 += result + " "

print(sw2)

# Tutorial 17 : https://www.youtube.com/watch?v=88yOCX4ZyoA&list=PLcTXcpndN-Sl9eYrKM6jtcOTgC52EJnqH&index=17

from replacer import AntonymReplacer

rep = AntonymReplacer()

antn = rep.replace('cowardice')
print(antn)

antn = rep.replace('heavy')
print(antn)

antn = rep.replace('weak')
print(antn)

antn = rep.replace('blue')
print(antn)

sent = rep.negreplace('this man is not salty')
print(sent)
コード例 #3
0
class Preprocesser(object):

    def __init__(self, tagger_backup):
        t0 = time()
        print 'Initialisation ...'
        self.extracter = Extracter()
        self.tokenizer = Tokenizer()
        self.tagger = Tagger(name=tagger_backup)
        self.replacer = AntonymReplacer()
        self.lemmatizer = Lemmatizer()
        print 'Initialisation done in %0.3fs.' % (time() - t0)
    
    def preprocess(self, text):
        t0 = time()
        preprocessed_jokes = []
        for joke in text:
            # Jokes tokenization...
            tokenized_joke = self.tokenizer.tokenize(joke, remove_stopwords=False)
            # Jokes tokenization done!
            # Jokes POS Tagging...
            tagged_sentences = self.tagger.tag_sentences(tokenized_joke)
            tagged_joke = self.tagger.map_tag(tagged_sentences)
            # Jokes POS Tagging done!
            # Jokes negation replacement...
            tagged_joke2 = []
            for sentence in tagged_joke:
                tagged_joke2.append(self.replacer.replace_negations(sentence))
            # Jokes negation replacement done!
            # Jokes lemmatization...
            lemmatized_joke = self.lemmatizer.lemmatize_sentences(tagged_joke2)
            # Jokes lemmatization done!
            # Jokes removing stopwords and punctuation...
            normalized_joke = self.extracter.remove_stopwords(lemmatized_joke)
            # Jokes removing stopwords and punctuation!
            # Jokes transformation from list of tokens into concatenation of tokens...
            preprocessed_joke = ''
            for normalized_sentence in normalized_joke:
                preprocessed_joke = preprocessed_joke +' '+' '.join(normalized_sentence)
            preprocessed_jokes.append(preprocessed_joke.strip())
            # Jokes transformation from list of tokens into concatenation of tokens!
        print 'Preprocessing done in %0.3fs.' % (time() - t0)
        return preprocessed_jokes
#    def preprocess(self, path='data/jokes/'):
#        #path = 'data/jokes/'
#        html_jokes = self.retriever.read_from_directory(path)
#        # Jokes extraction from HTML documents
#        jokes = []
#        for html_joke in html_jokes:
#            jokes.append(extracter.extract_joke(html_joke))
#            #jokes.append(extracter.replace(extracter.extract_text(html_joke)))
#        # Jokes tokenization
#        tokenized_jokes = []
#        for joke in jokes:
#            #without_punc = extracter.remove_punctuation(joke)
#            tokenized_jokes.append(tokenizer.tokenize(joke, remove_stopwords=False))
#        # Jokes POS Tagging
#        tagged_jokes = []
#        for sentences in tokenized_jokes:
#            tagged_sentences = tagger.tag_sentences(sentences)
#            tagged_jokes.append(tagger.map_tag(tagged_sentences))
#        # Jokes negation replacement
#        tagged_jokes_2 = []
#        for sentences in tagged_jokes:
#            tg_sentences = []
#            for sentence in sentences:
#                tg_sentences.append(replacer.replace_negations(sentence))
#            tagged_jokes_2.append(tg_sentences)
#        # Jokes lemmatization
#        lemmatized_jokes = []
#        for sentences in tagged_jokes_2:
#            lemmatized_jokes.append(lemmatizer.lemmatize_sentences(sentences))
#        # Jokes removing stopwords and punctuation
#        normalized_jokes = []
#        for lemmatized_joke in lemmatized_jokes:
#            normalized_jokes.append(extracter.remove_stopwords(lemmatized_joke))
#        # Jokes transformation from list of tokens into concatenation of tokens
#        preprocessed_jokes = []
#        for normalized_joke in normalized_jokes:
#            preprocessed_joke = ''
#            for normalized_sentence in normalized_joke:
#                preprocessed_joke = preprocessed_joke +' '+' '.join(normalized_sentence)
#            preprocessed_jokes.append(preprocessed_joke.strip()) 
#
#        return preprocessed_jokes