Exemple #1
0
 def __init__(self, _text):
     self.text = _text
     self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
     self.rc = re.compile(r"\<.*?\>")
     self.wordnet_lemmatizer = WordNetLemmatizer()
     pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
     self.tokenizer = RegexpTokenizer(pattern)
     self.replacer = RegexReplacer()
Exemple #2
0
    def __init__(self, FilePath, OutPath1, OutPath2=None):
        self.FilePath = FilePath
        self.OutPath1 = OutPath1
        self.OutPath2 = OutPath2
        self.DataFrame = pd.read_csv(FilePath, sep='\t', quoting=3)

        # split paragraph to sentence class
        self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
        # remove HTML tag pattern
        self.rc = re.compile(r"\<.*?\>")
        # Replacer class
        self.replacer = RegexReplacer()
        # split sentence into word
        pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
        self.tokenizer = RegexpTokenizer(pattern)
        # Lemmatizer
        self.wordnet_lemmatizer = WordNetLemmatizer()
Exemple #3
0
 def __init__( self ):
     self.load_classifier()
     self.tokenizer = RegexpTokenizer( "[\w']+" )
     self.lemmatizer = WordNetLemmatizer()
     self.neg_replacer = RegexReplacer()
     self.replacer = AntonymReplacer()
     self.max_key = 300
     self.customstopwords = stopwords.words( 'english' )
     self.customstopwords.remove( "up" )
     self.customstopwords.remove( "down" )
     self.customstopwords += ['s&p500', 'federal', 'united', 'states', 'investors', 'reserve', 'average', 'nikkei' , 'end',
                        'index', 'market', 'cent', 'wall', 'street', 'year', 'years', 'industrial', 'bank_of_america', 'york', 'today',
                        'dow', 'jones', 'it', 'closing', 'closed', 'saw', 'months', 'nasdaq', 'trading', 'us', 'day', 'chase', 'mortgage',
                        'apple', 'say', 'goldman', 'p500', 'microsoft', 'jpmorgan', 'google', 'bank', 'company', 'facebook', 'mr', 'wells_fargo',
                        'share', 'quarter', 'week', 'sachs', 'executive', 'yesterday', 'investor', 'executive', 'yesterday', 'investor', 'earnings', 'time', 'service', 'month', 'bank_of_america', 'business']
Exemple #4
0
class classifier( object ):
    
    def __init__( self ):
        self.load_classifier()
        self.tokenizer = RegexpTokenizer( "[\w']+" )
        self.lemmatizer = WordNetLemmatizer()
        self.neg_replacer = RegexReplacer()
        self.replacer = AntonymReplacer()
        self.max_key = 300
        self.customstopwords = stopwords.words( 'english' )
        self.customstopwords.remove( "up" )
        self.customstopwords.remove( "down" )
        self.customstopwords += ['s&p500', 'federal', 'united', 'states', 'investors', 'reserve', 'average', 'nikkei' , 'end',
                           'index', 'market', 'cent', 'wall', 'street', 'year', 'years', 'industrial', 'bank_of_america', 'york', 'today',
                           'dow', 'jones', 'it', 'closing', 'closed', 'saw', 'months', 'nasdaq', 'trading', 'us', 'day', 'chase', 'mortgage',
                           'apple', 'say', 'goldman', 'p500', 'microsoft', 'jpmorgan', 'google', 'bank', 'company', 'facebook', 'mr', 'wells_fargo',
                           'share', 'quarter', 'week', 'sachs', 'executive', 'yesterday', 'investor', 'executive', 'yesterday', 'investor', 'earnings', 'time', 'service', 'month', 'bank_of_america', 'business']
        
    def set_Wordlist( self, tweets ):
        # Calls above functions - gives us list of the words in the tweets, ordered by freq.
        wordlist = self.getwordfeatures( self.getwords( tweets ) )
        wordlist = [ i for i in wordlist if not i in self.customstopwords ]
        wordlist = wordlist[ :self.max_key ]
        f = open( join( settings.KEYWORDS_DIR , "WordList.txt" ), 'w' )
        pickle.dump( wordlist, f )
        return wordlist
    
    # Pull out all of the words in a list of tagged tweets, formatted in tuples.
    def getwords( self, tweets ):
        allwords = []
        for ( words, _ ) in tweets:
            allwords.extend( words )
        return allwords
    
    # Order a list of tweets by their frequency.
    def getwordfeatures( self, listoftweets ):
    # Print out wordfreq if you want to have a look at the individual counts of words.
        wordfreq = nltk.FreqDist( listoftweets )
        words = wordfreq.keys()
        return words
    
    def feature_extractor( self, doc ):
        docwords = set( doc )
        features = {}
        for i in self.wordlist:
            features['contains(%s)' % i] = ( i in docwords )
        return features
    
    def sent_prob( self, sentence ):
        temp = self.lemma_Sent( sentence )
        return self.classifier.prob_classify( self.feature_extractor( temp ) ).prob( 'positive' )
    
    def lemma_Sent( self, initialDoc ):
        doc = self.neg_replacer.replace( initialDoc )
        word = self.tokenizer.tokenize( doc )
        word_pos = nltk.pos_tag( word )
    #    replacer.replace_negations_pos(word_pos)
        dic = dict( word_pos )
          
        word_lemma = []
        
        if word_pos != []:   
            for i in zip( *word_pos )[0]:
                if dic[i] == None:
                    pass
                elif dic[i][0] == "V":
                    word_lemma.append( self.lemmatizer.lemmatize( i, "v" ).lower() )
                elif dic[i][0] == "N" or dic[i][0] == "ADJ" or dic[i][0] == "ADV":
                    word_lemma.append( self.lemmatizer.lemmatize( i ).lower() ) 
        return word_lemma
    
    def load_classifier( self ):
        f = open( join( settings.CACHE_DIR, "Classifier.dump" ), 'rb' )
        self.classifier = pickle.load( f )
        f.close()
        f = open( join( settings.KEYWORDS_DIR, "WordList.txt" ), 'r' )
        self.wordlist = pickle.load( f )
        f.close()
Exemple #5
0
class TextPreProcess(object):
    def __init__(self, _text):
        self.text = _text
        self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
        self.rc = re.compile(r"\<.*?\>")
        self.wordnet_lemmatizer = WordNetLemmatizer()
        pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
        self.tokenizer = RegexpTokenizer(pattern)
        self.replacer = RegexReplacer()

    def RemoveHTML(self):
        return [
            BeautifulSoup(sentence, "lxml").get_text()
            for sentence in self.text
        ]

    def SplitPhase(self):
        return self.PunktTokenizer.tokenize(self.text)

    def ReplaceAbbre(self):
        return [self.replacer.replace(sentence) for sentence in self.text]

    def SplitSent(self):
        return [self.tokenizer.tokenize(sentence) for sentence in self.text]

    def lemma(self, tags):
        WORD = []
        for word, tag in tags:
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v', 'n', 's'] else None
            if not wntag:
                lemma = word
            else:
                lemma = self.wordnet_lemmatizer.lemmatize(word, wntag)

            WORD.append(lemma)
        return WORD

    def Lemmatizer(self):
        return [self.lemma(nltk.pos_tag(sentence)) for sentence in self.text]

    def CleanWords(self, sentence):
        stops = cachedStopWords
        return [
            word.lower() for word in sentence
            if len(word) >= 3 and word.isalpha() and not word in stops
        ]

    def CleanSentences(self):
        return [self.CleanWords(sentence) for sentence in self.text]

    def ToStr(self):
        str = ""
        for sentence in self.text:
            for word in sentence:
                str += (word + " ")
        return str[:-1]

    def process(self):
        self.text = self.SplitPhase()
        self.text = self.ReplaceAbbre()
        self.text = self.SplitSent()
        self.text = self.Lemmatizer()
        self.text = self.CleanSentences()
        self.text = self.ToStr()
        return self.text

    def Print(self):
        print(self.text)
Exemple #6
0
class TextPreProcess(object):
    """ Token/Lemmatizer/Clean text\n
        OutPath1 is positive data path\n
        OutPath2 is negative data path\n
    """
    def __init__(self, FilePath, OutPath1, OutPath2=None):
        self.FilePath = FilePath
        self.OutPath1 = OutPath1
        self.OutPath2 = OutPath2
        self.DataFrame = pd.read_csv(FilePath, sep='\t', quoting=3)

        # split paragraph to sentence class
        self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
        # remove HTML tag pattern
        self.rc = re.compile(r"\<.*?\>")
        # Replacer class
        self.replacer = RegexReplacer()
        # split sentence into word
        pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
        self.tokenizer = RegexpTokenizer(pattern)
        # Lemmatizer
        self.wordnet_lemmatizer = WordNetLemmatizer()

    def SplitPhase(self, row):
        """ split paragraph to sentence """
        return self.PunktTokenizer.tokenize(row['review'])

    def RemoveHTML(self, row):
        """ remove HTML tags """
        return [
            BeautifulSoup(sentence, "lxml").get_text()
            for sentence in row['review']
        ]

    def ReplaceAbbre(self, row):
        """ Replace abbreviation """
        return [self.replacer.replace(sentence) for sentence in row['review']]

    def SplitSent(self, row):
        """ split sentence to words """
        return [
            self.tokenizer.tokenize(sentence) for sentence in row['review']
        ]

    def lemma(self, tags):
        """ lemmatizer for tagged words """
        WORD = []
        for word, tag in tags:
            wntag = tag[0].lower()
            wntag = wntag if wntag in ['a', 'r', 'n', 'v', 'n', 's'] else None
            if not wntag:
                lemma = word
            else:
                lemma = self.wordnet_lemmatizer.lemmatize(word, wntag)
            WORD.append(lemma)
        return WORD

    def Lemmatizer(self, row):
        """ Lemmatizer words use WordNet """
        return [
            self.lemma(nltk.pos_tag(sentence)) for sentence in row['review']
        ]

    def CleanWords(self, sentence):
        """ remove len < 3 and non alpha and lowercase """
        if self.word2vector:
            return [
                word.lower() for word in sentence
                if (word.isalpha() and len(word) >= 2) or word.isdigit()
            ]
        else:
            stops = set(stopwords.words("english"))
            return [
                word.lower() for word in sentence
                if len(word) >= 3 and word.isalpha() and not word in stops
            ]

    def CleanSentences(self, row):
        """ clean sentences """
        return [self.CleanWords(sentence) for sentence in row['review']]

    def ToStr(self, row):
        str = ""
        for sentence in row['review']:
            for word in sentence:
                str += (word + " ")
        return str[:-1]

    def process(self, word2vector=True):
        """ Remove HTML tags, Replace Abbre, Split into words
            if use word2vector, should not use ``stopwords ``
        """
        self.word2vector = word2vector
        # split phase
        self.DataFrame['review'] = self.DataFrame.apply(self.SplitPhase,
                                                        axis=1)
        # remove HTML tags
        self.DataFrame['review'] = self.DataFrame.apply(self.RemoveHTML,
                                                        axis=1)
        # replace abbre
        self.DataFrame['review'] = self.DataFrame.apply(self.ReplaceAbbre,
                                                        axis=1)
        # split sentences
        self.DataFrame['review'] = self.DataFrame.apply(self.SplitSent, axis=1)
        # lemmatizer
        self.DataFrame['review'] = self.DataFrame.apply(self.Lemmatizer,
                                                        axis=1)
        # clean sentences
        self.DataFrame['review'] = self.DataFrame.apply(self.CleanSentences,
                                                        axis=1)
        # convert list to str
        self.DataFrame['review'] = self.DataFrame.apply(self.ToStr, axis=1)

    def save(self, Label=False):
        if Label:
            a = self.DataFrame['review'][self.DataFrame.sentiment == 1]
            a.to_csv(self.OutPath1, index=False)
            b = self.DataFrame['review'][self.DataFrame.sentiment == 0]
            b.to_csv(self.OutPath2, index=False)
            print("save data success to " + self.OutPath1 + " and " +
                  self.OutPath2)
        else:
            # drop column and save
            self.DataFrame.drop(columns=['id']).to_csv(self.OutPath1,
                                                       index=False,
                                                       header=False)
            print("save to" + self.OutPath1)
Exemple #7
0
output_seq_len = 20
# 空值填充0
PAD_ID = 0
# 输出序列起始标记
GO_ID = 1
# 结尾标记
EOS_ID = 2
# LSTM神经元size
size = 8
# 初始学习率
init_learning_rate = 1
# 在样本中出现频率超过这个值才会进入词表
min_freq = 1

wordToken = word_token.WordToken()
replacer = RegexReplacer()

# 放在全局的位置,为了动态算出num_encoder_symbols和num_decoder_symbols
max_token_id = wordToken.load_file_list(['./samples/questioncorpus', './samples/answercorpus'], min_freq)
num_encoder_symbols = max_token_id + 5
num_decoder_symbols = max_token_id + 5


def get_id_list_from(sentence):
    sentence_id_list = []
    seg_list = sentence.split(' ')
    for str in seg_list:
        id = wordToken.word2id(str)
        if id:
            sentence_id_list.append(wordToken.word2id(str))
    return sentence_id_list
Exemple #8
0
from nltk.tokenize import RegexpTokenizer
tokenizer =RegexpTokenizer("[\w']+")

#Importing Chunkers
patterns = """ 
 NP: {<DT|PP\$>?<JJ>*<NN>} 
 {<NNP>+} 
 {<NN>+} 
""" 
#chunker=nltk.RegexpParser(patterns)
import chunkers
import pickle
#from nltk.corpus import treebank_chunk
#chunker=chunkers.TagChunker(treebank_chunk.chunked_sents())
f=open("chunker.dump",'r')
chunker=pickle.load(f) 
 
# training the chunker, ChunkParser is a class defined in the next slide 
#NPChunker = ChunkParser(train_sents) 
TxT="This method doesn't work well, because xxx."
from replacers import RegexReplacer
neg_replacer=RegexReplacer();
TxT=neg_replacer.replace(TxT)
sent=nltk.pos_tag(nltk.word_tokenize(TxT))
#tree=chunker.parse(sent)
#print "SubTree"
#subtree=replacer.FindSubTree(tree, 'not', 'work')
#print subtree
print "After Negation"
replacer.replace_negations_pos(sent)
print sent