def __init__(self, _text): self.text = _text self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() self.rc = re.compile(r"\<.*?\>") self.wordnet_lemmatizer = WordNetLemmatizer() pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S' self.tokenizer = RegexpTokenizer(pattern) self.replacer = RegexReplacer()
def __init__(self, FilePath, OutPath1, OutPath2=None): self.FilePath = FilePath self.OutPath1 = OutPath1 self.OutPath2 = OutPath2 self.DataFrame = pd.read_csv(FilePath, sep='\t', quoting=3) # split paragraph to sentence class self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() # remove HTML tag pattern self.rc = re.compile(r"\<.*?\>") # Replacer class self.replacer = RegexReplacer() # split sentence into word pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S' self.tokenizer = RegexpTokenizer(pattern) # Lemmatizer self.wordnet_lemmatizer = WordNetLemmatizer()
def __init__( self ): self.load_classifier() self.tokenizer = RegexpTokenizer( "[\w']+" ) self.lemmatizer = WordNetLemmatizer() self.neg_replacer = RegexReplacer() self.replacer = AntonymReplacer() self.max_key = 300 self.customstopwords = stopwords.words( 'english' ) self.customstopwords.remove( "up" ) self.customstopwords.remove( "down" ) self.customstopwords += ['s&p500', 'federal', 'united', 'states', 'investors', 'reserve', 'average', 'nikkei' , 'end', 'index', 'market', 'cent', 'wall', 'street', 'year', 'years', 'industrial', 'bank_of_america', 'york', 'today', 'dow', 'jones', 'it', 'closing', 'closed', 'saw', 'months', 'nasdaq', 'trading', 'us', 'day', 'chase', 'mortgage', 'apple', 'say', 'goldman', 'p500', 'microsoft', 'jpmorgan', 'google', 'bank', 'company', 'facebook', 'mr', 'wells_fargo', 'share', 'quarter', 'week', 'sachs', 'executive', 'yesterday', 'investor', 'executive', 'yesterday', 'investor', 'earnings', 'time', 'service', 'month', 'bank_of_america', 'business']
class classifier( object ): def __init__( self ): self.load_classifier() self.tokenizer = RegexpTokenizer( "[\w']+" ) self.lemmatizer = WordNetLemmatizer() self.neg_replacer = RegexReplacer() self.replacer = AntonymReplacer() self.max_key = 300 self.customstopwords = stopwords.words( 'english' ) self.customstopwords.remove( "up" ) self.customstopwords.remove( "down" ) self.customstopwords += ['s&p500', 'federal', 'united', 'states', 'investors', 'reserve', 'average', 'nikkei' , 'end', 'index', 'market', 'cent', 'wall', 'street', 'year', 'years', 'industrial', 'bank_of_america', 'york', 'today', 'dow', 'jones', 'it', 'closing', 'closed', 'saw', 'months', 'nasdaq', 'trading', 'us', 'day', 'chase', 'mortgage', 'apple', 'say', 'goldman', 'p500', 'microsoft', 'jpmorgan', 'google', 'bank', 'company', 'facebook', 'mr', 'wells_fargo', 'share', 'quarter', 'week', 'sachs', 'executive', 'yesterday', 'investor', 'executive', 'yesterday', 'investor', 'earnings', 'time', 'service', 'month', 'bank_of_america', 'business'] def set_Wordlist( self, tweets ): # Calls above functions - gives us list of the words in the tweets, ordered by freq. wordlist = self.getwordfeatures( self.getwords( tweets ) ) wordlist = [ i for i in wordlist if not i in self.customstopwords ] wordlist = wordlist[ :self.max_key ] f = open( join( settings.KEYWORDS_DIR , "WordList.txt" ), 'w' ) pickle.dump( wordlist, f ) return wordlist # Pull out all of the words in a list of tagged tweets, formatted in tuples. def getwords( self, tweets ): allwords = [] for ( words, _ ) in tweets: allwords.extend( words ) return allwords # Order a list of tweets by their frequency. def getwordfeatures( self, listoftweets ): # Print out wordfreq if you want to have a look at the individual counts of words. wordfreq = nltk.FreqDist( listoftweets ) words = wordfreq.keys() return words def feature_extractor( self, doc ): docwords = set( doc ) features = {} for i in self.wordlist: features['contains(%s)' % i] = ( i in docwords ) return features def sent_prob( self, sentence ): temp = self.lemma_Sent( sentence ) return self.classifier.prob_classify( self.feature_extractor( temp ) ).prob( 'positive' ) def lemma_Sent( self, initialDoc ): doc = self.neg_replacer.replace( initialDoc ) word = self.tokenizer.tokenize( doc ) word_pos = nltk.pos_tag( word ) # replacer.replace_negations_pos(word_pos) dic = dict( word_pos ) word_lemma = [] if word_pos != []: for i in zip( *word_pos )[0]: if dic[i] == None: pass elif dic[i][0] == "V": word_lemma.append( self.lemmatizer.lemmatize( i, "v" ).lower() ) elif dic[i][0] == "N" or dic[i][0] == "ADJ" or dic[i][0] == "ADV": word_lemma.append( self.lemmatizer.lemmatize( i ).lower() ) return word_lemma def load_classifier( self ): f = open( join( settings.CACHE_DIR, "Classifier.dump" ), 'rb' ) self.classifier = pickle.load( f ) f.close() f = open( join( settings.KEYWORDS_DIR, "WordList.txt" ), 'r' ) self.wordlist = pickle.load( f ) f.close()
class TextPreProcess(object): def __init__(self, _text): self.text = _text self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() self.rc = re.compile(r"\<.*?\>") self.wordnet_lemmatizer = WordNetLemmatizer() pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S' self.tokenizer = RegexpTokenizer(pattern) self.replacer = RegexReplacer() def RemoveHTML(self): return [ BeautifulSoup(sentence, "lxml").get_text() for sentence in self.text ] def SplitPhase(self): return self.PunktTokenizer.tokenize(self.text) def ReplaceAbbre(self): return [self.replacer.replace(sentence) for sentence in self.text] def SplitSent(self): return [self.tokenizer.tokenize(sentence) for sentence in self.text] def lemma(self, tags): WORD = [] for word, tag in tags: wntag = tag[0].lower() wntag = wntag if wntag in ['a', 'r', 'n', 'v', 'n', 's'] else None if not wntag: lemma = word else: lemma = self.wordnet_lemmatizer.lemmatize(word, wntag) WORD.append(lemma) return WORD def Lemmatizer(self): return [self.lemma(nltk.pos_tag(sentence)) for sentence in self.text] def CleanWords(self, sentence): stops = cachedStopWords return [ word.lower() for word in sentence if len(word) >= 3 and word.isalpha() and not word in stops ] def CleanSentences(self): return [self.CleanWords(sentence) for sentence in self.text] def ToStr(self): str = "" for sentence in self.text: for word in sentence: str += (word + " ") return str[:-1] def process(self): self.text = self.SplitPhase() self.text = self.ReplaceAbbre() self.text = self.SplitSent() self.text = self.Lemmatizer() self.text = self.CleanSentences() self.text = self.ToStr() return self.text def Print(self): print(self.text)
class TextPreProcess(object): """ Token/Lemmatizer/Clean text\n OutPath1 is positive data path\n OutPath2 is negative data path\n """ def __init__(self, FilePath, OutPath1, OutPath2=None): self.FilePath = FilePath self.OutPath1 = OutPath1 self.OutPath2 = OutPath2 self.DataFrame = pd.read_csv(FilePath, sep='\t', quoting=3) # split paragraph to sentence class self.PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() # remove HTML tag pattern self.rc = re.compile(r"\<.*?\>") # Replacer class self.replacer = RegexReplacer() # split sentence into word pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S' self.tokenizer = RegexpTokenizer(pattern) # Lemmatizer self.wordnet_lemmatizer = WordNetLemmatizer() def SplitPhase(self, row): """ split paragraph to sentence """ return self.PunktTokenizer.tokenize(row['review']) def RemoveHTML(self, row): """ remove HTML tags """ return [ BeautifulSoup(sentence, "lxml").get_text() for sentence in row['review'] ] def ReplaceAbbre(self, row): """ Replace abbreviation """ return [self.replacer.replace(sentence) for sentence in row['review']] def SplitSent(self, row): """ split sentence to words """ return [ self.tokenizer.tokenize(sentence) for sentence in row['review'] ] def lemma(self, tags): """ lemmatizer for tagged words """ WORD = [] for word, tag in tags: wntag = tag[0].lower() wntag = wntag if wntag in ['a', 'r', 'n', 'v', 'n', 's'] else None if not wntag: lemma = word else: lemma = self.wordnet_lemmatizer.lemmatize(word, wntag) WORD.append(lemma) return WORD def Lemmatizer(self, row): """ Lemmatizer words use WordNet """ return [ self.lemma(nltk.pos_tag(sentence)) for sentence in row['review'] ] def CleanWords(self, sentence): """ remove len < 3 and non alpha and lowercase """ if self.word2vector: return [ word.lower() for word in sentence if (word.isalpha() and len(word) >= 2) or word.isdigit() ] else: stops = set(stopwords.words("english")) return [ word.lower() for word in sentence if len(word) >= 3 and word.isalpha() and not word in stops ] def CleanSentences(self, row): """ clean sentences """ return [self.CleanWords(sentence) for sentence in row['review']] def ToStr(self, row): str = "" for sentence in row['review']: for word in sentence: str += (word + " ") return str[:-1] def process(self, word2vector=True): """ Remove HTML tags, Replace Abbre, Split into words if use word2vector, should not use ``stopwords `` """ self.word2vector = word2vector # split phase self.DataFrame['review'] = self.DataFrame.apply(self.SplitPhase, axis=1) # remove HTML tags self.DataFrame['review'] = self.DataFrame.apply(self.RemoveHTML, axis=1) # replace abbre self.DataFrame['review'] = self.DataFrame.apply(self.ReplaceAbbre, axis=1) # split sentences self.DataFrame['review'] = self.DataFrame.apply(self.SplitSent, axis=1) # lemmatizer self.DataFrame['review'] = self.DataFrame.apply(self.Lemmatizer, axis=1) # clean sentences self.DataFrame['review'] = self.DataFrame.apply(self.CleanSentences, axis=1) # convert list to str self.DataFrame['review'] = self.DataFrame.apply(self.ToStr, axis=1) def save(self, Label=False): if Label: a = self.DataFrame['review'][self.DataFrame.sentiment == 1] a.to_csv(self.OutPath1, index=False) b = self.DataFrame['review'][self.DataFrame.sentiment == 0] b.to_csv(self.OutPath2, index=False) print("save data success to " + self.OutPath1 + " and " + self.OutPath2) else: # drop column and save self.DataFrame.drop(columns=['id']).to_csv(self.OutPath1, index=False, header=False) print("save to" + self.OutPath1)
output_seq_len = 20 # 空值填充0 PAD_ID = 0 # 输出序列起始标记 GO_ID = 1 # 结尾标记 EOS_ID = 2 # LSTM神经元size size = 8 # 初始学习率 init_learning_rate = 1 # 在样本中出现频率超过这个值才会进入词表 min_freq = 1 wordToken = word_token.WordToken() replacer = RegexReplacer() # 放在全局的位置,为了动态算出num_encoder_symbols和num_decoder_symbols max_token_id = wordToken.load_file_list(['./samples/questioncorpus', './samples/answercorpus'], min_freq) num_encoder_symbols = max_token_id + 5 num_decoder_symbols = max_token_id + 5 def get_id_list_from(sentence): sentence_id_list = [] seg_list = sentence.split(' ') for str in seg_list: id = wordToken.word2id(str) if id: sentence_id_list.append(wordToken.word2id(str)) return sentence_id_list
from nltk.tokenize import RegexpTokenizer tokenizer =RegexpTokenizer("[\w']+") #Importing Chunkers patterns = """ NP: {<DT|PP\$>?<JJ>*<NN>} {<NNP>+} {<NN>+} """ #chunker=nltk.RegexpParser(patterns) import chunkers import pickle #from nltk.corpus import treebank_chunk #chunker=chunkers.TagChunker(treebank_chunk.chunked_sents()) f=open("chunker.dump",'r') chunker=pickle.load(f) # training the chunker, ChunkParser is a class defined in the next slide #NPChunker = ChunkParser(train_sents) TxT="This method doesn't work well, because xxx." from replacers import RegexReplacer neg_replacer=RegexReplacer(); TxT=neg_replacer.replace(TxT) sent=nltk.pos_tag(nltk.word_tokenize(TxT)) #tree=chunker.parse(sent) #print "SubTree" #subtree=replacer.FindSubTree(tree, 'not', 'work') #print subtree print "After Negation" replacer.replace_negations_pos(sent) print sent