def corpus_statistics(): #train_corpus_path = "/userstore/jieg/credbank/corpus/credbank_train_corpus.txt" train_corpus_path = "C:\\Data\\credbank\\tweets_corpus\\shuffled_credbank_held_corpus.txt" with open(train_corpus_path, mode='r', encoding='utf-8') as file: train_corpus = file.readlines() from nltk.tokenize.regexp import WhitespaceTokenizer whitespace_tokenize = WhitespaceTokenizer().tokenize corpus_size = 0 for tweet in train_corpus: tokens = whitespace_tokenize(tweet) corpus_size += len(tokens) print("all words (corpus size): ", corpus_size) from sklearn.feature_extraction.text import CountVectorizer #extract tokens text_vectorizer = CountVectorizer(analyzer='word', tokenizer=WhitespaceTokenizer().tokenize, ngram_range=(1, 1), min_df=1) X = text_vectorizer.fit_transform(train_corpus) # Vocabulary vocab = list(text_vectorizer.get_feature_names()) print("vocabulary size: ", len(vocab)) # 913611 counts = X.sum(axis=0).A1 from collections import Counter freq_distribution = Counter(dict(zip(vocab, counts))) print("top N frequent words: ", freq_distribution.most_common(10))
def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')]
def main(): text = read_doc() text = [unescape(sent) for sent in text] from nltk.tokenize.regexp import WhitespaceTokenizer ws_tokenizer = WhitespaceTokenizer() text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0] text = [[token.lower() for token in sent] for sent in text] text = [[ ''.join(ch for ch in token if ch.isalpha() or ch == '\'') for token in sent ] for sent in text] text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text] from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) text = [[token for token in sent if not token in stopwords] for sent in text] from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") text = [[stemmer.stem(token) for token in sent] for sent in text] from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(min_df=20, analyzer=lambda x: x) X = vect.fit_transform(text) #print(X.toarray()) feature_names = vect.get_feature_names() #print(feature_names) from collections import Counter try: # Python 2 from itertools import izip except ImportError: # Python 3 izip = zip wfd = Counter( {key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))}) from itertools import combinations, chain bfd = Counter( chain.from_iterable( [combinations(sorted(segment.tocoo().col), 2) for segment in X])) N_seg = len(text) scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd] print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]]) pass
def __init__(self, use_unicode=True): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c'), ('https')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')]
def __init__(self, use_unicode): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')]
def processPost(self, post): tokenizer = WhitespaceTokenizer() if post.text is not None and post.text != "": curtext = post.text.encode('utf-8') tokens = [word for sent in nltk.sent_tokenize(curtext) for word in tokenizer.tokenize(sent)] tokens = self.normalizeTokens(tokens) text = nltk.Text(tokens) self.processText(post, text)
def __chunk_sentence(self, sentence): """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't). Then chunk the tokens according to GRAMMAR. """ tokenizer = WhitespaceTokenizer() tokens = tokenizer.tokenize(sentence) pos_tagged = nltk.pos_tag(tokens) return self.parser.parse(pos_tagged)
def main(): text = read_doc() text = [unescape(sent) for sent in text] from nltk.tokenize.regexp import WhitespaceTokenizer ws_tokenizer = WhitespaceTokenizer() text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0] text = [[token.lower() for token in sent] for sent in text] text = [["".join(ch for ch in token if ch.isalpha() or ch == "'") for token in sent] for sent in text] text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text] from nltk.corpus import stopwords stopwords = set(stopwords.words("english")) text = [[token for token in sent if not token in stopwords] for sent in text] from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") text = [[stemmer.stem(token) for token in sent] for sent in text] from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(min_df=20, analyzer=lambda x: x) X = vect.fit_transform(text) # print(X.toarray()) feature_names = vect.get_feature_names() # print(feature_names) from collections import Counter try: # Python 2 from itertools import izip except ImportError: # Python 3 izip = zip wfd = Counter({key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))}) from itertools import combinations, chain bfd = Counter(chain.from_iterable([combinations(sorted(segment.tocoo().col), 2) for segment in X])) N_seg = len(text) scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd] print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]]) pass
def evaluateclassifier(self, featureselection): positivecount=0 negativecount=0 negativetweets = [] positivetweets = [] #print 'Evaluating Classifier' print featureselection with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f: #print 'Opening corpus file' reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4' and positivecount < self.corpuslength): positivetweets.append(row[5]) positivecount+=1 #Negative sentiment tweets if(row[0] == '0' and negativecount < self.corpuslength): negativetweets.append(row[5]) negativecount+=1 #print 'Generating Features' self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets] self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets] poscutoff = len(self.positivefeatures) negcutoff = len(self.negativefeatures) print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff] testfeats = self.test(featureselection) #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff] print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) #classifier.show_most_informative_features(20) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def _fit(self): '''Tokenize the documents, make backwards and forwards lists call the make_dictionary method''' tokenizer = WhitespaceTokenizer() # Get the sentences from the corpus sent_list_of_str = sent_tokenize(self.corpus_txt.lower()) # Capitalize and save the punctuation from the end sent_cap = [(sent.capitalize()[:-1], sent[-1]) for sent in sent_list_of_str] # Word tokenize to keep contractions, add back on punc self.f_sent = [tokenizer.tokenize(word_tuple[0]) + [word_tuple[1]] for word_tuple in sent_cap] # Reverse those sentences self.b_sent = [list(reversed(word_list)) for word_list in self.f_sent] self.f_dict = self._make_dictionary(self.f_sent) self.b_dict = self._make_dictionary(self.b_sent)
def process(self, text): """ предобработка, токенизация по предложениям, удаление дублей. выдает список предложений (для векторного метода, на будущее) Args: text ([type]): [description] """ #text = text.lower() # убираем числа, email, гиперрсылки #text = text.encode('utf-8') text = clear_emails(text) text = clear_url(text) text = clear_digits(text) text = clear_symb(text) # выделяем предложения sentence_tokenizer = PunktSentenceTokenizer() text = sentence_tokenizer.tokenize(text) cleaned_text = [] stop_words = set(stopwords.words('russian')) # разбиваем по словам, чистим от оставшейся пунктуации и stopwords tokenizer = WhitespaceTokenizer() stemmer = SnowballStemmer('russian') for sentence in text: punct_cleaned_sent = clear_endings( sentence) # служ. символы конца предложения tokenized_sent = tokenizer.tokenize( punct_cleaned_sent) # раскидали по словам, только для отчистки stpw_clean_sentence = [ word for word in tokenized_sent if not word in stop_words ] stemmed_sentence = [ stemmer.stem(word) for word in stpw_clean_sentence ] # проеборазуем в ед. число или корень слова clean_sentence = ' '.join( stemmed_sentence ) # собрали обратно в предложение-сторку для хэшировнаия cleaned_text.append(clean_sentence) return cleaned_text
def get_ngram_counts(comment_iter, n, tokenizer=None, sample_pct=100): """ Compute ngram counts from comments. Parameters: ----------- comment_iter : generator n : int tokenizer : nltk.tokenize.Tokenizer sample_pct : float Optional percentage from which to subsample the data. Returns: -------- counts : pandas.DataFrame Rows = ngrams, col = counts. """ if (tokenizer is None): tokenizer = WhitespaceTokenizer() counts = Counter() for i, c in enumerate(comment_iter): if (sample_pct == 100 or random.random() * 100 < sample_pct): ngrams = ngram_split(c, n, tokenizer) for ngram in ngrams: ngram = [' '.join(ngram)] counts.update(ngram) if (i % 1000000 == 0): print('got %d unique ngrams' % (len(counts))) # convert to dataframe counts = pd.DataFrame(pd.Series(counts)) return counts
def __init__(self, data_iterator, tokenizer=WhitespaceTokenizer(), char_map=None, word_len=30, sent_len=200): ''' DESCRIPTIONS: This class converts text to numbers for the standard unicode vocabulary size. PARAMS: data_iterator (iterator): iterator to iterates the text strings word_len (int): maximum length of the word, any word of length less than that will be padded with zeros, any word of length more than that will be cut at max word length. sent_len (int): maximum number of words in a sentence, any sentence with less number of words than that will be padded with zeros, any sentence with more words than the max number will be cut at the max sentence length. char_map (dict): a dictionary for mapping characters to numbers. ''' self.data_iterator = data_iterator self.word_len = word_len self.sent_len = sent_len self.char_map = char_map self.tokenizer = tokenizer self.char_zero = ' ' # character to be assigned the zero index
def tokenize(s): """ Tokenize string. Function to tokenize text into words (tokens). Downloads default NLTK tokenizer if not in machine. Args: - s: string with sentence to tokenize. Returns: - tokens: list of tuples (token, start-index, end-index) """ text = sub(r"[,.:;'\"]", " ", s) tokenizer = Tokenizer() spans = tokenizer.span_tokenize(text) tokens = tokenizer.tokenize(text) tokens = [(t, s[0], s[1]-1) for t, s in zip(tokens, spans)] return tokens
def __init__(self, fname): with open(fname, 'r') as f: self.corpus_txt = f.read().decode('utf-8').replace('\n', ' ') self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.corpus_txt) self.lower_word_list = [w.lower() for w in self.word_list] self.word_dict_count = Counter(self.word_list)
def __init__(self): self.portugues_stemmer = RSLPStemmer() self.tokenizar = WhitespaceTokenizer() self.stopwords = stopwords.words('portuguese') self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'] self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]
def build_topn_best_words(self): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() positivecount = 0; negativecount = 0 with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f: reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4' and positivecount < self.corpuslength): tweet = row[5] tokens = WhitespaceTokenizer().tokenize(tweet) #print tweet for token in tokens: word_fd.inc(token.lower()) label_word_fd['pos'].inc(token.lower()) positivecount+=1 #Negative sentiment tweets if(row[0] == '0' and negativecount < self.corpuslength): tweet = row[5] tokens = WhitespaceTokenizer().tokenize(tweet) #print tweet for token in tokens: word_fd.inc(token.lower()) label_word_fd['neg'].inc(token.lower()) negativecount+=1 #print word_fd #print label_word_fd pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print "Positive Word Count:", pos_word_count, "Negative Word Count:", neg_word_count, "Total Word count:", total_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) print 'Best Words Count:', len(self.bestwords)#, 'Best Words Set:', self.bestwords
class LimparTexto(object): def __init__(self): self.portugues_stemmer = RSLPStemmer() self.tokenizar = WhitespaceTokenizer() self.stopwords = stopwords.words('portuguese') self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'] self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')] #Remover acentuação dos textos def removeAccent(self, text): para = text for (lat, asc) in self.ascii_replace: para = para.replace(lat, asc) return para #Realiza a remoção das stop words que são palavras que não representam significado para o nosso modelo. def removerStopWords(self, texto): #O decode é necessário se for utilizado o latin-1 no mining texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.stopwords]) texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.mais_utilizadas]) # texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.stopwords]) # texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.mais_utilizadas]) return texto #Tokenização das palavras por espaços def tokenizarPalavras(self, texto): texto = self.tokenizar.tokenize(texto) return texto #A remoção da pontuação é necessário pois palavras seguidas de pontos difere de palavra iguais sem a pontuação. def removerPontuacao(self, texto): regex = re.compile('[%s]' % re.escape(string.punctuation)) texto = regex.sub('',texto) return texto #Remoção dos sufixos das palavras def removerSufixo(self, para): text = '' for w in para: # text = text + self.portugues_stemmer.stem(w.decode('latin-1')) + ' ' text = text + self.portugues_stemmer.stem(w) + ' ' return text def removerAcentos(self, texto): texto = unicode(texto, 'latin-1') para = unidecode.unidecode(texto) return para def removerCaracteresRepetidos(self, texto): texto = re.sub(r'([a-z])\1+', r'\1', texto) return texto
def analyize(self,text): try: unitext = any2unicode(text, encoding='utf8', errors='strict') except: print ("Not utf-8") return [] pass #convert to lower lowerText = unitext.lower() # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)' # very aggresive regex...removes puncs and digits..keeps only alphabetic words tokenizer = WhitespaceTokenizer() regexTokens = tokenizer.tokenize(lowerText) p_stemmer = PorterStemmer() stemmedTokens = [p_stemmer.stem(i) for i in regexTokens] stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1] return stemmedRemSingleLetterTokens
def getToken(self, post): self.tokenizer = WhitespaceTokenizer() if post.text is not None and post.text != "": curtext = post.text.encode('utf-8') tokens = self.tokenize(curtext) tokens = self.normalizeTokens(tokens) tokens = self.stripSpecialChars(tokens) tokens = self.filterInvalid(tokens) tokens = self.calculateTf(tokens) return tokens return []
def process(self, text, plain_text=False): """ предобработка, токенизация по словам, удаление дублей. выдает сплошной (plain) текст, для метода шиндлов или список токенов текста Args: text ([type]): [description] """ #text = text.encode('utf-8') # убираем числа, email, гиперрсылки text = clear_emails(text) text = clear_url(text) text = clear_digits(text) text = clear_symb(text) # разбиваем по словам, чистим от оставшейся пунктуации и stopwords stop_words = set(stopwords.words('russian')) tokenizer = WhitespaceTokenizer() stemmer = SnowballStemmer('russian') punct_cleaned_text = clear_endings( text) # служ. символы конца предложения tokenized_text = tokenizer.tokenize( punct_cleaned_text) # раскидали по словам, только для отчистки stpw_clean_text = [ word for word in tokenized_text if not word in stop_words ] stemmed_text = [stemmer.stem(word) for word in stpw_clean_text ] # проеборазуем в ед. число или корень слова clean_text = None if plain_text: clean_text = ' '.join( stemmed_text ) # собрали обратно в предложение-сторку для хэшировнаия else: clean_text = stemmed_text # иначе возвращаем список токенов return clean_text
def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')]
def test(self, featureselection): positiveTweets = [] negativeTweets = [] with open(r'..\polarityData\TweetCorpus\testdata.manual.2009.06.14.csv', 'rb') as f: reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4'): positiveTweets.append(utils.common.processTweetBlank(row[5])) #Negative sentiment tweets if(row[0] == '0'): negativeTweets.append(utils.common.processTweetBlank(row[5])) positiveTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positiveTweets] negativeTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativeTweets] poscutoff = len(positiveTestFeatures) negcutoff = len(negativeTestFeatures) print "Test Pos Cutoff: " + str(poscutoff) + " Test Neg Cutoff: " + str(negcutoff) testfeatures = positiveTestFeatures[:poscutoff] + negativeTestFeatures[:negcutoff] #print testfeatures return (testfeatures)
def __init__(self, tokenizer=WhitespaceTokenizer(), sent_len=200): self.sent_len = sent_len self.tokenizer = tokenizer self.w2v_dim = 300 this_dir = os.path.dirname(os.path.realpath(__file__)) model_dir = this_dir + '/model' if not os.path.exists(model_dir): os.makedirs(model_dir) pretrained_path = model_dir + '/GoogleNews-vectors-negative300.bin.gz' if not os.path.exists(pretrained_path): raise Exception('pretrained vector file not exists: {}'.format(pretrained_path)) print('..loading model') self.model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
def get_sentences_for_text(corpus_root, filename, lang='english'): """Segments the given text into sentences. Args: corpus_root: Directory in which the text file is residing. filename: Name of the text file. lang: Tokenizer language. For possible values, look at: ${NLTK_DATA}/tokenizers/punkt Returns: Sentences in the given text. """ tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle' text = PlaintextCorpusReader( corpus_root, [filename], word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=nltk.data.LazyLoader(tokenizer_path)) return text.sents()
def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None): self.markov_dict = markov_dict self.gtype = self.markov_dict['gtype'] self.stop_words = set(stopwords.words('english')) self.neighbor_dict = neighbor_dict self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt']) self.lower_word_list = [w.lower() for w in self.word_list] # Count of word freq, maintaining case self.word_dict_count = Counter(self.word_list) self.truecaser = TrueCase(self.markov_dict['fname']) # Create priority and not_found_list if none were entered if priority_list: self.priority_list = priority_list else: self._make_priority() if not_found_list: self.not_found_list = not_found_list else: self._make_not_found()
class Command(BaseCommand): args = '<page_id> <method>' help = 'Computes graph data for the given page' def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self._log = logging.getLogger('cmd') def handle(self, *args, **options): if args is None or len(args) < 1: pages = Page.objects.all() for page in pages: self._log.info("Page #%s: %s" % (page.id, page.fb_page_name)) raise CommandError('Invalid arguments. Expected: <page_id>') page_id = args[0] self._log.info('GraphCommand initializing.') self._log.info('Page-Id: %s' % page_id) page = Page.objects.get(id=page_id) self.allTextGraph(page) #self.kpGraph(page) #self.buildGraph(page) self._log.info("All done for now.") def getNextIndex(self): self.nextFreeIndex = self.nextFreeIndex + 1 return self.nextFreeIndex - 1 def allTextGraph(self, page): pageowner = page.owner pageposts = Post.objects.filter(page__exact=page) self.stop_words = None self.idfCache = {} userterms = {} pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() ) pageusers_count = len(pageusers) print "Calculating vectors for %s users" % pageusers_count self.nextFreeIndex = 0 curuseridx = 0 for currentuser in pageusers: curuseridx = curuseridx + 1 print "tok+tf %s/%s" % (curuseridx, pageusers_count) terms = self.getUserTfVector(page, currentuser, pageposts) if not terms is None: userterms[currentuser.id] = terms print "Maximal index: %s" % self.nextFreeIndex self.postcount = len(pageposts) print "Calculating IDF, posts: %s, terms: %s" % (self.postcount, len(self.idfCache)) curuseridx = 0 terms_with_idf = {} for user_id in userterms: curuseridx = curuseridx + 1 print "idf %s/%s" % (curuseridx, pageusers_count) tokens = self.calculateIdf(userterms[user_id]) terms_with_idf[user_id] = tokens print "tfidf" curuseridx = 0 for user_id in terms_with_idf: curuseridx = curuseridx + 1 print "tfidf %s/%s" % (curuseridx, pageusers_count) tokens = self.calculateTfIdf(terms_with_idf[user_id]) userterms[user_id] = tokens del terms_with_idf print "Terms: %s" % len(self.idfCache) print "Calculating term IDs" termIds = self.calculateTermIds(userterms) uservectors = self.getUserVectors(userterms, termIds, len(self.idfCache), pageusers_count) userswithindex, usermatrix = self.getUserMatrix(uservectors) print "Creating graph" graph = nx.Graph() graph.add_nodes_from(pageusers) for i1 in range(usermatrix.shape[0]-1): max_edge = None max_edge_val = 0.0 for i2 in range(usermatrix.shape[0]-1): if i1 == i2: continue u1 = userswithindex[i1] u2 = userswithindex[i2] u1u2val = usermatrix[i1][i2] if u1u2val > max_edge_val: max_edge = u2 max_edge_val = u1u2val if max_edge_val > 0.0 and not max_edge is None: self.add_edge(graph, u1, max_edge) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) self.removeSingletons(graph) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) self.deleteClusters(page) print "storing" cpage = page for compidx in range(len(components)-1): component = components[compidx] newcluster = UserCluster.objects.create(page=cpage) newcluster.save() tags = {} tagcounts = {} for user_id in component: adduser = pageusers.filter(id__exact=user_id)[0] newassoc = UserClusterAssoc.objects.create(cluster = newcluster, clusteruser = adduser) print user_id newassoc.save() for t, tfidf in userterms[user_id]: if not t in tagcounts: tagcounts[t] = 1.0 else: tagcounts[t] = tagcounts[t] + 1.0 if not t in tags: tags[t] = tfidf else: tags[t] = tags[t] + tfidf for t in tags.keys(): tweight = tags[t] / tagcounts[t] print t newterm = UserClusterTerm.objects.create(cluster = newcluster, clusterterm = t, termweight = tweight) newterm.save() print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(tags.keys()), ",".join(tags.keys())) def deleteClusters(self, page): print "cleaning" delclusters = 0 for currentcluster in UserCluster.objects.filter(page__exact=page): uca = UserClusterAssoc.objects.filter(cluster__exact=currentcluster) uca.delete() uct = UserClusterTerm.objects.filter(cluster__exact=currentcluster) uct.delete() currentcluster.delete() delclusters = delclusters + 1 print "Deleted %s clusters" % delclusters def getUserMatrix(self, uservectors): userswithindex = uservectors.keys() usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1]) u1idx = 0 for u1 in userswithindex: u2idx = 0 for u2 in userswithindex: u2idx = u2idx + 1 if u1 == u2: continue u1_vec = uservectors[u1][0] u2_vec = uservectors[u2][0] u1u2dot = np.dot(u1_vec, u2_vec) usermatrix[u1idx][u2idx] = u1u2dot u1idx = u1idx + 1 print "matrix %s/%s" % (u1idx, len(userswithindex)) return (userswithindex, usermatrix) def getUserVectors(self, userterms, termIds, vectorlen, pageusers_count): uservectors = {} curuseridx = 0 for user_id in userterms.keys(): curuseridx = curuseridx + 1 print "vec %s/%s" % (curuseridx, pageusers_count) currentvector = [0.0] * vectorlen terms = [] for w, tfidf in userterms[user_id]: terms.append(w) currentvector[ termIds[w] ] = tfidf uservectors[user_id] = (np.array(currentvector), terms) #print ", ".join(map(str, currentvector)) #print ", ".join(terms) return uservectors def calculateTermIds(self, userterms): next_id = 0 ids = {} for user_id in userterms: for w, tfidf in userterms[user_id]: if not w in ids: ids[w] = next_id next_id = next_id + 1 return ids def getIdf(self, term): if term in self.idfCache: return float(self.postcount) / self.idfCache[term] print "Missing IDF: %s " % term exit() def getUserTfVector(self, page, currentuser, pageposts): tok = {} for post in pageposts.filter(createuser__exact=currentuser): usertokens = self.getToken(post) for w, tf in usertokens: if not w in tok: tok[w] = tf else: tok[w] = tok[w] + tf return [(w, tok[w]) for w in tok] def getToken(self, post): self.tokenizer = WhitespaceTokenizer() if post.text is not None and post.text != "": curtext = post.text.encode('utf-8') tokens = self.tokenize(curtext) tokens = self.normalizeTokens(tokens) tokens = self.stripSpecialChars(tokens) tokens = self.filterInvalid(tokens) tokens = self.calculateTf(tokens) return tokens return [] def getTfIdf(self, w, tf, idf, tokens): return (tf * idf) / len(tokens) def calculateTfIdf(self, tokens): return [ (w, self.getTfIdf(w, tf, idf, tokens) ) for w, tf, idf in tokens ] # maximum normalized tf def calculateTf(self, tokens): if len(tokens) == 0: return [] seen = {} max_tf = 1.0 for w in tokens: if not w in seen: seen[w] = 1.0 if not w in self.idfCache: self.idfCache[w] = 1.0 else: self.idfCache[w] = self.idfCache[w] + 1.0 else: seen[w] = seen[w] + 1.0 if seen[w] > max_tf: max_tf = seen[w] res = [] for w in tokens: res.append( (w, seen[w] / max_tf) ) return res def calculateIdf(self, tokens): return [(w, tf, self.getIdf(w)) for w, tf in tokens] def filterInvalid(self, tokens): vt = [w for w in tokens if self.isValidTerm(w)] if vt is None: vt = [] return vt def tokenize(self, curtext): return [word for sent in nltk.sent_tokenize(curtext) for word in self.tokenizer.tokenize(sent)] def is_number(self, s): try: float(s) return True except ValueError: return False def is_stop_word(self, term): self.read_stop_words() return term in self.stop_words def read_stop_words(self): if not self.stop_words is None: return res = {} for word in open(os.path.join(settings.STATIC_ROOT, 'stop_words'), 'rt').read().split('\r\n'): if not word is None and word != '' and not word in res: res[word] = True self.stop_words = res def isValidTerm(self, term): if len(term) < 2: return False for t in [".", ",", "-", "+", "%", "?", "!", "$", "&", "/", "\"", "'", "`", "`", "|", ":", ";", ")", "(", "[", "]", "{", "}"]: if t in term: return False if self.is_number(term): return False if self.is_stop_word(term): return False try: term = term.decode('ascii') except: return False if term.find('.') > -1: # or term.find('/') > -1 or term.find("?"): # url parts return False return True def normalizeTokens(self, tokens): return [w.lower() for w in tokens] def stripSpecialChars(self, tokens): return [w.strip("\r\n.,-+%?!$&/\\'`|:;)([]{}\t\" ") for w in tokens] def kpGraph(self, page): # initialization self.nextFreeIndex = 0 self.tokenIndices = {} self.allTerms = [] pageowner = page.owner pageposts = Post.objects.filter(page__exact=page) pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() ) pageusers_count = len(pageusers) print "Calculating vectors for %s users" % pageusers_count kp_term_method = KeyphraseMethod.objects.get(name='pos_sequence') userterms = {} curuseridx = 0 for currentuser in pageusers: curuseridx = curuseridx + 1 print "%s/%s" % (curuseridx, pageusers_count) (terms, ids) = self.getUserVector(page, currentuser, kp_term_method) if not terms is None: userterms[currentuser.id] = (terms, ids) print "Maximal index: %s" % self.nextFreeIndex uservectors = {} vectorlen = self.nextFreeIndex for currentuser in userterms.keys(): terms, ids = userterms[currentuser] currentvector = [0.0] * vectorlen for i in range(len(ids)-1): currentvector[ids[i]] = 1.0 uservectors[currentuser] = (np.array(currentvector), terms) #print ", ".join(map(str, currentvector)) #print ", ".join(self.allTerms) userswithindex = uservectors.keys() usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1]) u1idx = 0 for u1 in userswithindex: u2idx = 0 for u2 in userswithindex: u2idx = u2idx + 1 if u1 == u2: continue u1_vec = uservectors[u1][0] u2_vec = uservectors[u2][0] u1u2dot = np.dot(u1_vec, u2_vec) usermatrix[u1idx][u2idx] = u1u2dot u1idx = u1idx + 1 print "%s/%s" % (u1idx, len(userswithindex)) print "Creating graph" graph = nx.Graph() graph.add_nodes_from(pageusers) for i1 in range(usermatrix.shape[0]-1): max_edge = None max_edge_val = 0.0 for i2 in range(usermatrix.shape[0]-1): if i1 == i2: continue u1 = userswithindex[i1] u2 = userswithindex[i2] u1u2val = usermatrix[i1][i2] if u1u2val > max_edge_val: max_edge = u2 max_edge_val = u1u2val if max_edge_val > 0.0 and not max_edge is None: self.add_edge(graph, u1, max_edge) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) self.removeSingletons(graph) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) for compidx in range(len(components)-1): component = components[compidx] taglist = [] for user_id in component: ut = userterms[user_id][0] for t in ut: if not t in taglist: taglist.append(t) print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(taglist), ",".join(taglist)) return def getIndex(self, token): if not token in self.tokenIndices: self.allTerms.append(token) self.tokenIndices[token] = self.getNextIndex() return self.tokenIndices[token] def getUserVector(self, page, currentuser, kp_term_method): user_posts = Post.objects.filter(page__exact=page, createuser__exact=currentuser) user_post_parents = Post.objects.filter(id__in=user_posts.values('parent').distinct()) user_kps = PostKeyphraseAssoc.objects.filter(post__in = user_posts, keyphrase__method__exact=kp_term_method) user_kp_count = len(user_kps) terms_all = [] terms_split = [] terms_n = user_kps.values('keyphrase__normalized').distinct() terms_t = user_kps.values('keyphrase__term').distinct() for term in terms_n: t = term['keyphrase__normalized'] if not t in terms_all: terms_all.append(t) for term in terms_t: t = term['keyphrase__term'] if not t in terms_all: terms_all.append(t) for term in terms_all: for term_part in term.split(" "): if not term_part in terms_split: terms_split.append(term_part) terms_all = terms_split #if (len(terms_all) > 0): # for thread_post in user_post_parents: # terms_all.append("POST%s" % (thread_post.id)) print "User: %s Posts: %s Keyphrases: %s" % ( currentuser, len(user_posts), user_kp_count ) print "Terms: %s" % ", ".join(terms_all) if user_kp_count == 0: return (None, None) res_terms = [] res_ids = [] for term in terms_all: term_idx = self.getIndex(term) res_terms.append(term) res_ids.append(term_idx) return (res_terms, res_ids) def add_edge(self, graph, obj_from, obj_to, add_weight=1.0): if not graph.has_edge(obj_from, obj_to): graph.add_edge(obj_from, obj_to, weight=add_weight) else: graph[obj_from][obj_to]['weight'] = graph[obj_from][obj_to]['weight'] + add_weight def addPostUser(self, graph, post, added_users): if not post.createuser in graph: graph.add_node(post.createuser) added_users.append(post.createuser) # edge: post -> createuser self.add_edge(graph, post, post.createuser) def addPostParent(self, graph, post): if not post.parent is None: if not post.parent in graph: graph.add_node(post.parent) self.add_edge(graph, post, post.parent) def addPostKeyPhrases(self, graph, post): # keyphrases in this post for pk in PostKeyphraseAssoc.objects.filter(post__exact=post): graph.add_node(pk.keyphrase) self.add_edge(graph, post, pk.keyphrase) def addUserMetaCategory(self, graph, user): metaentries = UserMeta.objects.filter(user__exact=user) for metaentry in metaentries: if metaentry is None: continue if metaentry.fb_category is None or metaentry.fb_category == '': continue nodeval = u'CAT_' + unicode(metaentry.fb_category) graph.add_node(nodeval) self.add_edge(graph, user, nodeval) def addUserMeta(self, graph, user): metaentries = UserMeta.objects.filter(user__exact=user) for metaentry in metaentries: if metaentry is None: continue nodeval = unicode(metaentry) graph.add_node(nodeval) self.add_edge(graph, user, nodeval) def removeNonConnectedUsers(self, graph, dist_threshold): components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Removing non-connected user nodes" remove_nodes = [] for component in components: usernodes = [] userdists = {} for node in component: if type(node) == User: usernodes.append(node) u1idx = 0 ulen = len(usernodes) for u1 in usernodes: u1idx = u1idx + 1 print "%s/%s" % (u1idx, ulen) if not u1.id in userdists: userdists[u1.id] = 1000 for u2 in usernodes: if u1 == u2: continue pathres = nx.dijkstra_path_length(graph,u1,u2) if pathres < userdists[u1.id]: userdists[pathres] = pathres if userdists[u1.id] < dist_threshold: break # condition satisfied for user in usernodes: if userdists[user.id] > dist_threshold: # shortest path to another user is > 5 -> remove print "Removing user %s. Dist value: %s" % (user.id, userdists[user.id]) remove_nodes.append(user) print "Removing %s user nodes" % len(remove_nodes) graph.remove_nodes_from(remove_nodes) del remove_nodes def removeSingletons(self, graph): print "Removing singletons" singleton_nodes = [ n for n,d in graph.degree_iter() if d==0 ] graph.remove_nodes_from(singleton_nodes) del singleton_nodes def buildGraph(self, page): print "Building graph" pageowner = page.owner pageposts = Post.objects.filter(page__exact=page) graph = nx.Graph() #pageposts = pageposts[500:700] ########################################## print "nodes: posts" graph.add_nodes_from(pageposts) print "edges: user -> post" added_users = [] for post in pageposts: # post.createuser self.addPostUser(graph, post, added_users) # post->parent post self.addPostParent(graph, post) # post->postkeyphraseassoc->keyphrase self.addPostKeyPhrases(graph, post) # post.createuser->usermeta #self.addUserMeta(graph, post.createuser) #self.addUserMetaCategory(graph, post.createuser) print "Graph nodes: %s" % len(graph.nodes()) print "Graph edges: %s" % len(graph.edges()) print "Removing page owner" graph.remove_node(pageowner) print "Graph nodes: %s" % len(graph.nodes()) print "Graph edges: %s" % len(graph.edges()) self.removeSingletons(graph) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Removing components with only 0/1 user nodes" remove_components = [] for component in components: usercount = 0 for node in component: if type(node) == User: usercount = usercount + 1 if usercount <= 1: remove_components.append(component) else: print "Found %s user nodes" % usercount print "Removing %s components" % len(remove_components) for component in remove_components: graph.remove_nodes_from(component) del remove_components components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Edges: %s" % len(graph.edges()) remove_edges = [] weight_threshold = 2.0 for node_a, node_b, attr in sorted(graph.edges(data = True), key = lambda (a, b, attr): attr['weight']): if type(node_a) == Post or type(node_b) == Post: # exclude post connections continue if 'weight' in attr and attr['weight'] > weight_threshold: break remove_edges.append((node_a, node_b)) #print('{a} {b} {w}'.format(a = node_a, b = node_b, w = attr['weight'])) for node_a, node_b in remove_edges: graph.remove_edge(node_a, node_b) print "Edges: %s" % len(graph.edges()) self.removeSingletons(graph) print "Graph dotfile" nx.write_dot(graph, '/home/double/graph_viz.dot') tmp = [] for user in added_users: if user in graph: tmp.append(user) added_users = tmp print "Unique users in graph: %s" % len(added_users) usergraph = nx.Graph() usergraph.add_nodes_from(added_users) for user_a, user_b in combinations(added_users, 2): try: userpath = nx.shortest_path_length(graph, user_a, user_b, weight='weight') usergraph.add_edge(user_a, user_b, weight=userpath) print user_a, user_b, userpath except nx.NetworkXNoPath, e: #print e continue self.removeSingletons(usergraph) #print "Drawing graph" plt.ioff() #nx.draw(graph, node_size=10, font_size=8) #plt.savefig('/home/double/graph.png', dpi=1000) print "UserGraph nodes: %s" % len(usergraph.nodes()) print "UserGraph edges: %s" % len(usergraph.edges()) return
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out_dir', default='../../data/frequency') parser.add_argument('--comment_files', nargs='+', default=None) parser.add_argument('--n', type=int, default=2) parser.add_argument('--file_suffix', default=None) parser.add_argument('--sample_pct', type=float, default=100) args = parser.parse_args() out_dir = args.out_dir comment_files = args.comment_files n = args.n file_suffix = args.file_suffix sample_pct = args.sample_pct if (comment_files is None): comment_files = get_all_comment_files() # replace with clean normalized (smaller vocab) comment_files = [ f.replace('.bz2', '_clean_normalized.bz2') for f in comment_files ] # start small # comment_files = comment_files[:1] # min_df = 5 # min_tf = 10 min_tf = 1 stopwords = [] tokenizer = WhitespaceTokenizer() # breaking memory # ngram_range = (1,3) # ngram_range = (2,3) # ngram_range = (2,2) # ngram_range = (1,1) # no CountVectorizer because memory and we don't need # cooccurrence anyway # cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize, # stop_words=stopwords, ngram_range=ngram_range) date_format = '201[0-9]-[0-9]+' for f in comment_files: print('processing file %s' % (f)) date_str = re.findall(date_format, f)[0] # for each level of ngram, recompute counts # for n in range(ngram_range[0], ngram_range[1]+1): print('computing ngram = %d' % (n)) with BZ2File(f, 'r') as comment_file: # takes too long to generate full DTM...what do?? # just compute counts comment_iter = make_iter(comment_file) counts = get_ngram_counts(comment_iter, n, tokenizer=tokenizer, sample_pct=sample_pct) # limit min_frequency? counts = counts[counts >= min_tf] counts.columns = [date_str] # write to file # TOO MUCH SPACE => compress? if (file_suffix is not None): out_fname = os.path.join( out_dir, '%s_%dgram_tf_%s.tsv' % (date_str, n, file_suffix)) else: out_fname = os.path.join(out_dir, '%s_%dgram_tf.tsv' % (date_str, n)) counts.to_csv(out_fname, sep='\t')
from argparse import ArgumentParser from collections import OrderedDict from textblob import TextBlob from nltk.util import bigrams from multiprocessing import Pool from traceback import format_exc from nltk.stem.snowball import EnglishStemmer from nltk.tokenize.regexp import WhitespaceTokenizer from nltk.corpus import stopwords from boto import connect_s3 import requests import codecs import traceback stemmer = EnglishStemmer() tokenizer = WhitespaceTokenizer() stops = stopwords.words(u'english') def get_args(): ap = ArgumentParser() ap.add_argument(u'--num-processes', dest=u"num_processes", default=8, type=int) ap.add_argument(u'--solr-host', dest=u"solr_host", default=u"http://search-s10:8983") ap.add_argument(u'--outfile', dest=u'outfile', default=u'wiki_data.csv') ap.add_argument(u'--s3dest', dest=u's3dest') return ap.parse_args()
class MarkovChain(object): '''Create a MarkovChain from the given dictionary and parameters, run() returns a sentence given a seed markov_dict should be a MarkovDict().api dictionary''' def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None): self.markov_dict = markov_dict self.gtype = self.markov_dict['gtype'] self.stop_words = set(stopwords.words('english')) self.neighbor_dict = neighbor_dict self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt']) self.lower_word_list = [w.lower() for w in self.word_list] # Count of word freq, maintaining case self.word_dict_count = Counter(self.word_list) self.truecaser = TrueCase(self.markov_dict['fname']) # Create priority and not_found_list if none were entered if priority_list: self.priority_list = priority_list else: self._make_priority() if not_found_list: self.not_found_list = not_found_list else: self._make_not_found() def _make_priority(self, n=10): '''Return the n most common words in the corpus''' # Remove stop_words content = [w for w in self.lower_word_list if w not in self.stop_words] # Remove words that are only punctuation content_no_punc = [] for word in content: tmp = False for char in word: if char not in punctuation: tmp = True else: continue if tmp: content_no_punc.append(word) priority_dict = Counter(content_no_punc) self.priority_list = [key for key, val in priority_dict.most_common(n)] def _make_not_found(self, n=15): '''Return the n most common sentences in the corpus''' not_found_dict = Counter(sent_tokenize(self.markov_dict['corpus_txt'])) common_sent = [key for key, val in not_found_dict.most_common(n)] self.not_found_list = [] # Might fill with small stuff, don't let that happen for sent in common_sent: if len(sent) > 5: self.not_found_list.append(sent) def _get_input(self, input_phrase): '''Take in the raw input from the user''' # Lowercase and remove common punc input_phrase = input_phrase.lower() input_phrase = re.sub('\?', '', input_phrase) input_phrase = re.sub('\.', '', input_phrase) input_phrase = re.sub(',', '', input_phrase) input_phrase = re.sub('!', '', input_phrase) # List of words from a potential input phrase word_list = input_phrase.split() # Make a list of words that are in priority_list priority_words = [w for w in word_list if w in self.priority_list] # If no priority words, look for non stop words content = [w for w in word_list if w not in self.stop_words] # Look for priority words first, content second, and finally random if priority_words: seed = np.random.choice(priority_words) elif content: seed = np.random.choice(content) else: # Final option is a random word seed = np.random.choice(word_list) # if the words is not in text, find neighbors if not self._in_text(seed): seed = self._get_neighbor(seed) return seed def _in_text(self, word): '''Return true if word is in the corpus''' return word.lower() in set(self.lower_word_list) def _get_neighbor(self, seed): '''Return the nearest neighbor to seed from a database''' if not self.neighbor_dict: return None neighbors = self.neighbor_dict[seed] good_neighbors = [] for word in neighbors: if self._in_text(word): # Only pick a neighbor if in text good_neighbors.append(word) if good_neighbors: return np.random.choice(good_neighbors) else: return None def _generate_key(self, seed, dir_dict): '''Return key from a chosen seed''' key_list = [] for key in dir_dict: # Look at the last key_gram_size words in the key # First word in that key_gram_size len phrase must match seed if seed in key[-self.key_gram_size]: key_list.append(key) return key_list[np.random.choice(len(key_list))] def _run_chain(self, seed, dir_dict): '''Return a list of words generated from seed Iterate through dictionary until a period or capital is reached''' key = self._generate_key(seed, dir_dict) text = list(key[-self.key_gram_size:]) # If not end/begin of sent, run while True: # Values is a list of lists values = dir_dict[key] # Choose a value with probability equal to distribution in corpus value = values[np.random.choice(len(values))] if (() in value) | (value == ()): # End condition break # Add a value_gram_size phrase to the text words_from_value = value[:self.value_gram_size] text += words_from_value # Create new lookup key key = tuple(text[-self.markov_dict['gram_size']:]) return text def _get_sentence(self, seed): '''Return a sentence given a seed''' f_text = self._run_chain(seed, self.markov_dict['f_dict']) b_text = self._run_chain(seed, self.markov_dict['b_dict']) # b_text is backwards obviously, so turn it around b_text = list(reversed(b_text)) # Only include seed once sent = b_text[:-1] + f_text return sent def _get_sentence_str(self, sent): '''Return a string representation of a list''' if self.gtype != 'naive': sent = [w[0] for w in sent] text = ' '.join(sent) punc_w_space = [' ' + x for x in punctuation] for i in xrange(len(text)-1): if text[i:i+2] in punc_w_space: text = text[:i] + text[i+1:] return text def run(self, input_text, key_gram_size=2, value_gram_size=1): '''Return a sentence based on gram_size Larger gram_size is more deterministic phrases gram_size cannot be larger than gram_size''' self.key_gram_size = min(key_gram_size, self.markov_dict['gram_size']) self.value_gram_size = min(value_gram_size, self.markov_dict['gram_size']) while self.key_gram_size + self.value_gram_size < self.markov_dict['gram_size']: self.value_gram_size += 1 seed = self._get_input(input_text) # If seed not in corpus and no neighbor found, return random sent if not seed: return np.random.choice(self.not_found_list) sent = self._get_sentence(seed) # Turn into string for output sent_str = self._get_sentence_str(sent) # Fix space before punc output = self.truecaser.truecase(sent_str) return output
class TextCleaner(object): def __init__(self, use_unicode): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado # por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Remover caracteres especiais (Ex: ?, /, " ...). def removeSymbols(self, text): for symbol in self.symbols: text = text.replace(symbol, ' ') return text # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.accents: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
def classify(self, text): return (self.classifier.classify(WhitespaceTokenizer().tokenize(text)))
class TrueCase(object): '''True case from a corpus''' def __init__(self, fname): with open(fname, 'r') as f: self.corpus_txt = f.read().decode('utf-8').replace('\n', ' ') self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.corpus_txt) self.lower_word_list = [w.lower() for w in self.word_list] self.word_dict_count = Counter(self.word_list) def truecase(self, sent): '''Return a true_cased sentence to look well formatted''' if isinstance(sent, basestring): sent = self.tokenizer.tokenize(sent) output = [] # If it appears capital more often, use that case for word in sent: capital = 0 lower = 0 all_caps = 0 try: lower += self.word_dict_count[word.lower()] except: lower += 0 try: capital += self.word_dict_count[word.capitalize()] except: capital += 0 try: all_caps += self.word_dict_count[word.upper()] except: all_caps += 0 # find max of those three options idx = np.argsort([all_caps, capital, lower])[-1] # If not found in dictionary, find original case if (all_caps + capital + lower) == 0: try: i = self.lower_word_list.index(word.lower()) output.append(self.word_list[i]) except: try: i = self.lower_word_list.index(word.lower().strip(punctuation)) output.append(self.word_list[i]) except: output.append(word) elif idx == 0: output.append(word.upper()) elif idx == 1: output.append(word.capitalize()) else: output.append(word) # sometimes sentence delimiters get picked up in the middle of words # they should only go at the end sent_str = ' '.join([x.strip('!?.') for x in output[:-1]]) + ' ' + output[-1] sent_str = sent_str[0].upper() + sent_str[1:] return sent_str def bulk_truecase(self, list_sent): '''Return a list of true_cased strings from an iterable''' output = [] for sent in list_sent: output.append(self.truecase(sent)) return output
class TextCleaner(object): def __init__(self, use_unicode=True): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c'), ('https')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado # por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Remover caracteres especiais (Ex: ?, !, " ...). def removeSymbols(self, text): for symbol in self.symbols: text = text.replace(symbol, ' ') return text # Remover sufixo das palavras da lingua portuguesa. def removeSufPort(self, para): para = para.split() text = '' for w in para: text = text + self.pt_stemmer.stem(w) + ' ' return text # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.accents: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) text = ' '.join( [word for word in text.split() if word not in self.more_stopwords]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
def __init__(self,): NltkTokenizer.__init__(self) _WhitespaceTokenizer.__init__(self,)
def get_social_word_counts(social_var, vocab, comment_file, meta_file, comment_thresh=10): """ Compute unique number of social vars per word in vocab over all comments. Parameters: ----------- social_var : str vocab : [str] Vocabulary to count. comment_file : str meta_file : str Tab-separated metadata file containing comment date, author, thread ID, and subreddit. comment_thresh : int Minimum number of comments for a social var to be counted. Returns: -------- social_var_counts : numpy.array """ # indices in meta file corresponding to social vars social_var_indices = {'user': 1, 'subreddit': 3, 'thread': 2} social_txt = defaultdict(list) tokenizer = WhitespaceTokenizer() stopwords = get_default_stopwords() ngram_range = (1, 1) min_df = 1 cv = CountVectorizer(encoding='utf-8', lowercase=True, tokenizer=tokenizer.tokenize, stop_words=stopwords, ngram_range=ngram_range, min_df=min_df, vocabulary=vocab, binary=True) # keep it simple and store {vocab : {sub : count}} social_word_counts = defaultdict(Counter) with BZ2File(comment_file, 'r') as comments, BZ2File(meta_file, 'r') as metas: for i, (comment, meta) in enumerate(izip(comments, metas)): meta = meta.split('\t') social_id = meta[social_var_indices[social_var]] # print('got social id %s'%(social_id)) # social_txt[social_id].append(comment) for w in tokenizer.tokenize(comment): social_word_counts[w][social_id] += 1 if (i % 100000 == 0): print('processed %d comments' % (i)) # if(i == 500000): # break social_word_counts = { w: d for w, d in social_word_counts.iteritems() if w in vocab } social_word_counts = { w: {k: v for k, v in d.iteritems() if v >= comment_thresh} for w, d in social_word_counts.iteritems() } social_word_counts = {w: len(d) for w, d in social_word_counts.iteritems()} social_word_counts = np.array([ social_word_counts[v] if v in social_word_counts else 0. for v in vocab ]) # old code for constructing word/social dtm # restrict to consistent users?? # social_txt = {k : v for k,v in social_txt.items() # if len(v) >= comment_thresh} # # now convert to DTM # def get_txt_iter(social_txt): # N = len(social_txt) # for i, v in enumerate(social_txt.itervalues()): # if(i % 1000 == 0): # print('processed %d/%d social vars'%(i, N)) # yield ' '.join(v) # txt_iter = get_txt_iter(social_txt) # # txt_iter = (' '.join(v) for v in social_txt.values()) # dtm = cv.fit_transform(txt_iter) # print('got %s dtm %s'%(social_var, dtm)) # # save sparse matrix # # all_social_vals = social_txt.keys() # # vocab = sorted(cv.vocabulary_, key=lambda x: cv.vocabulary_[x]) # # comment_date = re.findall(r'201[0-9]-[0-9]+', comment_file)[0] # # write_full_social_dtm(dtm, all_social_vals, vocab, comment_date, social_var) # # save unique social count for each word # # combine all counts per word # social_word_counts = np.array(dtm.sum(axis=0)).flatten() return social_word_counts
# -*- coding: latin-1 -*- import re import nltk from nltk.tag import UnigramTagger from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import PunktWordTokenizer from nltk import RegexpParser from nltk.corpus import stopwords from nltk.tokenize.regexp import WhitespaceTokenizer global corpus, sent_tags, tagger # corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux corpus = TaggedCorpusReader( 'C:/Users/jose.adail/workspace/TextProcessor/names', r'.*\.txt', word_tokenizer=WhitespaceTokenizer(), sep="_") name_tags = corpus.tagged_sents( ) # Recebe as sentenças marcadas com POS_Tags. tagger = UnigramTagger( name_tags ) # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas. class RegexpReplacer(object): def __init__(self): self.replacement_patterns = [(r"'", ''), (r'#', 'hash'), (r'no', 'no_'), (r'not', 'not_'), (r'RT ', ''), (r'rs[rs]+', 'rs'), (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'), (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
from nltk import * from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer def getUniqueWords(allWords): uniqueWords = [] for i in allWords: if not i in uniqueWords: uniqueWords.append(i) return uniqueWords text_str = open('corpus.txt').read() tokens = WhitespaceTokenizer().tokenize(text_str) print("\nInitial Statistics of the Corpus.") print("#token: " + str(len(tokens))) print("#types: " + str(len(getUniqueWords(tokens)))) print("\nThe Top-10 Frequent Tokens.") freq = nltk.FreqDist(tokens) print(freq.most_common(10)) tokens = [token.lower() for token in tokens] print("\nAfter Case Folding.") print("#token: " + str(len(tokens))) print("#types: " + str(len(getUniqueWords(tokens)))) print("\nThe Top-10 Frequent Tokens.") freq = nltk.FreqDist(tokens)
class NamedEntity(object): def __init__(self): self.tokenizer = WhitespaceTokenizer() # Remover do texto duas ou mais palavras próprias em sequência. def removeName(self, text): i = 0 j = 1 words = text.split() lim = len(words) - 1 while j <= lim: if not words[i].isupper() and not words[i].islower(): if not words[j].isupper() and not words[j].islower(): words[i] = words[i].replace(words[i], "") words[j] = words[j].replace(words[j], "") i += 1 j += 1 words = ' '.join(words) return words # Remover nomes próprios dos textos. Para isso, recebe o texto, que em seguida é dividido em palavras, que posteriormente recebem POS_Tags. # Para cada palavra/tag, é verificado se a tag nao corresponde a de nome proprio 'NPROP'. Ao final, forma-se um texto sem palavras com tags # 'NPROP', sendo assim retornado pelo método. def removePersonName(self, text): final_text = '' tokenized_text = self.tokenizeWords(text) tagged_text = self.tagWords(tokenized_text) for w, t in tagged_text: if t != "NPROP": final_text = final_text + ''.join(w) + ' ' return final_text # Remover menções de usuários de tweets. Os mesmos são identificados pelo caractere '@'. O texto original é repassado ao método e divido em palavras, # em seguida. Após isso, é verificado para cada palavra do texto se a mesma se inicia com o caractere '@'. Caso sim, essa palavra é removida do texto. # Ao final, o texto é retornado, sem os nomes de usuários. def removeTwitterUsername(self, text): text = text.split() for w in text: if w[0] == '@': text.remove(w) return ' '.join(text) # Marcar as palavras de uma sentença tokenizada com POS_Tags. O texto é repassado ao método tag da classe UnigramTagger, que marca as palavras do texto com # POS_Tags. Retorna uma lista com palavras/tags. def tagWords(self, tokenized_text): tagged_words = tagger.tag(tokenized_text) return tagged_words # Desenhar arvore que destaca um determinado padrão gramatical do texto. def drawNamedEntityTree(self, text): tokenized_text = tokenizer.tokenize(text) tagged_text = self.tagWords(tokenized_text) grammar = "ENT: {<PESSOA>*}" cp = RegexpParser(grammar) res = cp.parse(tagged_text) res.draw() # Tokenizar sentenças em palavras. Retorna uma lista com as palavras que formam o texto. def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
def __init__(self): self.tokenizer = WhitespaceTokenizer()
class TextCleaner(object): def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.ascii_replace: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text