def main(): text = read_doc() text = [unescape(sent) for sent in text] from nltk.tokenize.regexp import WhitespaceTokenizer ws_tokenizer = WhitespaceTokenizer() text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0] text = [[token.lower() for token in sent] for sent in text] text = [[ ''.join(ch for ch in token if ch.isalpha() or ch == '\'') for token in sent ] for sent in text] text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text] from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) text = [[token for token in sent if not token in stopwords] for sent in text] from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") text = [[stemmer.stem(token) for token in sent] for sent in text] from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(min_df=20, analyzer=lambda x: x) X = vect.fit_transform(text) #print(X.toarray()) feature_names = vect.get_feature_names() #print(feature_names) from collections import Counter try: # Python 2 from itertools import izip except ImportError: # Python 3 izip = zip wfd = Counter( {key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))}) from itertools import combinations, chain bfd = Counter( chain.from_iterable( [combinations(sorted(segment.tocoo().col), 2) for segment in X])) N_seg = len(text) scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd] print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]]) pass
def __chunk_sentence(self, sentence): """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't). Then chunk the tokens according to GRAMMAR. """ tokenizer = WhitespaceTokenizer() tokens = tokenizer.tokenize(sentence) pos_tagged = nltk.pos_tag(tokens) return self.parser.parse(pos_tagged)
def processPost(self, post): tokenizer = WhitespaceTokenizer() if post.text is not None and post.text != "": curtext = post.text.encode('utf-8') tokens = [word for sent in nltk.sent_tokenize(curtext) for word in tokenizer.tokenize(sent)] tokens = self.normalizeTokens(tokens) text = nltk.Text(tokens) self.processText(post, text)
def main(): text = read_doc() text = [unescape(sent) for sent in text] from nltk.tokenize.regexp import WhitespaceTokenizer ws_tokenizer = WhitespaceTokenizer() text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0] text = [[token.lower() for token in sent] for sent in text] text = [["".join(ch for ch in token if ch.isalpha() or ch == "'") for token in sent] for sent in text] text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text] from nltk.corpus import stopwords stopwords = set(stopwords.words("english")) text = [[token for token in sent if not token in stopwords] for sent in text] from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") text = [[stemmer.stem(token) for token in sent] for sent in text] from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(min_df=20, analyzer=lambda x: x) X = vect.fit_transform(text) # print(X.toarray()) feature_names = vect.get_feature_names() # print(feature_names) from collections import Counter try: # Python 2 from itertools import izip except ImportError: # Python 3 izip = zip wfd = Counter({key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))}) from itertools import combinations, chain bfd = Counter(chain.from_iterable([combinations(sorted(segment.tocoo().col), 2) for segment in X])) N_seg = len(text) scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd] print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]]) pass
class LimparTexto(object): def __init__(self): self.portugues_stemmer = RSLPStemmer() self.tokenizar = WhitespaceTokenizer() self.stopwords = stopwords.words('portuguese') self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'] self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')] #Remover acentuação dos textos def removeAccent(self, text): para = text for (lat, asc) in self.ascii_replace: para = para.replace(lat, asc) return para #Realiza a remoção das stop words que são palavras que não representam significado para o nosso modelo. def removerStopWords(self, texto): #O decode é necessário se for utilizado o latin-1 no mining texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.stopwords]) texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.mais_utilizadas]) # texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.stopwords]) # texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.mais_utilizadas]) return texto #Tokenização das palavras por espaços def tokenizarPalavras(self, texto): texto = self.tokenizar.tokenize(texto) return texto #A remoção da pontuação é necessário pois palavras seguidas de pontos difere de palavra iguais sem a pontuação. def removerPontuacao(self, texto): regex = re.compile('[%s]' % re.escape(string.punctuation)) texto = regex.sub('',texto) return texto #Remoção dos sufixos das palavras def removerSufixo(self, para): text = '' for w in para: # text = text + self.portugues_stemmer.stem(w.decode('latin-1')) + ' ' text = text + self.portugues_stemmer.stem(w) + ' ' return text def removerAcentos(self, texto): texto = unicode(texto, 'latin-1') para = unidecode.unidecode(texto) return para def removerCaracteresRepetidos(self, texto): texto = re.sub(r'([a-z])\1+', r'\1', texto) return texto
def _fit(self): '''Tokenize the documents, make backwards and forwards lists call the make_dictionary method''' tokenizer = WhitespaceTokenizer() # Get the sentences from the corpus sent_list_of_str = sent_tokenize(self.corpus_txt.lower()) # Capitalize and save the punctuation from the end sent_cap = [(sent.capitalize()[:-1], sent[-1]) for sent in sent_list_of_str] # Word tokenize to keep contractions, add back on punc self.f_sent = [tokenizer.tokenize(word_tuple[0]) + [word_tuple[1]] for word_tuple in sent_cap] # Reverse those sentences self.b_sent = [list(reversed(word_list)) for word_list in self.f_sent] self.f_dict = self._make_dictionary(self.f_sent) self.b_dict = self._make_dictionary(self.b_sent)
def process(self, text): """ предобработка, токенизация по предложениям, удаление дублей. выдает список предложений (для векторного метода, на будущее) Args: text ([type]): [description] """ #text = text.lower() # убираем числа, email, гиперрсылки #text = text.encode('utf-8') text = clear_emails(text) text = clear_url(text) text = clear_digits(text) text = clear_symb(text) # выделяем предложения sentence_tokenizer = PunktSentenceTokenizer() text = sentence_tokenizer.tokenize(text) cleaned_text = [] stop_words = set(stopwords.words('russian')) # разбиваем по словам, чистим от оставшейся пунктуации и stopwords tokenizer = WhitespaceTokenizer() stemmer = SnowballStemmer('russian') for sentence in text: punct_cleaned_sent = clear_endings( sentence) # служ. символы конца предложения tokenized_sent = tokenizer.tokenize( punct_cleaned_sent) # раскидали по словам, только для отчистки stpw_clean_sentence = [ word for word in tokenized_sent if not word in stop_words ] stemmed_sentence = [ stemmer.stem(word) for word in stpw_clean_sentence ] # проеборазуем в ед. число или корень слова clean_sentence = ' '.join( stemmed_sentence ) # собрали обратно в предложение-сторку для хэшировнаия cleaned_text.append(clean_sentence) return cleaned_text
def tokenize(s): """ Tokenize string. Function to tokenize text into words (tokens). Downloads default NLTK tokenizer if not in machine. Args: - s: string with sentence to tokenize. Returns: - tokens: list of tuples (token, start-index, end-index) """ text = sub(r"[,.:;'\"]", " ", s) tokenizer = Tokenizer() spans = tokenizer.span_tokenize(text) tokens = tokenizer.tokenize(text) tokens = [(t, s[0], s[1]-1) for t, s in zip(tokens, spans)] return tokens
def analyize(self,text): try: unitext = any2unicode(text, encoding='utf8', errors='strict') except: print ("Not utf-8") return [] pass #convert to lower lowerText = unitext.lower() # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)' # very aggresive regex...removes puncs and digits..keeps only alphabetic words tokenizer = WhitespaceTokenizer() regexTokens = tokenizer.tokenize(lowerText) p_stemmer = PorterStemmer() stemmedTokens = [p_stemmer.stem(i) for i in regexTokens] stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1] return stemmedRemSingleLetterTokens
def process(self, text, plain_text=False): """ предобработка, токенизация по словам, удаление дублей. выдает сплошной (plain) текст, для метода шиндлов или список токенов текста Args: text ([type]): [description] """ #text = text.encode('utf-8') # убираем числа, email, гиперрсылки text = clear_emails(text) text = clear_url(text) text = clear_digits(text) text = clear_symb(text) # разбиваем по словам, чистим от оставшейся пунктуации и stopwords stop_words = set(stopwords.words('russian')) tokenizer = WhitespaceTokenizer() stemmer = SnowballStemmer('russian') punct_cleaned_text = clear_endings( text) # служ. символы конца предложения tokenized_text = tokenizer.tokenize( punct_cleaned_text) # раскидали по словам, только для отчистки stpw_clean_text = [ word for word in tokenized_text if not word in stop_words ] stemmed_text = [stemmer.stem(word) for word in stpw_clean_text ] # проеборазуем в ед. число или корень слова clean_text = None if plain_text: clean_text = ' '.join( stemmed_text ) # собрали обратно в предложение-сторку для хэшировнаия else: clean_text = stemmed_text # иначе возвращаем список токенов return clean_text
class TextCleaner(object): def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.ascii_replace: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
class NamedEntity(object): def __init__(self): self.tokenizer = WhitespaceTokenizer() # Remover do texto duas ou mais palavras próprias em sequência. def removeName(self, text): i = 0 j = 1 words = text.split() lim = len(words) - 1 while j <= lim: if not words[i].isupper() and not words[i].islower(): if not words[j].isupper() and not words[j].islower(): words[i] = words[i].replace(words[i], "") words[j] = words[j].replace(words[j], "") i += 1 j += 1 words = ' '.join(words) return words # Remover nomes próprios dos textos. Para isso, recebe o texto, que em seguida é dividido em palavras, que posteriormente recebem POS_Tags. # Para cada palavra/tag, é verificado se a tag nao corresponde a de nome proprio 'NPROP'. Ao final, forma-se um texto sem palavras com tags # 'NPROP', sendo assim retornado pelo método. def removePersonName(self, text): final_text = '' tokenized_text = self.tokenizeWords(text) tagged_text = self.tagWords(tokenized_text) for w, t in tagged_text: if t != "NPROP": final_text = final_text + ''.join(w) + ' ' return final_text # Remover menções de usuários de tweets. Os mesmos são identificados pelo caractere '@'. O texto original é repassado ao método e divido em palavras, # em seguida. Após isso, é verificado para cada palavra do texto se a mesma se inicia com o caractere '@'. Caso sim, essa palavra é removida do texto. # Ao final, o texto é retornado, sem os nomes de usuários. def removeTwitterUsername(self, text): text = text.split() for w in text: if w[0] == '@': text.remove(w) return ' '.join(text) # Marcar as palavras de uma sentença tokenizada com POS_Tags. O texto é repassado ao método tag da classe UnigramTagger, que marca as palavras do texto com # POS_Tags. Retorna uma lista com palavras/tags. def tagWords(self, tokenized_text): tagged_words = tagger.tag(tokenized_text) return tagged_words # Desenhar arvore que destaca um determinado padrão gramatical do texto. def drawNamedEntityTree(self, text): tokenized_text = tokenizer.tokenize(text) tagged_text = self.tagWords(tokenized_text) grammar = "ENT: {<PESSOA>*}" cp = RegexpParser(grammar) res = cp.parse(tagged_text) res.draw() # Tokenizar sentenças em palavras. Retorna uma lista com as palavras que formam o texto. def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
def get_social_word_counts(social_var, vocab, comment_file, meta_file, comment_thresh=10): """ Compute unique number of social vars per word in vocab over all comments. Parameters: ----------- social_var : str vocab : [str] Vocabulary to count. comment_file : str meta_file : str Tab-separated metadata file containing comment date, author, thread ID, and subreddit. comment_thresh : int Minimum number of comments for a social var to be counted. Returns: -------- social_var_counts : numpy.array """ # indices in meta file corresponding to social vars social_var_indices = {'user': 1, 'subreddit': 3, 'thread': 2} social_txt = defaultdict(list) tokenizer = WhitespaceTokenizer() stopwords = get_default_stopwords() ngram_range = (1, 1) min_df = 1 cv = CountVectorizer(encoding='utf-8', lowercase=True, tokenizer=tokenizer.tokenize, stop_words=stopwords, ngram_range=ngram_range, min_df=min_df, vocabulary=vocab, binary=True) # keep it simple and store {vocab : {sub : count}} social_word_counts = defaultdict(Counter) with BZ2File(comment_file, 'r') as comments, BZ2File(meta_file, 'r') as metas: for i, (comment, meta) in enumerate(izip(comments, metas)): meta = meta.split('\t') social_id = meta[social_var_indices[social_var]] # print('got social id %s'%(social_id)) # social_txt[social_id].append(comment) for w in tokenizer.tokenize(comment): social_word_counts[w][social_id] += 1 if (i % 100000 == 0): print('processed %d comments' % (i)) # if(i == 500000): # break social_word_counts = { w: d for w, d in social_word_counts.iteritems() if w in vocab } social_word_counts = { w: {k: v for k, v in d.iteritems() if v >= comment_thresh} for w, d in social_word_counts.iteritems() } social_word_counts = {w: len(d) for w, d in social_word_counts.iteritems()} social_word_counts = np.array([ social_word_counts[v] if v in social_word_counts else 0. for v in vocab ]) # old code for constructing word/social dtm # restrict to consistent users?? # social_txt = {k : v for k,v in social_txt.items() # if len(v) >= comment_thresh} # # now convert to DTM # def get_txt_iter(social_txt): # N = len(social_txt) # for i, v in enumerate(social_txt.itervalues()): # if(i % 1000 == 0): # print('processed %d/%d social vars'%(i, N)) # yield ' '.join(v) # txt_iter = get_txt_iter(social_txt) # # txt_iter = (' '.join(v) for v in social_txt.values()) # dtm = cv.fit_transform(txt_iter) # print('got %s dtm %s'%(social_var, dtm)) # # save sparse matrix # # all_social_vals = social_txt.keys() # # vocab = sorted(cv.vocabulary_, key=lambda x: cv.vocabulary_[x]) # # comment_date = re.findall(r'201[0-9]-[0-9]+', comment_file)[0] # # write_full_social_dtm(dtm, all_social_vals, vocab, comment_date, social_var) # # save unique social count for each word # # combine all counts per word # social_word_counts = np.array(dtm.sum(axis=0)).flatten() return social_word_counts
class TrueCase(object): '''True case from a corpus''' def __init__(self, fname): with open(fname, 'r') as f: self.corpus_txt = f.read().decode('utf-8').replace('\n', ' ') self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.corpus_txt) self.lower_word_list = [w.lower() for w in self.word_list] self.word_dict_count = Counter(self.word_list) def truecase(self, sent): '''Return a true_cased sentence to look well formatted''' if isinstance(sent, basestring): sent = self.tokenizer.tokenize(sent) output = [] # If it appears capital more often, use that case for word in sent: capital = 0 lower = 0 all_caps = 0 try: lower += self.word_dict_count[word.lower()] except: lower += 0 try: capital += self.word_dict_count[word.capitalize()] except: capital += 0 try: all_caps += self.word_dict_count[word.upper()] except: all_caps += 0 # find max of those three options idx = np.argsort([all_caps, capital, lower])[-1] # If not found in dictionary, find original case if (all_caps + capital + lower) == 0: try: i = self.lower_word_list.index(word.lower()) output.append(self.word_list[i]) except: try: i = self.lower_word_list.index(word.lower().strip(punctuation)) output.append(self.word_list[i]) except: output.append(word) elif idx == 0: output.append(word.upper()) elif idx == 1: output.append(word.capitalize()) else: output.append(word) # sometimes sentence delimiters get picked up in the middle of words # they should only go at the end sent_str = ' '.join([x.strip('!?.') for x in output[:-1]]) + ' ' + output[-1] sent_str = sent_str[0].upper() + sent_str[1:] return sent_str def bulk_truecase(self, list_sent): '''Return a list of true_cased strings from an iterable''' output = [] for sent in list_sent: output.append(self.truecase(sent)) return output
class TextCleaner(object): def __init__(self, use_unicode=True): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c'), ('https')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado # por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Remover caracteres especiais (Ex: ?, !, " ...). def removeSymbols(self, text): for symbol in self.symbols: text = text.replace(symbol, ' ') return text # Remover sufixo das palavras da lingua portuguesa. def removeSufPort(self, para): para = para.split() text = '' for w in para: text = text + self.pt_stemmer.stem(w) + ' ' return text # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.accents: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) text = ' '.join( [word for word in text.split() if word not in self.more_stopwords]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
class TextCleaner(object): def __init__(self, use_unicode): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado # por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Remover caracteres especiais (Ex: ?, /, " ...). def removeSymbols(self, text): for symbol in self.symbols: text = text.replace(symbol, ' ') return text # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.accents: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text
class MarkovChain(object): '''Create a MarkovChain from the given dictionary and parameters, run() returns a sentence given a seed markov_dict should be a MarkovDict().api dictionary''' def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None): self.markov_dict = markov_dict self.gtype = self.markov_dict['gtype'] self.stop_words = set(stopwords.words('english')) self.neighbor_dict = neighbor_dict self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt']) self.lower_word_list = [w.lower() for w in self.word_list] # Count of word freq, maintaining case self.word_dict_count = Counter(self.word_list) self.truecaser = TrueCase(self.markov_dict['fname']) # Create priority and not_found_list if none were entered if priority_list: self.priority_list = priority_list else: self._make_priority() if not_found_list: self.not_found_list = not_found_list else: self._make_not_found() def _make_priority(self, n=10): '''Return the n most common words in the corpus''' # Remove stop_words content = [w for w in self.lower_word_list if w not in self.stop_words] # Remove words that are only punctuation content_no_punc = [] for word in content: tmp = False for char in word: if char not in punctuation: tmp = True else: continue if tmp: content_no_punc.append(word) priority_dict = Counter(content_no_punc) self.priority_list = [key for key, val in priority_dict.most_common(n)] def _make_not_found(self, n=15): '''Return the n most common sentences in the corpus''' not_found_dict = Counter(sent_tokenize(self.markov_dict['corpus_txt'])) common_sent = [key for key, val in not_found_dict.most_common(n)] self.not_found_list = [] # Might fill with small stuff, don't let that happen for sent in common_sent: if len(sent) > 5: self.not_found_list.append(sent) def _get_input(self, input_phrase): '''Take in the raw input from the user''' # Lowercase and remove common punc input_phrase = input_phrase.lower() input_phrase = re.sub('\?', '', input_phrase) input_phrase = re.sub('\.', '', input_phrase) input_phrase = re.sub(',', '', input_phrase) input_phrase = re.sub('!', '', input_phrase) # List of words from a potential input phrase word_list = input_phrase.split() # Make a list of words that are in priority_list priority_words = [w for w in word_list if w in self.priority_list] # If no priority words, look for non stop words content = [w for w in word_list if w not in self.stop_words] # Look for priority words first, content second, and finally random if priority_words: seed = np.random.choice(priority_words) elif content: seed = np.random.choice(content) else: # Final option is a random word seed = np.random.choice(word_list) # if the words is not in text, find neighbors if not self._in_text(seed): seed = self._get_neighbor(seed) return seed def _in_text(self, word): '''Return true if word is in the corpus''' return word.lower() in set(self.lower_word_list) def _get_neighbor(self, seed): '''Return the nearest neighbor to seed from a database''' if not self.neighbor_dict: return None neighbors = self.neighbor_dict[seed] good_neighbors = [] for word in neighbors: if self._in_text(word): # Only pick a neighbor if in text good_neighbors.append(word) if good_neighbors: return np.random.choice(good_neighbors) else: return None def _generate_key(self, seed, dir_dict): '''Return key from a chosen seed''' key_list = [] for key in dir_dict: # Look at the last key_gram_size words in the key # First word in that key_gram_size len phrase must match seed if seed in key[-self.key_gram_size]: key_list.append(key) return key_list[np.random.choice(len(key_list))] def _run_chain(self, seed, dir_dict): '''Return a list of words generated from seed Iterate through dictionary until a period or capital is reached''' key = self._generate_key(seed, dir_dict) text = list(key[-self.key_gram_size:]) # If not end/begin of sent, run while True: # Values is a list of lists values = dir_dict[key] # Choose a value with probability equal to distribution in corpus value = values[np.random.choice(len(values))] if (() in value) | (value == ()): # End condition break # Add a value_gram_size phrase to the text words_from_value = value[:self.value_gram_size] text += words_from_value # Create new lookup key key = tuple(text[-self.markov_dict['gram_size']:]) return text def _get_sentence(self, seed): '''Return a sentence given a seed''' f_text = self._run_chain(seed, self.markov_dict['f_dict']) b_text = self._run_chain(seed, self.markov_dict['b_dict']) # b_text is backwards obviously, so turn it around b_text = list(reversed(b_text)) # Only include seed once sent = b_text[:-1] + f_text return sent def _get_sentence_str(self, sent): '''Return a string representation of a list''' if self.gtype != 'naive': sent = [w[0] for w in sent] text = ' '.join(sent) punc_w_space = [' ' + x for x in punctuation] for i in xrange(len(text)-1): if text[i:i+2] in punc_w_space: text = text[:i] + text[i+1:] return text def run(self, input_text, key_gram_size=2, value_gram_size=1): '''Return a sentence based on gram_size Larger gram_size is more deterministic phrases gram_size cannot be larger than gram_size''' self.key_gram_size = min(key_gram_size, self.markov_dict['gram_size']) self.value_gram_size = min(value_gram_size, self.markov_dict['gram_size']) while self.key_gram_size + self.value_gram_size < self.markov_dict['gram_size']: self.value_gram_size += 1 seed = self._get_input(input_text) # If seed not in corpus and no neighbor found, return random sent if not seed: return np.random.choice(self.not_found_list) sent = self._get_sentence(seed) # Turn into string for output sent_str = self._get_sentence_str(sent) # Fix space before punc output = self.truecaser.truecase(sent_str) return output
class Command(BaseCommand): args = '<page_id> <method>' help = 'Computes graph data for the given page' def __init__(self, *args, **kwargs): super(Command, self).__init__(*args, **kwargs) self._log = logging.getLogger('cmd') def handle(self, *args, **options): if args is None or len(args) < 1: pages = Page.objects.all() for page in pages: self._log.info("Page #%s: %s" % (page.id, page.fb_page_name)) raise CommandError('Invalid arguments. Expected: <page_id>') page_id = args[0] self._log.info('GraphCommand initializing.') self._log.info('Page-Id: %s' % page_id) page = Page.objects.get(id=page_id) self.allTextGraph(page) #self.kpGraph(page) #self.buildGraph(page) self._log.info("All done for now.") def getNextIndex(self): self.nextFreeIndex = self.nextFreeIndex + 1 return self.nextFreeIndex - 1 def allTextGraph(self, page): pageowner = page.owner pageposts = Post.objects.filter(page__exact=page) self.stop_words = None self.idfCache = {} userterms = {} pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() ) pageusers_count = len(pageusers) print "Calculating vectors for %s users" % pageusers_count self.nextFreeIndex = 0 curuseridx = 0 for currentuser in pageusers: curuseridx = curuseridx + 1 print "tok+tf %s/%s" % (curuseridx, pageusers_count) terms = self.getUserTfVector(page, currentuser, pageposts) if not terms is None: userterms[currentuser.id] = terms print "Maximal index: %s" % self.nextFreeIndex self.postcount = len(pageposts) print "Calculating IDF, posts: %s, terms: %s" % (self.postcount, len(self.idfCache)) curuseridx = 0 terms_with_idf = {} for user_id in userterms: curuseridx = curuseridx + 1 print "idf %s/%s" % (curuseridx, pageusers_count) tokens = self.calculateIdf(userterms[user_id]) terms_with_idf[user_id] = tokens print "tfidf" curuseridx = 0 for user_id in terms_with_idf: curuseridx = curuseridx + 1 print "tfidf %s/%s" % (curuseridx, pageusers_count) tokens = self.calculateTfIdf(terms_with_idf[user_id]) userterms[user_id] = tokens del terms_with_idf print "Terms: %s" % len(self.idfCache) print "Calculating term IDs" termIds = self.calculateTermIds(userterms) uservectors = self.getUserVectors(userterms, termIds, len(self.idfCache), pageusers_count) userswithindex, usermatrix = self.getUserMatrix(uservectors) print "Creating graph" graph = nx.Graph() graph.add_nodes_from(pageusers) for i1 in range(usermatrix.shape[0]-1): max_edge = None max_edge_val = 0.0 for i2 in range(usermatrix.shape[0]-1): if i1 == i2: continue u1 = userswithindex[i1] u2 = userswithindex[i2] u1u2val = usermatrix[i1][i2] if u1u2val > max_edge_val: max_edge = u2 max_edge_val = u1u2val if max_edge_val > 0.0 and not max_edge is None: self.add_edge(graph, u1, max_edge) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) self.removeSingletons(graph) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) self.deleteClusters(page) print "storing" cpage = page for compidx in range(len(components)-1): component = components[compidx] newcluster = UserCluster.objects.create(page=cpage) newcluster.save() tags = {} tagcounts = {} for user_id in component: adduser = pageusers.filter(id__exact=user_id)[0] newassoc = UserClusterAssoc.objects.create(cluster = newcluster, clusteruser = adduser) print user_id newassoc.save() for t, tfidf in userterms[user_id]: if not t in tagcounts: tagcounts[t] = 1.0 else: tagcounts[t] = tagcounts[t] + 1.0 if not t in tags: tags[t] = tfidf else: tags[t] = tags[t] + tfidf for t in tags.keys(): tweight = tags[t] / tagcounts[t] print t newterm = UserClusterTerm.objects.create(cluster = newcluster, clusterterm = t, termweight = tweight) newterm.save() print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(tags.keys()), ",".join(tags.keys())) def deleteClusters(self, page): print "cleaning" delclusters = 0 for currentcluster in UserCluster.objects.filter(page__exact=page): uca = UserClusterAssoc.objects.filter(cluster__exact=currentcluster) uca.delete() uct = UserClusterTerm.objects.filter(cluster__exact=currentcluster) uct.delete() currentcluster.delete() delclusters = delclusters + 1 print "Deleted %s clusters" % delclusters def getUserMatrix(self, uservectors): userswithindex = uservectors.keys() usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1]) u1idx = 0 for u1 in userswithindex: u2idx = 0 for u2 in userswithindex: u2idx = u2idx + 1 if u1 == u2: continue u1_vec = uservectors[u1][0] u2_vec = uservectors[u2][0] u1u2dot = np.dot(u1_vec, u2_vec) usermatrix[u1idx][u2idx] = u1u2dot u1idx = u1idx + 1 print "matrix %s/%s" % (u1idx, len(userswithindex)) return (userswithindex, usermatrix) def getUserVectors(self, userterms, termIds, vectorlen, pageusers_count): uservectors = {} curuseridx = 0 for user_id in userterms.keys(): curuseridx = curuseridx + 1 print "vec %s/%s" % (curuseridx, pageusers_count) currentvector = [0.0] * vectorlen terms = [] for w, tfidf in userterms[user_id]: terms.append(w) currentvector[ termIds[w] ] = tfidf uservectors[user_id] = (np.array(currentvector), terms) #print ", ".join(map(str, currentvector)) #print ", ".join(terms) return uservectors def calculateTermIds(self, userterms): next_id = 0 ids = {} for user_id in userterms: for w, tfidf in userterms[user_id]: if not w in ids: ids[w] = next_id next_id = next_id + 1 return ids def getIdf(self, term): if term in self.idfCache: return float(self.postcount) / self.idfCache[term] print "Missing IDF: %s " % term exit() def getUserTfVector(self, page, currentuser, pageposts): tok = {} for post in pageposts.filter(createuser__exact=currentuser): usertokens = self.getToken(post) for w, tf in usertokens: if not w in tok: tok[w] = tf else: tok[w] = tok[w] + tf return [(w, tok[w]) for w in tok] def getToken(self, post): self.tokenizer = WhitespaceTokenizer() if post.text is not None and post.text != "": curtext = post.text.encode('utf-8') tokens = self.tokenize(curtext) tokens = self.normalizeTokens(tokens) tokens = self.stripSpecialChars(tokens) tokens = self.filterInvalid(tokens) tokens = self.calculateTf(tokens) return tokens return [] def getTfIdf(self, w, tf, idf, tokens): return (tf * idf) / len(tokens) def calculateTfIdf(self, tokens): return [ (w, self.getTfIdf(w, tf, idf, tokens) ) for w, tf, idf in tokens ] # maximum normalized tf def calculateTf(self, tokens): if len(tokens) == 0: return [] seen = {} max_tf = 1.0 for w in tokens: if not w in seen: seen[w] = 1.0 if not w in self.idfCache: self.idfCache[w] = 1.0 else: self.idfCache[w] = self.idfCache[w] + 1.0 else: seen[w] = seen[w] + 1.0 if seen[w] > max_tf: max_tf = seen[w] res = [] for w in tokens: res.append( (w, seen[w] / max_tf) ) return res def calculateIdf(self, tokens): return [(w, tf, self.getIdf(w)) for w, tf in tokens] def filterInvalid(self, tokens): vt = [w for w in tokens if self.isValidTerm(w)] if vt is None: vt = [] return vt def tokenize(self, curtext): return [word for sent in nltk.sent_tokenize(curtext) for word in self.tokenizer.tokenize(sent)] def is_number(self, s): try: float(s) return True except ValueError: return False def is_stop_word(self, term): self.read_stop_words() return term in self.stop_words def read_stop_words(self): if not self.stop_words is None: return res = {} for word in open(os.path.join(settings.STATIC_ROOT, 'stop_words'), 'rt').read().split('\r\n'): if not word is None and word != '' and not word in res: res[word] = True self.stop_words = res def isValidTerm(self, term): if len(term) < 2: return False for t in [".", ",", "-", "+", "%", "?", "!", "$", "&", "/", "\"", "'", "`", "`", "|", ":", ";", ")", "(", "[", "]", "{", "}"]: if t in term: return False if self.is_number(term): return False if self.is_stop_word(term): return False try: term = term.decode('ascii') except: return False if term.find('.') > -1: # or term.find('/') > -1 or term.find("?"): # url parts return False return True def normalizeTokens(self, tokens): return [w.lower() for w in tokens] def stripSpecialChars(self, tokens): return [w.strip("\r\n.,-+%?!$&/\\'`|:;)([]{}\t\" ") for w in tokens] def kpGraph(self, page): # initialization self.nextFreeIndex = 0 self.tokenIndices = {} self.allTerms = [] pageowner = page.owner pageposts = Post.objects.filter(page__exact=page) pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() ) pageusers_count = len(pageusers) print "Calculating vectors for %s users" % pageusers_count kp_term_method = KeyphraseMethod.objects.get(name='pos_sequence') userterms = {} curuseridx = 0 for currentuser in pageusers: curuseridx = curuseridx + 1 print "%s/%s" % (curuseridx, pageusers_count) (terms, ids) = self.getUserVector(page, currentuser, kp_term_method) if not terms is None: userterms[currentuser.id] = (terms, ids) print "Maximal index: %s" % self.nextFreeIndex uservectors = {} vectorlen = self.nextFreeIndex for currentuser in userterms.keys(): terms, ids = userterms[currentuser] currentvector = [0.0] * vectorlen for i in range(len(ids)-1): currentvector[ids[i]] = 1.0 uservectors[currentuser] = (np.array(currentvector), terms) #print ", ".join(map(str, currentvector)) #print ", ".join(self.allTerms) userswithindex = uservectors.keys() usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1]) u1idx = 0 for u1 in userswithindex: u2idx = 0 for u2 in userswithindex: u2idx = u2idx + 1 if u1 == u2: continue u1_vec = uservectors[u1][0] u2_vec = uservectors[u2][0] u1u2dot = np.dot(u1_vec, u2_vec) usermatrix[u1idx][u2idx] = u1u2dot u1idx = u1idx + 1 print "%s/%s" % (u1idx, len(userswithindex)) print "Creating graph" graph = nx.Graph() graph.add_nodes_from(pageusers) for i1 in range(usermatrix.shape[0]-1): max_edge = None max_edge_val = 0.0 for i2 in range(usermatrix.shape[0]-1): if i1 == i2: continue u1 = userswithindex[i1] u2 = userswithindex[i2] u1u2val = usermatrix[i1][i2] if u1u2val > max_edge_val: max_edge = u2 max_edge_val = u1u2val if max_edge_val > 0.0 and not max_edge is None: self.add_edge(graph, u1, max_edge) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) self.removeSingletons(graph) print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) ) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) for compidx in range(len(components)-1): component = components[compidx] taglist = [] for user_id in component: ut = userterms[user_id][0] for t in ut: if not t in taglist: taglist.append(t) print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(taglist), ",".join(taglist)) return def getIndex(self, token): if not token in self.tokenIndices: self.allTerms.append(token) self.tokenIndices[token] = self.getNextIndex() return self.tokenIndices[token] def getUserVector(self, page, currentuser, kp_term_method): user_posts = Post.objects.filter(page__exact=page, createuser__exact=currentuser) user_post_parents = Post.objects.filter(id__in=user_posts.values('parent').distinct()) user_kps = PostKeyphraseAssoc.objects.filter(post__in = user_posts, keyphrase__method__exact=kp_term_method) user_kp_count = len(user_kps) terms_all = [] terms_split = [] terms_n = user_kps.values('keyphrase__normalized').distinct() terms_t = user_kps.values('keyphrase__term').distinct() for term in terms_n: t = term['keyphrase__normalized'] if not t in terms_all: terms_all.append(t) for term in terms_t: t = term['keyphrase__term'] if not t in terms_all: terms_all.append(t) for term in terms_all: for term_part in term.split(" "): if not term_part in terms_split: terms_split.append(term_part) terms_all = terms_split #if (len(terms_all) > 0): # for thread_post in user_post_parents: # terms_all.append("POST%s" % (thread_post.id)) print "User: %s Posts: %s Keyphrases: %s" % ( currentuser, len(user_posts), user_kp_count ) print "Terms: %s" % ", ".join(terms_all) if user_kp_count == 0: return (None, None) res_terms = [] res_ids = [] for term in terms_all: term_idx = self.getIndex(term) res_terms.append(term) res_ids.append(term_idx) return (res_terms, res_ids) def add_edge(self, graph, obj_from, obj_to, add_weight=1.0): if not graph.has_edge(obj_from, obj_to): graph.add_edge(obj_from, obj_to, weight=add_weight) else: graph[obj_from][obj_to]['weight'] = graph[obj_from][obj_to]['weight'] + add_weight def addPostUser(self, graph, post, added_users): if not post.createuser in graph: graph.add_node(post.createuser) added_users.append(post.createuser) # edge: post -> createuser self.add_edge(graph, post, post.createuser) def addPostParent(self, graph, post): if not post.parent is None: if not post.parent in graph: graph.add_node(post.parent) self.add_edge(graph, post, post.parent) def addPostKeyPhrases(self, graph, post): # keyphrases in this post for pk in PostKeyphraseAssoc.objects.filter(post__exact=post): graph.add_node(pk.keyphrase) self.add_edge(graph, post, pk.keyphrase) def addUserMetaCategory(self, graph, user): metaentries = UserMeta.objects.filter(user__exact=user) for metaentry in metaentries: if metaentry is None: continue if metaentry.fb_category is None or metaentry.fb_category == '': continue nodeval = u'CAT_' + unicode(metaentry.fb_category) graph.add_node(nodeval) self.add_edge(graph, user, nodeval) def addUserMeta(self, graph, user): metaentries = UserMeta.objects.filter(user__exact=user) for metaentry in metaentries: if metaentry is None: continue nodeval = unicode(metaentry) graph.add_node(nodeval) self.add_edge(graph, user, nodeval) def removeNonConnectedUsers(self, graph, dist_threshold): components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Removing non-connected user nodes" remove_nodes = [] for component in components: usernodes = [] userdists = {} for node in component: if type(node) == User: usernodes.append(node) u1idx = 0 ulen = len(usernodes) for u1 in usernodes: u1idx = u1idx + 1 print "%s/%s" % (u1idx, ulen) if not u1.id in userdists: userdists[u1.id] = 1000 for u2 in usernodes: if u1 == u2: continue pathres = nx.dijkstra_path_length(graph,u1,u2) if pathres < userdists[u1.id]: userdists[pathres] = pathres if userdists[u1.id] < dist_threshold: break # condition satisfied for user in usernodes: if userdists[user.id] > dist_threshold: # shortest path to another user is > 5 -> remove print "Removing user %s. Dist value: %s" % (user.id, userdists[user.id]) remove_nodes.append(user) print "Removing %s user nodes" % len(remove_nodes) graph.remove_nodes_from(remove_nodes) del remove_nodes def removeSingletons(self, graph): print "Removing singletons" singleton_nodes = [ n for n,d in graph.degree_iter() if d==0 ] graph.remove_nodes_from(singleton_nodes) del singleton_nodes def buildGraph(self, page): print "Building graph" pageowner = page.owner pageposts = Post.objects.filter(page__exact=page) graph = nx.Graph() #pageposts = pageposts[500:700] ########################################## print "nodes: posts" graph.add_nodes_from(pageposts) print "edges: user -> post" added_users = [] for post in pageposts: # post.createuser self.addPostUser(graph, post, added_users) # post->parent post self.addPostParent(graph, post) # post->postkeyphraseassoc->keyphrase self.addPostKeyPhrases(graph, post) # post.createuser->usermeta #self.addUserMeta(graph, post.createuser) #self.addUserMetaCategory(graph, post.createuser) print "Graph nodes: %s" % len(graph.nodes()) print "Graph edges: %s" % len(graph.edges()) print "Removing page owner" graph.remove_node(pageowner) print "Graph nodes: %s" % len(graph.nodes()) print "Graph edges: %s" % len(graph.edges()) self.removeSingletons(graph) components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Removing components with only 0/1 user nodes" remove_components = [] for component in components: usercount = 0 for node in component: if type(node) == User: usercount = usercount + 1 if usercount <= 1: remove_components.append(component) else: print "Found %s user nodes" % usercount print "Removing %s components" % len(remove_components) for component in remove_components: graph.remove_nodes_from(component) del remove_components components = nx.connected_components(graph) print "Number of connected components: %s" % len(components) print "Edges: %s" % len(graph.edges()) remove_edges = [] weight_threshold = 2.0 for node_a, node_b, attr in sorted(graph.edges(data = True), key = lambda (a, b, attr): attr['weight']): if type(node_a) == Post or type(node_b) == Post: # exclude post connections continue if 'weight' in attr and attr['weight'] > weight_threshold: break remove_edges.append((node_a, node_b)) #print('{a} {b} {w}'.format(a = node_a, b = node_b, w = attr['weight'])) for node_a, node_b in remove_edges: graph.remove_edge(node_a, node_b) print "Edges: %s" % len(graph.edges()) self.removeSingletons(graph) print "Graph dotfile" nx.write_dot(graph, '/home/double/graph_viz.dot') tmp = [] for user in added_users: if user in graph: tmp.append(user) added_users = tmp print "Unique users in graph: %s" % len(added_users) usergraph = nx.Graph() usergraph.add_nodes_from(added_users) for user_a, user_b in combinations(added_users, 2): try: userpath = nx.shortest_path_length(graph, user_a, user_b, weight='weight') usergraph.add_edge(user_a, user_b, weight=userpath) print user_a, user_b, userpath except nx.NetworkXNoPath, e: #print e continue self.removeSingletons(usergraph) #print "Drawing graph" plt.ioff() #nx.draw(graph, node_size=10, font_size=8) #plt.savefig('/home/double/graph.png', dpi=1000) print "UserGraph nodes: %s" % len(usergraph.nodes()) print "UserGraph edges: %s" % len(usergraph.edges()) return
class TextCleaner(object): def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')] # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado por falta de padrão na escrita. def removeRepChar(self, word): repl_word = self.repeat_regexp.sub(self.repl, word) if repl_word != word: return self.removeRepChar(repl_word) else: return repl_word # Remover sufixo das palavras da lingua portuguesa. def removeSufPort(self, para): para = para.split() text = '' for w in para: text = text + self.pt_stemmer.stem(w) + ' ' return text # Substituir caracateres acentuados por caracteres sem acentos. def removeAccent(self, text): para = text for (lat, asc) in self.ascii_replace: para = para.replace(lat, asc) return para # Remover stopwords dos textos. def removeStopwords(self, text): text = ' '.join([ word for word in text.split() if word not in self.cached_stopwords ]) text = ' '.join( [word for word in text.split() if word not in self.more_stopwords]) return text # Remover links dos textos. def removeLinks(self, text): for l in self.link_patterns: text = text.split(l, 1)[0] return text # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr def normalizeDigraph(self, text): for a, d in self.digraph: text = re.sub(a, d, text) return text # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo. def normalizeText(self, text): for a, b in self.normal: text = re.sub(a, b, text) return text def removeOneCharacter(self, text): text = self.tokenizeWords(text) for i in range(len(text)): if len(text[i]) <= 2: text[i] = '' return ' '.join(text) def tokenizeWords(self, text): text = self.tokenizer.tokenize(text) return text