def corpus_statistics():
    #train_corpus_path = "/userstore/jieg/credbank/corpus/credbank_train_corpus.txt"
    train_corpus_path = "C:\\Data\\credbank\\tweets_corpus\\shuffled_credbank_held_corpus.txt"
    with open(train_corpus_path, mode='r', encoding='utf-8') as file:
        train_corpus = file.readlines()

    from nltk.tokenize.regexp import WhitespaceTokenizer
    whitespace_tokenize = WhitespaceTokenizer().tokenize
    corpus_size = 0
    for tweet in train_corpus:
        tokens = whitespace_tokenize(tweet)
        corpus_size += len(tokens)

    print("all words (corpus size): ", corpus_size)

    from sklearn.feature_extraction.text import CountVectorizer

    #extract tokens
    text_vectorizer = CountVectorizer(analyzer='word',
                                      tokenizer=WhitespaceTokenizer().tokenize,
                                      ngram_range=(1, 1),
                                      min_df=1)
    X = text_vectorizer.fit_transform(train_corpus)
    # Vocabulary
    vocab = list(text_vectorizer.get_feature_names())
    print("vocabulary size: ", len(vocab))  # 913611
    counts = X.sum(axis=0).A1

    from collections import Counter
    freq_distribution = Counter(dict(zip(vocab, counts)))

    print("top N frequent words: ", freq_distribution.most_common(10))
Beispiel #2
0
 def __init__(self):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.pt_stemmer = nltk.stem.RSLPStemmer()
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('portuguese')
     self.more_stopwords = [
         'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
         'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
         'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
         'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos',
         'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
     ]
     self.ascii_replace = [
         ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
         ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
         ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
         ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
         ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
         ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
         ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
     ]
     self.link_patterns = [('http'), ('www'), ('w3c')]
     self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                    (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                    (r'\s*insan\s*', ' insano '),
                    (r'\s*saudad\s*', ' saudade ')]
     self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                     (r'eqe', 'ee'), (r'oqo', 'oo')]
def main():
    text = read_doc()

    text = [unescape(sent) for sent in text]

    from nltk.tokenize.regexp import WhitespaceTokenizer
    ws_tokenizer = WhitespaceTokenizer()
    text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0]

    text = [[token.lower() for token in sent] for sent in text]

    text = [[
        ''.join(ch for ch in token if ch.isalpha() or ch == '\'')
        for token in sent
    ] for sent in text]

    text = [[token for token in sent if len(token) >= 2 and len(token) <= 35]
            for sent in text]

    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    text = [[token for token in sent if not token in stopwords]
            for sent in text]

    from nltk.stem.snowball import SnowballStemmer
    stemmer = SnowballStemmer("english")
    text = [[stemmer.stem(token) for token in sent] for sent in text]

    from sklearn.feature_extraction.text import CountVectorizer
    vect = CountVectorizer(min_df=20, analyzer=lambda x: x)
    X = vect.fit_transform(text)

    #print(X.toarray())
    feature_names = vect.get_feature_names()
    #print(feature_names)

    from collections import Counter
    try:
        # Python 2
        from itertools import izip
    except ImportError:
        # Python 3
        izip = zip
    wfd = Counter(
        {key: value
         for (key, value) in izip(range(X.shape[1]), X.getnnz(0))})

    from itertools import combinations, chain
    bfd = Counter(
        chain.from_iterable(
            [combinations(sorted(segment.tocoo().col), 2) for segment in X]))

    N_seg = len(text)
    scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup)
              for tup in bfd]

    print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]])
           for tup in sorted(scores, reverse=True)[:20]])

    pass
 def __init__(self, use_unicode=True):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.pt_stemmer = nltk.stem.RSLPStemmer()
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('portuguese')
     self.symbols = [
         u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
         u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
         u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
         u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b",
         u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b",
         u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f",
         u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf",
         u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad",
         u"\u00ab"
     ]
     self.more_stopwords = [
         'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
         'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
         'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
         'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita',
         'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
     ]
     if use_unicode:
         self.accents = unicode_replace
     else:
         self.accents = ascii_replace
     self.link_patterns = [('http'), ('www'), ('w3c'), ('https')]
     self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                    (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                    (r'\s*insan\s*', ' insano '),
                    (r'\s*saudad\s*', ' saudade ')]
     self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                     (r'eqe', 'ee'), (r'oqo', 'oo')]
Beispiel #5
0
 def __init__(self, use_unicode):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('english')
     self.symbols = [
         u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
         u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
         u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
         u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b",
         u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7",
         u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794",
         u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0",
         u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab"
     ]
     if use_unicode:
         self.accents = unicode_replace
     else:
         self.accents = ascii_replace
     self.link_patterns = [('http'), ('www'), ('w3c')]
     self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                     (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                     (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                     (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                     (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                     (r'fqf', 'ff'), (r'lql', 'll')]
Beispiel #6
0
 def processPost(self, post):
     tokenizer = WhitespaceTokenizer()
     if post.text is not None and post.text != "":
         curtext = post.text.encode('utf-8')
         tokens = [word for sent in nltk.sent_tokenize(curtext) for word in tokenizer.tokenize(sent)]
         tokens = self.normalizeTokens(tokens)
         text = nltk.Text(tokens)
         self.processText(post, text)
Beispiel #7
0
 def __chunk_sentence(self, sentence):
     """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't).
    Then chunk the tokens according to GRAMMAR.
 """
     tokenizer = WhitespaceTokenizer()
     tokens = tokenizer.tokenize(sentence)
     pos_tagged = nltk.pos_tag(tokens)
     return self.parser.parse(pos_tagged)
 def __chunk_sentence(self, sentence):
   """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't).
      Then chunk the tokens according to GRAMMAR.
   """
   tokenizer = WhitespaceTokenizer()
   tokens = tokenizer.tokenize(sentence)
   pos_tagged = nltk.pos_tag(tokens)
   return self.parser.parse(pos_tagged)
def main():
    text = read_doc()

    text = [unescape(sent) for sent in text]

    from nltk.tokenize.regexp import WhitespaceTokenizer

    ws_tokenizer = WhitespaceTokenizer()
    text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0]

    text = [[token.lower() for token in sent] for sent in text]

    text = [["".join(ch for ch in token if ch.isalpha() or ch == "'") for token in sent] for sent in text]

    text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text]

    from nltk.corpus import stopwords

    stopwords = set(stopwords.words("english"))
    text = [[token for token in sent if not token in stopwords] for sent in text]

    from nltk.stem.snowball import SnowballStemmer

    stemmer = SnowballStemmer("english")
    text = [[stemmer.stem(token) for token in sent] for sent in text]

    from sklearn.feature_extraction.text import CountVectorizer

    vect = CountVectorizer(min_df=20, analyzer=lambda x: x)
    X = vect.fit_transform(text)

    # print(X.toarray())
    feature_names = vect.get_feature_names()
    # print(feature_names)

    from collections import Counter

    try:
        # Python 2
        from itertools import izip
    except ImportError:
        # Python 3
        izip = zip
    wfd = Counter({key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))})

    from itertools import combinations, chain

    bfd = Counter(chain.from_iterable([combinations(sorted(segment.tocoo().col), 2) for segment in X]))

    N_seg = len(text)
    scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd]

    print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]])

    pass
Beispiel #10
0
    def evaluateclassifier(self, featureselection):
        positivecount=0
        negativecount=0
        negativetweets = []
        positivetweets = []
        #print 'Evaluating Classifier'
        print featureselection
        with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
            #print 'Opening corpus file'
            reader = csv.reader(f)
            for row in reader:
                #Positive sentiment tweets
                if(row[0] == '4' and positivecount < self.corpuslength):
                    positivetweets.append(row[5])        
                    positivecount+=1        
                #Negative sentiment tweets
                if(row[0] == '0' and negativecount < self.corpuslength):
                    negativetweets.append(row[5])
                    negativecount+=1
        
        #print 'Generating Features' 
        self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets]
        self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets]
        
        poscutoff = len(self.positivefeatures)
        negcutoff = len(self.negativefeatures)
        print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff)
        trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]
        
        testfeats = self.test(featureselection) 
        #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff]       
        print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
        classifier = NaiveBayesClassifier.train(trainfeats)        
        print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
        
        #classifier.show_most_informative_features(20)
        
        refsets = collections.defaultdict(set)
        testsets = collections.defaultdict(set) 
        
        for i, (feats, label) in enumerate(testfeats):    
            refsets[label].add(i)    
            observed = classifier.classify(feats)  
            #print label, observed  
            testsets[observed].add(i)

        print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
        print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
        print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
    def _fit(self):
        '''Tokenize the documents, make backwards and forwards lists
        call the make_dictionary method'''

        tokenizer = WhitespaceTokenizer()
        # Get the sentences from the corpus
        sent_list_of_str = sent_tokenize(self.corpus_txt.lower())
        # Capitalize and save the punctuation from the end
        sent_cap = [(sent.capitalize()[:-1], sent[-1]) for sent in sent_list_of_str]
        # Word tokenize to keep contractions, add back on punc
        self.f_sent = [tokenizer.tokenize(word_tuple[0]) + [word_tuple[1]] for word_tuple in sent_cap]
        # Reverse those sentences
        self.b_sent = [list(reversed(word_list)) for word_list in self.f_sent]
        self.f_dict = self._make_dictionary(self.f_sent)
        self.b_dict = self._make_dictionary(self.b_sent)
Beispiel #12
0
    def process(self, text):
        """
            предобработка, токенизация по предложениям, удаление дублей.
            выдает список предложений (для векторного метода, на будущее)
            Args:
                text ([type]): [description]
            """

        #text = text.lower()

        # убираем числа, email, гиперрсылки

        #text = text.encode('utf-8')

        text = clear_emails(text)
        text = clear_url(text)
        text = clear_digits(text)
        text = clear_symb(text)

        # выделяем предложения
        sentence_tokenizer = PunktSentenceTokenizer()
        text = sentence_tokenizer.tokenize(text)

        cleaned_text = []
        stop_words = set(stopwords.words('russian'))

        # разбиваем по словам, чистим от оставшейся пунктуации и stopwords
        tokenizer = WhitespaceTokenizer()
        stemmer = SnowballStemmer('russian')

        for sentence in text:
            punct_cleaned_sent = clear_endings(
                sentence)  # служ. символы конца предложения
            tokenized_sent = tokenizer.tokenize(
                punct_cleaned_sent)  # раскидали по словам, только для отчистки
            stpw_clean_sentence = [
                word for word in tokenized_sent if not word in stop_words
            ]
            stemmed_sentence = [
                stemmer.stem(word) for word in stpw_clean_sentence
            ]  # проеборазуем в ед. число или корень слова
            clean_sentence = ' '.join(
                stemmed_sentence
            )  # собрали обратно в предложение-сторку для хэшировнаия

            cleaned_text.append(clean_sentence)

        return cleaned_text
Beispiel #13
0
def get_ngram_counts(comment_iter, n, tokenizer=None, sample_pct=100):
    """
    Compute ngram counts from comments.
    
    Parameters:
    -----------
    comment_iter : generator
    n : int
    tokenizer : nltk.tokenize.Tokenizer
    sample_pct : float
    Optional percentage from which to subsample the data.
    
    Returns:
    --------
    counts : pandas.DataFrame
    Rows = ngrams, col = counts.
    """
    if (tokenizer is None):
        tokenizer = WhitespaceTokenizer()
    counts = Counter()
    for i, c in enumerate(comment_iter):
        if (sample_pct == 100 or random.random() * 100 < sample_pct):
            ngrams = ngram_split(c, n, tokenizer)
            for ngram in ngrams:
                ngram = [' '.join(ngram)]
                counts.update(ngram)
        if (i % 1000000 == 0):
            print('got %d unique ngrams' % (len(counts)))
    # convert to dataframe
    counts = pd.DataFrame(pd.Series(counts))
    return counts
Beispiel #14
0
 def __init__(self,
              data_iterator,
              tokenizer=WhitespaceTokenizer(),
              char_map=None,
              word_len=30,
              sent_len=200):
     '''
     DESCRIPTIONS:
         This class converts text to numbers for the standard unicode vocabulary
         size.
     PARAMS:
         data_iterator (iterator): iterator to iterates the text strings
         word_len (int): maximum length of the word, any word of length less
             than that will be padded with zeros, any word of length more than
             that will be cut at max word length.
         sent_len (int): maximum number of words in a sentence, any sentence
             with less number of words than that will be padded with zeros,
             any sentence with more words than the max number will be cut at
             the max sentence length.
         char_map (dict): a dictionary for mapping characters to numbers.
     '''
     self.data_iterator = data_iterator
     self.word_len = word_len
     self.sent_len = sent_len
     self.char_map = char_map
     self.tokenizer = tokenizer
     self.char_zero = ' '  # character to be assigned the zero index
Beispiel #15
0
def tokenize(s):
    """
    Tokenize string.
    Function to tokenize text into words (tokens). Downloads default NLTK
    tokenizer if not in machine.
    Args:
        - s: string with sentence to tokenize.
    Returns:
        - tokens: list of tuples (token, start-index, end-index)
    """
    text = sub(r"[,.:;'\"]", " ", s)
    tokenizer = Tokenizer()
    spans = tokenizer.span_tokenize(text)
    tokens = tokenizer.tokenize(text)
    tokens = [(t, s[0], s[1]-1) for t, s in zip(tokens, spans)]
    return tokens
 def __init__(self, fname):
     with open(fname, 'r') as f:
         self.corpus_txt = f.read().decode('utf-8').replace('\n', ' ')
     self.tokenizer = WhitespaceTokenizer()
     self.word_list = self.tokenizer.tokenize(self.corpus_txt)
     self.lower_word_list = [w.lower() for w in self.word_list]
     self.word_dict_count = Counter(self.word_list)
	def __init__(self):
		self.portugues_stemmer = RSLPStemmer()
		self.tokenizar = WhitespaceTokenizer()
		self.stopwords = stopwords.words('portuguese')
		self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate']
		self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'),
                 ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'),
                 ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]
Beispiel #18
0
 def build_topn_best_words(self):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
     positivecount = 0;
     negativecount = 0
     with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f:
         reader = csv.reader(f)
         for row in reader:
                 #Positive sentiment tweets
                 if(row[0] == '4' and positivecount < self.corpuslength):
                     tweet = row[5]
                     tokens = WhitespaceTokenizer().tokenize(tweet)
                     #print tweet
                     for token in tokens:                        
                         word_fd.inc(token.lower())    
                         label_word_fd['pos'].inc(token.lower()) 
                     positivecount+=1
                 #Negative sentiment tweets
                 if(row[0] == '0' and negativecount < self.corpuslength):
                     tweet = row[5]
                     tokens = WhitespaceTokenizer().tokenize(tweet)
                     #print tweet
                     for token in tokens:     
                         word_fd.inc(token.lower())    
                         label_word_fd['neg'].inc(token.lower())
                     negativecount+=1
                     
     #print word_fd
     #print label_word_fd
     
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
     print "Positive Word Count:", pos_word_count, "Negative Word Count:", neg_word_count, "Total Word count:", total_word_count
     
     word_scores = {}
     for word, freq in word_fd.iteritems():    
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)    
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)    
         word_scores[word] = pos_score + neg_score
         
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     self.bestwords = set([w for w, s in best])        
     print 'Best Words Count:', len(self.bestwords)#, 'Best Words Set:', self.bestwords
class LimparTexto(object):
	def __init__(self):
		self.portugues_stemmer = RSLPStemmer()
		self.tokenizar = WhitespaceTokenizer()
		self.stopwords = stopwords.words('portuguese')
		self.mais_utilizadas = ['ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate']
		self.ascii_replace = [('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'),
                 ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'),
                 ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')]

	#Remover acentuação dos textos		
	def removeAccent(self, text):
		para = text
		for (lat, asc) in self.ascii_replace:
			para = para.replace(lat, asc)
		return para

	#Realiza a remoção das stop words que são palavras que não representam significado para o nosso modelo.
	def removerStopWords(self, texto):		
#O decode é necessário se for utilizado o latin-1 no mining
		texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.stopwords])
		texto = ' '.join([word for word in texto.split() if word.decode('latin-1') not in self.mais_utilizadas])
#		texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.stopwords])
#		texto = ' '.join([word for word in texto.split() if word.decode('utf-8') not in self.mais_utilizadas])
		return texto

	#Tokenização das palavras por espaços
	def tokenizarPalavras(self, texto):
		texto = self.tokenizar.tokenize(texto)
		return texto

	#A remoção da pontuação é necessário pois palavras seguidas de pontos difere de palavra iguais sem a pontuação.
	def removerPontuacao(self, texto):
		regex = re.compile('[%s]' % re.escape(string.punctuation))
		texto = regex.sub('',texto)
		return texto
		
		
	#Remoção dos sufixos das palavras
	def removerSufixo(self, para):
		text = ''
		for w in para:
#			text = text + self.portugues_stemmer.stem(w.decode('latin-1')) + ' '
			text = text + self.portugues_stemmer.stem(w) + ' '
		return text
	
	def removerAcentos(self, texto):
		texto = unicode(texto, 'latin-1')
		para = unidecode.unidecode(texto)
		return para

	def removerCaracteresRepetidos(self, texto):
		texto = re.sub(r'([a-z])\1+', r'\1', texto)
		return texto
Beispiel #20
0
    def analyize(self,text):
        try:
            unitext = any2unicode(text, encoding='utf8', errors='strict')
        except:
            print ("Not utf-8")
            return []
        pass

        #convert to lower
        lowerText = unitext.lower()

        # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)'
        # very aggresive regex...removes puncs and digits..keeps only alphabetic words
        tokenizer = WhitespaceTokenizer()
        regexTokens = tokenizer.tokenize(lowerText)
        p_stemmer = PorterStemmer()
        stemmedTokens = [p_stemmer.stem(i) for i in regexTokens]

        stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1]
        return stemmedRemSingleLetterTokens
Beispiel #21
0
 def getToken(self, post):
     self.tokenizer = WhitespaceTokenizer()
     if post.text is not None and post.text != "":
         curtext = post.text.encode('utf-8')
         tokens = self.tokenize(curtext)
         tokens = self.normalizeTokens(tokens)
         tokens = self.stripSpecialChars(tokens)
         tokens = self.filterInvalid(tokens)
         tokens = self.calculateTf(tokens)
         return tokens
     return []
Beispiel #22
0
    def process(self, text, plain_text=False):
        """
        предобработка, токенизация по словам,  удаление дублей.
        выдает сплошной (plain) текст, для метода шиндлов или список токенов текста

        Args:
            text ([type]): [description]
        """
        #text = text.encode('utf-8')

        # убираем числа, email, гиперрсылки

        text = clear_emails(text)
        text = clear_url(text)
        text = clear_digits(text)
        text = clear_symb(text)

        # разбиваем по словам, чистим от оставшейся пунктуации и stopwords

        stop_words = set(stopwords.words('russian'))
        tokenizer = WhitespaceTokenizer()
        stemmer = SnowballStemmer('russian')

        punct_cleaned_text = clear_endings(
            text)  # служ. символы конца предложения
        tokenized_text = tokenizer.tokenize(
            punct_cleaned_text)  # раскидали по словам, только для отчистки
        stpw_clean_text = [
            word for word in tokenized_text if not word in stop_words
        ]
        stemmed_text = [stemmer.stem(word) for word in stpw_clean_text
                        ]  # проеборазуем в ед. число или корень слова
        clean_text = None
        if plain_text:
            clean_text = ' '.join(
                stemmed_text
            )  # собрали обратно в предложение-сторку для хэшировнаия
        else:
            clean_text = stemmed_text  #  иначе возвращаем список токенов

        return clean_text
Beispiel #23
0
 def __init__(self):
     self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
     self.repl = r'\1\2\3'
     self.tokenizer = WhitespaceTokenizer()
     self.cached_stopwords = stopwords.words('english')
     self.ascii_replace = [
         ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
         ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
         ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
         ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
         ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
         ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
         ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
     ]
     self.link_patterns = [('http'), ('www'), ('w3c')]
     self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                     (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                     (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                     (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                     (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                     (r'fqf', 'ff'), (r'lql', 'll')]
Beispiel #24
0
 def test(self, featureselection):
     positiveTweets = [] 
     negativeTweets = []
     with open(r'..\polarityData\TweetCorpus\testdata.manual.2009.06.14.csv', 'rb') as f:
         reader = csv.reader(f)
         for row in reader:
             #Positive sentiment tweets
             if(row[0] == '4'):
                 positiveTweets.append(utils.common.processTweetBlank(row[5]))          
             #Negative sentiment tweets
             if(row[0] == '0'):
                 negativeTweets.append(utils.common.processTweetBlank(row[5]))
         
     positiveTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positiveTweets]
     negativeTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativeTweets]
     
     poscutoff = len(positiveTestFeatures)
     negcutoff = len(negativeTestFeatures)
     print "Test Pos Cutoff: " + str(poscutoff) + " Test Neg Cutoff: " + str(negcutoff)
     testfeatures = positiveTestFeatures[:poscutoff] + negativeTestFeatures[:negcutoff]
     #print testfeatures
     return (testfeatures)
Beispiel #25
0
 def __init__(self, tokenizer=WhitespaceTokenizer(), sent_len=200):
     self.sent_len = sent_len
     self.tokenizer = tokenizer
     self.w2v_dim = 300
     this_dir = os.path.dirname(os.path.realpath(__file__))
     model_dir = this_dir + '/model'
     if not os.path.exists(model_dir):
         os.makedirs(model_dir)
     pretrained_path = model_dir + '/GoogleNews-vectors-negative300.bin.gz'
     if not os.path.exists(pretrained_path):
         raise Exception('pretrained vector file not exists: {}'.format(pretrained_path))
     print('..loading model')
     self.model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
Beispiel #26
0
def get_sentences_for_text(corpus_root, filename, lang='english'):
    """Segments the given text into sentences.

  Args:
    corpus_root: Directory in which the text file is residing.
    filename: Name of the text file.
    lang: Tokenizer language. For possible values, look at:
    ${NLTK_DATA}/tokenizers/punkt

  Returns:
    Sentences in the given text. 

  """
    tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle'
    text = PlaintextCorpusReader(
        corpus_root, [filename],
        word_tokenizer=WhitespaceTokenizer(),
        sent_tokenizer=nltk.data.LazyLoader(tokenizer_path))
    return text.sents()
	def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None):
		self.markov_dict = markov_dict
		self.gtype = self.markov_dict['gtype']
		self.stop_words = set(stopwords.words('english'))
		self.neighbor_dict = neighbor_dict
		self.tokenizer = WhitespaceTokenizer()
		self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt'])
		self.lower_word_list = [w.lower() for w in self.word_list]
		# Count of word freq, maintaining case
		self.word_dict_count = Counter(self.word_list)
		self.truecaser = TrueCase(self.markov_dict['fname'])

		# Create priority and not_found_list if none were entered
		if priority_list:
			self.priority_list = priority_list
		else:
			self._make_priority()
		if not_found_list:
			self.not_found_list = not_found_list
		else:
			self._make_not_found()
Beispiel #28
0
class Command(BaseCommand):
    args = '<page_id> <method>'
    help = 'Computes graph data for the given page'

    def __init__(self, *args, **kwargs):
        super(Command, self).__init__(*args, **kwargs)
        self._log = logging.getLogger('cmd')

    def handle(self, *args, **options):
        if args is None or len(args) < 1:
            pages = Page.objects.all()
            for page in pages:
                self._log.info("Page #%s: %s" % (page.id, page.fb_page_name))
            raise CommandError('Invalid arguments. Expected: <page_id>')

        page_id = args[0]

        self._log.info('GraphCommand initializing.')

        self._log.info('Page-Id: %s' % page_id)
        page = Page.objects.get(id=page_id)

        self.allTextGraph(page)
        #self.kpGraph(page)
        #self.buildGraph(page)

        self._log.info("All done for now.")

    def getNextIndex(self):
        self.nextFreeIndex = self.nextFreeIndex + 1
        return self.nextFreeIndex - 1

    def allTextGraph(self, page):
        pageowner = page.owner
        pageposts = Post.objects.filter(page__exact=page)

        self.stop_words = None
        self.idfCache = {}

        userterms = {}

        pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() )
        pageusers_count = len(pageusers)
        print "Calculating vectors for %s users" % pageusers_count

        self.nextFreeIndex = 0
        curuseridx = 0
        for currentuser in pageusers:
            curuseridx = curuseridx + 1
            print "tok+tf %s/%s" % (curuseridx, pageusers_count)
            terms = self.getUserTfVector(page, currentuser, pageposts)
            if not terms is None:
                userterms[currentuser.id] = terms
        print "Maximal index: %s" % self.nextFreeIndex

        self.postcount = len(pageposts)
        print "Calculating IDF, posts: %s, terms: %s" % (self.postcount, len(self.idfCache))
        curuseridx = 0
        terms_with_idf = {}
        for user_id in userterms:
            curuseridx = curuseridx + 1
            print "idf %s/%s" % (curuseridx, pageusers_count)
            tokens = self.calculateIdf(userterms[user_id])
            terms_with_idf[user_id] = tokens

        print "tfidf"
        curuseridx = 0
        for user_id in terms_with_idf:
            curuseridx = curuseridx + 1
            print "tfidf %s/%s" % (curuseridx, pageusers_count)
            tokens = self.calculateTfIdf(terms_with_idf[user_id])
            userterms[user_id] = tokens

        del terms_with_idf

        print "Terms: %s" % len(self.idfCache)
        print "Calculating term IDs"
        termIds = self.calculateTermIds(userterms)

        uservectors = self.getUserVectors(userterms, termIds, len(self.idfCache), pageusers_count)
        userswithindex, usermatrix = self.getUserMatrix(uservectors)

        print "Creating graph"
        graph = nx.Graph()

        graph.add_nodes_from(pageusers)
        for i1 in range(usermatrix.shape[0]-1):
            max_edge = None
            max_edge_val = 0.0
            for i2 in range(usermatrix.shape[0]-1):
                if i1 == i2:
                    continue
                u1 = userswithindex[i1]
                u2 = userswithindex[i2]
                u1u2val = usermatrix[i1][i2]
                if u1u2val > max_edge_val:
                    max_edge = u2
                    max_edge_val = u1u2val

            if max_edge_val > 0.0 and not max_edge is None:
                self.add_edge(graph, u1, max_edge)

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )
        self.removeSingletons(graph)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        self.deleteClusters(page)

        print "storing"
        cpage = page
        for compidx in range(len(components)-1):
            component = components[compidx]
            newcluster = UserCluster.objects.create(page=cpage)
            newcluster.save()
            tags = {}
            tagcounts = {}
            for user_id in component:
                adduser = pageusers.filter(id__exact=user_id)[0]
                newassoc = UserClusterAssoc.objects.create(cluster = newcluster, clusteruser = adduser)
                print user_id
                newassoc.save()

                for t, tfidf in userterms[user_id]:
                    if not t in tagcounts:
                        tagcounts[t] = 1.0
                    else:
                        tagcounts[t] = tagcounts[t] + 1.0
                    if not t in tags:
                        tags[t] = tfidf
                    else:
                        tags[t] = tags[t] + tfidf
            for t in tags.keys():
                tweight = tags[t] / tagcounts[t]
                print t
                newterm = UserClusterTerm.objects.create(cluster = newcluster, clusterterm = t, termweight = tweight)
                newterm.save()

            print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(tags.keys()), ",".join(tags.keys()))

    def deleteClusters(self, page):
        print "cleaning"
        delclusters = 0
        for currentcluster in UserCluster.objects.filter(page__exact=page):
            uca = UserClusterAssoc.objects.filter(cluster__exact=currentcluster)
            uca.delete()
            uct = UserClusterTerm.objects.filter(cluster__exact=currentcluster)
            uct.delete()
            currentcluster.delete()
            delclusters = delclusters + 1
        print "Deleted %s clusters" % delclusters

    def getUserMatrix(self, uservectors):
        userswithindex = uservectors.keys()
        usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1])

        u1idx = 0

        for u1 in userswithindex:
            u2idx = 0
            for u2 in userswithindex:
                u2idx = u2idx + 1
                if u1 == u2:
                    continue

                u1_vec = uservectors[u1][0]
                u2_vec = uservectors[u2][0]
                u1u2dot = np.dot(u1_vec, u2_vec)
                usermatrix[u1idx][u2idx] = u1u2dot

            u1idx = u1idx + 1
            print "matrix %s/%s" % (u1idx, len(userswithindex))
        return (userswithindex, usermatrix)

    def getUserVectors(self, userterms, termIds, vectorlen, pageusers_count):
        uservectors = {}

        curuseridx = 0
        for user_id in userterms.keys():
            curuseridx = curuseridx + 1
            print "vec %s/%s" % (curuseridx, pageusers_count)

            currentvector = [0.0] * vectorlen

            terms = []
            for w, tfidf in userterms[user_id]:
                terms.append(w)
                currentvector[ termIds[w] ] = tfidf

            uservectors[user_id] = (np.array(currentvector), terms)
            #print ", ".join(map(str, currentvector))
            #print ", ".join(terms)

        return uservectors

    def calculateTermIds(self, userterms):
        next_id = 0
        ids = {}
        for user_id in userterms:
            for w, tfidf in userterms[user_id]:
                if not w in ids:
                    ids[w] = next_id
                    next_id = next_id + 1
        return ids

    def getIdf(self, term):
        if term in self.idfCache:
            return float(self.postcount) / self.idfCache[term]

        print "Missing IDF: %s " % term
        exit()

    def getUserTfVector(self, page, currentuser, pageposts):
        tok = {}

        for post in pageposts.filter(createuser__exact=currentuser):
            usertokens = self.getToken(post)
            for w, tf in usertokens:
                if not w in tok:
                    tok[w] = tf
                else:
                    tok[w] = tok[w] + tf

        return [(w, tok[w]) for w in tok]

    def getToken(self, post):
        self.tokenizer = WhitespaceTokenizer()
        if post.text is not None and post.text != "":
            curtext = post.text.encode('utf-8')
            tokens = self.tokenize(curtext)
            tokens = self.normalizeTokens(tokens)
            tokens = self.stripSpecialChars(tokens)
            tokens = self.filterInvalid(tokens)
            tokens = self.calculateTf(tokens)
            return tokens
        return []

    def getTfIdf(self, w, tf, idf, tokens):
        return (tf * idf) / len(tokens)

    def calculateTfIdf(self, tokens):
        return [ (w, self.getTfIdf(w, tf, idf, tokens) ) for w, tf, idf in tokens ]

    # maximum normalized tf
    def calculateTf(self, tokens):
        if len(tokens) == 0:
            return []

        seen = {}
        max_tf = 1.0

        for w in tokens:
            if not w in seen:
                seen[w] = 1.0
                if not w in self.idfCache:
                    self.idfCache[w] = 1.0
                else:
                    self.idfCache[w] = self.idfCache[w] + 1.0
            else:
                seen[w] = seen[w] + 1.0
            if seen[w] > max_tf:
                max_tf = seen[w]

        res = []
        for w in tokens:
            res.append( (w, seen[w] / max_tf) )
        return res

    def calculateIdf(self, tokens):
        return [(w, tf, self.getIdf(w)) for w, tf in tokens]

    def filterInvalid(self, tokens):
        vt = [w for w in tokens if self.isValidTerm(w)]
        if vt is None:
            vt = []
        return vt

    def tokenize(self, curtext):
        return [word for sent in nltk.sent_tokenize(curtext) for word in self.tokenizer.tokenize(sent)]

    def is_number(self, s):
        try:
            float(s)
            return True
        except ValueError:
            return False

    def is_stop_word(self, term):
        self.read_stop_words()
        return term in self.stop_words

    def read_stop_words(self):
        if not self.stop_words is None:
            return
        res = {}
        for word in open(os.path.join(settings.STATIC_ROOT, 'stop_words'), 'rt').read().split('\r\n'):
            if not word is None and word != '' and not word in res:
                res[word] = True
        self.stop_words = res

    def isValidTerm(self, term):
        if len(term) < 2:
            return False
        for t in [".", ",", "-", "+", "%", "?", "!", "$", "&", "/", "\"", "'", "`", "`", "|", ":", ";", ")", "(", "[", "]", "{", "}"]:
            if t in term:
                return False
        if self.is_number(term):
            return False
        if self.is_stop_word(term):
            return False

        try:
            term = term.decode('ascii')
        except:
            return False

        if term.find('.') > -1: # or term.find('/') > -1 or term.find("?"): # url parts
            return False
        return True

    def normalizeTokens(self, tokens):
        return [w.lower() for w in tokens]

    def stripSpecialChars(self, tokens):
        return [w.strip("\r\n.,-+%?!$&/\\'`|:;)([]{}\t\" ") for w in tokens]

    def kpGraph(self, page):
        # initialization
        self.nextFreeIndex = 0
        self.tokenIndices = {}
        self.allTerms = []

        pageowner = page.owner
        pageposts = Post.objects.filter(page__exact=page)

        pageusers = User.objects.filter(id__in = pageposts.exclude(createuser__exact=pageowner).values('createuser').distinct() )
        pageusers_count = len(pageusers)
        print "Calculating vectors for %s users" % pageusers_count

        kp_term_method = KeyphraseMethod.objects.get(name='pos_sequence')

        userterms = {}

        curuseridx = 0
        for currentuser in pageusers:
            curuseridx = curuseridx + 1
            print "%s/%s" % (curuseridx, pageusers_count)
            (terms, ids) = self.getUserVector(page, currentuser, kp_term_method)
            if not terms is None:
                userterms[currentuser.id] = (terms, ids)

        print "Maximal index: %s" % self.nextFreeIndex

        uservectors = {}
        vectorlen = self.nextFreeIndex
        for currentuser in userterms.keys():
            terms, ids = userterms[currentuser]
            currentvector = [0.0] * vectorlen

            for i in range(len(ids)-1):
                currentvector[ids[i]] = 1.0

            uservectors[currentuser] = (np.array(currentvector), terms)
            #print ", ".join(map(str, currentvector))
            #print ", ".join(self.allTerms)

        userswithindex = uservectors.keys()
        usermatrix = np.zeros([len(userswithindex)+1, len(userswithindex)+1])

        u1idx = 0

        for u1 in userswithindex:
            u2idx = 0
            for u2 in userswithindex:
                u2idx = u2idx + 1
                if u1 == u2:
                    continue

                u1_vec = uservectors[u1][0]
                u2_vec = uservectors[u2][0]
                u1u2dot = np.dot(u1_vec, u2_vec)
                usermatrix[u1idx][u2idx] = u1u2dot

            u1idx = u1idx + 1
            print "%s/%s" % (u1idx, len(userswithindex))

        print "Creating graph"
        graph = nx.Graph()

        graph.add_nodes_from(pageusers)
        for i1 in range(usermatrix.shape[0]-1):
            max_edge = None
            max_edge_val = 0.0
            for i2 in range(usermatrix.shape[0]-1):
                if i1 == i2:
                    continue
                u1 = userswithindex[i1]
                u2 = userswithindex[i2]
                u1u2val = usermatrix[i1][i2]
                if u1u2val > max_edge_val:
                    max_edge = u2
                    max_edge_val = u1u2val

            if max_edge_val > 0.0 and not max_edge is None:
                self.add_edge(graph, u1, max_edge)

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )
        self.removeSingletons(graph)
        print "Nodes: %s Edges: %s" % ( len(graph.nodes()), len(graph.edges()) )

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        for compidx in range(len(components)-1):
            component = components[compidx]
            taglist = []
            for user_id in component:
                ut = userterms[user_id][0]
                for t in ut:
                    if not t in taglist:
                        taglist.append(t)

            print "Component #%s Users: %s Tags (%s): \"%s\"" % (compidx, len(component), len(taglist), ",".join(taglist))




        return

    def getIndex(self, token):
        if not token in self.tokenIndices:
            self.allTerms.append(token)
            self.tokenIndices[token] = self.getNextIndex()
        return self.tokenIndices[token]

    def getUserVector(self, page, currentuser, kp_term_method):
        user_posts = Post.objects.filter(page__exact=page, createuser__exact=currentuser)
        user_post_parents = Post.objects.filter(id__in=user_posts.values('parent').distinct())

        user_kps = PostKeyphraseAssoc.objects.filter(post__in = user_posts, keyphrase__method__exact=kp_term_method)
        user_kp_count = len(user_kps)

        terms_all = []
        terms_split = []
        terms_n = user_kps.values('keyphrase__normalized').distinct()
        terms_t = user_kps.values('keyphrase__term').distinct()

        for term in terms_n:
            t = term['keyphrase__normalized']
            if not t in terms_all:
                terms_all.append(t)

        for term in terms_t:
            t = term['keyphrase__term']
            if not t in terms_all:
                terms_all.append(t)

        for term in terms_all:
            for term_part in term.split(" "):
                if not term_part in terms_split:
                   terms_split.append(term_part)

        terms_all = terms_split

        #if (len(terms_all) > 0):
        #    for thread_post in user_post_parents:
        #        terms_all.append("POST%s" % (thread_post.id))

        print "User: %s Posts: %s Keyphrases: %s" % ( currentuser, len(user_posts), user_kp_count )
        print "Terms: %s" % ", ".join(terms_all)

        if user_kp_count == 0:
            return (None, None)

        res_terms = []
        res_ids = []
        for term in terms_all:
            term_idx = self.getIndex(term)
            res_terms.append(term)
            res_ids.append(term_idx)

        return (res_terms, res_ids)

    def add_edge(self, graph, obj_from, obj_to, add_weight=1.0):
        if not graph.has_edge(obj_from, obj_to):
            graph.add_edge(obj_from, obj_to, weight=add_weight)
        else:
            graph[obj_from][obj_to]['weight'] = graph[obj_from][obj_to]['weight'] + add_weight

    def addPostUser(self, graph, post, added_users):
        if not post.createuser in graph:
            graph.add_node(post.createuser)
            added_users.append(post.createuser)
        # edge: post -> createuser
        self.add_edge(graph, post, post.createuser)

    def addPostParent(self, graph, post):
        if not post.parent is None:
            if not post.parent in graph:
                graph.add_node(post.parent)
                self.add_edge(graph, post, post.parent)

    def addPostKeyPhrases(self, graph, post):
        # keyphrases in this post
        for pk in PostKeyphraseAssoc.objects.filter(post__exact=post):
            graph.add_node(pk.keyphrase)
            self.add_edge(graph, post, pk.keyphrase)

    def addUserMetaCategory(self, graph, user):
        metaentries = UserMeta.objects.filter(user__exact=user)
        for metaentry in metaentries:
            if metaentry is None:
                continue
            if metaentry.fb_category is None or metaentry.fb_category == '':
                continue
            nodeval = u'CAT_' + unicode(metaentry.fb_category)
            graph.add_node(nodeval)
            self.add_edge(graph, user, nodeval)

    def addUserMeta(self, graph, user):
        metaentries = UserMeta.objects.filter(user__exact=user)
        for metaentry in metaentries:
            if metaentry is None:
                continue
            nodeval = unicode(metaentry)
            graph.add_node(nodeval)
            self.add_edge(graph, user, nodeval)

    def removeNonConnectedUsers(self, graph, dist_threshold):
        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        print "Removing non-connected user nodes"
        remove_nodes = []
        for component in components:
            usernodes = []
            userdists = {}
            for node in component:
                if type(node) == User:
                    usernodes.append(node)
            u1idx = 0
            ulen = len(usernodes)
            for u1 in usernodes:
                u1idx = u1idx + 1
                print "%s/%s" % (u1idx, ulen)
                if not u1.id in userdists:
                    userdists[u1.id] = 1000
                for u2 in usernodes:
                    if u1 == u2:
                        continue
                    pathres = nx.dijkstra_path_length(graph,u1,u2)
                    if pathres < userdists[u1.id]:
                        userdists[pathres] = pathres
                    if userdists[u1.id] < dist_threshold:
                        break # condition satisfied
            for user in usernodes:
                if userdists[user.id] > dist_threshold: # shortest path to another user is > 5 -> remove
                    print "Removing user %s. Dist value: %s" % (user.id, userdists[user.id])
                    remove_nodes.append(user)
        print "Removing %s user nodes" % len(remove_nodes)
        graph.remove_nodes_from(remove_nodes)
        del remove_nodes

    def removeSingletons(self, graph):
        print "Removing singletons"
        singleton_nodes = [ n for n,d in graph.degree_iter() if d==0 ]
        graph.remove_nodes_from(singleton_nodes)
        del singleton_nodes


    def buildGraph(self, page):
        print "Building graph"
        pageowner = page.owner
        pageposts = Post.objects.filter(page__exact=page)

        graph = nx.Graph()

        #pageposts = pageposts[500:700] ##########################################

        print "nodes: posts"
        graph.add_nodes_from(pageposts)

        print "edges: user -> post"
        added_users = []

        for post in pageposts:
            # post.createuser
            self.addPostUser(graph, post, added_users)

            # post->parent post
            self.addPostParent(graph, post)

            # post->postkeyphraseassoc->keyphrase
            self.addPostKeyPhrases(graph, post)

            # post.createuser->usermeta
            #self.addUserMeta(graph, post.createuser)
            #self.addUserMetaCategory(graph, post.createuser)

        print "Graph nodes: %s" % len(graph.nodes())
        print "Graph edges: %s" % len(graph.edges())

        print "Removing page owner"
        graph.remove_node(pageowner)

        print "Graph nodes: %s" % len(graph.nodes())
        print "Graph edges: %s" % len(graph.edges())


        self.removeSingletons(graph)

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        print "Removing components with only 0/1 user nodes"
        remove_components = []
        for component in components:
            usercount = 0
            for node in component:
                if type(node) == User:
                    usercount = usercount + 1
            if usercount <= 1:
                remove_components.append(component)
            else:
                print "Found %s user nodes" % usercount
        print "Removing %s components" % len(remove_components)
        for component in remove_components:
            graph.remove_nodes_from(component)
        del remove_components

        components = nx.connected_components(graph)
        print "Number of connected components: %s" % len(components)

        print "Edges: %s" % len(graph.edges())
        remove_edges = []
        weight_threshold = 2.0
        for node_a, node_b, attr in sorted(graph.edges(data = True), key = lambda (a, b, attr): attr['weight']):
            if type(node_a) == Post or type(node_b) == Post: # exclude post connections
                continue
            if 'weight' in attr and attr['weight'] > weight_threshold:
                break
            remove_edges.append((node_a, node_b))
            #print('{a} {b} {w}'.format(a = node_a, b = node_b, w = attr['weight']))
        for node_a, node_b in remove_edges:
            graph.remove_edge(node_a, node_b)
        print "Edges: %s" % len(graph.edges())

        self.removeSingletons(graph)

        print "Graph dotfile"
        nx.write_dot(graph, '/home/double/graph_viz.dot')


        tmp = []
        for user in added_users:
            if user in graph:
                tmp.append(user)
        added_users = tmp
        print "Unique users in graph: %s" % len(added_users)

        usergraph = nx.Graph()
        usergraph.add_nodes_from(added_users)
        for user_a, user_b in combinations(added_users, 2):
            try:
                userpath = nx.shortest_path_length(graph, user_a, user_b, weight='weight')
                usergraph.add_edge(user_a, user_b, weight=userpath)
                print user_a, user_b, userpath
            except nx.NetworkXNoPath, e:
                #print e
                continue

        self.removeSingletons(usergraph)

        #print "Drawing graph"
        plt.ioff()

        #nx.draw(graph, node_size=10, font_size=8)
        #plt.savefig('/home/double/graph.png', dpi=1000)

        print "UserGraph nodes: %s" % len(usergraph.nodes())
        print "UserGraph edges: %s" % len(usergraph.edges())


        return
Beispiel #29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--out_dir', default='../../data/frequency')
    parser.add_argument('--comment_files', nargs='+', default=None)
    parser.add_argument('--n', type=int, default=2)
    parser.add_argument('--file_suffix', default=None)
    parser.add_argument('--sample_pct', type=float, default=100)
    args = parser.parse_args()
    out_dir = args.out_dir
    comment_files = args.comment_files
    n = args.n
    file_suffix = args.file_suffix
    sample_pct = args.sample_pct
    if (comment_files is None):
        comment_files = get_all_comment_files()
        # replace with clean normalized (smaller vocab)
        comment_files = [
            f.replace('.bz2', '_clean_normalized.bz2') for f in comment_files
        ]
    # start small
    # comment_files = comment_files[:1]
    # min_df = 5
    # min_tf = 10
    min_tf = 1
    stopwords = []
    tokenizer = WhitespaceTokenizer()
    # breaking memory
    # ngram_range = (1,3)
    # ngram_range = (2,3)
    # ngram_range = (2,2)
    # ngram_range = (1,1)
    # no CountVectorizer because memory and we don't need
    # cooccurrence anyway
    # cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize,
    #                      stop_words=stopwords, ngram_range=ngram_range)
    date_format = '201[0-9]-[0-9]+'
    for f in comment_files:
        print('processing file %s' % (f))
        date_str = re.findall(date_format, f)[0]
        # for each level of ngram, recompute counts
        # for n in range(ngram_range[0], ngram_range[1]+1):
        print('computing ngram = %d' % (n))
        with BZ2File(f, 'r') as comment_file:
            # takes too long to generate full DTM...what do??
            # just compute counts
            comment_iter = make_iter(comment_file)
            counts = get_ngram_counts(comment_iter,
                                      n,
                                      tokenizer=tokenizer,
                                      sample_pct=sample_pct)

            # limit min_frequency?
            counts = counts[counts >= min_tf]
            counts.columns = [date_str]
            # write to file
            # TOO MUCH SPACE => compress?
            if (file_suffix is not None):
                out_fname = os.path.join(
                    out_dir,
                    '%s_%dgram_tf_%s.tsv' % (date_str, n, file_suffix))
            else:
                out_fname = os.path.join(out_dir,
                                         '%s_%dgram_tf.tsv' % (date_str, n))
            counts.to_csv(out_fname, sep='\t')
from argparse import ArgumentParser
from collections import OrderedDict
from textblob import TextBlob
from nltk.util import bigrams
from multiprocessing import Pool
from traceback import format_exc
from nltk.stem.snowball import EnglishStemmer
from nltk.tokenize.regexp import WhitespaceTokenizer
from nltk.corpus import stopwords
from boto import connect_s3
import requests
import codecs
import traceback

stemmer = EnglishStemmer()
tokenizer = WhitespaceTokenizer()
stops = stopwords.words(u'english')


def get_args():
    ap = ArgumentParser()
    ap.add_argument(u'--num-processes',
                    dest=u"num_processes",
                    default=8,
                    type=int)
    ap.add_argument(u'--solr-host',
                    dest=u"solr_host",
                    default=u"http://search-s10:8983")
    ap.add_argument(u'--outfile', dest=u'outfile', default=u'wiki_data.csv')
    ap.add_argument(u'--s3dest', dest=u's3dest')
    return ap.parse_args()
class MarkovChain(object):
	'''Create a MarkovChain from the given dictionary and parameters,
	run() returns a sentence given a seed

	markov_dict should be a MarkovDict().api dictionary'''

	def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None):
		self.markov_dict = markov_dict
		self.gtype = self.markov_dict['gtype']
		self.stop_words = set(stopwords.words('english'))
		self.neighbor_dict = neighbor_dict
		self.tokenizer = WhitespaceTokenizer()
		self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt'])
		self.lower_word_list = [w.lower() for w in self.word_list]
		# Count of word freq, maintaining case
		self.word_dict_count = Counter(self.word_list)
		self.truecaser = TrueCase(self.markov_dict['fname'])

		# Create priority and not_found_list if none were entered
		if priority_list:
			self.priority_list = priority_list
		else:
			self._make_priority()
		if not_found_list:
			self.not_found_list = not_found_list
		else:
			self._make_not_found()

	def _make_priority(self, n=10):
		'''Return the n most common words in the corpus'''
		# Remove stop_words
		content = [w for w in self.lower_word_list if w not in self.stop_words]
		# Remove words that are only punctuation
		content_no_punc = []
		for word in content:
			tmp = False
			for char in word:
				if char not in punctuation:
					tmp = True
				else:
					continue
			if tmp:
				content_no_punc.append(word)

		priority_dict = Counter(content_no_punc)
		self.priority_list = [key for key, val in priority_dict.most_common(n)]

	def _make_not_found(self, n=15):
		'''Return the n most common sentences in the corpus'''
		not_found_dict = Counter(sent_tokenize(self.markov_dict['corpus_txt']))
		common_sent = [key for key, val in not_found_dict.most_common(n)]
		self.not_found_list = []
		# Might fill with small stuff, don't let that happen
		for sent in common_sent:
			if len(sent) > 5:
				self.not_found_list.append(sent)

	def _get_input(self, input_phrase):
		'''Take in the raw input from the user'''
		# Lowercase and remove common punc
		input_phrase = input_phrase.lower()
		input_phrase = re.sub('\?', '', input_phrase)
		input_phrase = re.sub('\.', '', input_phrase)
		input_phrase = re.sub(',', '', input_phrase)
		input_phrase = re.sub('!', '', input_phrase)

		# List of words from a potential input phrase
		word_list = input_phrase.split()

		# Make a list of words that are in priority_list
		priority_words = [w for w in word_list if w in self.priority_list]

		# If no priority words, look for non stop words
		content = [w for w in word_list if w not in self.stop_words]

		# Look for priority words first, content second, and finally random
		if priority_words:
			seed = np.random.choice(priority_words)
		elif content:
			seed = np.random.choice(content)
		else:  # Final option is a random word
		    seed = np.random.choice(word_list)

		# if the words is not in text, find neighbors
		if not self._in_text(seed):
			seed = self._get_neighbor(seed)

		return seed


	def _in_text(self, word):
		'''Return true if word is in the corpus'''
		return word.lower() in set(self.lower_word_list)

	def _get_neighbor(self, seed):
		'''Return the nearest neighbor to seed from a database'''
		if not self.neighbor_dict:
			return None

		neighbors = self.neighbor_dict[seed]

		good_neighbors = []
		for word in neighbors:
			if self._in_text(word):  # Only pick a neighbor if in text
				good_neighbors.append(word)
		if good_neighbors:
			return np.random.choice(good_neighbors)
		else:
			return None

	def _generate_key(self, seed, dir_dict):
		'''Return key from a chosen seed'''
		key_list = []
		for key in dir_dict:
			# Look at the last key_gram_size words in the key
			# First word in that key_gram_size len phrase must match seed
			if seed in key[-self.key_gram_size]:
				key_list.append(key)
		return key_list[np.random.choice(len(key_list))]

	def _run_chain(self, seed, dir_dict):
		'''Return a list of words generated from seed
		Iterate through dictionary until a period or capital is reached'''
		key = self._generate_key(seed, dir_dict)
		text = list(key[-self.key_gram_size:])

		# If not end/begin of sent, run
		while True:
			# Values is a list of lists
			values = dir_dict[key]

			# Choose a value with probability equal to distribution in corpus
			value = values[np.random.choice(len(values))]
			if (() in value) | (value == ()): # End condition
				break

			# Add a value_gram_size phrase to the text
			words_from_value = value[:self.value_gram_size]
			text += words_from_value

			# Create new lookup key
			key = tuple(text[-self.markov_dict['gram_size']:])
		return text

	def _get_sentence(self, seed):
		'''Return a sentence given a seed'''
		f_text = self._run_chain(seed, self.markov_dict['f_dict'])
		b_text = self._run_chain(seed, self.markov_dict['b_dict'])

		# b_text is backwards obviously, so turn it around
		b_text = list(reversed(b_text))

		# Only include seed once
		sent = b_text[:-1] + f_text

		return sent

	def _get_sentence_str(self, sent):
		'''Return a string representation of a list'''
		if self.gtype != 'naive':
			sent = [w[0] for w in sent]
		text = ' '.join(sent)

		punc_w_space = [' ' + x for x in punctuation]
		for i in xrange(len(text)-1):
			if text[i:i+2] in punc_w_space:
				text = text[:i] + text[i+1:]
		return text

	def run(self, input_text, key_gram_size=2, value_gram_size=1):
		'''Return a sentence based on gram_size
		Larger gram_size is more deterministic phrases
		gram_size cannot be larger than gram_size'''
		self.key_gram_size = min(key_gram_size, self.markov_dict['gram_size'])
		self.value_gram_size = min(value_gram_size, self.markov_dict['gram_size'])
		while self.key_gram_size + self.value_gram_size < self.markov_dict['gram_size']:
			self.value_gram_size += 1

		seed = self._get_input(input_text)
		# If seed not in corpus and no neighbor found, return random sent
		if not seed:
			return np.random.choice(self.not_found_list)
		sent = self._get_sentence(seed)

		# Turn into string for output
		sent_str = self._get_sentence_str(sent)

		# Fix space before punc
		output = self.truecaser.truecase(sent_str)
		return output
Beispiel #32
0
class TextCleaner(object):
    def __init__(self, use_unicode):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('english')
        self.symbols = [
            u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
            u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
            u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
            u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b",
            u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7",
            u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794",
            u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0",
            u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab"
        ]
        if use_unicode:
            self.accents = unicode_replace
        else:
            self.accents = ascii_replace
        self.link_patterns = [('http'), ('www'), ('w3c')]
        self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                        (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                        (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                        (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                        (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                        (r'fqf', 'ff'), (r'lql', 'll')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado
    # por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Remover caracteres especiais (Ex: ?, /, " ...).
    def removeSymbols(self, text):
        for symbol in self.symbols:
            text = text.replace(symbol, ' ')
        return text

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.accents:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
Beispiel #33
0
 def classify(self, text):
     return (self.classifier.classify(WhitespaceTokenizer().tokenize(text)))
class TrueCase(object):
    '''True case from a corpus'''

    def __init__(self, fname):
        with open(fname, 'r') as f:
            self.corpus_txt = f.read().decode('utf-8').replace('\n', ' ')
        self.tokenizer = WhitespaceTokenizer()
        self.word_list = self.tokenizer.tokenize(self.corpus_txt)
        self.lower_word_list = [w.lower() for w in self.word_list]
        self.word_dict_count = Counter(self.word_list)

    def truecase(self, sent):
        '''Return a true_cased sentence to look well formatted'''
        if isinstance(sent, basestring):
            sent = self.tokenizer.tokenize(sent)
        output = []
        # If it appears capital more often, use that case
        for word in sent:
            capital = 0
            lower = 0
            all_caps = 0
            try:
                lower += self.word_dict_count[word.lower()]
            except:
                lower += 0
            try:
                capital += self.word_dict_count[word.capitalize()]
            except:
                capital += 0
            try:
                all_caps += self.word_dict_count[word.upper()]
            except:
                all_caps += 0

            # find max of those three options
            idx = np.argsort([all_caps, capital, lower])[-1]

            # If not found in dictionary, find original case
            if (all_caps + capital + lower) == 0:
                try:
                    i = self.lower_word_list.index(word.lower())
                    output.append(self.word_list[i])
                except:
                    try:
                        i = self.lower_word_list.index(word.lower().strip(punctuation))
                        output.append(self.word_list[i])
                    except:
                        output.append(word)
            elif idx == 0:
                output.append(word.upper())
            elif idx == 1:
                output.append(word.capitalize())
            else:
                output.append(word)

        # sometimes sentence delimiters get picked up in the middle of words
        # they should only go at the end
        sent_str = ' '.join([x.strip('!?.') for x in output[:-1]]) + ' ' + output[-1]
        sent_str = sent_str[0].upper() + sent_str[1:]

        return sent_str

    def bulk_truecase(self, list_sent):
        '''Return a list of true_cased strings from an iterable'''
        output = []
        for sent in list_sent:
            output.append(self.truecase(sent))
        return output
class TextCleaner(object):
    def __init__(self, use_unicode=True):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.pt_stemmer = nltk.stem.RSLPStemmer()
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('portuguese')
        self.symbols = [
            u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-",
            u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`",
            u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&",
            u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b",
            u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b",
            u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f",
            u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf",
            u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad",
            u"\u00ab"
        ]
        self.more_stopwords = [
            'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa',
            'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne',
            'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra',
            'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita',
            'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate'
        ]
        if use_unicode:
            self.accents = unicode_replace
        else:
            self.accents = ascii_replace
        self.link_patterns = [('http'), ('www'), ('w3c'), ('https')]
        self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'),
                       (r' ir ', '_ir '), (r'bom demal', ' bomdemais '),
                       (r'\s*insan\s*', ' insano '),
                       (r'\s*saudad\s*', ' saudade ')]
        self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'),
                        (r'eqe', 'ee'), (r'oqo', 'oo')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado
    # por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Remover caracteres especiais (Ex: ?, !, " ...).
    def removeSymbols(self, text):
        for symbol in self.symbols:
            text = text.replace(symbol, ' ')
        return text

    # Remover sufixo das palavras da lingua portuguesa.
    def removeSufPort(self, para):
        para = para.split()
        text = ''
        for w in para:
            text = text + self.pt_stemmer.stem(w) + ' '
        return text

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.accents:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        text = ' '.join(
            [word for word in text.split() if word not in self.more_stopwords])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
Beispiel #36
0
    def __init__(self,):

        NltkTokenizer.__init__(self)
        _WhitespaceTokenizer.__init__(self,)
def get_social_word_counts(social_var,
                           vocab,
                           comment_file,
                           meta_file,
                           comment_thresh=10):
    """
    Compute unique number of social vars 
    per word in vocab over all comments.
    Parameters:
    -----------
    social_var : str
    vocab : [str]
    Vocabulary to count.
    comment_file : str
    meta_file : str
    Tab-separated metadata file containing comment date, 
    author, thread ID, and subreddit.
    comment_thresh : int
    Minimum number of comments for a social var to be counted.
    Returns:
    --------
    social_var_counts : numpy.array
    """
    # indices in meta file corresponding to social vars
    social_var_indices = {'user': 1, 'subreddit': 3, 'thread': 2}
    social_txt = defaultdict(list)
    tokenizer = WhitespaceTokenizer()
    stopwords = get_default_stopwords()
    ngram_range = (1, 1)
    min_df = 1
    cv = CountVectorizer(encoding='utf-8',
                         lowercase=True,
                         tokenizer=tokenizer.tokenize,
                         stop_words=stopwords,
                         ngram_range=ngram_range,
                         min_df=min_df,
                         vocabulary=vocab,
                         binary=True)
    # keep it simple and store {vocab : {sub : count}}
    social_word_counts = defaultdict(Counter)
    with BZ2File(comment_file, 'r') as comments, BZ2File(meta_file,
                                                         'r') as metas:
        for i, (comment, meta) in enumerate(izip(comments, metas)):
            meta = meta.split('\t')
            social_id = meta[social_var_indices[social_var]]
            # print('got social id %s'%(social_id))
            # social_txt[social_id].append(comment)
            for w in tokenizer.tokenize(comment):
                social_word_counts[w][social_id] += 1
            if (i % 100000 == 0):
                print('processed %d comments' % (i))
            # if(i == 500000):
            #     break
    social_word_counts = {
        w: d
        for w, d in social_word_counts.iteritems() if w in vocab
    }
    social_word_counts = {
        w: {k: v
            for k, v in d.iteritems() if v >= comment_thresh}
        for w, d in social_word_counts.iteritems()
    }
    social_word_counts = {w: len(d) for w, d in social_word_counts.iteritems()}
    social_word_counts = np.array([
        social_word_counts[v] if v in social_word_counts else 0. for v in vocab
    ])

    # old code for constructing word/social dtm
    # restrict to consistent users??
    # social_txt = {k : v for k,v in social_txt.items()
    #               if len(v) >= comment_thresh}
    # # now convert to DTM
    # def get_txt_iter(social_txt):
    #     N = len(social_txt)
    #     for i, v in enumerate(social_txt.itervalues()):
    #         if(i % 1000 == 0):
    #             print('processed %d/%d social vars'%(i, N))
    #         yield ' '.join(v)
    # txt_iter = get_txt_iter(social_txt)
    # # txt_iter = (' '.join(v) for v in social_txt.values())
    # dtm = cv.fit_transform(txt_iter)
    # print('got %s dtm %s'%(social_var, dtm))
    # # save sparse matrix
    # # all_social_vals = social_txt.keys()
    # # vocab = sorted(cv.vocabulary_, key=lambda x: cv.vocabulary_[x])
    # # comment_date = re.findall(r'201[0-9]-[0-9]+', comment_file)[0]
    # # write_full_social_dtm(dtm, all_social_vals, vocab, comment_date, social_var)
    # # save unique social count for each word
    # # combine all counts per word
    # social_word_counts = np.array(dtm.sum(axis=0)).flatten()
    return social_word_counts
Beispiel #38
0
# -*- coding: latin-1 -*-
import re
import nltk
from nltk.tag import UnigramTagger
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import PunktWordTokenizer
from nltk import RegexpParser
from nltk.corpus import stopwords
from nltk.tokenize.regexp import WhitespaceTokenizer
global corpus, sent_tags, tagger

# corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux
corpus = TaggedCorpusReader(
    'C:/Users/jose.adail/workspace/TextProcessor/names',
    r'.*\.txt',
    word_tokenizer=WhitespaceTokenizer(),
    sep="_")
name_tags = corpus.tagged_sents(
)  # Recebe as sentenças marcadas com POS_Tags.
tagger = UnigramTagger(
    name_tags
)  # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas.


class RegexpReplacer(object):
    def __init__(self):
        self.replacement_patterns = [(r"'", ''), (r'#', 'hash'),
                                     (r'no', 'no_'), (r'not', 'not_'),
                                     (r'RT ', ''), (r'rs[rs]+', 'rs'),
                                     (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'),
                                     (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
from nltk import *
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer


def getUniqueWords(allWords):
    uniqueWords = []
    for i in allWords:
        if not i in uniqueWords:
            uniqueWords.append(i)
    return uniqueWords


text_str = open('corpus.txt').read()
tokens = WhitespaceTokenizer().tokenize(text_str)
print("\nInitial Statistics of the Corpus.")
print("#token: " + str(len(tokens)))
print("#types: " + str(len(getUniqueWords(tokens))))

print("\nThe Top-10 Frequent Tokens.")
freq = nltk.FreqDist(tokens)
print(freq.most_common(10))

tokens = [token.lower() for token in tokens]
print("\nAfter Case Folding.")
print("#token: " + str(len(tokens)))
print("#types: " + str(len(getUniqueWords(tokens))))

print("\nThe Top-10 Frequent Tokens.")
freq = nltk.FreqDist(tokens)
Beispiel #40
0
class NamedEntity(object):
    def __init__(self):
        self.tokenizer = WhitespaceTokenizer()

    # Remover do texto duas ou mais palavras próprias em sequência.
    def removeName(self, text):
        i = 0
        j = 1
        words = text.split()
        lim = len(words) - 1
        while j <= lim:
            if not words[i].isupper() and not words[i].islower():
                if not words[j].isupper() and not words[j].islower():
                    words[i] = words[i].replace(words[i], "")
                    words[j] = words[j].replace(words[j], "")
            i += 1
            j += 1
        words = ' '.join(words)
        return words

    # Remover nomes próprios dos textos. Para isso, recebe o texto, que em seguida é dividido em palavras, que posteriormente recebem POS_Tags.
    # Para cada palavra/tag, é verificado se a tag nao corresponde a de nome proprio 'NPROP'. Ao final, forma-se um texto sem palavras com tags
    # 'NPROP', sendo assim retornado pelo método.
    def removePersonName(self, text):
        final_text = ''
        tokenized_text = self.tokenizeWords(text)
        tagged_text = self.tagWords(tokenized_text)
        for w, t in tagged_text:
            if t != "NPROP":
                final_text = final_text + ''.join(w) + ' '
        return final_text

    # Remover menções de usuários de tweets. Os mesmos são identificados pelo caractere '@'. O texto original é repassado ao método e divido em palavras,
    # em seguida. Após isso, é verificado para cada palavra do texto se a mesma se inicia com o caractere '@'. Caso sim, essa palavra é removida do texto.
    # Ao final, o texto é retornado, sem os nomes de usuários.
    def removeTwitterUsername(self, text):
        text = text.split()
        for w in text:
            if w[0] == '@':
                text.remove(w)
        return ' '.join(text)

    # Marcar as palavras de uma sentença tokenizada com POS_Tags. O texto é repassado ao método tag da classe UnigramTagger, que marca as palavras do texto com
    # POS_Tags. Retorna uma lista com palavras/tags.
    def tagWords(self, tokenized_text):
        tagged_words = tagger.tag(tokenized_text)
        return tagged_words

    # Desenhar arvore que destaca um determinado padrão gramatical do texto.
    def drawNamedEntityTree(self, text):
        tokenized_text = tokenizer.tokenize(text)
        tagged_text = self.tagWords(tokenized_text)
        grammar = "ENT: {<PESSOA>*}"
        cp = RegexpParser(grammar)
        res = cp.parse(tagged_text)
        res.draw()

    # Tokenizar sentenças em palavras. Retorna uma lista com as palavras que formam o texto.
    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text
Beispiel #41
0
 def __init__(self):
     self.tokenizer = WhitespaceTokenizer()
Beispiel #42
0
class TextCleaner(object):
    def __init__(self):
        self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
        self.repl = r'\1\2\3'
        self.tokenizer = WhitespaceTokenizer()
        self.cached_stopwords = stopwords.words('english')
        self.ascii_replace = [
            ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'),
            ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'),
            ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'),
            ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'),
            ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'),
            ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'),
            ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c')
        ]
        self.link_patterns = [('http'), ('www'), ('w3c')]
        self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'),
                        (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'),
                        (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'),
                        (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'),
                        (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'),
                        (r'fqf', 'ff'), (r'lql', 'll')]

    # Remover caracteres repetidos seguidamente, para que o modelo não seja prejudicado por falta de padrão na escrita.
    def removeRepChar(self, word):
        repl_word = self.repeat_regexp.sub(self.repl, word)
        if repl_word != word:
            return self.removeRepChar(repl_word)
        else:
            return repl_word

    # Substituir caracateres acentuados por caracteres sem acentos.
    def removeAccent(self, text):
        para = text
        for (lat, asc) in self.ascii_replace:
            para = para.replace(lat, asc)
        return para

    # Remover stopwords dos textos.
    def removeStopwords(self, text):
        text = ' '.join([
            word for word in text.split() if word not in self.cached_stopwords
        ])
        return text

    # Remover links dos textos.
    def removeLinks(self, text):
        for l in self.link_patterns:
            text = text.split(l, 1)[0]
        return text

    # Reescrever os digrafos na sua forma original. Exemplo: rxr -> rr
    def normalizeDigraph(self, text):
        for a, d in self.digraph:
            text = re.sub(a, d, text)
        return text

    # Reescrever algumas palavras para dar melhor semântica e legibilidade aos resultados do modelo.
    def normalizeText(self, text):
        for a, b in self.normal:
            text = re.sub(a, b, text)
        return text

    def removeOneCharacter(self, text):
        text = self.tokenizeWords(text)
        for i in range(len(text)):
            if len(text[i]) <= 2:
                text[i] = ''
        return ' '.join(text)

    def tokenizeWords(self, text):
        text = self.tokenizer.tokenize(text)
        return text