def corpus_statistics(): #train_corpus_path = "/userstore/jieg/credbank/corpus/credbank_train_corpus.txt" train_corpus_path = "C:\\Data\\credbank\\tweets_corpus\\shuffled_credbank_held_corpus.txt" with open(train_corpus_path, mode='r', encoding='utf-8') as file: train_corpus = file.readlines() from nltk.tokenize.regexp import WhitespaceTokenizer whitespace_tokenize = WhitespaceTokenizer().tokenize corpus_size = 0 for tweet in train_corpus: tokens = whitespace_tokenize(tweet) corpus_size += len(tokens) print("all words (corpus size): ", corpus_size) from sklearn.feature_extraction.text import CountVectorizer #extract tokens text_vectorizer = CountVectorizer(analyzer='word', tokenizer=WhitespaceTokenizer().tokenize, ngram_range=(1, 1), min_df=1) X = text_vectorizer.fit_transform(train_corpus) # Vocabulary vocab = list(text_vectorizer.get_feature_names()) print("vocabulary size: ", len(vocab)) # 913611 counts = X.sum(axis=0).A1 from collections import Counter freq_distribution = Counter(dict(zip(vocab, counts))) print("top N frequent words: ", freq_distribution.most_common(10))
def evaluateclassifier(self, featureselection): positivecount=0 negativecount=0 negativetweets = [] positivetweets = [] #print 'Evaluating Classifier' print featureselection with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f: #print 'Opening corpus file' reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4' and positivecount < self.corpuslength): positivetweets.append(row[5]) positivecount+=1 #Negative sentiment tweets if(row[0] == '0' and negativecount < self.corpuslength): negativetweets.append(row[5]) negativecount+=1 #print 'Generating Features' self.positivefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positivetweets] self.negativefeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativetweets] poscutoff = len(self.positivefeatures) negcutoff = len(self.negativefeatures) print "Train Pos Cutoff: " + str(poscutoff) + " Train Neg Cutoff: " + str(negcutoff) trainfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff] testfeats = self.test(featureselection) #testfeats = self.positivefeatures[:poscutoff] + self.negativefeatures[:negcutoff] print 'Train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats)) classifier = NaiveBayesClassifier.train(trainfeats) print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats) #classifier.show_most_informative_features(20) refsets = collections.defaultdict(set) testsets = collections.defaultdict(set) for i, (feats, label) in enumerate(testfeats): refsets[label].add(i) observed = classifier.classify(feats) #print label, observed testsets[observed].add(i) print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) print 'pos F-measure:', nltk.metrics.f_measure(refsets['pos'], testsets['pos']) print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg']) print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg']) print 'neg F-measure:', nltk.metrics.f_measure(refsets['neg'], testsets['neg'])
def main(): text = read_doc() text = [unescape(sent) for sent in text] from nltk.tokenize.regexp import WhitespaceTokenizer ws_tokenizer = WhitespaceTokenizer() text = [ws_tokenizer.tokenize(sent) for sent in text if len(sent) > 0] text = [[token.lower() for token in sent] for sent in text] text = [[ ''.join(ch for ch in token if ch.isalpha() or ch == '\'') for token in sent ] for sent in text] text = [[token for token in sent if len(token) >= 2 and len(token) <= 35] for sent in text] from nltk.corpus import stopwords stopwords = set(stopwords.words('english')) text = [[token for token in sent if not token in stopwords] for sent in text] from nltk.stem.snowball import SnowballStemmer stemmer = SnowballStemmer("english") text = [[stemmer.stem(token) for token in sent] for sent in text] from sklearn.feature_extraction.text import CountVectorizer vect = CountVectorizer(min_df=20, analyzer=lambda x: x) X = vect.fit_transform(text) #print(X.toarray()) feature_names = vect.get_feature_names() #print(feature_names) from collections import Counter try: # Python 2 from itertools import izip except ImportError: # Python 3 izip = zip wfd = Counter( {key: value for (key, value) in izip(range(X.shape[1]), X.getnnz(0))}) from itertools import combinations, chain bfd = Counter( chain.from_iterable( [combinations(sorted(segment.tocoo().col), 2) for segment in X])) N_seg = len(text) scores = [(mutinf(bfd[tup], wfd[tup[0]], wfd[tup[1]], N_seg), tup) for tup in bfd] print([(tup[0], feature_names[tup[1][0]], feature_names[tup[1][1]]) for tup in sorted(scores, reverse=True)[:20]]) pass
def __init__(self, data_iterator, tokenizer=WhitespaceTokenizer(), char_map=None, word_len=30, sent_len=200): ''' DESCRIPTIONS: This class converts text to numbers for the standard unicode vocabulary size. PARAMS: data_iterator (iterator): iterator to iterates the text strings word_len (int): maximum length of the word, any word of length less than that will be padded with zeros, any word of length more than that will be cut at max word length. sent_len (int): maximum number of words in a sentence, any sentence with less number of words than that will be padded with zeros, any sentence with more words than the max number will be cut at the max sentence length. char_map (dict): a dictionary for mapping characters to numbers. ''' self.data_iterator = data_iterator self.word_len = word_len self.sent_len = sent_len self.char_map = char_map self.tokenizer = tokenizer self.char_zero = ' ' # character to be assigned the zero index
def __init__(self, use_unicode=True): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u30fb", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'rt', 'retweeted', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c'), ('https')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')]
def __init__(self, use_unicode): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.symbols = [ u"\"", u"'", u"!", u"?", u".", u",", u";", u">", u"_", u"<", u"-", u"[", u"]", u"{", u"}", u"/", u"\\", u"^", u"~", u"´", u"`", u"``", u"\u2026", u":", u"(", u")", u"|", u"#", u"$", u"%", u"&", u"*", u"=", u"+", u"\u2013", u"\u201c", u"\u201d", u"\u300b\u300b", u"\u2019", u"\u2018", u"\u00b0", u"\u00ba", u"\u200b", u"\u00b7", u"\u2014", u"\u00bb", u"\u221a", u"\u00aa", u"\ufe0f", u"\u2794", u"\u2192", u"\u00a8", u"\u2022", u"\u300a", u"\u00bf", u"\u25a0", u"\u00af", u"\u22b3", u"\u2060", u"\u261b", u"\u00ad", u"\u00ab" ] if use_unicode: self.accents = unicode_replace else: self.accents = ascii_replace self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')]
def get_ngram_counts(comment_iter, n, tokenizer=None, sample_pct=100): """ Compute ngram counts from comments. Parameters: ----------- comment_iter : generator n : int tokenizer : nltk.tokenize.Tokenizer sample_pct : float Optional percentage from which to subsample the data. Returns: -------- counts : pandas.DataFrame Rows = ngrams, col = counts. """ if (tokenizer is None): tokenizer = WhitespaceTokenizer() counts = Counter() for i, c in enumerate(comment_iter): if (sample_pct == 100 or random.random() * 100 < sample_pct): ngrams = ngram_split(c, n, tokenizer) for ngram in ngrams: ngram = [' '.join(ngram)] counts.update(ngram) if (i % 1000000 == 0): print('got %d unique ngrams' % (len(counts))) # convert to dataframe counts = pd.DataFrame(pd.Series(counts)) return counts
def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.pt_stemmer = nltk.stem.RSLPStemmer() self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('portuguese') self.more_stopwords = [ 'ja', 'q', 'd', 'ai', 'desse', 'dessa', 'disso', 'nesse', 'nessa', 'nisso', 'esse', 'essa', 'isso', 'so', 'mt', 'vc', 'voce', 'ne', 'ta', 'to', 'pq', 'cade', 'kd', 'la', 'e', 'eh', 'dai', 'pra', 'vai', 'olha', 'pois', 'fica', 'muito', 'muita', 'muitos', 'muitas', 'onde', 'mim', 'oi', 'ola', 'ate' ] self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.normal = [(r'kxkxk', 'kkk'), (r'nao ', ' nao_'), (r' ir ', '_ir '), (r'bom demal', ' bomdemais '), (r'\s*insan\s*', ' insano '), (r'\s*saudad\s*', ' saudade ')] self.digraph = [(r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo')]
def __chunk_sentence(self, sentence): """Tokenize the sentence into words using a whitespace parser to avoid parsing couldn't into two tokens (could and n't). Then chunk the tokens according to GRAMMAR. """ tokenizer = WhitespaceTokenizer() tokens = tokenizer.tokenize(sentence) pos_tagged = nltk.pos_tag(tokens) return self.parser.parse(pos_tagged)
def build_topn_best_words(self): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() positivecount = 0; negativecount = 0 with open(r'..\polarityData\TweetCorpus\training.1600000.processed.noemoticon.csv', 'rb') as f: reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4' and positivecount < self.corpuslength): tweet = row[5] tokens = WhitespaceTokenizer().tokenize(tweet) #print tweet for token in tokens: word_fd.inc(token.lower()) label_word_fd['pos'].inc(token.lower()) positivecount+=1 #Negative sentiment tweets if(row[0] == '0' and negativecount < self.corpuslength): tweet = row[5] tokens = WhitespaceTokenizer().tokenize(tweet) #print tweet for token in tokens: word_fd.inc(token.lower()) label_word_fd['neg'].inc(token.lower()) negativecount+=1 #print word_fd #print label_word_fd pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count print "Positive Word Count:", pos_word_count, "Negative Word Count:", neg_word_count, "Total Word count:", total_word_count word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000] self.bestwords = set([w for w, s in best]) print 'Best Words Count:', len(self.bestwords)#, 'Best Words Set:', self.bestwords
def test(self, featureselection): positiveTweets = [] negativeTweets = [] with open(r'..\polarityData\TweetCorpus\testdata.manual.2009.06.14.csv', 'rb') as f: reader = csv.reader(f) for row in reader: #Positive sentiment tweets if(row[0] == '4'): positiveTweets.append(utils.common.processTweetBlank(row[5])) #Negative sentiment tweets if(row[0] == '0'): negativeTweets.append(utils.common.processTweetBlank(row[5])) positiveTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'pos') for tweet in positiveTweets] negativeTestFeatures = [(featureselection(WhitespaceTokenizer().tokenize(tweet)), 'neg') for tweet in negativeTweets] poscutoff = len(positiveTestFeatures) negcutoff = len(negativeTestFeatures) print "Test Pos Cutoff: " + str(poscutoff) + " Test Neg Cutoff: " + str(negcutoff) testfeatures = positiveTestFeatures[:poscutoff] + negativeTestFeatures[:negcutoff] #print testfeatures return (testfeatures)
def __init__(self, tokenizer=WhitespaceTokenizer(), sent_len=200): self.sent_len = sent_len self.tokenizer = tokenizer self.w2v_dim = 300 this_dir = os.path.dirname(os.path.realpath(__file__)) model_dir = this_dir + '/model' if not os.path.exists(model_dir): os.makedirs(model_dir) pretrained_path = model_dir + '/GoogleNews-vectors-negative300.bin.gz' if not os.path.exists(pretrained_path): raise Exception('pretrained vector file not exists: {}'.format(pretrained_path)) print('..loading model') self.model = gensim.models.KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
def process(self, text): """ предобработка, токенизация по предложениям, удаление дублей. выдает список предложений (для векторного метода, на будущее) Args: text ([type]): [description] """ #text = text.lower() # убираем числа, email, гиперрсылки #text = text.encode('utf-8') text = clear_emails(text) text = clear_url(text) text = clear_digits(text) text = clear_symb(text) # выделяем предложения sentence_tokenizer = PunktSentenceTokenizer() text = sentence_tokenizer.tokenize(text) cleaned_text = [] stop_words = set(stopwords.words('russian')) # разбиваем по словам, чистим от оставшейся пунктуации и stopwords tokenizer = WhitespaceTokenizer() stemmer = SnowballStemmer('russian') for sentence in text: punct_cleaned_sent = clear_endings( sentence) # служ. символы конца предложения tokenized_sent = tokenizer.tokenize( punct_cleaned_sent) # раскидали по словам, только для отчистки stpw_clean_sentence = [ word for word in tokenized_sent if not word in stop_words ] stemmed_sentence = [ stemmer.stem(word) for word in stpw_clean_sentence ] # проеборазуем в ед. число или корень слова clean_sentence = ' '.join( stemmed_sentence ) # собрали обратно в предложение-сторку для хэшировнаия cleaned_text.append(clean_sentence) return cleaned_text
def get_sentences_for_text(corpus_root, filename, lang='english'): """Segments the given text into sentences. Args: corpus_root: Directory in which the text file is residing. filename: Name of the text file. lang: Tokenizer language. For possible values, look at: ${NLTK_DATA}/tokenizers/punkt Returns: Sentences in the given text. """ tokenizer_path = 'tokenizers/punkt/' + lang + '.pickle' text = PlaintextCorpusReader( corpus_root, [filename], word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=nltk.data.LazyLoader(tokenizer_path)) return text.sents()
def analyize(self,text): try: unitext = any2unicode(text, encoding='utf8', errors='strict') except: print ("Not utf-8") return [] pass #convert to lower lowerText = unitext.lower() # Regex way: gives some text 'qwe (x)' as 'qwe' '(x)' # very aggresive regex...removes puncs and digits..keeps only alphabetic words tokenizer = WhitespaceTokenizer() regexTokens = tokenizer.tokenize(lowerText) p_stemmer = PorterStemmer() stemmedTokens = [p_stemmer.stem(i) for i in regexTokens] stemmedRemSingleLetterTokens = [w for w in stemmedTokens if len(w)>1] return stemmedRemSingleLetterTokens
def process(self, text, plain_text=False): """ предобработка, токенизация по словам, удаление дублей. выдает сплошной (plain) текст, для метода шиндлов или список токенов текста Args: text ([type]): [description] """ #text = text.encode('utf-8') # убираем числа, email, гиперрсылки text = clear_emails(text) text = clear_url(text) text = clear_digits(text) text = clear_symb(text) # разбиваем по словам, чистим от оставшейся пунктуации и stopwords stop_words = set(stopwords.words('russian')) tokenizer = WhitespaceTokenizer() stemmer = SnowballStemmer('russian') punct_cleaned_text = clear_endings( text) # служ. символы конца предложения tokenized_text = tokenizer.tokenize( punct_cleaned_text) # раскидали по словам, только для отчистки stpw_clean_text = [ word for word in tokenized_text if not word in stop_words ] stemmed_text = [stemmer.stem(word) for word in stpw_clean_text ] # проеборазуем в ед. число или корень слова clean_text = None if plain_text: clean_text = ' '.join( stemmed_text ) # собрали обратно в предложение-сторку для хэшировнаия else: clean_text = stemmed_text # иначе возвращаем список токенов return clean_text
def __init__(self): self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)') self.repl = r'\1\2\3' self.tokenizer = WhitespaceTokenizer() self.cached_stopwords = stopwords.words('english') self.ascii_replace = [ ('á', 'a'), ('à', 'a'), ('ã', 'a'), ('â', 'a'), ('é', 'e'), ('è', 'e'), ('ê', 'e'), ('í', 'i'), ('ó', 'o'), ('ò', 'o'), ('ô', 'o'), ('õ', 'o'), ('ú', 'u'), ('ç', 'c'), ('ä', 'a'), ('ë', 'e'), ('ï', 'i'), ('ö', 'o'), ('ü', 'u'), ('Á', 'a'), ('À', 'a'), ('Ã', 'a'), ('Â', 'a'), ('É', 'e'), ('È', 'e'), ('Ê', 'e'), ('Í', 'i'), ('Ó', 'o'), ('Ò', 'o'), ('Ô', 'o'), ('Õ', 'o'), ('Ú', 'u'), ('Ç', 'c') ] self.link_patterns = [('http'), ('www'), ('w3c')] self.digraph = [(r'hash', '#'), (r'rxr', 'rr'), (r'sxs', 'ss'), (r'aqa', 'aa'), (r'eqe', 'ee'), (r'oqo', 'oo'), (r'fqf', 'ff'), (r'gqg', 'gg'), (r'cqc', 'cc'), (r'dqd', 'dd'), (r'mqm', 'mm'), (r'nqn', 'nn'), (r'pqp', 'pp'), (r'dqd', 'dd'), (r'tqt', 'tt'), (r'fqf', 'ff'), (r'lql', 'll')]
from argparse import ArgumentParser from collections import OrderedDict from textblob import TextBlob from nltk.util import bigrams from multiprocessing import Pool from traceback import format_exc from nltk.stem.snowball import EnglishStemmer from nltk.tokenize.regexp import WhitespaceTokenizer from nltk.corpus import stopwords from boto import connect_s3 import requests import codecs import traceback stemmer = EnglishStemmer() tokenizer = WhitespaceTokenizer() stops = stopwords.words(u'english') def get_args(): ap = ArgumentParser() ap.add_argument(u'--num-processes', dest=u"num_processes", default=8, type=int) ap.add_argument(u'--solr-host', dest=u"solr_host", default=u"http://search-s10:8983") ap.add_argument(u'--outfile', dest=u'outfile', default=u'wiki_data.csv') ap.add_argument(u'--s3dest', dest=u's3dest') return ap.parse_args()
def get_social_word_counts(social_var, vocab, comment_file, meta_file, comment_thresh=10): """ Compute unique number of social vars per word in vocab over all comments. Parameters: ----------- social_var : str vocab : [str] Vocabulary to count. comment_file : str meta_file : str Tab-separated metadata file containing comment date, author, thread ID, and subreddit. comment_thresh : int Minimum number of comments for a social var to be counted. Returns: -------- social_var_counts : numpy.array """ # indices in meta file corresponding to social vars social_var_indices = {'user': 1, 'subreddit': 3, 'thread': 2} social_txt = defaultdict(list) tokenizer = WhitespaceTokenizer() stopwords = get_default_stopwords() ngram_range = (1, 1) min_df = 1 cv = CountVectorizer(encoding='utf-8', lowercase=True, tokenizer=tokenizer.tokenize, stop_words=stopwords, ngram_range=ngram_range, min_df=min_df, vocabulary=vocab, binary=True) # keep it simple and store {vocab : {sub : count}} social_word_counts = defaultdict(Counter) with BZ2File(comment_file, 'r') as comments, BZ2File(meta_file, 'r') as metas: for i, (comment, meta) in enumerate(izip(comments, metas)): meta = meta.split('\t') social_id = meta[social_var_indices[social_var]] # print('got social id %s'%(social_id)) # social_txt[social_id].append(comment) for w in tokenizer.tokenize(comment): social_word_counts[w][social_id] += 1 if (i % 100000 == 0): print('processed %d comments' % (i)) # if(i == 500000): # break social_word_counts = { w: d for w, d in social_word_counts.iteritems() if w in vocab } social_word_counts = { w: {k: v for k, v in d.iteritems() if v >= comment_thresh} for w, d in social_word_counts.iteritems() } social_word_counts = {w: len(d) for w, d in social_word_counts.iteritems()} social_word_counts = np.array([ social_word_counts[v] if v in social_word_counts else 0. for v in vocab ]) # old code for constructing word/social dtm # restrict to consistent users?? # social_txt = {k : v for k,v in social_txt.items() # if len(v) >= comment_thresh} # # now convert to DTM # def get_txt_iter(social_txt): # N = len(social_txt) # for i, v in enumerate(social_txt.itervalues()): # if(i % 1000 == 0): # print('processed %d/%d social vars'%(i, N)) # yield ' '.join(v) # txt_iter = get_txt_iter(social_txt) # # txt_iter = (' '.join(v) for v in social_txt.values()) # dtm = cv.fit_transform(txt_iter) # print('got %s dtm %s'%(social_var, dtm)) # # save sparse matrix # # all_social_vals = social_txt.keys() # # vocab = sorted(cv.vocabulary_, key=lambda x: cv.vocabulary_[x]) # # comment_date = re.findall(r'201[0-9]-[0-9]+', comment_file)[0] # # write_full_social_dtm(dtm, all_social_vals, vocab, comment_date, social_var) # # save unique social count for each word # # combine all counts per word # social_word_counts = np.array(dtm.sum(axis=0)).flatten() return social_word_counts
def main(): parser = argparse.ArgumentParser() parser.add_argument('--out_dir', default='../../data/frequency') parser.add_argument('--comment_files', nargs='+', default=None) parser.add_argument('--n', type=int, default=2) parser.add_argument('--file_suffix', default=None) parser.add_argument('--sample_pct', type=float, default=100) args = parser.parse_args() out_dir = args.out_dir comment_files = args.comment_files n = args.n file_suffix = args.file_suffix sample_pct = args.sample_pct if (comment_files is None): comment_files = get_all_comment_files() # replace with clean normalized (smaller vocab) comment_files = [ f.replace('.bz2', '_clean_normalized.bz2') for f in comment_files ] # start small # comment_files = comment_files[:1] # min_df = 5 # min_tf = 10 min_tf = 1 stopwords = [] tokenizer = WhitespaceTokenizer() # breaking memory # ngram_range = (1,3) # ngram_range = (2,3) # ngram_range = (2,2) # ngram_range = (1,1) # no CountVectorizer because memory and we don't need # cooccurrence anyway # cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize, # stop_words=stopwords, ngram_range=ngram_range) date_format = '201[0-9]-[0-9]+' for f in comment_files: print('processing file %s' % (f)) date_str = re.findall(date_format, f)[0] # for each level of ngram, recompute counts # for n in range(ngram_range[0], ngram_range[1]+1): print('computing ngram = %d' % (n)) with BZ2File(f, 'r') as comment_file: # takes too long to generate full DTM...what do?? # just compute counts comment_iter = make_iter(comment_file) counts = get_ngram_counts(comment_iter, n, tokenizer=tokenizer, sample_pct=sample_pct) # limit min_frequency? counts = counts[counts >= min_tf] counts.columns = [date_str] # write to file # TOO MUCH SPACE => compress? if (file_suffix is not None): out_fname = os.path.join( out_dir, '%s_%dgram_tf_%s.tsv' % (date_str, n, file_suffix)) else: out_fname = os.path.join(out_dir, '%s_%dgram_tf.tsv' % (date_str, n)) counts.to_csv(out_fname, sep='\t')
def classify(self, text): return (self.classifier.classify(WhitespaceTokenizer().tokenize(text)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('--comment_files', nargs='+', default=None) parser.add_argument('--out_dir', default='../../data/frequency/') parser.add_argument( '--social_vars', nargs='+', # default=['user', 'thread', 'subreddit']) # default=['user']) # default=['thread']) default=['subreddit']) args = parser.parse_args() comment_files = args.comment_files out_dir = args.out_dir social_vars = args.social_vars if (comment_files is None): # data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/' data_dir = '/mnt/new_hg190/corpora/reddit_comment_data/monthly_submission/' years = ['2015', '2016'] comment_files = get_all_comment_files(data_dir, years) print('comment files %s' % (str(comment_files))) # but we actually want clean_normalized lol comment_files = [ f.replace('.bz2', '_normalized.bz2') for f in comment_files ] meta_files = [f.replace('.bz2', '_meta.bz2') for f in comment_files] # print('got meta files %s'%(meta_files)) # TODO: start small, eventually move to rest of files # comment_files = comment_files[3:] # comment_files = comment_files[1:] # for testing # social_vars = social_vars[:1] vocab = get_default_vocab() # chunk_size = 1000 # chunk_size = 5000 # chunk_size = len(vocab) # chunks = int(len(vocab) / chunk_size) # vocab_chunks = [vocab[i*chunk_size:i*chunk_size+chunk_size] # for i in xrange(chunks)] # start small # top_vocab = 1000 top_vocab = 100000 stopwords = get_default_stopwords() # already whitespace separated, so just need whitespace tokenizer tokenizer = WhitespaceTokenizer() ngram_range = (1, 1) min_df = 1 cv = CountVectorizer( encoding='utf-8', lowercase=True, tokenizer=tokenizer.tokenize, stop_words=stopwords, ngram_range=ngram_range, min_df=min_df, # max_features=top_vocab, vocabulary=vocab, # binarize to save space b/c we only care about cooccurrence binary=True) out_dir = args.out_dir # min number of comments within social value # to make it count # social_comment_thresh = 10 social_comment_thresh = 1 for comment_file, meta_file in izip(comment_files, meta_files): print('processing comment file %s and meta file %s' % (comment_file, meta_file)) date_str = re.findall(r'201[0-9]-[0-9]+', comment_file)[0] for social_var in social_vars: # use for full dtm # out_fname = os.path.join(out_dir, '%s_%s_dtm'%(date_str, social_var)) out_fname = os.path.join( out_dir, '%s_%s_unique.tsv' % (date_str, social_var)) # for each vocab chunk in list, get unique social counts! # for vocab in vocab_chunks: print('got vocab size %d' % (len(vocab))) social_word_counts = get_social_word_counts( social_var, vocab, comment_file, meta_file, comment_thresh=social_comment_thresh) # write to file social_word_counts = pd.DataFrame(social_word_counts, index=vocab) social_word_counts.to_csv(out_fname, sep='\t', header=False)
from nltk import * from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from nltk.stem import PorterStemmer def getUniqueWords(allWords): uniqueWords = [] for i in allWords: if not i in uniqueWords: uniqueWords.append(i) return uniqueWords text_str = open('corpus.txt').read() tokens = WhitespaceTokenizer().tokenize(text_str) print("\nInitial Statistics of the Corpus.") print("#token: " + str(len(tokens))) print("#types: " + str(len(getUniqueWords(tokens)))) print("\nThe Top-10 Frequent Tokens.") freq = nltk.FreqDist(tokens) print(freq.most_common(10)) tokens = [token.lower() for token in tokens] print("\nAfter Case Folding.") print("#token: " + str(len(tokens))) print("#types: " + str(len(getUniqueWords(tokens)))) print("\nThe Top-10 Frequent Tokens.") freq = nltk.FreqDist(tokens)
# -*- coding: latin-1 -*- import re import nltk from nltk.tag import UnigramTagger from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import PunktWordTokenizer from nltk import RegexpParser from nltk.corpus import stopwords from nltk.tokenize.regexp import WhitespaceTokenizer global corpus, sent_tags, tagger # corpus = TaggedCorpusReader('/root/adail/python/names',r'.*\.txt',word_tokenizer=PunktWordTokenizer(),sep="_") PATH no linux corpus = TaggedCorpusReader( 'C:/Users/jose.adail/workspace/TextProcessor/names', r'.*\.txt', word_tokenizer=WhitespaceTokenizer(), sep="_") name_tags = corpus.tagged_sents( ) # Recebe as sentenças marcadas com POS_Tags. tagger = UnigramTagger( name_tags ) # UnigramTagger é treinado com essas sentenças marcadas que o são repassadas. class RegexpReplacer(object): def __init__(self): self.replacement_patterns = [(r"'", ''), (r'#', 'hash'), (r'no', 'no_'), (r'not', 'not_'), (r'RT ', ''), (r'rs[rs]+', 'rs'), (r'ha[ha]+', 'haha'), (r's[s]+', 'sxs'), (r'r[r]+', 'rxr'), (r'a[a]+', 'aqa'),
def __init__(self): self.tokenizer = WhitespaceTokenizer()