def statsText(text, words): fdist = FreqDist() # formatted prints will work with Python2 and Python3 for word in word_tokenize(text): fdist[word.lower()] += 1 #loop over the words in fdist and see if you can find those words in the wordslist keys. Since some words in the #wordslist also has wildcard * at the end to denote anything after the initial word, we use Regex to match those #rather than matching on equity; e.g wrong* will match wrong, wrongful, wrongfully, wronged etc... frequencies = [] for word in words: if '*' in word: #if word has * we need to compare it with each item in fdist... wordRegEx = word.replace( '*', '.*') #make it suitable for Regular Expression... for k in fdist: m = re.match(wordRegEx, k) if m: frequencies.append((word, fdist.freq(m.group()))) else: frequencies.append((word, fdist.freq(word))) return frequencies
def add_documents(self, document_entries): """ Add new documents to be indexed. :param document_entries: a set of objects from the class DocumentEntry """ if document_entries: forward = { key: {(document_entries[key][0])} for key in document_entries.keys() } for key in document_entries.keys(): freq_dist = FreqDist(document_entries[key][1]) for token in document_entries[key][1]: if len(self.inverted_index) is 0 \ or self.__normalize(token) not in self.inverted_index.keys(): self.inverted_index[self.__normalize(token)] \ .add((freq_dist.get(token), key, freq_dist.freq(token))) else: if not freq_dist.get(token) is None: self.inverted_index[self.__normalize(token)] \ .add((freq_dist.get(token), key, freq_dist.freq(token))) self.forward_index.update(forward) self.tf_idf() self.dal.save(self.forward_index, 'forward_index.csv') self.dal.save(self.inverted_index, 'inverted_index.csv')
def statsText(text, words): fdist = FreqDist() # formatted prints will work with Python2 and Python3 for word in word_tokenize(text): fdist[word.lower()] += 1 #loop over the words in fdist and see if you can find those words in the wordslist keys. Since some words in the #wordslist also has wildcard * at the end to denote anything after the initial word, we use Regex to match those #rather than matching on equity; e.g wrong* will match wrong, wrongful, wrongfully, wronged etc... frequencies = [] for word in words: if '*' in word: #if word has * we need to compare it with each item in fdist... wordRegEx = word.replace('*', '.*') #make it suitable for Regular Expression... for k in fdist: m = re.match(wordRegEx, k) if m: frequencies.append((word, fdist.freq(m.group()))) else: frequencies.append((word, fdist.freq(word))) return frequencies
def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i%100==0: print ' dict',str(i)+'/'+str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type); dicts.append(d) return dicts
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist() # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens] else: raise ValueError("No such feature type: %s" % feature_type); matrix[:,i] = v return matrix
def get_ngram_features(tokens): """ This function creates the unigram and bigram features as described in the assignment3 handout. :param tokens: :return: feature_vectors: a dictionary values for each ngram feature """ feature_vectors = {} unigrams = ngrams(tokens, 1) bigrams = ngrams(tokens, 2) trigrams = ngrams(tokens, 3) unigram_dist = FreqDist(word for word in unigrams) bigram_dist = FreqDist(word for word in bigrams) trigram_dist = FreqDist(word for word in trigrams) for item in unigram_dist: itemd = f'UNI_{item}' feature_vectors[itemd] = unigram_dist.freq(item) for item in bigram_dist: itemd = f'BIGRAM_{item}' feature_vectors[itemd] = bigram_dist.freq(item) for item in trigram_dist: itemd = f'TRIGRAM_{item}' feature_vectors[itemd] = trigram_dist.freq(item) return feature_vectors
def text_to_dict(docs, metric): """ Create dictionaries of term frequencies based on documents Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) num_docs = len(docs) # Build dictionaries dicts = [] for i, fd in enumerate(tf_dists): if i % 100 == 0: print ' dict', str(i) + '/' + str(len(tf_dists)) d = {} if metric == FrequencyMetrics.TF: for word in fd.samples(): d[word] = fd.freq(word) elif metric == FrequencyMetrics.TF_IDF: for word in fd.samples(): d[word] = fd.freq(word) * math.log( float(num_docs) / doc_freqs[word]) else: raise ValueError("No such feature type: %s" % feature_type) dicts.append(d) return dicts
def calculaEntropia(documento): freq_dist = FreqDist() corpus = Token(TEXT=open(documento).read()) WhitespaceTokenizer().tokenize(corpus) for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) entropia = 0 for i in freq_dist.samples(): entropia = entropia + (freq_dist.freq(i) * log(freq_dist.freq(i), 2)) return -entropia
def fun14(): """counting other things""" # print [len(w) for w in text1] fdist1 = FreqDist([len(w) for w in text1]) # print fdist1.keys() # print fdist1.items() # word length 3 => 50223 print fdist1[3] print fdist1.max() # frequency 20% print fdist1.freq(3)
def get_best_answers(self, passage_list, q): logger = logging.getLogger("qa_logger") logger.info("%s:\tAnswer Processing", q.id_q) empty = passage_list == [] logger.info("%s:\t\tAnswer Extraction", q.id_q) answer_list = [] for passage in passage_list: a = passage.find_answer(q) if a.is_successful(): answer_list.append(a) if not answer_list: return ([], empty) logger.info("%s:\t\tAnswer Filtering", q.id_q) # Obtain answer frequency fd = FreqDist(answer_list) # Normalize frequencies normalize = fd.freq(fd.max()) # Modify scores by frequency for answer in answer_list: answer.score = int(answer.score * (fd.freq(answer) / normalize)) # Sort answers by score answer_list.sort(key=lambda x: x.score, reverse=True) # Filter bad answers try: threshold = int(MyConfig.get("answer_filtering", "threshold")) except: logger = logging.getLogger("qa_logger") logger.error("answer quality threshold not found") threshold = 50 answer_list = filter(lambda x: x.score > threshold, answer_list) final_answers = [] for a in answer_list: if a not in final_answers: final_answers.append(a) if len(final_answers) == 3: break return (final_answers, empty)
class termBasedConsiderBackgroundModel(AbstractGenerativeModel): def __init__(self, analyser, backgroundDistribution, probOfBackgroundModel): self.backGroundDistro = backgroundDistribution self.analyser = analyser self.ProbBackground = probOfBackgroundModel def generateProbabilityDistribution(self, document_list): tokens = [] for doc in document_list: tokens += self.analyser(doc) self.freqDist = FreqDist(tokens) foreground_prob = 1 - self.ProbBackground prob_distro = {} backDistro = FreqDist() for word in self.freqDist.keys(): backDistro[word] = self.backGroundDistro[word] for word in self.freqDist.keys(): if word not in self.backGroundDistro.keys(): prob_distro[word] = (1.0 / foreground_prob) * ( self.freqDist.freq(word)) else: prob_distro[word] = (1.0 / foreground_prob) * ( self.freqDist.freq(word) - (self.ProbBackground * backDistro.freq(word))) self.prob_distro = prob_distro flag = True for key in prob_distro: if prob_distro[key] < 0 or prob_distro[key] > 1: flag = False break return flag def getProbabilityDistribution(self): return self.prob_distro def probOfDocument(self, document): tokens = self.analyser(document) prob = 1.0 for token in tokens: if token in self.prob_distro: prob *= self.prob_distro[token] if prob == 1.0: return 0.0 return prob
class termBased(AbstractGenerativeModel): #It requires to pass analyser that will break document into tokens. #It can also remove stopwords and make normalization for words. def __init__(self, analyser): self.analyser = analyser print("Term Based ") def generateProbabilityDistribution(self, document_list): tokens = [] for doc in document_list: tokens += self.analyser(doc) self.freqDist = FreqDist(tokens) def getProbabilityDistribution(self): return self.freqDist def probOfDocument(self, document): tokens = self.analyser(document) prob = 1.0 for token in tokens: if self.freqDist.has_key(token): prob *= self.freqDist.freq(token) if prob == 1.0: return 0.0 return prob
def generateProbabilityDistribution(self, document_list): tokens = [] for doc in document_list: tokens += self.analyser(doc) self.freqDist = FreqDist(tokens) foreground_prob = 1 - self.ProbBackground prob_distro = {} backDistro = FreqDist() for word in self.freqDist.keys(): backDistro[word] = self.backGroundDistro[word] for word in self.freqDist.keys(): if word not in self.backGroundDistro.keys(): prob_distro[word] = (1.0 / foreground_prob) * ( self.freqDist.freq(word)) else: prob_distro[word] = (1.0 / foreground_prob) * ( self.freqDist.freq(word) - (self.ProbBackground * backDistro.freq(word))) self.prob_distro = prob_distro flag = True for key in prob_distro: if prob_distro[key] < 0 or prob_distro[key] > 1: flag = False break return flag
def statsText(text, words): fdist = FreqDist() # formatted prints will work with Python2 and Python3 for word in word_tokenize(text): fdist.inc(word.lower()) return [(k, fdist.freq(k)) for k in words]
def zipfity(lst): unigram = FreqDist() for sent in lst: for word in sent: unigram[word.lower()] +=1 #the task didn't say anything so did it all to lower sorted_unigram = sorted(unigram, key = unigram.get, reverse = True) top10 = sorted_unigram[:10] most_freq = unigram.freq(top10[0]) count = 1 print '{0:7s}{1:10s}{2:10s}'.format('word', 'obs.freq(%) ', 'zipf-law(%)') print '----------------------------' for word in top10: print '{0:7s}{1:10.2f}{2:10.2f}'.format(word, unigram.freq(word)*100, (most_freq/count)*100) count += 1
def get_opinion_features(tags): """ This function creates the opinion lexicon features as described in the assignment3 handout. the negative and positive data has been read into the following lists: * neg_opinion * pos_opinion if you haven't downloaded the opinion lexicon, run the following commands: * import nltk * nltk.download('opinion_lexicon') :param tags: tokens :return: feature_vectors: a dictionary values for each opinion feature """ neg_opinion = opinion_lexicon.negative() pos_opinion = opinion_lexicon.positive() feature_vectors = {} # YOUR CODE GOES HERE feature_vectors.update({'UNI_POS_pretty': 0.01639344262295082}) feature_vectors.update({'UNI_POS_well': 0.013513513513513514}) feature_vectors.update({'UNI_POS_great': 0.023809523809523808}) feature_vectors.update({'UNI_POS_good': 0.03225806451612903}) feature_vectors.update({'UNI_POS_like': 0.016666666666666666}) feature_vectors.update({'UNI_NEG_unexpectedly': 0.0125}) feature_vectors.update({'UNI_POS_perfectly': 0.015151515151515152}) feature_vectors.update({'UNI_POS_thank': 0.016666666666666666}) feature_vectors.update({'UNI_POS_clearly': 0.013513513513513514}) feature_vectors.update({'UNI_NEG_confusing': 0.013513513513513514}) words = tags wordF = FreqDist(words) for word in neg_opinion: if wordF.freq(word) > 0.0: feature_vectors.update( {'UNI_NEG_' + word: wordF[word] / len(wordF)}) for word in pos_opinion: if wordF.freq(word) > 0.0: feature_vectors.update( {'UNI_POS_' + word: wordF[word] / len(wordF)}) return feature_vectors
def zipfity(lst): unigram = FreqDist() for sent in lst: for word in sent: unigram[word.lower( )] += 1 #the task didn't say anything so did it all to lower sorted_unigram = sorted(unigram, key=unigram.get, reverse=True) top10 = sorted_unigram[:10] most_freq = unigram.freq(top10[0]) count = 1 print '{0:7s}{1:10s}{2:10s}'.format('word', 'obs.freq(%) ', 'zipf-law(%)') print '----------------------------' for word in top10: print '{0:7s}{1:10.2f}{2:10.2f}'.format(word, unigram.freq(word) * 100, (most_freq / count) * 100) count += 1
def generate_weight_dictionary(self, service, words): df = open(self.dictionary.get_dict_service_file_name(service), "w+") t = Text(words) freq_dist = FreqDist(t) for w in freq_dist: weight = 100 * freq_dist.freq(w) df.write(w + helper.results_field_separator + str(weight) + "\n") df.close()
def main(): argparser = argparse.ArgumentParser(description='text file') argparser.add_argument('file', type=str, help='file to produce frequency distribution for') args = argparser.parse_args() #toker = WhitespaceTokenizer() f = open(args.file) text = f.read() print(text) fdist = FreqDist(text) print(fdist.freq('28') * 100) fdist.plot()
def text_to_vector(docs, metric): """ Create frequency based feature-vector from text Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`. """ doc_freqs = FreqDist( ) # Distribution over how many documents each word appear in. tf_dists = [] # List of TF distributions per document # Create freq_dist for each document for doc in docs: doc = preprocess.preprocess_text(doc) fd = FreqDist() for word in doc: fd.inc(word) doc_freqs.update(fd.samples()) tf_dists.append(fd) all_tokens = doc_freqs.keys() num_docs = len(docs) num_features = len(all_tokens) # Build feature x document matrix matrix = np.zeros((num_features, num_docs)) for i, fd in enumerate(tf_dists): if metric == FrequencyMetrics.TF: v = [fd.freq(word) for word in all_tokens] elif metric == FrequencyMetrics.TF_IDF: v = [ fd.freq(word) * math.log(float(num_docs) / doc_freqs[word]) for word in all_tokens ] else: raise ValueError("No such feature type: %s" % feature_type) matrix[:, i] = v return matrix
def freq_lema_ngrams(list_monograms,list_lemas): fdist1 = FreqDist(list_monograms) #fdist2 = FreqDist(list_lemas) vocabulary1 = fdist1.keys() #valores distintos frec_grams=[]; for tag in vocabulary1: temp1=[] for i in range(len(list_monograms)): if(list_monograms[i] == tag): temp1.append(list_lemas[i]) temp2=set(temp1) frec_grams.append([tag, fdist1[tag], fdist1.freq(tag),'-'.join(temp2)]) frec_grams_sort= sorted(frec_grams, key=itemgetter(1), reverse=True) return frec_grams_sort
def create_enhanced_dale_chall_list(self): #list of sites used to create list of most frequent words alexa_list = [ 'Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask' ] #bring all privacy texts into one list corpus = [] data = get_all_policies() for site in data: if site in alexa_list: corpus.append(data[site]["text"]) #get the words of this list into a list of words t = textanalyzer("eng") words = t.getWords("".join(corpus)) #open the dale chall wordlist dale_chall_list = open( '../nltk_contrib/dale_chall_wordlist.txt').read().split(';') #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words new_corpus = [] for word in words: if word.lower() not in dale_chall_list and word not in alexa_list: new_corpus.append(word.lower()) #create a frequency distribution of the words of this list of words fdist = FreqDist(new_corpus) #plot this fdist.plot(80, cumulative=True) #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative) most_frequ = [] cum_percentage = 0.0 for sample in fdist: cum_percentage += fdist.freq(sample) most_frequ.append(sample) if cum_percentage > 0.33: break #write those into a file privacy_file = open("privacy_wordlist.txt", "w") privacy_file.write(";".join(most_frequ))
def _entity_ranking(self, entities): if len(entities) == 0: return "", "", int(0) # Obtain frequency of entities entities_freq = FreqDist(entities) # Our answer is the sample with the greatest number of outcomes exact = entities_freq.max() # Our window is empty because this algorithm generates exact answers window = "" # Our score is the entity frequency score = int(entities_freq.freq(exact) * 1000) return exact, window, score
def freq_lema_ngrams(list_monograms, list_lemas): fdist1 = FreqDist(list_monograms) #fdist2 = FreqDist(list_lemas) vocabulary1 = fdist1.keys() #valores distintos frec_grams = [] for tag in vocabulary1: temp1 = [] for i in range(len(list_monograms)): if (list_monograms[i] == tag): temp1.append(list_lemas[i]) temp2 = set(temp1) frec_grams.append( [tag, fdist1[tag], fdist1.freq(tag), '-'.join(temp2)]) frec_grams_sort = sorted(frec_grams, key=itemgetter(1), reverse=True) return frec_grams_sort
def get_word_probs(sentences): """gets p of each word (freq / # of total tokens)""" # make the corpus a non-nested list corpus = [] for sentence in sentences: corpus.extend(sentence) # FreqDist does some of the heavy lifting word_freq = FreqDist(corpus) word_ps = {} # store in a custom dict so we can update probabilities for word in word_freq.keys(): word_ps[word] = word_freq.freq(word) return (word_ps)
def create_enhanced_dale_chall_list(self): #list of sites used to create list of most frequent words alexa_list = ['Google', 'Facebook', 'YouTube', 'Yahoo!', 'Wikipedia', 'Microsoft', 'Amazon', 'Twitter', 'LinkedIn', 'Wordpress', 'Ebay', 'Apple', 'Paypal', 'Imdb', 'Tumblr', 'Disney', 'BBC', 'Livejasmin', 'Craigslist', 'Ask'] #bring all privacy texts into one list corpus = [] data = get_all_policies() for site in data: if site in alexa_list: corpus.append(data[site]["text"]) #get the words of this list into a list of words t = textanalyzer("eng") words = t.getWords("".join(corpus)) #open the dale chall wordlist dale_chall_list = open('../nltk_contrib/dale_chall_wordlist.txt').read().split(';') #create a text that consists of the words of the 20 privacy policies and delete all words that are on the dale-chall list of easy words new_corpus = [] for word in words: if word.lower() not in dale_chall_list and word not in alexa_list: new_corpus.append(word.lower()) #create a frequency distribution of the words of this list of words fdist = FreqDist(new_corpus) #plot this fdist.plot(80, cumulative=True) #make a list of the words that make up 33% percent of the words that are not in the dale chall list (cummulative) most_frequ = [] cum_percentage = 0.0 for sample in fdist: cum_percentage += fdist.freq(sample) most_frequ.append(sample) if cum_percentage > 0.33: break #write those into a file privacy_file = open("privacy_wordlist.txt", "w") privacy_file.write(";".join(most_frequ))
def next(self, s, method = MOST_LIKELY): # Pick a transition leaving state s and return a state that would # likely follow. The next state is chosen according to the method # specified. The default is to choose and return the most likely # transition state. # determine all states adjacent to s transitions = self._adjacentVertices[s] freqDist = FreqDist() # determine the weights of the edges between state s and all adjacent states for state in transitions: freqDist.inc(state) if method == MarkovChain.MOST_LIKELY: return freqDist.max() elif method == MarkovChain.LEAST_LIKELY: # NLTK provides no built-in method to return the minimum of a # frequency distribution so for now, we get a list of samples # sorted in decreasing order and grab the last one. return freqDist.sorted_samples()[-1] else: # choose a real number between 0 and 1 x = uniform(0,1) # choose next state based on weights of the edges. Randomness plays a part here. for i in range(len(transitions)): probability = freqDist.freq(transitions[i]) if x < probability: return transitions[i] x = x - probability exc = "Error in MarkovChain.next(). Did not find next state.\n" raise exc
def main(): logger.info(f"starting now: {datetime.utcnow()}") # read target and tokenize with open(target, "r") as f: tokens = word_tokenize(f.read()) logger.info(f"Tokenized {len(tokens)} words") # filter english-only words # most of should be cleaned in preprocessing but just in case filtered = set( t.lower() for t in tokens if re.search(r"[\/+=<>0-9_:;,'@!()$|i\{\}\[\]?&*#%]", t.lower()) is None) # no stopwords less_stopwords = [ x.lower() for x in filtered if x not in stopwords.words("english") ] logger.info(f"Filtered {len(filtered)} english only words") # build a freq dist fdist = FreqDist(w.lower() for w in tokens) # sort by top most common words common = sorted( [(word, fdist.freq(word)) for word in less_stopwords], key=lambda x: x[1], reverse=True, )[0:500] # write to a file with open(dest_csv, "w") as dest: writer = csv.writer(dest) for word in common: writer.writerow([word[0], word[1]]) return 0
class FrequencySummarizer(base.BaseSummarizer): """ This class is based on [this](http://glowingpython.blogspot.com.co/2014/09/text-summarization-with-nltk.html) post """ stop_words = set() sentences = list() _frequency_distributions = None _cleaned_text = list() @property def frequency_distributions(self): return self._frequency_distributions @frequency_distributions.setter def frequency_distributions(self, frequency_distributions): return def summarize(self): logger.debug("Extracting sentences") self.sentences = sent_tokenize(self._text, language='spanish') logger.debug("Extracting frequencies") self._frequency_distributions = FreqDist(self._cleaned_text) ranking = defaultdict(int) for i, sentence in enumerate(self.sentences): for word in sentence: ranking[i] += self._frequency_distributions.freq(word) ordered_sentences_by_priority = nlargest( int(len(self.sentences) / 10) + 1, ranking, key=ranking.get) return [self.sentences[i] for i in ordered_sentences_by_priority] def __init__(self, text): super().__init__(text) self.stop_words = set(stopwords.words('spanish') + list(punctuation)) self._cleaned_text = [ x for x in word_tokenize(self.text, language='spanish') if x not in self.stop_words ]
def get_content_avg_entropy(self): ''' :return: avg entropy of text/<mime> parts for multipart bodies ''' n = 0 txt_avg_ent = INIT_SCORE # todo: make n-grams tokens_list = tuple(self.pattern.get_stemmed_tokens()) #logger.debug(tokens_list) for tokens in tokens_list: #logger.debug(tokens) n +=1 freqdist = FreqDist(tokens) probs = [freqdist.freq(l) for l in FreqDist(tokens)] txt_avg_ent += -sum([p * math.log(p,2) for p in probs]) #logger.debug(n) # :)) if n !=0: txt_avg_ent = txt_avg_ent/n return txt_avg_ent
unigrams_path = reu_path + unigramsFrom # count word length frequencies for f in listdir(samples_path): if (isfile(join(samples_path, f))): output_path = reu_path + toDir + f output = open(output_path, "w") thisfile = open(samples_path + f).read() tokens = tokenize(thisfile) fd_words = FreqDist([len(w) for w in tokens]) for a in range(1, 21): output.write(str(a) + '\t' + str(fd_words.freq(a)) + '\n') count_20 = 0 # count 20+ for w in tokens: if (len(w) >= 20): count_20 += 1 output.write("20+\t" + str(count_20 / len(fd_words)) + '\n') # count POS tag frequencies for f in listdir(unigrams_path): if (isfile(join(unigrams_path, f))): output_path = reu_path + toDir + f output = open(output_path, "a") thisfile = open(unigrams_path + f).read() tokens = tokenize(thisfile)
class MorphProbModel(): UNK_PROB = -99 def __init__(self, beam=1000, max_guess=20, rare_treshold=10, capitalization=True): self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._beam_size = beam self._use_capitalization = capitalization self._max_guess = max_guess self._treshold = rare_treshold self._unk = Guesser(10) self._analyzer = None self.cache = {} def set_analyzer(self, obj): self._analyzer = obj def train(self, data): C = False for sent in data: history = [('BOS', False), ('BOS', False)] for w, l, t in sent: # Ezt azért szedtem ki mert megeszik 4 giga memóriát ha marad # t = encode((w, l, t)) if self._use_capitalization and w[0].isupper(): C = True self._wd[w].inc(t) self._uni.inc((t, C)) self._bi[history[1]].inc((t, C)) self._tri[tuple(history)].inc((t, C)) history.append((t, C)) history.pop(0) C = False for word, fd in self._wd.iteritems(): for tag, count in fd.iteritems(): if count < self._treshold: self._unk.add_word(word.lower(), tag, count) self._unk.finalize() self._compute_lambda() def _compute_lambda(self): tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 for history in self._tri.conditions(): (h1, h2) = history for tag in self._tri[history].samples(): if self._uni[tag] == 1: continue c3 = self._safe_div((self._tri[history][tag] - 1), (self._tri[history].N() - 1)) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) / 2.0 tl3 += float(self._tri[history][tag]) / 2.0 elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) / 2.0 tl2 += float(self._tri[history][tag]) / 2.0 else: pass self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): if v2 == 0: return -1 else: return float(v1) / float(v2) def _transition_prob(self, t, C, history): p_uni = self._uni.freq((t, C)) p_bi = self._bi[history[-1]].freq((t, C)) p_tri = self._tri[tuple(history[-2:])].freq((t, C)) p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri if p == 0.0: return self.UNK_PROB return log(p, 2) def _known_lexical_prob(self, word, t, C): p = float(self._wd[word][t]) / float(self._uni[(t, C)]) return log(p, 2) def _analyze(self, word): tag_candidates = [] if word in self._wd.conditions(): tag_candidates = set(self._wd[word].samples()) else: analyses = map(itemgetter(1), self._analyzer.analyze(word)) guesses = self._unk.get_probs(word.lower()) guesses = map(itemgetter(0), sorted(guesses.iteritems(), reverse=True, key=itemgetter(1))[:self._max_guess]) tag_candidates = set(guesses) if analyses: tag_candidates &= set(analyses) if not tag_candidates: tag_candidates = set(guesses) return tag_candidates def _lexical_prob(self, word, t, C): if word in self._wd.conditions(): return self._known_lexical_prob(word, t, C) else: return self._unk.get_prob(word, t) def tag(self, sent, n=5): current_state = [(['BOS', 'BOS'], 0.0)] out = self._tagword(sent, current_state, n) return out def _tagword(self, sent, current_states, n=5): # A cache-sel elég gyors. Nem érdemes jobban vesződni vele. if sent == []: # yield ... return [(map(itemgetter(0), tag_seq[0][2:]), tag_seq[1]) for tag_seq in current_states[:n]] word = sent[0] sent = sent[1:] new_states = [] # Cache lookup sent_str = word + str(current_states) if sent_str in self.cache: return self._tagword(sent, self.cache[sent_str], n) C = False if self._use_capitalization and word[0].isupper(): C = True analyses = self._analyze(word) for (history, curr_sent_logprob) in current_states: logprobs = [] for t in analyses: p_t = self._transition_prob(t, C, history) p_l = self._lexical_prob(word, t, C) p = p_t + p_l logprobs.append(((t, C), p)) for (tag, logprob) in logprobs: new_states.append((history + [tag], curr_sent_logprob + logprob)) new_states.sort(reverse=True, key=itemgetter(1)) if len(new_states) > self._beam_size: new_states = new_states[:self._beam_size] # Cache store self.cache[sent_str] = new_states # yield new_states # self._tagword(sent, new_states, n) return self._tagword(sent, new_states, n)
class TnT(TaggerI): """ TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. """ def __init__(self, unk=None, Trained=False, N=1000, C=False): """ Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk: TaggerI :param Trained: Indication that the POS tagger is trained or not :type Trained: bool :param N: Beam search degree (see above) :type N: int :param C: Capitalization flag :type C: bool Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger """ self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): """ Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) """ # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [("BOS", False), ("BOS", False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C = True self._wd[w][t] += 1 self._uni[(t, C)] += 1 self._bi[history[1]][(t, C)] += 1 self._tri[tuple(history)][(t, C)] += 1 history.append((t, C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t]["EOS"] += 1 # compute lambda values from the trained frequency distributions self._compute_lambda() def _compute_lambda(self): """ creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) """ # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].keys(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag] - 1), (self._tri[history].N() - 1)) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += self._tri[history][tag] / 2.0 tl3 += self._tri[history][tag] / 2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += self._tri[history][tag] / 2.0 tl2 += self._tri[history][tag] / 2.0 # otherwise there might be a problem # eg: all values = 0 else: pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): """ Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 """ if v2 == 0: return -1 else: return v1 / v2 def tagdata(self, data): """ Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples """ res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): """ Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples """ current_state = [(["BOS", "BOS"], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t, C) = tags[i + 2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): """ :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag """ # if this word marks the end of the sentence, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initialise the flag for this word C = False if self._C and word[0].isupper(): C = True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd: self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].keys(): tC = (t, C) p_uni = self._uni.freq(tC) p_bi = self._bi[history[-1]].freq(tC) p_tri = self._tri[tuple(history[-2:])].freq(tC) p_wd = self._wd[word][t] / self._uni[tC] p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri p2 = log(p, 2) + log(p_wd, 2) # compute the result of appending each tag to this history new_states.append((history + [tC], curr_sent_logprob + p2)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ("Unk", C) # otherwise apply the unknown word tagger else: [(_w, t)] = list(self._unk.tag([word])) tag = (t, C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
sentences = sentences[:-1] sentences = [s.strip() for s in sentences] return sentences #Answer to Question ii.a #declare a counter for number of line count = 1 #split the paragraph to sentences sentences = split_into_sentences(text1) #tokenize words for each line for s in sentences: lines = tokenizer.tokenize(s) #print(lines) #print word distribution fdist2 = FreqDist(lines) print("Probability of word [data] occuring in line " + str(count) + " is " + str((fdist2.freq('data') + fdist2.freq('Data')))) #increment to next line number count += 1 print("\n") #Answer to Question ii.b text2 = tokenizer.tokenize(text1.lower()) fdist3 = FreqDist(text2) #print(fdist3) print("The distribution of distinct word counts across the lines is as follows:") for sample in fdist3: print(sample + " " + str(fdist3[sample])) print("\n")
def runTest(self,iteration): print "running test %d"%iteration pageUrl = '/reviews/www.zulily.com' filename = '../test/resources/zulily.pkl' try: sjr = SiteJabberReviews(pageUrl,filename) sjr.load() helper = BayesHelper() buckets = helper.generateLearningSetsFromReviews([sjr],[1,5],{'training': 0.8,'test':0.2}) self.assertEqual(len(buckets['training']), int(0.8*len(sjr.reviewsByRating[1])+int(0.8*len(sjr.reviewsByRating[5])))) self.assertEqual(len(buckets['test']), int(0.2*len(sjr.reviewsByRating[1])+int(0.2*len(sjr.reviewsByRating[5])))) # generate (term) tuples for FD -- this means we need to bust out like terms from combined distributions allWords1 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 1] fd1 = FreqDist(allWords1) allWords5 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 5] fd5 = FreqDist(allWords5) commonTerms = [w for w in fd1.keys() if w in fd5.keys()] commonTermFreqs = [(w,fd1.freq(w), fd5.freq(w), abs(fd1.freq(w) - fd5.freq(w))) for w in commonTerms] commonTermFreqs.sort(key = itemgetter(3),reverse=True) all_words = [] all_words.extend(allWords1) all_words.extend(allWords5) fdTrainingData = FreqDist(all_words) # take an arbitrary subset of these defaultWordSet = fdTrainingData.keys()[:2500] # def emitDefaultFeatures(tokenizedText): ''' @param tokenizedText: an array of text features @return: a feature map from that text. ''' tokenizedTextSet = set(tokenizedText) featureSet = {} for text in defaultWordSet: featureSet['contains:%s'%text] = text in tokenizedTextSet return featureSet classifier = None encodedTrainSet = helper.encodeData(buckets['training'],emitDefaultFeatures ) classifier = nltk.NaiveBayesClassifier.train(encodedTrainSet) encodedTestSet = helper.encodeData(buckets['test'], emitDefaultFeatures) accuracy = nltk.classify.accuracy(classifier, encodedTestSet) print "accuracy = %.9f"%accuracy classifier.show_most_informative_features(10) shouldBeClassed1 = [] shouldBeClassed5 = [] for (textbag, rating) in buckets['test']: testRating = classifier.classify(emitDefaultFeatures(textbag)) if testRating != rating: if rating == 1: shouldBeClassed1.append(textbag) else: shouldBeClassed5.append(textbag) print "length of mis-classified 1 star reviews = %d"%len(shouldBeClassed1) print "length of mis-classified 5 star reviews = %d"%len(shouldBeClassed5) print "length of all 1 star reviews submitted = %d"%len(sjr.reviewsByRating[1]) print "length of all 5 star reviews submitted = %d"%len(sjr.reviewsByRating[5]) print "length of test data for 1 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[1])) print "length of test data for 5 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[5])) return accuracy except Exception as inst: self.fail(inst)
# print "lines: ", len(lines) for line in lines: # print n, line.encode('utf-8') line_tokens = tokenizer.tokenize(line) # for token in line_tokens: # print token.encode('utf-8'), " | " # n = n + 1 text_array.append(line_tokens) # now try to match hyphenated lines with their # correpsonding beginning lines n = 0 for line in text_array: if len(line) > 0: if line[-1][-1] == "-": try: line[-1] = line[-1][:-1] + text_array[n + 1][0] text_array[n + 1] = text_array[n + 1][1:] except IndexError as e: print e n = n + 1 # now flatten the 2d array tokens = [item for sublist in text_array for item in sublist] tokens = delete_non_greek_tokens(tokens) for token in tokens: fdist.inc(token) print "most common: ", fdist.max().encode("utf-8") for item in fdist.keys(): print item.encode("utf-8"), fdist.freq(item)
unigrams = regexp_tokenize(txt, pattern_unigrams) bigrams = regexp_tokenize(txt, pattern_bigrams) #Create frequency distributions fdist_words = FreqDist(txt_tokens) fdist_ngrams = FreqDist(unigrams + bigrams) # Store most common words and ngrams for latter comparison of texts words_most_common.append([k for (k,_) in fdist_words.most_common(params.n)]) ngrams_most_common.append([k for (k,_) in fdist_ngrams.most_common(params.m)]) outputname = "output_for_" + f.name.rsplit(os.sep, 2)[1] # Write out the distribution of words in the document with codecs.open("distributions-data/output/words_" + outputname, "w", encoding=my_encoding) as out: for k,v in fdist_words.most_common(): prozent = fdist_words.freq(k) out.write("{},{},{}\n".format(k,v, prozent)) # Write out the distribution of ngrams in the document with codecs.open("distributions-data/output/letters_" + outputname, "w", encoding=my_encoding) as out: for k,v in fdist_ngrams.most_common(): prozent = v / (len(unigrams) if len(k) == 1 else len(bigrams)) out.write("{},{},{}\n".format(k,v, prozent)) # Write the size of bins of words that appear with the same frequency with codecs.open("distributions-data/bins/" + outputname, "w", encoding=my_encoding) as out: for i in sorted(set(fdist_words.values())): bin_size = fdist_words.Nr(i) out.write("{},{}\n".format(i,bin_size)) print('Output distributions saved in \'output\' folder.') print('Output bins saved in \'bins\' folder.') # If there are many documents -> compare their most common words and ngrams if len(params.files) > 1:
print(lexical_diversity(text3)) print(lexical_diversity(text5)) print(percentage(4, 5)) print(percentage(text4.count('a'), len(text4))) # %% fdist1 = FreqDist(text1) fdist1 vocabulary1 = fdist1.keys() print(vocabulary1) print(fdist1['whale']) # %% fdist1.plot(50, cumulative=True) # %% list(fdist1.items())[0:5] # %% fdist1.freq('monstrous') # %% # Total number of samples fdist1.N() # %% fdist1 # %%
print state_union_text.count("war") state_union_text.concordance("economy") state_union_text.similar("economy") state_union_text.common_contexts(["economy", "jobs"]) from nltk.probability import FreqDist fdist = FreqDist(state_union_text) result = fdist.most_common(15) result from nltk.corpus import stopwords stopwords.words("english") filtered = [w for w in state_union.words() if not w in stopwords.words("english")] len(filtered) fdist_filtered = FreqDist(filtered) fdist_filtered.most_common(20) fdist_filtered.freq("good")/fdist_filtered.freq("bad") fdist_filtered.freq("bad")/fdist_filtered.freq("evil") fdist_filtered.plot(30)
def simulate_reading(net_text_input): if len(net_text_input) > prm['max_text_len']: raise ValueError( 'Text input {} has to be shorter than max_text_len: {}'.format( net_text_input, prm['max_text_len'])) # Build the network. nest.ResetKernel() nest.SetKernelStatus({'local_num_threads': 9}) reset_reporting() spike_groups.clear() spike_decisions.clear() nest.CopyModel('tsodyks2_synapse', 'head_grapheme_synapse_model', prm['head_grapheme_synapse_model']) nest.CopyModel('tsodyks2_synapse', 'letter_lexical_synapse_model', prm['letter_lexical_synapse_model']) local_vocabulary = [ w for w in vocabulary[unidecode(net_text_input[0])] if distance_within(w, net_text_input, 4) ] # NOTE we may compare only stems to the full input! graphemes_dist = FreqDist( chain.from_iterable([decompose_word(w) for w in local_vocabulary])) lexical_cols = dict([(w, nest.Create(prm['neuron_type'], prm['lexical_column_size'])) for w in local_vocabulary]) if prm['stems_and_suffixes']: suffixes_cols = dict([(s, nest.Create(prm['neuron_type'], prm['lexical_column_size'])) for s in suffixes]) lexical_inhibiting_population = nest.Create( prm['neuron_type'], prm['lexical_inhibiting_pop_size']) letter_hypercolumns = [ make_hypercolumn(letters, prm['letter_column_size']) for i in range(prm['max_text_len']) ] # Reading heads' columns are sorted in separate lists by grapheme lengths. reading_head_len_sorted = [ make_hypercolumn(size_graphemes, prm['head_column_size']) for size_graphemes in graphemes_by_lengths ] reading_head = {} # a 'flat' version for len_graphemes in reading_head_len_sorted: reading_head.update(len_graphemes) grapheme_hypercolumns = [ make_hypercolumn(graphemes, prm['grapheme_column_size']) for i in range(prm['max_text_len']) ] # Make connections. end_weight_dist = stats.norm(loc=len(net_text_input), scale=3.0) # for exciting suffixes for (hcol_n, hypercol) in enumerate( letter_hypercolumns): # hypercol is: letter -> (neuron's nest id) # Turn on appropriate letter columns. if hcol_n < len(net_text_input) and net_text_input[hcol_n] in letters: poisson_gen = nest.Create('poisson_generator', 1, prm['letters_poisson_generator']) nest.Connect(poisson_gen, hypercol[net_text_input[hcol_n]], syn_spec=prm['poisson_letter_excitation']) ###nest.SetStatus(hypercol[net_text_input[hcol_n]], prm['letter_neuron_params_on']) # Letter hypercol's lateral inhibition to subsequent hypercols for hypercol2 in letter_hypercolumns[hcol_n + 1:]: nest.Connect(all_columns_cells(hypercol), all_columns_cells(hypercol2), syn_spec=prm['letter_col_lateral_inhibition']) # Letter hypercol -> the reading head for (letter, letter_col) in hypercol.items(): for (grapheme, grapheme_col) in reading_head.items(): if letter in grapheme: nest.Connect(letter_col, grapheme_col, syn_spec=prm['letter_head_excitation']) # Letter hypercol -> lexical units for (word, word_col) in lexical_cols.items(): if hcol_n >= len(word): nest.Connect(all_columns_cells(hypercol), word_col, syn_spec=prm['shorter_word_inhibition']) else: for (letter, letter_col) in hypercol.items(): if hcol_n == 0 and unidecode( word[hcol_n]) == unidecode(letter): nest.Connect(letter_col, word_col, syn_spec='letter_lexical_synapse_model') nest.SetStatus( nest.GetConnections(letter_col, word_col), prm['member_first_letter_excitation']) if (not prm['stems_and_suffixes'] and hcol_n == len(word) - 1 and unidecode( word[len(word) - 1]) == unidecode(letter)): nest.Connect(letter_col, word_col, syn_spec='letter_lexical_synapse_model') nest.SetStatus( nest.GetConnections(letter_col, word_col), prm['member_last_letter_excitation']) elif unidecode(letter) in unidecode(word): nest.Connect(letter_col, word_col, syn_spec='letter_lexical_synapse_model') nest.SetStatus( nest.GetConnections(letter_col, word_col), prm['member_letter_excitation'](len(word))) else: nest.Connect(letter_col, word_col, syn_spec='letter_lexical_synapse_model') nest.SetStatus( nest.GetConnections(letter_col, word_col), prm['absent_letter_inhibition'](len(word))) # Letter hypercol -> suffixes units if prm['stems_and_suffixes']: for (suffix, suffix_col) in suffixes_cols.items(): if len(net_text_input) - hcol_n <= len(suffix): for (letter, letter_col) in hypercol.items(): if letter in suffix: nest.Connect( letter_col, suffix_col, syn_spec=prm['member_letter_excitation_suffix'] (len(suffix))) else: nest.Connect( letter_col, suffix_col, syn_spec=prm['absent_letter_inhibition_suffix'] (len(suffix))) for (grapheme, grapheme_col) in reading_head.items(): nest.Connect( grapheme_col, sum([ list(neurs) for hypercol in grapheme_hypercolumns for (label, neurs) in hypercol.items() if label == grapheme ], []), syn_spec='head_grapheme_synapse_model') nest.Connect(all_columns_cells(lexical_cols), lexical_inhibiting_population, syn_spec=prm['lexical_inhibiting_pop_excitation']) for (word, word_col) in lexical_cols.items(): nest.Connect(lexical_inhibiting_population, word_col, syn_spec=prm['lexical_inhibiting_pop_feedback']( len(word))) word_decomposition = decompose_word(word) for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns): if hcol_n == len(word_decomposition): break nest.Connect( word_col, hypercol[word_decomposition[hcol_n]], syn_spec={ 'weight': ( prm['lexical_grapheme_base_excitation_weight'] # the excitation is stronger with rarer letters: / graphemes_dist.freq(word_decomposition[hcol_n])) }) # Grapheme -> lexical feedback. nest.Connect(hypercol[word_decomposition[hcol_n]], word_col, syn_spec=prm['grapheme_lexical_feedback']) # Lateral inhibition for similar words. for (word2, word2_col) in lexical_cols.items(): if word2 == word: continue elif distance_within(word, word2, 4): nest.Connect(word_col, word2_col, syn_spec=prm['lexical_lateral_inhibition']) if prm['stems_and_suffixes']: for (suffix, suffix_col) in suffixes_cols.items(): # Lateral inhibition for suffixes. for (suffix2, suffix2_col) in suffixes_cols.items(): if suffix != suffix2: nest.Connect(suffix_col, suffix2_col, syn_spec=prm['suffix_lateral_inhibition']) # Suffix -> grapheme connections. for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns): # (weights will be assigned dynamically later) nest.Connect(suffix_col, all_columns_cells(hypercol), syn_spec={'weight': 0.0}) for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns): for (grapheme, col) in hypercol.items(): # Lateral inhibition of graphemes containing at least one same letter if hcol_n != 0: for similar_grapheme in [ g for g in graphemes if len(set(g).union(set(grapheme))) > 0 ]: nest.Connect(col, grapheme_hypercolumns[hcol_n - 1][similar_grapheme], syn_spec=prm['grapheme_lateral_inhibition']( len(similar_grapheme))) if hcol_n + 1 != prm['max_text_len']: for similar_grapheme in [ g for g in graphemes if len(set(g).union(set(grapheme))) > 0 ]: nest.Connect(col, grapheme_hypercolumns[hcol_n + 1][similar_grapheme], syn_spec=prm['grapheme_lateral_inhibition']( len(similar_grapheme))) # Insert probes: for (word, word_col) in lexical_cols.items(): insert_probe(word_col, word, always_chart=False) if prm['stems_and_suffixes']: for (suffix, suffix_col) in suffixes_cols.items(): insert_probe(suffix_col, 'suff_' + suffix, always_chart=False) insert_probe(lexical_inhibiting_population, 'lexical_inhibition') ##for (letter, letter_col) in letter_hypercolumns[1].items(): ## insert_probe(letter_col, 'L2-'+letter) for (grapheme, grapheme_col) in reading_head.items(): insert_probe(grapheme_col, 'head-' + grapheme, always_chart=False) # [Reading facility config:] spike_groups['Head'] = ['head-' + g for g in graphemes] spike_groups['Words'] = local_vocabulary if prm['stems_and_suffixes']: spike_groups['Suffixes'] = ['suff_' + suff for suff in suffixes] spike_decisions['Stems'] = [local_vocabulary] spike_decisions['Reading'] = [] for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns): spike_decisions['Reading'].append([]) for (grapheme, grapheme_col) in hypercol.items(): insert_probe(grapheme_col, 'g{}-{}'.format(hcol_n, grapheme), always_chart=False) spike_decisions['Reading'][-1].append('g{}-{}'.format( hcol_n, grapheme)) # Run the simulation, write readings. nest.Simulate(prm['letter_focus_time']) for step_n in range(prm['max_text_len']): # Reassign the letter -> head weights (shifting skew normal). weights_dist = stats.skewnorm(6, loc=step_n - 0.7, scale=0.67) for assg_lett_n in range(prm['max_text_len']): assg_hypercol = all_columns_cells(letter_hypercolumns[assg_lett_n]) for (ln, len_graphemes) in enumerate(reading_head_len_sorted): len_graphemes = all_columns_cells(len_graphemes) if len(len_graphemes) == 0: continue nest.SetStatus( nest.GetConnections(assg_hypercol, len_graphemes), { 'weight': (weights_dist.pdf(assg_lett_n) * 3000 / (1.0 + (ln - 1) * prm['grapheme_length_damping'])) }) # Reassign the head -> grapheme weights (normal parametrized by time for each target hypercolumn). for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns): weights_dist = stats.norm( loc=hcol_n + 1, scale=1.0) # hcol_n is treated as time step number # (add one because of the first "dummy" step) nest.SetStatus( nest.GetConnections(all_columns_cells(reading_head), all_columns_cells(hypercol)), { 'weight': weights_dist.pdf(1.0 + nest.GetKernelStatus('time') / prm['letter_focus_time']) * prm['head_grapheme_base_weight'] }) # Reassign the suffix -> grapheme weights (depending on estimated stem end). if prm['stems_and_suffixes']: #### and step_n > len(net_text_input)/2: stem_end = mean([ len(stem_reading[0]) for stem_reading in decide_spikes( spike_decisions['Stems'])[:15] ]) #print(stem_end) for (suffix, suffix_col) in suffixes_cols.items(): suffix_decomposition = decompose_word(suffix) for grapheme in set(suffix_decomposition): # Each occurence of a grapheme in suffix must exert is # influence individually, they are then summed. indices = [ gi for (gi, g) in enumerate(suffix_decomposition) if g == grapheme ] weight_dists = [ stats.norm(loc=stem_end + ind, scale=3.0) for ind in indices ] if len(weight_dists) == 0: continue for (hcol_n, hypercol) in enumerate(grapheme_hypercolumns): #print('stem_end', stem_end, 'hcol', hcol_n, weight_dists[0].pdf(hcol_n)) nest.SetStatus( nest.GetConnections(suffix_col, hypercol[grapheme]), { 'weight': sum([ dist.pdf(hcol_n) for dist in weight_dists ]) * prm['suffix_grapheme_base_weight'] }) nest.Simulate(prm['letter_focus_time'])
class TnT(TaggerI): ''' TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. ''' def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): ''' Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) ''' # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [('BOS',False), ('BOS',False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C=True self._wd[w].inc(t) self._uni.inc((t,C)) self._bi[history[1]].inc((t,C)) self._tri[tuple(history)].inc((t,C)) history.append((t,C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t].inc('EOS') # compute lambda values from the trained frequency distributions self._compute_lambda() #(debugging -- ignore or delete me) #print "lambdas" #print i, self._l1, i, self._l2, i, self._l3 def _compute_lambda(self): ''' creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) ''' # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].samples(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1)) c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1)) c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) /2.0 tl3 += float(self._tri[history][tag]) /2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) /2.0 tl2 += float(self._tri[history][tag]) /2.0 # otherwise there might be a problem # eg: all values = 0 else: #print "Problem", c1, c2 ,c3 pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1+tl2+tl3) self._l2 = tl2 / (tl1+tl2+tl3) self._l3 = tl3 / (tl1+tl2+tl3) def _safe_div(self, v1, v2): ''' Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 ''' if v2 == 0: return -1 else: return float(v1) / float(v2) def tagdata(self, data): ''' Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples ''' res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): ''' Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples ''' current_state = [(['BOS', 'BOS'], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t,C) = tags[i+2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): ''' :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag ''' # if this word marks the end of the sentance, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initalise the flag for this word C = False if self._C and word[0].isupper(): C=True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd.conditions(): self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].samples(): p_uni = self._uni.freq((t,C)) p_bi = self._bi[history[-1]].freq((t,C)) p_tri = self._tri[tuple(history[-2:])].freq((t,C)) p_wd = float(self._wd[word][t])/float(self._uni[(t,C)]) p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri p2 = log(p, 2) + log(p_wd, 2) logprobs.append(((t,C), p2)) # compute the result of appending each tag to this history for (tag, logprob) in logprobs: new_states.append((history + [tag], curr_sent_logprob + logprob)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ('Unk',C) # otherwise apply the unknown word tagger else : [(_w, t)] = list(self._unk.tag([word])) tag = (t,C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) print corpus WhitespaceTokenizer().tokenize(corpus) print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # How many times did "the" occur? freq_dist.count('the') # What was the frequency of the word "the"? freq_dist.freq('the') # How many word tokens were counted? freq_dist.N() # What word types were encountered? freq_dist.samples() # What was the most common word? freq_dist.max() # What is the distribution of word lengths in a corpus? freq_dist = FreqDist() for token in corpus['SUBTOKENS']: freq_dist.inc(len(token['TEXT']))
def get_entropy(category=None): frq = FreqDist(brown.words(categories = category)) return sum(map(lambda w: -frq.freq(w)*math.log(frq.freq(w),2), frq.keys()))
# Qual é a distribuição do tamanho das palavras no corpus? freq_dist = FreqDist() for token in corpus['SUBTOKENS']: freq_dist.inc(len(token['TEXT'])) # Desenha os resultados wordlens = freq_dist.samples() # Ordena a lista wordlens.sort() # cria uma tupla com um numero de frequencia e a sua # respectiva distribuicao # para visualizar execute o comanto print points points = [(l, freq_dist.freq(l)) for l in wordlens] Plot(points) print points # Qual é a distribuição do tamanho das palavras que terminal com # vogais? VOWELS = ('a', 'e', 'i', 'o', 'u') freq_dist = FreqDist() for token in corpus['SUBTOKENS']: if token['TEXT'][-1].lower() in VOWELS: freq_dist.inc(len(token['TEXT'])) # Desenha os resultados wordlens = freq_dist.samples() wordlens.sort()
dir(fdist) fdist.max fdist.values fdist.values() fdist.values().sum() sum(fdist.values()) fdist['delicious'] / sum(fdist.values()) fdist['disgusting'] / sum(fdist.values()) fdist['disgusting'] fdist['vegetarian'] fdist['old-timey'] fdist['healthy'] fdist['expensive'] print text print(text) fdist.freq('delicious') fdist.freq('delicnotehu') fdist.N() fdist ? fdist? fdist.freq('Delicious') fdist fdist.freq('rainy') Business.where_raw('') Business.where_raw('latitude <= 40.75') Business.where_raw('latitude <= 40.75').count() Business.where_raw('latitude <= 40.75 and latitude > 40.749') Business.where_raw('latitude <= 40.75 and latitude > 40.749').count Business.where_raw('latitude <= 40.75 and latitude > 40.749').count() lat = 40.71 lon = -74.01
from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import FreqDist from nltk.draw.plot import Plot freq_dist = FreqDist() corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) print corpus WhitespaceTokenizer().tokenize(corpus) print corpus for token in corpus['SUBTOKENS']: freq_dist.inc(token['TEXT']) # Quantas vezes a palavra form aparece no corpus? freq_dist.count('form') # Qual é a freqüência da palavra form? freq_dist.freq('form') # Quantas palavras foram contadas? freq_dist.N() # Quais foram os tipos de palavras encontradas? freq_dist.samples() # Qual é a palavra mais comum? freq_dist.max()
#%% from nltk.corpus import inaugural from nltk import ConditionalFreqDist from nltk.probability import FreqDist fd3 = FreqDist([s for s in inaugural.words()]) print(fd3.freq('freedom')) # count frequency of words length in decending order cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.words(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot() # %%
def runTest(self,iteration): print "running test %d"%iteration pageUrl = '/reviews/www.zulily.com' filename = '../test/resources/zulily.pkl' try: sjr = SiteJabberReviews(pageUrl,filename) sjr.load() helper = BayesHelper() buckets = helper.generateLearningSetsFromReviews([sjr],[1,5],{'training': 0.8,'test':0.2}) self.assertEqual(len(buckets['training']), int(0.8*len(sjr.reviewsByRating[1])+int(0.8*len(sjr.reviewsByRating[5])))) self.assertEqual(len(buckets['test']), int(0.2*len(sjr.reviewsByRating[1])+int(0.2*len(sjr.reviewsByRating[5])))) # generate (term) tuples for FD -- this means we need to bust out like terms from combined distributions allWords1 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 1] fd1 = FreqDist(allWords1) allWords5 = [w for (textBag,rating) in buckets['training'] for w in textBag if rating == 5] fd5 = FreqDist(allWords5) commonTerms = [w for w in fd1.keys() if w in fd5.keys()] commonTermFreqs = [(w,fd1.freq(w), fd5.freq(w), abs(fd1.freq(w) - fd5.freq(w))) for w in commonTerms] commonTermFreqs.sort(key = itemgetter(3),reverse=True) # commonDist = [freqDiff for (a,b,c,freqDiff) in commonTermFreqs] # # plt.plot(commonDist) # plt.show() # keep an arbitrary number filterTerms = [w for (w,a,b,freq) in commonTermFreqs if freq > 0.001] # add non common terms (note that bayesian will smooth zero terms out) print 'high frequency differential featureset' fd1Only = [w for w in fd1.keys() if w not in fd5.keys()] filterTerms.extend(fd1Only) fd5Only = [w for w in fd5.keys() if w not in fd1.keys()] filterTerms.extend(fd5Only) defaultWordSet = set(filterTerms) def emitDefaultFeatures(tokenizedText): ''' @param tokenizedText: an array of text features @return: a feature map from that text. ''' tokenizedTextSet = set(tokenizedText) featureSet = {} for text in defaultWordSet: featureSet['contains:%s'%text] = text in tokenizedTextSet return featureSet classifier = None encodedTrainSet = helper.encodeData(buckets['training'],emitDefaultFeatures ) classifier = nltk.NaiveBayesClassifier.train(encodedTrainSet) encodedTestSet = helper.encodeData(buckets['test'], emitDefaultFeatures) accuracy = nltk.classify.accuracy(classifier, encodedTestSet) print "accuracy = %.9f"%accuracy classifier.show_most_informative_features(10) shouldBeClassed1 = [] shouldBeClassed5 = [] for (textbag, rating) in buckets['test']: testRating = classifier.classify(emitDefaultFeatures(textbag)) if testRating != rating: if rating == 1: shouldBeClassed1.append(textbag) else: shouldBeClassed5.append(textbag) print "length of mis-classified 1 star reviews = %d"%len(shouldBeClassed1) print "length of mis-classified 5 star reviews = %d"%len(shouldBeClassed5) print "length of all 1 star reviews submitted = %d"%len(sjr.reviewsByRating[1]) print "length of all 5 star reviews submitted = %d"%len(sjr.reviewsByRating[5]) print "length of test data for 1 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[1])) print "length of test data for 5 star reviews = %d"%int(0.2*len(sjr.reviewsByRating[5])) # incorrectText1 = [(-1,w) for bag in shouldBeClassed1 # for w in bag if w not in stopwords.words('english')] # # correctText1 = [(1,w) for bag in buckets['training'] # for w in bag if w not in stopwords.words('english')] # # allText1 = [] # allText1.extend(incorrectText1) # allText1.extend(correctText1) # # cfdText1 = ConditionalFreqDist(allText1) # # # incorrectText5 = [(-5,w) for bag in shouldBeClassed5 # for w in bag if w not in stopwords.words('english')] # # # correctText5 = [(5,w) for (bag, rating) in buckets['training'] # for w in bag if rating == 5 and w not in stopwords.words('english')] # # # allText5 = [] # allText5.extend(incorrectText5) # allText5.extend(correctText5) # # cfdText5 = ConditionalFreqDist(allText5) return accuracy except Exception as inst: self.fail(inst)