コード例 #1
0
 def __FreqFromCorpus (self):
     r"""
         Questo metodo estrae le frequenze dal corpus
     """
     print "Calcolo bigrams..."
     bi = FreqDist(bigrams(self.words))
     print "Calcolo FreqDist..."
     wfr = FreqDist(self.words)
     
     print "Coda di elaborazione..."
     print 
           
     tot = len(bi.keys())
     i = 0
     for eles in bi.keys():
         a = wfr[eles[0]]
         b = wfr[eles[1]]
         ab = bi[eles]
         N = wfr.N()
         try:
             self.__col_logl.append (nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood  (a, b, ab, N))
             print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t ->  %f" % (i, tot,eles[0], eles[1], self.__col_logl[-1])
         except UnicodeEncodeError:
             #catturo eventuali errori di codifica
             pass
         i += 1
コード例 #2
0
ファイル: re.py プロジェクト: 52nlp/nlp
def pmi(features):
	'''
	Compute the PMI value for all features
	'''
	dic = FreqDist()
	dic_pos = FreqDist()
	pos = 0.0
	N = 0.0
	for i,feature in enumerate(features):
		N = N + 1
		for f in feature:
			if f[-1] == 1:
				pos = pos + 1
				for t in f[:-3]:
					dic_pos.inc(t)
					dic.inc(t)
			else:
				for t in f[:-3]:
					dic.inc(t)
	N = N + len(dic.keys())
	pos = pos + len(dic.keys())
	pmi_pos = {}
	for t in dic.keys():
		pmi_pos[t]=np.log(float((dic_pos[t]+1)*N)/float((dic[t]+1)*pos))
	pmi_pos = dict(sorted(pmi_pos.items(), key=itemgetter(1)))
	return pmi_pos
コード例 #3
0
def wordprefixsuffixsubstringsprobdist():
	for w in englishdicttxt:
		wtok=w.split()
		if len(wtok) > 0:		
			computeprefixessuffixessubstrings(wtok[0])
			wordlist.append(wtok[0])
	#prefixf=open("WordPrefixesProbabilities.txt","w")
	#suffixf=open("WordSuffixesProbabilities.txt","w")
	prefixdict=FreqDist(prefixes)
	suffixdict=FreqDist(suffixes)
	substringsdict=FreqDist(suffixes)
	totalprefixes=sum(prefixdict.values())
	totalsuffixes=sum(suffixdict.values())
	totalsubstrings=sum(substringsdict.values())
	for pk,pv in zip(prefixdict.keys(), prefixdict.values()):
		prefixprobdict[pk] = float(pv)/float(totalprefixes)
	for pk,pv in zip(suffixdict.keys(), suffixdict.values()):
		suffixprobdict[pk] = float(pv)/float(totalsuffixes)
	for pk,pv in zip(substringsdict.keys(), substringsdict.values()):
		substringsprobdict[pk] = float(pv)/float(totalsubstrings)
	#json.dump(prefixprobdict,prefixf)
	#json.dump(suffixprobdict,suffixf)
	#print "prefix probabilities:",prefixprobdict
	#print "suffix probabilities:",suffixprobdict
	return (prefixprobdict, suffixprobdict, substringsprobdict)
コード例 #4
0
    def __FreqFromCorpus(self):
        r"""
            Questo metodo estrae le frequenze dal corpus
        """
        print "Calcolo bigrams..."
        bi = FreqDist(bigrams(self.words))
        print "Calcolo FreqDist..."
        wfr = FreqDist(self.words)

        print "Coda di elaborazione..."
        print

        tot = len(bi.keys())
        i = 0
        for eles in bi.keys():
            a = wfr[eles[0]]
            b = wfr[eles[1]]
            ab = bi[eles]
            N = wfr.N()
            try:
                self.__col_logl.append(
                    nltk.tokenize.punkt.PunktTrainer()._col_log_likelihood(
                        a, b, ab, N))
                print "elemento %d / %d \t -> \tloglikelihood di %s %s \t\t ->  %f" % (
                    i, tot, eles[0], eles[1], self.__col_logl[-1])
            except UnicodeEncodeError:
                #catturo eventuali errori di codifica
                pass
            i += 1
コード例 #5
0
ファイル: lda.py プロジェクト: wilmtang/StatisticalNLP
class VocabBuilder:
    """
    Creates a vocabulary after scanning a corpus.
    """

    def __init__(self, lang="english", min_length=3, cut_first=100):
        """
        Set the minimum length of words and which stopword list (by language) to
        use.
        """
        self._counts = FreqDist()
        self._stop = set(stopwords.words(lang))
        self._min_length = min_length
        self._cut_first = cut_first

        print("Using stopwords: %s ... " % " ".join(list(self._stop)[:10]))

    def scan(self, words):
        """
        Add a list of words as observed.
        """

        for ii in [x.lower() for x in words if x.lower() not in self._stop \
                       and len(x) >= self._min_length]:
            self._counts.inc(ii)

    def vocab(self, size=5000):
        """
        Return a list of the top words sorted by frequency.
        """
        if len(self._counts) > self._cut_first + size:
            return self._counts.keys()[self._cut_first:(size + self._cut_first)]
        else:
            return self._counts.keys()[:size]
コード例 #6
0
ファイル: sentiment.py プロジェクト: srom/sentiment
    def get_most_common_ngrams(self, n, nb_ngrams=None):
        """
        Compute and return the set of the most common ngrams in the documents.
        This set is cached inside the object.

        Args:
            n: The number of grams. Must be a positive interger.
            nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'.

        Returns:
            A list of the most common ngrams.
        """
        try:
            # return cached value
            return self._most_common_ngrams[n]
        except KeyError:
            pass

        # compute all ngrams
        all_ngrams = []
        for document in self.training_set:
            all_ngrams.extend(self.compute_ngrams(document, n))

        # get the frequency or return all ngrams
        freq = FreqDist(ngram for ngram in all_ngrams)
        # store and return the nb_ngrams most common ngrams
        if nb_ngrams:
            self._most_common_ngrams[n] = freq.keys()[:nb_ngrams]
        else:
            self._most_common_ngrams[n] = freq.keys()
        return self._most_common_ngrams[n]
コード例 #7
0
    def get_most_common_ngrams(self, n, nb_ngrams=None):
        """
        Compute and return the set of the most common ngrams in the documents.
        This set is cached inside the object.

        Args:
            n: The number of grams. Must be a positive interger.
            nb_ngrams: The number of ngrams to return, i.e quantifying the 'most'.

        Returns:
            A list of the most common ngrams.
        """
        try:
            # return cached value
            return self._most_common_ngrams[n]
        except KeyError:
            pass

        # compute all ngrams
        all_ngrams = []
        for document in self.training_set:
            all_ngrams.extend(self.compute_ngrams(document, n))

        # get the frequency or return all ngrams
        freq = FreqDist(ngram for ngram in all_ngrams)
        # store and return the nb_ngrams most common ngrams
        if nb_ngrams:
            self._most_common_ngrams[n] = freq.keys()[:nb_ngrams]
        else:
            self._most_common_ngrams[n] = freq.keys()
        return self._most_common_ngrams[n]
コード例 #8
0
ファイル: project.py プロジェクト: bayomim/CIS-530-Project
def get_bot_nouns_verbs(pos_tags, tagmap, n):
    # get_func_words('/home1/c/cis530/hw4/funcwords.txt')
    funcwords = get_func_words('funcwords.txt')
    fdNoun = FreqDist()
    fdVerb = FreqDist()
    for (word, tag) in pos_tags:
        if tagmap[tag] == "VERB" and word not in funcwords and wn.synsets(word):
            fdVerb.inc(word) 
        elif tagmap[tag] == "NOUN" and word not in funcwords and wn.synsets(word):
            fdNoun.inc(word)
    return (fdNoun.keys()[::-1][:n], fdVerb.keys()[::-1][:n])
コード例 #9
0
def get_all_nouns_verbs(tok_sents, tagmap):
    # get_func_words('/home1/c/cis530/hw4/funcwords.txt')
    funcwords = get_func_words('funcwords.txt')
    fdNoun = FreqDist()
    fdVerb = FreqDist()
    for sent in tok_sents:
        for tup in sent:
            if tagmap[tup[2]] == "VERB" and tup[1] not in funcwords and wn.synsets(tup[0]):
                fdVerb.inc(tup[1]) 
            elif tagmap[tup[2]] == "NOUN" and tup[1] not in funcwords and wn.synsets(tup[0]):
                fdNoun.inc(tup[1])
    return (fdNoun.keys(), fdVerb.keys())
コード例 #10
0
ファイル: project.py プロジェクト: zyh961117/CIS-530-Project
def get_bot_nouns_verbs(pos_tags, tagmap, n):
    # get_func_words('/home1/c/cis530/hw4/funcwords.txt')
    funcwords = get_func_words('funcwords.txt')
    fdNoun = FreqDist()
    fdVerb = FreqDist()
    for (word, tag) in pos_tags:
        if tagmap[tag] == "VERB" and word not in funcwords and wn.synsets(
                word):
            fdVerb.inc(word)
        elif tagmap[tag] == "NOUN" and word not in funcwords and wn.synsets(
                word):
            fdNoun.inc(word)
    return (fdNoun.keys()[::-1][:n], fdVerb.keys()[::-1][:n])
コード例 #11
0
def get_all_nouns_verbs(tok_sents, tagmap):
    # get_func_words('/home1/c/cis530/hw4/funcwords.txt')
    funcwords = get_func_words('funcwords.txt')
    fdNoun = FreqDist()
    fdVerb = FreqDist()
    for sent in tok_sents:
        for tup in sent:
            if tagmap[tup[2]] == "VERB" and tup[
                    1] not in funcwords and wn.synsets(tup[0]):
                fdVerb.inc(tup[1])
            elif tagmap[tup[2]] == "NOUN" and tup[
                    1] not in funcwords and wn.synsets(tup[0]):
                fdNoun.inc(tup[1])
    return (fdNoun.keys(), fdVerb.keys())
コード例 #12
0
ファイル: ExtractorOfWords.py プロジェクト: Labzin/NLP
class ExtractorOfWords():
    
    def __init__(self, pos_words, neg_words, type_of_Feature_extractor = 0): 
        self.pos_words_training =  reduce(lambda words,review: words + review.words(), pos_words, [])
        self.neg_words_training =  reduce(lambda words,review: words + review.words(), neg_words, [])
        
        if type_of_Feature_extractor == 1:
            formated_pos_words_training = self.Feature_extractor1(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor1(self.neg_words_training)
        elif type_of_Feature_extractor == 2:
            formated_pos_words_training = self.Feature_extractor2(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor2(self.neg_words_training)
        elif type_of_Feature_extractor == 3:
            formated_pos_words_training = self.Feature_extractor3(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor3(self.neg_words_training)
        elif type_of_Feature_extractor == 4:
            formated_pos_words_training = self.Feature_extractor4(self.pos_words_training)
            formated_neg_words_training = self.Feature_extractor4(self.neg_words_training)
        else: 
            formated_pos_words_training = self.pos_words_training
            formated_neg_words_training = self.neg_words_training
        
        self.pos_words_freqdist = FreqDist(formated_pos_words_training)
        self.neg_words_freqdist = FreqDist(formated_neg_words_training)
               
    #Extract n most Freq. words    
    def Extraxt_n_most_Freq_Words (self, n):
        return  self.pos_words_freqdist.keys()[:n], self.neg_words_freqdist.keys()[:n]
    
    #list of all words with their number of occurrences over *number_count*   
    def Extraxt_words_above_count (self, number_count):
        return  [word for word,count in self.pos_words_freqdist.iteritems() if count > number_count], [word for word,count in self.neg_words_freqdist.iteritems() if count > number_count]
    
    #PorterStemmer
    def Feature_extractor1(self, in_list):             
        ps =  PorterStemmer()
        return [ps.stem(w) for w in in_list]
    
    #lowercase versions of all the words
    def Feature_extractor2(self, in_list):             
        return [w.lower() for w in in_list]
    
    #Replace all number tokens with "NUM"
    def Feature_extractor3(self, in_list):             
        return ["NUM" if w.isdigit() else w for w in in_list]
    
    #combination of fiters 1 and 2
    def Feature_extractor4(self, in_list):             
        return [w.lower() for w in in_list if w.isalpha() and w.lower() not in stopwords.words('english')]
コード例 #13
0
ファイル: q2.py プロジェクト: atiassa/recommend-2011
def _train(self, tagged_corpus, cutoff=0, verbose=False): 
    token_count = hit_count = 0 
    useful_contexts = set() 
    fd = ConditionalFreqDist() 
    tag_prob = FreqDist()
    for sentence in tagged_corpus: 
        tokens, tags = zip(*sentence) 
        for index, (token, tag) in enumerate(sentence): 
            # Record the event. 
            token_count += 1 
            tag_prob.inc(tag)
            context = self.context(tokens, index, tags[:index])
            if context is None: continue 
            fd[context].inc(tag) 
            # If the backoff got it wrong, this context is useful: 
            if (self.backoff is None or 
                tag != self.backoff.tag_one(tokens, index, tags[:index])): 
                useful_contexts.add(context) 
    # Build the context_to_tag table -- for each context,  
    # calculate the entropy.  Only include contexts that 
    # lower then `cutoff` .
    total_tags = float(sum(tag_prob.values()))
    tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()]
    useful_contexts_after_filter = useful_contexts.copy()
    most_high = FreqDist()
    for context in useful_contexts:
        dd = fd[context]
#        total_tags = float(sum(dd.values()))
#        tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()]
        h = self.H(dd.keys(),tags_probs)
        if h > cutoff:
            useful_contexts_after_filter.remove(context)
            continue
        most_high[context] = h
    print most_high.keys()
    # Build the context_to_tag table -- for each context, figure
    # out what the most likely tag is.  
    for context in useful_contexts_after_filter:
        best_tag = fd[context].max()
        hits = fd[context][best_tag]
        self._context_to_tag[context] = best_tag
        hit_count += hits
    # Display some stats, if requested. 
    if verbose: 
        size = len(self._context_to_tag) 
        backoff = 100 - (hit_count * 100.0)/ token_count 
        pruning = 100 - (size * 100.0) / len(fd.conditions()) 
        print "[Trained Unigram tagger:", 
        print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
コード例 #14
0
def find_stop_words(file, lower_bound, upper_bound):
    all_posts = []
    stop_words = []
    for raw_post in file:
        all_posts += raw_post[1]

    frequency = FreqDist(all_posts)
    total_count = frequency.B()
    lower_bound = total_count * lower_bound
    upper_bound = total_count * (1 - upper_bound)

    stop_words += frequency.keys()[-int(lower_bound):]
    stop_words += frequency.keys()[:int(upper_bound)]

    return stop_words
コード例 #15
0
class termBasedConsiderBackgroundModel(AbstractGenerativeModel):
    def __init__(self, analyser, backgroundDistribution,
                 probOfBackgroundModel):
        self.backGroundDistro = backgroundDistribution
        self.analyser = analyser
        self.ProbBackground = probOfBackgroundModel

    def generateProbabilityDistribution(self, document_list):
        tokens = []
        for doc in document_list:
            tokens += self.analyser(doc)
        self.freqDist = FreqDist(tokens)

        foreground_prob = 1 - self.ProbBackground
        prob_distro = {}

        backDistro = FreqDist()
        for word in self.freqDist.keys():
            backDistro[word] = self.backGroundDistro[word]

        for word in self.freqDist.keys():
            if word not in self.backGroundDistro.keys():
                prob_distro[word] = (1.0 / foreground_prob) * (
                    self.freqDist.freq(word))
            else:
                prob_distro[word] = (1.0 / foreground_prob) * (
                    self.freqDist.freq(word) -
                    (self.ProbBackground * backDistro.freq(word)))
        self.prob_distro = prob_distro
        flag = True
        for key in prob_distro:
            if prob_distro[key] < 0 or prob_distro[key] > 1:
                flag = False
                break
        return flag

    def getProbabilityDistribution(self):
        return self.prob_distro

    def probOfDocument(self, document):
        tokens = self.analyser(document)
        prob = 1.0
        for token in tokens:
            if token in self.prob_distro:
                prob *= self.prob_distro[token]
        if prob == 1.0:
            return 0.0
        return prob
コード例 #16
0
ファイル: NLTK_tools.py プロジェクト: dreampocketit/bocard
def demo_similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.
        
        @param word: The word used to seed the similarity search
        @type word: C{str} 
        @param num: The number of words to generate (default=20)
        @type num: C{int}
        @seealso: L{ContextIndex.similar_words()}
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.text.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        while 1:
          word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):'); 
          print "word='"+ word + "'"
          if word == '0': break
          word = word.decode('utf-8')
          wci = self._word_context_index._word_to_contexts
          if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            print tokenwrap(words)
          else:
            print "No matches"
コード例 #17
0
ファイル: OrigReader.py プロジェクト: wencanluo/Summarization
	def getsimilar(self, word, num =20):
		"""
		@param word: The word used to seed the similarity search 
		@type word: C{str}  
		@param num: The number of words to generate (default=20) 
		@type num: C{int} 
		@seealso: L{ContextIndex.similar_words()}
		"""	
		if '_word_context_index' not in self.__dict__: 
			print 'Building word-context index...' 
			self._word_context_index = ContextIndex(self.tokens, 
		                                        filter=lambda x:x.isalpha(), 
		                                        key=lambda s:s.lower()) 
		#words = self._word_context_index.similar_words(word, num) 
		word = word.lower() 
		wci = self._word_context_index._word_to_contexts 
		if word in wci.conditions(): 
			contexts = set(wci[word]) 
			fd = FreqDist(w for w in wci.conditions() for c in wci[w] 
		              if c in contexts and not w == word) 
			words = fd.keys()[:num] #lists of words
			#print tokenwrap(words) 
			return words
		else: 
			print "No matches"
			return None
コード例 #18
0
ファイル: model_classifier.py プロジェクト: jiwooming9/NID
def get_word_features(wordlist):
    """
        tra ve cac tu co tan suat xuat hien nhieu
    """
    wordlist = FreqDist(wordlist)
    #print wordlist.keys(),"------->",wordlist.values()
    return wordlist.keys()
コード例 #19
0
def getAllWords(lines, stop_words):
    all_words = {}
    try:
        for line in lines:
            words = line.split()
            for word in words:
                if word not in stop_words:
                    all_words[word] = True
        temp = all_words.keys()
        # removePunctuationFromList(temp)


        top_words = FreqDist(temp)
        print("All Words list length : ", len(top_words))
        # print(str(list(all_words1.keys())[:100]))

        # use top 20000 words
        return list(top_words.keys())[:20000]
        # word_features = list(all_words.keys())[:6000]
        # featuresets = [(find_features(rev, word_features), category)
        #        for (rev, category) in documents]
        # print("Feature sets list length : ", len(featuresets))
    except Exception as e:
        print("type error: " + str(e))
        exit()
コード例 #20
0
    def run(self):
        # Preprocessing
        train_corpus = self.preprocess(self.trainData)
        self.test_corpus = self.preprocess(self.testData)

        # Generate dictionary
        wordFreq = FreqDist([
            word for phrase in train_corpus + self.test_corpus
            for word in phrase.split(" ")
        ])
        self.vocabulary = list(wordFreq.keys())[:2000]

        # Extracting features
        self.extractFeatures(train_corpus)

        # Polarize matrix
        self.modifyMatrix()
        # pickle.dump(self.X, open("dataMatrix", "wb"))
        # self.X = pickle.load(open("dataMatrix", "rb"))

        self.trainX, self.testX, self.trainY, self.testY = train_test_split(
            self.X, self.y, test_size=0.1, random_state=0)
        del self.X

        # Classify
        classifier_name = "DNN"
        self.classify(classifier_name)
コード例 #21
0
ファイル: term_frequency.py プロジェクト: mkcor/caption-words
def tokenize_clean(text):
    """Return list of items from tokenized text."""
    tokens = word_tokenize(text.lower())
    fdist = FreqDist(tokens)
    words = [w.lower() for w in fdist.keys()
             if w not in stopwords.words('english') and w.isalpha()]
    return words
コード例 #22
0
ファイル: markov.py プロジェクト: christiaanw/NaNoGenMo-2014
class MyMarkovModel(MarkovModel):
    def __init__(self, order):
        self.order = order
        self.filename = NGRAM_FILES[self.order]
       
        if 3 >= self.order >= 2:
            self.backoff = MyMarkovModel(order - 1)
            self.cfd = ConditionalFreqDist()
            self.charset = self.backoff.charset
            for ngram, count in self.get_data():
                context, char = tuple(ngram[:-1]), ngram[-1]
                self.cfd[context][char] = count

        elif self.order == 1:
            self.backoff = None
            self.n = 0
            self.fd = FreqDist()
            for char, count in self.get_data():
                self.fd[char] = count
            self.charset = set(self.fd.keys())

        else:
            raise NotImplemented

    def get_data(self):
        with open(self.filename) as fp:
            for line in fp.readlines():
                ngram, count = line.lower().split()
                count = int(count)
                yield ngram, count
コード例 #23
0
def pkmpopana():
	df=pd.read_csv("pkm-19-clean.csv")
	for i in range(20,27):
		df=df.append(pd.read_csv(f'pkm-{str(i)}-clean.csv'),ignore_index=True)
	sen=''
	for j in df['text']:
		sen+=j
	sen=sen.lower()
	toker=RegexpTokenizer(r'\w+')
	words=toker.tokenize(sen)
	stop_words = set(stopwords.words('english'))
	filtered_sentence = [w for w in words if not w in stop_words]
	fdist=FreqDist(filtered_sentence)
	pk=pd.read_csv('pokemon.csv')
	pk=pk[pk['id']<152]
	pkmname=list(pk['pokemon'])
	re={}
	for n in pkmname:
		if n in fdist.keys():
			re[n]=fdist[n]
	so=sorted(re.items(),key=lambda item:item[1],reverse = True)
	l,p=[],[]
	tar=so[0:2]
	for i in tar:
		l.append(i[1])
		p.append(i[0])
	plt.barh(list(range(len(tar))),width=l[::-1],align='center')
	plt.xlabel('count')
	plt.ylabel('name')
	plt.yticks(list(range(len(tar))),p[::-1])
	plt.show()
コード例 #24
0
ファイル: summarizer.py プロジェクト: rusad/summarizer
 def getFreq(self, text, normalize=True):
     stop_words = stopwords.words(self.detectLanguage(text))
     words = self.getTokens(text)
     clean_words = filter(
         lambda word: not word in stop_words and not word in punctuation,
         words)
     fdist = FreqDist(clean_words)
     #==============================================================================
     #         # same result
     #         fdist = FreqDist()
     #         for word in word_tokenize(text):
     #             word = word.lower()
     #             if not word in stop_words and not word in punctuation:
     #                 fdist[word] += 1
     #==============================================================================
     # normalization by dividing on max freqency
     if normalize:
         norm = float(max(fdist.values()))
         for word in fdist.keys():
             fdist[word] = fdist[word] / norm
             # remove too frequent and too rare words
             if fdist[word] >= self._upper_bound or fdist[
                     word] <= self._lower_bound:
                 del fdist[word]
     return fdist
コード例 #25
0
def unigramAll():
    to_save_folder = "./#Unigram[.]/"
    folder_list = os.listdir("./")
    for folder in folder_list:
        if folder.find(".") != -1:
            continue
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc"
        fw = open(data_path, "r", encoding="utf8")
        text = fw.read()
        words = word_tokenize(text)
        fdist = FreqDist(w for w in words if len(w) > 1 and w != "``")

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = ""
        for key in keys:
            dataFreq += str(key[0]) + " , " + str(key[1]) + "\n"

        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[unigram].csv",
                      "w+",
                      encoding="utf8")
        writer.write(dataFreq)
        fw.close()
        writer.close()
コード例 #26
0
    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens,
                filter=lambda x: x.isalpha(),
                key=lambda s: s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = islice(fd.keys(), num)
            print(tokenwrap(words))
        else:
            print("No matches")
コード例 #27
0
 def run(self):
     # Preprocessing
     train_corpus = self.preprocess(self.trainData)
     test_corpus = self.preprocess(self.testData)
     
     # Generate dictionary
     wordFreq = FreqDist([word for phrase in train_corpus+test_corpus for word in phrase.split(" ")])
     self.vocabulary = list(wordFreq.keys())
     self.word2id = {word: i for i, word in enumerate(self.vocabulary)}
     
     # Extracting features
     self.X = self.extractFeatures(train_corpus)
     self.testData = self.extractFeatures(test_corpus)
     
     # Determine max sequence length
     lenStats = sorted([len(phrase) for phrase in self.X+self.testData])
     maxLength = lenStats[int(len(lenStats)*0.8)]
     
     # Pad sequences
     self.X = sequence.pad_sequences(np.array(self.X), maxlen=maxLength)
     self.testData = sequence.pad_sequences(np.array(self.testData), maxlen=maxLength)
     
     # Split validation set
     self.trainX, self.testX, self.trainY, self.testY = train_test_split(self.X, self.y, test_size=0.1, random_state=0)
     
     # Classify
     classifier_name = "XGBoost"
     self.classify(classifier_name)
コード例 #28
0
def convert_real_data(df):
    docs = []
    for index, row in df.iterrows():
        uid = row['id']
        text = row['text']

        content = Content()
        content.raw = text
        content.tokens = nltk.word_tokenize(text)
        content.date = datetime.datetime.utcnow

        fdist = FreqDist(content.tokens)
        freq_list = []
        for k in fdist.keys():
            freq_list.append((k, fdist[k]))
        content.construct_word_freq_list(freq_list)

        # content.construct_word_freq_list([('basketbal', 1), ('document', 1), (
        # 'footbal', 1), ('gener', 1), ('golf', 1), ('sport', 1), ('talk', 1), ('tenni', 1)])

        doc = TestDocument(uid, "test_name", "test_name",
                           datetime.datetime.utcnow, content, "no_url", 0)
        docs.append(doc)

    return docs
コード例 #29
0
def answer_four():
    wordfreq = FreqDist(text1)
    freqAnswerFour = [
        w for w in wordfreq.keys() if len(w) > 5 and wordfreq[w] > 150
    ]

    return sorted(freqAnswerFour)  # Your answer here
コード例 #30
0
def get_term_freq_dict(data):
    # Change it to lower case
    lower_data = data.lower()
    
    # Tokenize it
    tokens = word_tokenize(lower_data)
    freq_dist = FreqDist(tokens)
    
    # Lemmatize it
    word_freq = {}
    
    for term in freq_dist.keys():
        lemmatize_term = wordnet.lemmatize(term)
        val = freq_dist.get(term)
        
        # If it exist in word_freq, add value
        if lemmatize_term in word_freq:
            freq = word_freq[lemmatize_term]
            word_freq[lemmatize_term] = freq + val
            
        # Else, assign value
        else:
            word_freq[lemmatize_term] = val
    
    
    return word_freq
コード例 #31
0
def write(file, final_list, labels, tokens):
    num_tracker = []
    freq = FreqDist([word for sublist in final_list for word in sublist])
    #block of code to get freq of word in instance
    for iter in range(0, len(final_list)):
        row_list = []
        line_freq = FreqDist(final_list[iter])
        for word in freq:
            if word in line_freq:
                row_list.append(u'{}'.format(line_freq[word]))
            else:
                row_list.append(u'0')
        num_tracker.append(row_list)

    write_list = tokens
    tokens = []
    counter = 0
    #code to write to csv file
    with io.open(file, 'w', encoding='utf8') as outfile:
        headers = freq.keys()
        header = ','.join(headers)
        label_word = ',' + "label" + '\n'
        outfile.write(header + label_word)
        for num in num_tracker:
            num_occurances = ','.join(num)
            end_label = ',' + labels[counter] + '\n'
            counter += 1
            outfile.write(num_occurances + end_label)
コード例 #32
0
    def mostCommWords(self, tag, pos_tag_pattern):
        """
            This is a help method for mostCommNouns and mostCommVerbs.
            Argument:   tag --  a hashtag that we want to compute the most commonly hashtag with
                        pos_tag_pattern
                            --  the regular expression that used to match the POS tags
            return:     a list of the top 20 nouns associated with the input hashtag
            """
        words = {}
        topTwenty = []
        j = 0
        for line in self.lines:
            hasTag = False
            for t in self.tokenizer(line, hashtag_pattern):
                if t == tag:
                    hasTag = True
                    break
            if hasTag:
                counts = FreqDist()
                tokens = self.tokenizer(line, word_pattern)
                pos = nltk.pos_tag(tokens)
                for p in pos:
                    if re.match(pos_tag_pattern, p[1]):
                        counts.inc(p[0])
                for n in counts.keys():
                    if words.has_key(n):
                        words[n] = words[n] + counts[n]
                    else:
                        words[n] = counts[n]
        words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True)
        for i in range(0, 20):
            topTwenty.append(words_sorted_by_counts[i][0])

        return topTwenty
コード例 #33
0
ファイル: text.py プロジェクト: damorelse/MachineTranslation
    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = islice(fd.keys(), num)
            print(tokenwrap(words))
        else:
            print("No matches")
コード例 #34
0
def get_top_words(directory, n, file):
	num_docs = 0.0
	flist = {}
	result = {}
	for f in os.listdir(directory):
		#stop = "/Users/oliverfengpet/Dropbox/TwitterAffect/stoplist.txt"
		
		num_docs+=1
		rawContents = load_file_tokens(directory+'/'+f)
		fdist = FreqDist( rawContents )
		normalF = max(fdist.values())
		
		for key in fdist.keys():
			fdist[key]=float(float(fdist[key])/normalF)
	
		flist[directory+'/'+f] = fdist
		
		
	for key in flist[file].keys():
		num_appear=0
		for key_file in flist.keys():
			if key in flist[key_file].keys():
				num_appear+=1
		
		result[key] = flist[file][key]*math.log(num_docs/(num_appear))
	
	sorted_x = sorted(result.iteritems(), key=operator.itemgetter(1),reverse=True)
	
	top_x = sorted_x[:n]
	result = []
	
	for item in top_x:
		result.append(item[0])
	
	return result
コード例 #35
0
ファイル: ch01.py プロジェクト: gree2/hobby
def fun10():
    """frequency distribution"""
    fdist1 = FreqDist(text1)
    # print fdist1
    vocabulary1 = fdist1.keys()
    # print vocabulary1[:50]
    fdist1.plot(50, cumulative=True)
コード例 #36
0
ファイル: tokenizer.py プロジェクト: 52nlp/nlp
 def mostCommWords(self, tag, pos_tag_pattern):
     """
         This is a help method for mostCommNouns and mostCommVerbs.
         Argument:   tag --  a hashtag that we want to compute the most commonly hashtag with
                     pos_tag_pattern
                         --  the regular expression that used to match the POS tags
         return:     a list of the top 20 nouns associated with the input hashtag
         """
     words={}
     topTwenty=[]
     j = 0
     for line in self.lines:
         hasTag = False
         for t in self.tokenizer(line, hashtag_pattern):
             if t == tag:
                 hasTag = True
                 break
         if hasTag:
             counts = FreqDist()
             tokens = self.tokenizer(line, word_pattern)
             pos = nltk.pos_tag(tokens)
             for p in pos:
                 if re.match(pos_tag_pattern,p[1]):
                     counts.inc(p[0])
             for n in counts.keys():
                 if words.has_key(n):
                     words[n] = words[n]+counts[n]
                 else:
                     words[n] = counts[n]
     words_sorted_by_counts = sorted(words.items(), key=lambda x: x[1], reverse=True)
     for i in range(0,20):
         topTwenty.append(words_sorted_by_counts[i][0])
     
     return topTwenty
コード例 #37
0
ファイル: ch01.py プロジェクト: akiniwa/hobby
def fun10():
    """frequency distribution"""
    fdist1 = FreqDist(text1)
    # print fdist1
    vocabulary1 = fdist1.keys()
    # print vocabulary1[:50]
    fdist1.plot(50, cumulative=True)
コード例 #38
0
def ngram4All():
    to_save_folder = "./#Ngram_4[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8")
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"]
        nlist4 = []
        vlen = len(valid_word);
        for i in range(0,vlen-3):
            nlist4.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2] + " " +valid_word[i+3])

        fdist = FreqDist(w for w in nlist4)
        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = ""
        for key in keys:
            dataFreq += str(key[0])+ "," + str(key[1]) + "\n"
        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Ngram_4_Freq].csv", "w+", encoding="utf8")
        writer.write(dataFreq)
        fw.close()
        writer.close()
コード例 #39
0
def get_frequent_pos_bigrams(all_tokens, mf_pos_bigrams):
    """Get frequent part-of-speech bigrams."""

    # Get all part-of-speech tags
    all_pos = [t.sim_pos_full for t in all_tokens]

    # Get bigrams and frequencies
    pos_bigrams = nltk.bigrams(all_pos)
    pos_fdist = FreqDist(pos_bigrams)

    # Set dict
    pos_bigram_freq_dict = {bigram: 0 for bigram in mf_pos_bigrams}

    # Fill dict
    for key in pos_bigram_freq_dict.keys():
        if key in pos_fdist.keys():
            pos_bigram_freq_dict[key] = pos_fdist[key]

    # Normalize frequencies
    pos_bigram_freq = [
        pos_bigram_freq_dict[k] / len(pos_fdist)
        for k in sorted(pos_bigram_freq_dict)
    ]

    return pos_bigram_freq
コード例 #40
0
def BigramAll():
    to_save_folder = "./#Bigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1 :
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name+"data.doc";
        fw = open(data_path,"r",encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);

        big = list(bigrams(w for w in words if len(w) > 1 and w != "``"));
        myBig = []
        for bi in big:
            myBig.append(bi[0]+" "+bi[1]);

        fdist = FreqDist(str(w) for w in myBig);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq+= str(key[0]).strip()+","+str(key[1]).strip()+"\n";

        make_sure_path_exists(to_save_folder+folder)
        writer = open(to_save_folder+folder+"/"+folder+"[bigram_Freq].csv","w+",encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
コード例 #41
0
def trigramAll():
    to_save_folder = "./#Trigram[.]/"
    folder_list = os.listdir("./");
    for folder in folder_list:
        if folder.find(".") != -1:
            continue;
        folder_name = "./" + folder + "/"
        data_path = folder_name + "data.doc";
        fw = open(data_path, "r", encoding="utf8");
        text = fw.read();
        words = word_tokenize(text);
        valid_word = [w for w in words if len(w) > 1 and w != "``"];
        tri_list = [];
        vlen = len(valid_word);
        for i in range(0,vlen-2):
            tri_list.append(valid_word[i]+" "+valid_word[i+1]+" "+valid_word[i+2]);

        fdist = FreqDist(w for w in tri_list);

        keys = fdist.most_common(len(fdist.keys()))
        dataFreq = "";
        for key in keys:
            dataFreq += str(key[0]).strip()+ "," + str(key[1]).strip() + "\n";

        make_sure_path_exists(to_save_folder + folder)
        writer = open(to_save_folder + folder + "/" + folder + "[Triram_Freq].csv", "w+", encoding="utf8");
        writer.write(dataFreq);
        fw.close();
        writer.close();
コード例 #42
0
ファイル: 2-03 Assignment+2.py プロジェクト: dasmey/Portfolio
def answer_six():

    dist = FreqDist(moby_tokens)
    vocab = dist.keys()
    frequency = [(dist[w], w) for w in vocab if w.isalpha() and dist[w] > 2000]

    return sorted(frequency, reverse=True)
コード例 #43
0
ファイル: 2-03 Assignment+2.py プロジェクト: dasmey/Portfolio
def answer_four():

    dist = FreqDist(moby_tokens)
    vocab = dist.keys()
    freqwords = [w for w in vocab if len(w) > 5 and dist[w] > 150]

    return sorted(freqwords)
コード例 #44
0
def text_to_vector(docs, metric):
    """ Create frequency based feature-vector from text

    Metric must be either :attr:`FrequencyMetrics.TF` or :attr:`FrequencyMetrics.TF_IDF`.
    """
    doc_freqs = FreqDist() # Distribution over how many documents each word appear in.
    tf_dists = [] # List of TF distributions per document

    # Create freq_dist for each document
    for doc in docs:
        doc = preprocess.preprocess_text(doc)
        fd = FreqDist()
        for word in doc: fd.inc(word)
        doc_freqs.update(fd.samples())
        tf_dists.append(fd)


    all_tokens = doc_freqs.keys()
    num_docs = len(docs)
    num_features = len(all_tokens)


    # Build feature x document matrix
    matrix = np.zeros((num_features, num_docs))
    for i, fd in enumerate(tf_dists):
        if metric == FrequencyMetrics.TF:
            v = [fd.freq(word) for word in all_tokens]
        elif metric == FrequencyMetrics.TF_IDF:
            v = [fd.freq(word) * math.log(float(num_docs)/doc_freqs[word]) for word in all_tokens]
        else:
            raise ValueError("No such feature type: %s" % feature_type);
        matrix[:,i] = v

    return matrix
コード例 #45
0
ファイル: sa.py プロジェクト: corne12345/FoDS1
def get_word_features(wordlist):
    # print(wordlist)
    wordlist = FreqDist(wordlist)
    word_features = wordlist.keys()
    # print ("Word frequency list\n")
    # pprint(wordlist)
    return word_features
コード例 #46
0
    def FREQ(self, threshold):
        tagged = []
        nouns = []
        noun_phrases = []
        sorted_fdist = []
        result = []
        for s in self.tokens:
            print(s)
            temp = nltk.pos_tag(s)
            print(temp)
            tagged.append(temp)
            nouns = nouns + list(
                filter(lambda x: x[1].__contains__("NN"), temp))
            noun_phrases = noun_phrases + self.get_noun_phrases(s)

            fdist = FreqDist(word.lower() for word in s)
            for x in fdist.keys():
                sorted_fdist.append((fdist.get(x), x))
        sorted_fdist.sort()

        nouns_r = set([x[0] for x in nouns])
        noun_phrases = set(noun_phrases)
        print("=================================")
        print("NOUNS:", nouns)
        print("NOUNSPHRA:", noun_phrases)
        print("FREQ:", sorted_fdist)
        t = list(
            filter(lambda x: x[0] >= threshold and x[1] in nouns_r,
                   sorted_fdist))
        print(t)
        t_r = [x[1] for x in t]
        print("T_R", t_r)
        result = t_r + list(noun_phrases)
        print("RESULT", set(result))
        return set(result)
コード例 #47
0
def char_freq(lines):
    """ 返回 DataFrame,按字符频率倒序排列 """
    corpus = nltk.Text(chain.from_iterable(lines))  # 需要一个长字符串,而不是字符串列表
    wc = FreqDist(corpus)
    df = pd.DataFrame({'word': wc.keys(), 'freq': wc.values()})
    df.sort('freq', ascending=False, inplace=True)
    df['idx'] = np.arange(len(wc.values()))
    return df
コード例 #48
0
 def tabulate_categorized_words(self, reader, number_of_cat):
     wordtypes = reader.words()
     print '\n%s %5s %7s %14s\n' %('rank', 'fi', 'Fi*', 'wordtype')
     fd = FreqDist(wordtypes)
     cumulative = 0.0
     rank = 0
     for word in fd.keys()[:number_of_cat]:
         rank += 1
         cumulative += fd[word] * 100.0 / fd.N()
         print "%4d %6d %4d%% %15s" %(rank, fd[word], cumulative, word)
コード例 #49
0
ファイル: hw1_code_jmow.py プロジェクト: closen39/CIS-530-HW1
def get_top_words(path, n):
    files = get_all_files(path) # returns [] if path is a file
    fdist = FreqDist()
    if(len(files) == 0):
        for word in load_file_tokens(path):
            fdist.inc(word)
    else:
        for word in load_collection_tokens(path):
            fdist.inc(word)  
    li = fdist.keys()
    return li[:n]
コード例 #50
0
    def report(self, words, top_words=5):
        """
        Return the most likely translations of English words
        """

        for ii in [lower(x).strip() for x in words]:
            probability = FreqDist()
            for jj in self._trans.vocab():
                probability.inc(jj, self._trans.score(ii, jj))

            for jj in probability.keys()[:top_words]:
                yield ii, jj, probability[jj]
コード例 #51
0
def construct_model(copusPath, modelPath):
    mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt',
                                           cat_pattern=r'*/.*', encoding='iso-8859-1')
    stop = stopwords.words('french')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation],
                   i.split('/')[0]) for i in mr.fileids()]
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = list(word_features.keys())
    numtrain = int(len(documents) * 100 / 100)
    train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]]
    """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag  in documents[numtrain:]]"""
    classifier = nbc.train(train_set)
    mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1')
    documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() 
                   not in string.punctuation],
                   i.split('/')[0]) for i in mrtest.fileids()]
    word_features_test = FreqDist(chain(*[i for i, j in documentsTest]))
    word_features_test = list(word_features_test.keys())
    numtrain_test = int(len(documentsTest) * 100 / 100)
    test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag  in documentsTest[:numtrain_test]]
    save_classifier(classifier, modelPath)
コード例 #52
0
def collocations(words,defined_terms):

    
    # Count the words and bigrams
    wfd = FreqDist([w[2] for w in words])
    
    #tri = [tuple(x[1:] for x in words[i:i+3]) for i in range(len(words)-2) if 
    #       tuple(x[1:] for x in words[i:i+3])[1][1] in [token.split('\t')[2] for token in defined_terms]]
    tri = [tuple(x[1:] for x in words[i:i+7]) for i in range(len(words)-6) if 
           tuple(x[1:] for x in words[i:i+7])[3][1] == defined_terms ]#in [token.split('\t')[2] for token in defined_terms]  ]
           #tuple(x[1:] for x in words[i:i+2])[2][1] in [token.split('\t')[2] for token in defined_terms] ]
    #tri = nltk.bigrams([w[1:] for w in words])
    ite = [itertools.combinations(trigram,2) for trigram in tri]



   
    #for i in ite[:10]: print i

    
    bigrams_tag_fd = FreqDist([ff for it in ite for ff in it])

    filtre = nltk.bigrams([w[1:] for w in words])


    '''
    for ii in bigrams_tag_fd.keys():
        if ((ii[0][1] or ii[1][1]) in [token.split('\t')[2] for token in defined_terms]) and (bigrams_tag_fd[ii]>1):
            bigrams_tag_fd[ii] = bigrams_tag_fd[ii]- filtre.count(tuple(ii))
    '''      
   

    ADJ = r"JJ|VVG|VVN|VVD"
    NOM = r"NN|VV$"
    VER = r"VVP|VVZ"

    adj = "ADJ|VER:[(ppre|pper)]"
    nom = "NAM|NOM|VER:infi"
    ver = "VER:[^(ppre|pper|infi)]"
    
    pfd = { (a,b):bigrams_tag_fd[(a,b)] for (a, b) in sorted(bigrams_tag_fd.keys())
                                    if (b[1] == defined_terms and#in [token.split('\t')[2] for token in defined_terms] and
                                    re.match(ver,a[0]))
                                    or (a[1] == defined_terms and#in [token.split('\t')[2] for token in defined_terms] and
                                    re.match(ver,b[0])) }

    
    # score them
    scored = [((w1,w2), score(w1, w2, wfd, pfd)) for w1, w2 in pfd]
    scored.sort(key=itemgetter(1), reverse=True)
   
    
    return scored#map(itemgetter(0), scored)
コード例 #53
0
def filter_words(words):
    new_words = FreqDist(words)
    stopwords = get_stop_words('ar')
    keys = new_words.keys()
    
    for word in keys:
        if word in stopwords:
            new_words.pop(word)
            
        if len(word) <= 2:
            new_words.pop(word)
            
    return new_words