コード例 #1
0
	def get_summarized(self, inputContent, num_sentences ):
	
		base_words = [word.lower()
			for word in nltk.word_tokenize(inputContent)]
		words = [word for word in base_words if word not in stopwords.words()]
		word_frequencies = FreqDist(words)
		
		most_frequent_words = [pair[0] for pair in
			word_frequencies.items()]
		
		
		sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
		actual_sentences = sent_detector.tokenize(inputContent)
		working_sentences = [sentence.lower()
			for sentence in actual_sentences]

		
		output_sentences = []

		for word in most_frequent_words:
			for i in range(0, len(working_sentences)):
				if (word in working_sentences[i]
				  and actual_sentences[i] not in output_sentences):
					output_sentences.append(actual_sentences[i])
					break
				if len(output_sentences) >= num_sentences: break
			if len(output_sentences) >= num_sentences: break
			
		
		return output_sentences
コード例 #2
0
    def summarize(self, input, num_sentences):

        punt_list = [".", ",", "!", "?"]
        summ_sentences = []

        sentences = sent_tokenize(input)
        lowercase_sentences = [sentence.lower() for sentence in sentences]
        # print lowercase_sentences

        s = list(input)
        ts = "".join([o for o in s if not o in punt_list]).split()
        lowercase_words = [word.lower() for word in ts]
        words = [word for word in lowercase_words if word not in stopwords.words()]
        word_frequencies = FreqDist(words)

        most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]]

        # add sentences with the most frequent words
        for word in most_frequent_words:
            for i in range(0, len(lowercase_sentences)):
                if len(summ_sentences) < num_sentences:
                    if lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]:
                        summ_sentences.append(sentences[i])
                        break

            # reorder the selected sentences
        summ_sentences.sort(lambda s1, s2: input.find(s1) - input.find(s2))
        return " ".join(summ_sentences)
コード例 #3
0
def get_negative_grams(filePath,n):
	l = list()
	#Open the file and write on it the result
	with codecs.open(filePath,'r') as myfile:
		sentence=myfile.read()
		sentence=sentence.replace('points forts', ' ')
		sentence=sentence.replace('points faibles', ' ')
		sentence=sentence.replace('commentaires', ' ')

	n_grams = ngrams(sentence.split(), n)
	s=''
	for grams in n_grams:
		if('est pas' in grams or 'ai pas' in grams or 'pas' in grams or 'cher' in grams):
			s+=str(grams)+'\n'
			l.append(grams)

	'''fe = open('negative-'+str(n)+'-gram.txt', 'w')
	fe.write(s)
	fe.close()'''

	Dict = FreqDist(l)
	Dict = sorted(Dict.items(), key=operator.itemgetter(1), reverse=True)

	t=''
	for x in Dict:
		t+= '(\''+str(x[0])+'\' , ' +str(x[1])+')\n'

	fe = open('stats/Freq_negative-'+str(n)+'-gram.txt', 'w')
	fe.write(t)
	fe.close()
コード例 #4
0
def getBestWords(posWords, negWords):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        label_word_fd["pos"][word.lower()] += 1

    for word in negWords:
        word_fd[word.lower()] += 1
        label_word_fd["neg"][word.lower()] += 1

    pos_word_count = label_word_fd["pos"].N()
    neg_word_count = label_word_fd["neg"].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)
    bestwords = set([w for w, s in sorted_x])

    return bestwords
コード例 #5
0
def bysegment(db):
    dist = FreqDist()
    
    total = 0

    while db.hasNext():
        fragments = db.nextPwd()
        pwd = fragments[0].password
        
        for f in fragments: # iterate through fragments
            total += 1
            if total % 100000 == 0:
                print "{} segments processed...".format(total)
                
            if f.is_gap(): 
                dist.inc("gap")
            else:
                raw_word = pwd[f.s_index:f.e_index]

                if     raw_word.isupper():  dist.inc('upper')
                elif   raw_word.istitle():  dist.inc('capitalized')
                elif   raw_word.islower():  dist.inc('lower')
                else:                       dist.inc('mangled')
            
    for k, v in dist.items():
        print "{}\t{}".format(k, v)
コード例 #6
0
    def train_MLT(self, tagged_train_data, untagged_training_data):
        """
        Builds a most likely tag tagger from the given tagged training data as WORDS
        :param train_data:
        :return: model
        """
        # find the set of words
        words = set()
        for sent in untagged_training_data:
            for word in sent:
                words.add(word)
        # Define mlt_dict of format {word1:{(word1,tag1):count1, (word1, tag2):count2 ........},..........}
        mlt_dict = dict()
        # Initialize keys and values to it
        for word in words:
            mlt_dict[word] = dict()
        # Compute the freq dist of tagged words
        tagged_words_fdist = FreqDist(tagged_train_data)

        for tagged_word, count in tagged_words_fdist.items():
            (mlt_dict[tagged_word[0]])[tagged_word] = count

        # Update the dict to contain the most likely tag for each word
        #for word, inside_dict in mlt_dict.items():
        #   max_val = max(inside_dict.values())
        #    inside_dict =
        print("Training is done!")
        return mlt_dict
コード例 #7
0
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """
コード例 #8
0
ファイル: summaly.py プロジェクト: aigeano/Summaly
def make_summary( text):
	sent = []
	stemmed = []
	tokens = word_tokenize(text)
	sent = sent_tokenize(text)
	for token in tokens:
		if token in stopwords.words('english'):
			tokens.remove(token)
	stemmer = PorterStemmer()

	for token in tokens:
	 	stemmed.append(stemmer.stem(token))
#freq(stemmed)
	for word in stemmed:
		word.lower()
	word_freq = FreqDist(stemmed)

	most_freq_words = [pair[0] for pair in word_freq.items()[:60]]

	working_sent = [sentence.lower() for sentence in sent]

	out_sent = []

	for word in most_freq_words:
		for i in range(0,len(working_sent)):
			if (word in working_sent[i] and sent[i] not in out_sent):
				out_sent.append(sent[i])
				break
			if len(out_sent) >= 5:
			 	break
		
		if len(out_sent) >= 5:
			break

	return reorder(out_sent,text)
コード例 #9
0
def create_word_scores():
    posWords = pickle.load(open('pos_review.pkl', 'rb'))
    negWords = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posWords))  # 把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords))  # 同理

    word_fd = FreqDist()  # 可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  # 可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()  # 积极词的数量
    neg_word_count = cond_word_fd['neg'].N()  # 消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count),
                                               total_word_count)  # 计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count),
                                               total_word_count)  # 同理
        word_scores[word] = pos_score + neg_score  # 一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores  # 包括了每个词和这个词的信息量
コード例 #10
0
def create_word_bigram_scores():
    posdata = pickle.load(open('pos_review.pkl', 'rb'))
    negdata = pickle.load(open('neg_review.pkl', 'rb'))

    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder_pos = BigramCollocationFinder.from_words(posWords)
    bigram_finder_neg = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder_pos.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder_neg.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams  # 词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in neg:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #11
0
ファイル: naivesumm.py プロジェクト: benjbigot/BNN_WIN
def summarize(self, input, num_sentences ):
                s=[]
                punt_list=['.',',','!','?']
                summ_sentences = []
                sentences=input
                #sentences = sent_tokenize(input)
                lowercase_sentences =[sentence.lower() 
                        for sentence in sentences]
                #print lowercase_sentences
                saito=' '.join(sentences)
                s=input
                ts=''.join([ o for o in s if not o in  punt_list ]).split()
                lowercase_words=[word.lower() for word in ts]
                words = [word for word in lowercase_words if word not in stopwords.words()]
                word_frequencies = FreqDist(words)
                
                most_frequent_words = [pair[0] for pair in 
                        word_frequencies.items()[:100]]

                # add sentences with the most frequent words
                if(len(s) < num_sentences):
                    num_sentences=len(s)
                for word in most_frequent_words:
                        for i in range(len(lowercase_sentences)):
                            if len(summ_sentences) < num_sentences:
                                        if (lowercase_sentences[i] not in summ_sentences and word in lowercase_sentences[i]):
                                                summ_sentences.append(lowercase_sentences[i])
                            else:
								break
                        if len(summ_sentences) >= num_sentences:
                             break  
                        
                # reorder the selected sentences
                summ_sentences.sort( lambda s1, s2: saito.find(s1) - saito.find(s2) )
                return summ_sentences
コード例 #12
0
def create_word_bigram_scores(posWords, negWords, n = 5000):
    # (posWords,negWords) = readwordarr()
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))
    bigramfinder = BigramCollocationFinder.from_words(posWords)
    posbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    bigramfinder = BigramCollocationFinder.from_words(negWords)
    negbigrams = bigramfinder.nbest(BigramAssocMeasures.chi_sq, n)
    posWords = posWords + posbigrams
    negWords = negWords + negbigrams
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    for word in posWords:
        wordfd[word]+=1
        conditionwordfd['pos'][word]+=1
        
    for word in negWords:
        wordfd[word]+=1
        conditionwordfd['neg'][word]+=1
    
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores
コード例 #13
0
def get_words_frequency(string, top_values):
    """
    Gets the words frequency in a corpus
    :param string: corpus
    :param top_values: maximum of sorted values to return
    :return: list of frequencies of the word in there synset form
    """

    # import stop words from nltk corpus
    stop_words_en_nltk = list(stopwords.words('english'))

    # create additional stop words for puntuations and others
    stop_words_en_custom = ['.', ',', '\'', '!', '(', ')', ':', ';', '?', '--', '*', '[', ']', '``', str("''"),
                            '&', '\'ll', '\'ve', '\'s', '\'re', 'a', 'b', 'c',
                            'i', '\'i', 'this', 'n\'t', 'a', 'could', 'should', 'would', 'can', 'will', 'shall',
                            'there', 'it', 'also', 'in', 'the', 'many', 'by', 'an',
                            '1990s', 'the', '+', '-', '...', '=', '%', '#', '[hide]', '[edit]', '.jpg', '/',
                            'be.v.01', 'have.v.01', 'use.v.01', 'besides.r.02', 'analysis.n.01', 'categorization.n.03',
                            'vitamin_e.n.01', 'vitamin_c.n.01', 'include.v.01', 'such.s.01', 'many.a.01', 'order.n.01',
                            'episode.n.01', 'show.n.01', 'not.r.01', 'standard.n.01', 'survey.n.01', 'factor.n.01',
                            'first.a.01']
    until_number = 300
    stop_words_en_custom_numbers = []
    for value in [lambda i=i: i for i in range(until_number+1)]:
        stop_words_en_custom_numbers.append(str(value()))

    # add them together
    stop_words_en = stop_words_en_nltk + stop_words_en_custom + stop_words_en_custom_numbers

    words_list_tmp = word_tokenize(string.lower())
    words_list = []

    lemmatizer = WordNetLemmatizer()
    for word in nltk.pos_tag(words_list_tmp):
        tag = get_word_tag(word[1])
        if tag is not '':
            try:
                synset_word = wordnet.synsets(lemmatizer.lemmatize(word[0], pos=tag), pos=tag)[0]
                words_list.append(synset_word.name())
            except:
                pass

    processed_word_list = [word for word in words_list if word not in stop_words_en]

    text_obj = nltk.Text(processed_word_list)

    fd = FreqDist(text_obj)

    result = list(fd.items())

    if top_values is not 0:
        result.sort(key=lambda x: x[1], reverse=True)
        result = result[:top_values]
        return result

    else:
        return result
コード例 #14
0
def probDist():

    ### files pointers to reading files
    f1 = open(os.path.join('allfiles', 'document01-finance.txt'), "r")
    f2 = open(os.path.join('allfiles', 'document02-finance.txt'), "r")
    f3 = open(os.path.join('allfiles', 'document03-finance.txt'), "r")
    f4 = open(os.path.join('allfiles', 'document04-ee.txt'), "r")
    f5 = open(os.path.join('allfiles', 'document05-ee.txt'), "r")
     
    ### read the file content
    line1 = f1.read()
    line2 = f2.read()
    line3 = f3.read()
    line4 = f4.read()
    line5 = f5.read()
    
    
    ### document01-finance.txt is the writer document and other files are 
    ### are reader files so we get the word list from the write document 
    words = line1.split()
    X_words = []
    
    ### create a dictionary to store the frequency of each term
    dict_x1 = {}
    
    ### using nltk calcuate frequency of each word
    unigramWordList = FreqDist(words)
    datalen = len(unigramWordList) ### total words in the document
    
    for k,v in unigramWordList.items():
        #print k,v
        X_words.append(k)
        dict_x1[k] = (v/float(datalen))
        pd_x1.append(v/float(datalen))
    #print X_words
    #print dict_x1
    #print pd_x1
    
    ### create probability distribution of all files
    for word in X_words:
        pd_x2.append( line2.count(word)/float(datalen) )
        pd_x3.append( line3.count(word)/float(datalen) )
        pd_x4.append( line4.count(word)/float(datalen) )
        pd_x5.append( line5.count(word)/float(datalen) )
        
    #print pd_x2
    #print pd_x3
        
    ### calculate total probability distribution across 3 files
    line_S = line1+line2+line3+line4+line5
    #print line_S
    
    for word in X_words:
        s.append( line_S.count(word)/float(datalen) )
    
    print s
コード例 #15
0
def opinion_tokens_Fr(liste):
	#Creat the results floder in case it doesn't exist
	result = "stats"
	if not os.path.exists(result):
		os.mkdir(result,0777)

	i=0
	comments = ''
	while(i<len(liste)):
		comments+=liste[i]+'\n'
		i+=1

	comments=comments.lower()

	#Open the file and write on it the result
	f = open('opinions.txt', 'w')
	f.write(comments)
	f.close()

	w=['"','→','–','’','»','«',',','.','[',']','|','{','}',':',';','!','?','(',')','_','-','=','/',
	' qui ',' cette ',' mais ',' ou ',' où ',' et ',' donc ',' or ',' ni ',' car ',' la ',' là ',' le ',
	' les ',' de ',' des ',' du ',' tout ',' tous ',' toutes ',' que ',' comme ',' si ',' quand ',' je ',
	' tu ',' il ',' elle ',' nous ',' vous ',' ils ',' elles ',' un ',' une ',' au ',' aux ',' dans ',' ce '
	,' se ',' ces ',' ses ',' on ',' en ',' leur ',' leurs ',' a ',' à ',' pour ',' par ',' sous ',' sur ']

	#Open the file and write on it the result
	with codecs.open('opinions.txt','r') as myfile:
    	
		content=myfile.read()
		content=content.replace('points forts', ' ')
		content=content.replace('points faibles', ' ')
		content=content.replace('commentaires', ' ')
		
		# remove numeric forms
		content = ''.join([i for i in content if not i.isdigit()])
		while w:
			# remove conjuction, connectors, ...			
			content=content.replace(w.pop(0), ' ')

	content = content.split()

	tokenDict = FreqDist(content)
	tokenDict = sorted(tokenDict.items(), key=operator.itemgetter(1), reverse=True)

	s=''
	for x in tokenDict:
		s+= '(\''+x[0].decode('utf-8', 'ignore').encode('utf-8')+'\' , ' +str(x[1])+')\n'
	fe = open('stats/freq_tokens.txt', 'w')
	fe.write(s)
	fe.close()

	return tokenDict
コード例 #16
0
ファイル: __init__.py プロジェクト: organisciak/field-exam
def main():
	# Number of words to display
	count = 40

	# Open files as strings
	obama = open("obama.txt", "r").read()
	bush = open("bush.txt", "r").read()

	#Tokenize texts into words, then count frequencies for all words
	top_obama = FreqDist(word.lower() for word in word_tokenize(obama))
	top_bush = FreqDist(word.lower() for word in word_tokenize(bush))
	
	#Return top {count} most occurring words
	print "No stoplist".upper()
	print "Obama/2009\t".upper(), " ".join(item[0] for item in top_obama.items()[:count])
	print "Bush/2001\t".upper(), " ".join(item[0] for item in top_bush.items()[:count])

	#Return most occurring words that are not in the NLTK English stoplist
	print
	print "Stoplisted".upper()
	print "Obama/2009\t".upper(), " ".join([item[0] for item in top_obama.items() if not item[0] in stopwords.words('english')][:count])
	print "Bush/2001\t".upper(), " ".join([item[0] for item in top_bush.items() if not item[0] in stopwords.words('english')][:count])
コード例 #17
0
def CheckSSNStats(FileName):
    SSNList = []
    f = open(FileName, 'r')
    reader = csv.reader(f, delimiter ='|')
    for line in reader:
        SSNList.append(line[3].strip())
    SSNStatsDict['SSNCount'] = len(SSNList)-1
    fdist = FreqDist(SSNList)
    frequencies = OrderedDict(sorted(fdist.items(), key = lambda x:x[1], reverse = True))
    SSNStatsDict['DistinctSSNCount'] = len(frequencies)
    for k, v in frequencies.items()[:10]:
        SSNFDDict[k]=v
    SSNStatsDict['FreqDist'] = SSNFDDict
    print "Check SSN Stats: Complete"
コード例 #18
0
def CheckRIDStats(FileName):
    RIDList = []
    f = open(FileName, 'r')
    reader = csv.reader(f, delimiter ='|')
    for line in reader:
        RIDList.append(line[0].strip())
    RIDStatsDict['RIDCount'] = len(RIDList)-1
    fdist = FreqDist(RIDList)
    frequencies = OrderedDict(sorted(fdist.items(), key = lambda x:x[1], reverse = True))
    RIDStatsDict['DistinctRIDCount'] = len(frequencies)
    for k, v in frequencies.items()[:10]:
        RIDFDDict[k]=v
    RIDStatsDict['FreqDist'] = RIDFDDict    
    print "Check RID Stats: Complete"
コード例 #19
0
def create_word_scores():
    posFile = directory + 'posWords.txt'
    negFile = directory + 'negWords.txt'
    if os.path.exists(posFile):
        posSentences = codecs.open(posFile, 'r', 'utf-8')
    else:
        print("posFile doesn't exist")
    if os.path.exists(negFile):
        negSentences = codecs.open(negFile, 'r', 'utf-8')
    else:
        print("negFile doesn't exist")

    # each line is a single comment
    posSentences = re.split(r'\n', posSentences.read())
    # print posSentences
    negSentences = re.split(r'\n', negSentences.read())

    posWords = []
    negWords = []
    for i in posSentences:
        posWord = re.findall(r"[\w']+|[.,!?;]", i)
        posWords.append(posWord)
    for i in negSentences:
        negWord = re.findall(r"[\w']+|[.,!?;]", i)
        negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
コード例 #20
0
def create_word_scores():

    posWord_score = []
    negWord_score = []
##    low_posWord_score = []
##    low_negWord_score = []

    for i in short_pos.split('\n'):
        posWords = word_tokenize(i)
        posWord_score.append(posWords)

    for i in short_neg.split('\n'):
        negWords = word_tokenize(i)
        negWord_score.append(negWords)


        
    word_scores = {}

    posWord_score = list(itertools.chain(*posWord_score))
    negWord_score = list(itertools.chain(*negWord_score))

    #build frequency distibution of all words and then frequency distributions of words within positive and negative labels
    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    
    for word in posWord_score:
        word_fd[word.lower()] += 1
        cond_word_fd["pos"][word.lower()] += 1
        
    for word in negWord_score:
        word_fd[word.lower()] += 1
        cond_word_fd["neg"][word.lower()] += 1

        
    #finds the number of positive and negative words, as well as the total number of words
    pos_word_count = cond_word_fd["pos"].N()
    neg_word_count = cond_word_fd["neg"].N()
    
    total_word_count = pos_word_count + neg_word_count

    #Chi-Squared Informative Gain
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
        
        word_scores[word] = pos_score + neg_score
        
    return word_scores
コード例 #21
0
ファイル: detector.py プロジェクト: Steven-Eardley/tts
def simhash(words):
    fdist = FreqDist(words)
    v = [0]*32
    for (token, freq) in fdist.items():
        token_hash = [int(val) for val in word_dict[token]]
        for i in range(freq):
            for index in range(len(v)):
                if token_hash[index] == 1:
                    v[index] += 1
                else:
                    v[index] -= 1
    simhash = ['0']*32
    for j in range(len(v)):
        if v[j] > 0:
            simhash[j] = '1'
    return ''.join(simhash)
コード例 #22
0
    def get_summarized(self, input, num_sentences,mustinclude):
        # TODO: allow the caller to specify the tokenizer they want
        # TODO: allow the user to specify the sentence tokenizer they want

        tokenizer = RegexpTokenizer('\w+')

        # get the frequency of each word in the input
        base_words = [word.lower()
            for word in tokenizer.tokenize(input)]
        words = [word for word in base_words if word not in stopwords.words()]
        word_frequencies = FreqDist(words)

        # now create a set of the most frequent words
        most_frequent_words = [pair[0] for pair in word_frequencies.items()[:100]]

        # break the input up into sentences.  working_sentences is used
        # for the analysis, but actual_sentences is used in the results
        # so capitalization will be correct.

        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        actual_sentences = sent_detector.tokenize(input)
        working_sentences = [sentence.lower()
            for sentence in actual_sentences]

        mustinclude = mustinclude.lower()

        # iterate over the most frequent words, and add the first sentence
        # that inclues each word to the result.
        output_sentences = []

        for word in most_frequent_words:
            for i in range(0, len(working_sentences)):
                if (mustinclude in working_sentences[i] and word in working_sentences[i] and actual_sentences[i] not in output_sentences):
                    output_sentences.append(actual_sentences[i])
                    break
                if len(output_sentences) >= num_sentences: break
            if len(output_sentences) >= num_sentences: break

        # If we came up empty just find a sentence with our word that must be included
        if len(output_sentences) == 0:
          for i in range(0, len(working_sentences)):
            if mustinclude in working_sentences[i]:
              output_sentences.append(actual_sentences[i])
              break

        # sort the output sentences back to their original order
        return self.reorder_sentences(output_sentences, input)
コード例 #23
0
ファイル: word_frequency.py プロジェクト: rajaselvan/sherlock
def freq_words(string):


	print "\n\n\n\t\t\tReading from file"

	#tokenize on white spaces
	raw_word_list=word_tokenize(string)

	#remove stop words
	processed_word_list=[word for word in raw_word_list if word not in total_stop_words]

	#create an nltk text object
	text_obj=nltk.Text(processed_word_list)

	print "\n\n\n\t\t\tProcessing"

	#Call the frequency distribution method and store the words and  corresponding frequencies in a dictionary
	fd=FreqDist(text_obj)

	
	#convert the dictionary to a list of tuples conatining key-value pairs
	result=fd.items()
	

	#select the 100 most frequent words. If number of words in the result is less than 100, adjust accordingly
	if len(result) < 100:
		result_length=len(result)
		chosen_words=result[: result_length/2]
	else:
		chosen_words=result[:100]
	
	print "\n\n\n\t\t\tDrawing cloud"


	#specify the canvas measurement
	elements = wordcloud.fit_words(chosen_words, width=500, height=500)


	#draw the cloud
	wordcloud.draw(elements, path.join(d, 'frequent_words.png'), width=500, height=500,
        scale=2)


	print "\n\n\n\t\t\tWord cloud generated in frequent_words.png file"
	

	return
コード例 #24
0
ファイル: processComments.py プロジェクト: gabelula/commentIQ
def ComputeVocabulary():
    # Get the data
    csvFile = open("data/comments_study.csv", "Ur")
    csvReader = csv.reader(csvFile, delimiter=",", quotechar='"')
    comments = {}
    for row in csvReader:
        # don't read 1st line
        if csvReader.line_num > 1:
            comments[row[0]] = row

            # Compute Vocabulary and output it for later
    tokens = []
    n = 0
    nDocuments = len(comments)
    for c in comments:
        n = n + 1
        if n % 100 == 0:
            print n
        ct = CleanAndTokenize(comments[c][2].decode("utf8"))
        ct = [w for w in ct if w not in stopword_list]
        stemmed_tokens = [porter.stem(t) for t in ct]
        tokens.extend(stemmed_tokens)
        for t in stemmed_tokens:
            if t not in doc_frequency:
                doc_frequency[t] = 1
            else:
                doc_frequency[t] = doc_frequency[t] + 1

                # print tokens
    fd = FreqDist(tokens)

    # find cutoff
    unigram_cutoff = 0
    for (i, (key, val)) in enumerate(fd.items()):
        # print str(i) + " " + str(key) + " " + str(fd[key])
        if fd[key] < 10:
            unigram_cutoff = i - 1
            break
    print "unigram cutoff: " + str(unigram_cutoff)
    word_features.extend(fd.keys()[:unigram_cutoff])

    fileWriter = csv.writer(open("data/vocab.csv", "w+"), delimiter=",")
    for w in word_features:
        row = [w.encode("utf8"), doc_frequency[w]]
        fileWriter.writerow(row)
コード例 #25
0
def create_word_scores(posWords,negWords, presense = False):
    # (posWords,negWords) = readwordarr()
    wordscores = {}
    wordfd = FreqDist()
    conditionwordfd = ConditionalFreqDist()
    if not presense:
        posWords = list(itertools.chain(*posWords))
        negWords = list(itertools.chain(*negWords))
        
        
        for word in posWords:
            wordfd[word]+=1
            conditionwordfd['pos'][word]+=1
            
        for word in negWords:
            wordfd[word]+=1
            conditionwordfd['neg'][word]+=1
    
    else:
        for wordarr in posWords:
            flag = dict()
            for word in wordarr:
                if word in flag:
                    continue
                flag[word]=1
                wordfd[word]+=1
                conditionwordfd['pos'][word]+=1
        for wordarr in negWords:
            flag = dict()
            for word in wordarr:
                if word in flag:
                    continue
                flag[word]=1
                wordfd[word]+=1
                conditionwordfd['neg'][word]+=1
                
    pos_word_count = conditionwordfd['pos'].N()
    neg_word_count = conditionwordfd['neg'].N()
    totalcount = pos_word_count + neg_word_count
    for word,freq in wordfd.items():
        pos_score = BigramAssocMeasures.chi_sq(conditionwordfd['pos'][word], (freq, pos_word_count), totalcount)
        neg_score = BigramAssocMeasures.chi_sq(conditionwordfd['neg'][word], (freq, neg_word_count), totalcount)
        wordscores[word] = pos_score + neg_score
    return wordscores
コード例 #26
0
def bypassword(db):
    
    dist = FreqDist()
    
    total = 0

#     regex_word_capitalized = r'^[A-Z][a-z]*'
    regex_pwd_capitalized = r'^[A-Z][^A-Z]*$'
    
    while db.hasNext():
        fragments = db.nextPwd()
        pwd = fragments[0].password
        
        total += 1
        
        if total % 100000 == 0:
            print "{} passwords processed...".format(total)
        
        pattern = None
        
        if all([f.is_gap() for f in fragments]): 
            pattern = 'gap'
        elif re.match(regex_pwd_capitalized, pwd):
            pattern = 'title'
        else:
            bag = set()
        
            for f in fragments:
                if f.is_gap(): continue
                
                raw_word = pwd[f.s_index:f.e_index]
                
                if     raw_word.isupper():  bag.add('upper')
                elif   raw_word.istitle():  bag.add('captlzd')
                elif   raw_word.islower():  bag.add('lower')
                else:                     bag.add('mangled')
                
            pattern = ', '.join(sorted(bag))
            if pattern == 'captlzd, upper': print pwd

        dist.inc(pattern)    
    
    for k, v in dist.items():
        print "{}\t{}".format(k, v)
コード例 #27
0
ファイル: Zummarizer.py プロジェクト: JDOUBLE-U/Zummarizer
def get_summarized(inputt, num_sentences):
    # A tokenizer splits a string using a regular expression, which
    # matches either the tokens or the separators between tokens.
    tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

    # get the frequency of each word in the input
    base_words = [word.lower()
                  for word in tokenizer.tokenize(inputt)]
    words = [word for word in base_words if
             word not in open("Stopwords/dutch")]
    word_frequencies = FreqDist(words)

    # now create a set of the most frequent words
    most_frequent_words = [pair[0] for pair in
                           word_frequencies.items()[:100]]

    # break the input up into sentences. working_sentences is used
    # for the analysis, but actual_sentences is used in the results
    # so capitalization will be correct.
    sent_detector = pickle.load(open("Pickles/dutch.pickle", "rb"))
    actual_sentences = sent_detector.tokenize(inputt)
    working_sentences = [sentence.lower()
                         for sentence in actual_sentences]

    # iterate over the most frequent words, and add the first sentence
    # that inclues each word to the result.
    output_sentences = []

    for word in most_frequent_words:
        for i in range(0, len(working_sentences)):
            if (word in working_sentences[i] and actual_sentences[i] not in
                output_sentences):
                output_sentences.append(actual_sentences[i])
                break

            if len(output_sentences) >= num_sentences:
                break

        if len(output_sentences) >= num_sentences:
            break

    # sort the output sentences back to their original order
    return reorder_sentences(output_sentences, inputt)
コード例 #28
0
ファイル: processComments.py プロジェクト: intuinno/commentIQ
def ComputeVocabulary(comment_filename, vocab_filename):
	# Get the data
	csvFile = open(comment_filename, 'Ur')
	csvReader = csv.reader(csvFile, delimiter=',', quotechar='"')
	comments  = {}
	for row in csvReader:
		# don't read 1st line
		if csvReader.line_num > 1:			
			comments[row[0]] = row

	# Compute Vocabulary and output it for later
	tokens = []
	n = 0
	nDocuments = len(comments)
	for c in comments:
		n = n + 1
		if n % 100 == 0 :
			print "vocabulary : " + str(n)
		ct = CleanAndTokenize(comments[c][2].decode("utf8"))
		ct = [w for w in ct if w not in stopword_list]
		stemmed_tokens = [porter.stem(t) for t in ct]
		tokens.extend(stemmed_tokens)
		for t in stemmed_tokens:
			if t not in doc_frequency:
				doc_frequency[t] = 1
			else:
				doc_frequency[t] = doc_frequency[t]+1

	#print tokens
	fd = FreqDist(tokens)

	# find cutoff

	for (i, (key, val)) in enumerate(fd.items()):
		# print str(i) + " " + str(key) + " " + str(fd[key])
		if fd[key] >= 10:
			word_features.append(key)

	fileWriter = csv.writer(open(vocab_filename, "w+"),delimiter=",")
	for w in word_features:
		row = [w.encode("utf8"), doc_frequency[w]]
		fileWriter.writerow(row)
コード例 #29
0
ファイル: lib.py プロジェクト: zacharydenton/autoblog
    def summarize(self, input, num_sentences ):
        # TODO: allow the caller to specify the tokenizer they want
        # TODO: allow the user to specify the sentence tokenizer they want

        tokenizer = RegexpTokenizer('\w+')
        
        # get the frequency of each word in the input
        base_words = [word.lower().encode('utf-8')
            for word in tokenizer.tokenize(input)]
        words = [word for word in base_words if word not in stopwords.words()]
        word_frequencies = FreqDist(words)
        
        # now create a set of the most frequent words
        most_frequent_words = [pair[0] for pair in 
            word_frequencies.items()[:100]]

        # break the input up into sentences.  working_sentences is used 
        # for the analysis, but actual_sentences is used in the results
        # so capitalization will be correct.
        
        actual_sentences = nltk.tokenize.sent_tokenize(input)
        working_sentences = [sentence.lower() 
            for sentence in actual_sentences]

        # iterate over the most frequent words, and add the first sentence
        # that inclues each word to the result.
        output_sentences = []

        for word in most_frequent_words:
            for i in range(0, len(working_sentences)):
                if (word in working_sentences[i] 
                  and actual_sentences[i] not in output_sentences):
                    output_sentences.append(actual_sentences[i])
                    break
                if len(output_sentences) >= num_sentences: break
            if len(output_sentences) >= num_sentences: break
            
        # sort the output sentences back to their original order
        output_sentences = self.reorder_sentences(output_sentences, input)

        # concatinate the sentences into a single string
        return ' '.join(output_sentences)
コード例 #30
0
    def load_data(self, data_set):
        """
        Loads the given data set. Makes the data set case insensitive.
        Remove words that appear less than 5 times.
        :return updated data set
        """
        print("Started Loading the Data")
        tagged_tokens = data_set.tagged_words()
        tokens = untag(tagged_tokens)

        # Get the list of words that appear less than 5 times in  Corpus
        print("Get LT5's")
        tokens = [token.lower() for token in tokens] # Convert to lower case
        freq_dist = FreqDist(tokens) # Compute the freq dist
        tokens_lt_5 = [word for word, count in freq_dist.items() if count < 5]

        # Delete words less than 5 and make the corpus insensitive
        print("Making data case insensitive")
        token_range = range(len(tagged_tokens))
        indexed_tokens = OrderedDict(zip(token_range,tagged_tokens))
        updated_tagged_tokens = OrderedDict()
        for tagged_token_id, tagged_token in indexed_tokens.items():
            if tagged_token[0].lower() in tokens_lt_5:
                del indexed_tokens[tagged_token_id]
            else:
                temp = list()
                temp.append(tagged_token[0].lower())
                temp.append(tagged_token[1])
                temp = tuple(temp)
                updated_tagged_tokens[tagged_token_id] = temp
        tagged_tokens = list(updated_tagged_tokens.values())

        # Pickle the data for future purpose
        print("Pickling the Updated Corpus")
        if data_set == brown:
            file_name = "q5_brown_updated.pkl"
        else:
            file_name = "q5_treebank_updated.pkl"
        pkl.dump((tagged_tokens, tokens_lt_5), open(file_name,'wb'))

        return tagged_tokens, tokens_lt_5
コード例 #31
0
def update_word_freqeuncy(soup: BeautifulSoup, url, report):
    #tokenize the content
    tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z.\'\-_]+")  #define a tokenizer
    url_context = ""
    for string in soup.stripped_strings:
        url_context = url_context + string 
    word_list = tokenizer.tokenize(url_context)
    low_information_flag = False
    longest = len(word_list) #logestpage update by looking at len(word_list)
    if longest <= 10: #if word numbers <= 10, it is low information
        low_information_flag = True
    
    #move stop words out
    stop_words_list = ['a', 'able', 'about', 'above', 'abst', 'accordance', 'according', 'accordingly', 'across', 'act', 'actually', 'added', 'adj', 'affected', 'affecting', 'affects', 'after', 'afterwards', 'again', 'against', 'ah', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'announce', 'another', 'any', 'anybody', 'anyhow', 'anymore', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apparently', 'approximately', 'are', 'aren', 'arent', 'arise', 'around', 'as', 'aside', 'ask', 'asking', 'at', 'auth', 'available', 'away', 'awfully', 'b', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'begin', 'beginning', 'beginnings', 'begins', 'behind', 'being', 'believe', 'below', 'beside', 'besides', 'between', 'beyond', 'biol', 'both', 'brief', 'briefly', 'but', 'by', 'c', 'ca', 'came', 'can', 'cannot', "can't", 'cause', 'causes', 'certain', 'certainly', 'co', 'com', 'come', 'comes', 'contain', 'containing', 'contains', 'could', 'couldnt', 'd', 'date', 'did', "didn't", 'different', 'do', 'does', "doesn't", 'doing', 'done', "don't", 'down', 'downwards', 'due', 'during', 'e', 'each', 'ed', 'edu', 'effect', 'eg', 'eight', 'eighty', 'either', 'else', 'elsewhere', 'end', 'ending', 'enough', 'especially', 'et', 'et-al', 'etc', 'even', 'ever', 'every', 'everybody', 'everyone', 'everything', 'everywhere', 'ex', 'except', 'f', 'far', 'few', 'ff', 'fifth', 'first', 'five', 'fix', 'followed', 'following', 'follows', 'for', 'former', 'formerly', 'forth', 'found', 'four', 'from', 'further', 'furthermore', 'g', 'gave', 'get', 'gets', 'getting', 'give', 'given', 'gives', 'giving', 'go', 'goes', 'gone', 'got', 'gotten', 'h', 'had', 'happens', 'hardly', 'has', "hasn't", 'have', "haven't", 'having', 'he', 'hed', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'heres', 'hereupon', 'hers', 'herself', 'hes', 'hi', 'hid', 'him', 'himself', 'his', 'hither', 'home', 'how', 'howbeit', 'however', 'hundred', 'i', 'id', 'ie', 'if', "i'll", 'im', 'immediate', 'immediately', 'importance', 'important', 'in', 'inc', 'indeed', 'index', 'information', 'instead', 'into', 'invention', 'inward', 'is', "isn't", 'it', 'itd', "it'll", 'its', 'itself', "i've", 'j', 'just', 'k', 'keep\tkeeps', 'kept', 'kg', 'km', 'know', 'known', 'knows', 'l', 'largely', 'last', 'lately', 'later', 'latter', 'latterly', 'least', 'less', 'lest', 'let', 'lets', 'like', 'liked', 'likely', 'line', 'little', "'ll", 'look', 'looking', 'looks', 'ltd', 'm', 'made', 'mainly', 'make', 'makes', 'many', 'may', 'maybe', 'me', 'mean', 'means', 'meantime', 'meanwhile', 'merely', 'mg', 'might', 'million', 'miss', 'ml', 'more', 'moreover', 'most', 'mostly', 'mr', 'mrs', 'much', 'mug', 'must', 'my', 'myself', 'n', 'na', 'name', 'namely', 'nay', 'nd', 'near', 'nearly', 'necessarily', 'necessary', 'need', 'needs', 'neither', 'never', 'nevertheless', 'new', 'next', 'nine', 'ninety', 'no', 'nobody', 'non', 'none', 'nonetheless', 'noone', 'nor', 'normally', 'nos', 'not', 'noted', 'nothing', 'now', 'nowhere', 'o', 'obtain', 'obtained', 'obviously', 'of', 'off', 'often', 'oh', 'ok', 'okay', 'old', 'omitted', 'on', 'once', 'one', 'ones', 'only', 'onto', 'or', 'ord', 'other', 'others', 'otherwise', 'ought', 'our', 'ours', 'ourselves', 'out', 'outside', 'over', 'overall', 'owing', 'own', 'p', 'page', 'pages', 'part', 'particular', 'particularly', 'past', 'per', 'perhaps', 'placed', 'please', 'plus', 'poorly', 'possible', 'possibly', 'potentially', 'pp', 'predominantly', 'present', 'previously', 'primarily', 'probably', 'promptly', 'proud', 'provides', 'put', 'q', 'que', 'quickly', 'quite', 'qv', 'r', 'ran', 'rather', 'rd', 're', 'readily', 'really', 'recent', 'recently', 'ref', 'refs', 'regarding', 'regardless', 'regards', 'related', 'relatively', 'research', 'respectively', 'resulted', 'resulting', 'results', 'right', 'run', 's', 'said', 'same', 'saw', 'say', 'saying', 'says', 'sec', 'section', 'see', 'seeing', 'seem', 'seemed', 'seeming', 'seems', 'seen', 'self', 'selves', 'sent', 'seven', 'several', 'shall', 'she', 'shed', "she'll", 'shes', 'should', "shouldn't", 'show', 'showed', 'shown', 'showns', 'shows', 'significant', 'significantly', 'similar', 'similarly', 'since', 'six', 'slightly', 'so', 'some', 'somebody', 'somehow', 'someone', 'somethan', 'something', 'sometime', 'sometimes', 'somewhat', 'somewhere', 'soon', 'sorry', 'specifically', 'specified', 'specify', 'specifying', 'still', 'stop', 'strongly', 'sub', 'substantially', 'successfully', 'such', 'sufficiently', 'suggest', 'sup', 'sure\tt', 'take', 'taken', 'taking', 'tell', 'tends', 'th', 'than', 'thank', 'thanks', 'thanx', 'that', "that'll", 'thats', "that've", 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'thered', 'therefore', 'therein', "there'll", 'thereof', 'therere', 'theres', 'thereto', 'thereupon', "there've", 'these', 'they', 'theyd', "they'll", 'theyre', "they've", 'think', 'this', 'those', 'thou', 'though', 'thoughh', 'thousand', 'throug', 'through', 'throughout', 'thru', 'thus', 'til', 'tip', 'to', 'together', 'too', 'took', 'toward', 'towards', 'tried', 'tries', 'truly', 'try', 'trying', 'ts', 'twice', 'two', 'u', 'un', 'under', 'unfortunately', 'unless', 'unlike', 'unlikely', 'until', 'unto', 'up', 'upon', 'ups', 'us', 'use', 'used', 'useful', 'usefully', 'usefulness', 'uses', 'using', 'usually', 'v', 'value', 'various', "'ve", 'very', 'via', 'viz', 'vol', 'vols', 'vs', 'w', 'want', 'wants', 'was', 'wasnt', 'way', 'we', 'wed', 'welcome', "we'll", 'went', 'were', 'werent', "we've", 'what', 'whatever', "what'll", 'whats', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'wheres', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whim', 'whither', 'who', 'whod', 'whoever', 'whole', "who'll", 'whom', 'whomever', 'whos', 'whose', 'why', 'widely', 'willing', 'wish', 'with', 'within', 'without', 'wont', 'words', 'world', 'would', 'wouldnt', 'www', 'x', 'y', 'yes', 'yet', 'you', 'youd', "you'll", 'your', 'youre', 'yours', 'yourself', 'yourselves', "you've", 'z', 'zero']
    fdic = FreqDist(word_list)  
    for stop_word in stop_words_list:
        if stop_word in fdic.keys():
            del fdic[stop_word]
    
    if len(fdic.keys()) <= 10: #if word in word frequency dict <= 10, it is low information
        low_information_flag = True
    
    #update longest page to json file
    if "----------Longest_Num----------" not in report:
        report["----------Longest_Num----------"] = 0
        report["----------Longest_Url----------"] = []
    if longest > report["----------Longest_Num----------"]:
        report["----------Longest_Num----------"] = longest
        report["----------Longest_Url----------"] = url
            
    if not low_information_flag: # if it has high information value, update word frequency to json file
        for key, value in fdic.items():
            if key in report.keys():
                report[key] += fdic[key]
            else:
                report[key] = fdic[key]
def create_word_scores(posWords,negWords,objWords):

    word_fd = FreqDist() #可统计所有词的词频
    print(type(word_fd))
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['pos'].inc(word)
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['neg'][word] += 1
    for word in objWords:
        #word_fd.inc(word)
        word_fd[word] += 1
        #cond_word_fd['neg'].inc(word)
        cond_word_fd['obj'][word] += 1

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    obj_word_count = cond_word_fd['obj'].N() #中性词的数量
    total_word_count = pos_word_count + neg_word_count + obj_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) 
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) 
        obj_score = BigramAssocMeasures.chi_sq(cond_word_fd['obj'][word], (freq, obj_word_count), total_word_count) 
        #一个词的信息量等于积极卡方统计量加上消极卡方统计量
        word_scores[word] = pos_score + neg_score + obj_score

    return word_scores #包括了每个词和这个词的信息量
コード例 #33
0
"""
Throwaway script that processes the olac classification results on June 29th
to see how many iso codes were identified for each record and for how many
records were that number of iso codes identified.
"""
from operator import itemgetter
from nltk.probability import FreqDist

fd = FreqDist()
results_file = open('olac_iso_identification_results').readlines()
# this took like 15 minutes to get.
# not too bad, considering it's like all of olac. 
num_records = len(results_file)+0.0
for line in results_file:
    record = line.strip().split('\t')
    iso_list = record[-1].split()
    fd.inc(len(iso_list))

print "num\tfreq\tpercentage of records"
for num, freq in sorted(fd.items(), key=itemgetter(1), reverse=True):
    print str(num)+'\t'+str(freq)+'\t'+str(freq/num_records)
print ''
print 'Number of records: '+str(num_records)
コード例 #34
0
    "new", "called", "said", "come", "two", "city", "group", "state", "year",
    "case", "member", "even", "later", "month", "years", "much", "week",
    "county", "name", "example"
    "well", "members", "us", "say", "s"
}
stopwords.update(commonwords)

# tokenize and calculate the word frequencies
tokens = nltk.tokenize.word_tokenize(txt)
fDist = FreqDist(tokens)
# print(fDist.most_common(20))

# remove the stop words and common words
filtered_fDist = nltk.FreqDist(
    dict(
        (word, freq) for word, freq in fDist.items() if word not in stopwords))

# print(words)
# words.remove("example")
# words.remove("told")
# words.remove("become")
# words.remove("well")
# words.remove("may")
# words.remove("june")
# words.remove("homosexuals")

print('loading model...')
model = Word2Vec.load("assets/gay-seattle.w2v")
g = nx.DiGraph()
items = filtered_fDist.most_common(50)
for item in items:
コード例 #35
0
 def save_codex_hist_info(self, codex_type, codex_id, constraint=None):
     """Сохранение частотности слов во всем корпусе"""
     raw_articles_info = self.parser.sorted_articles_info[codex_type]
     articles_tokens = list()
     for article_info in tqdm(raw_articles_info):
         text = self.parser.get_article_text_by_id(article_info.id)
         text = text.lower()
         text = self.remove_chars_from_text(text, self.spec_chars)
         article_tokens = word_tokenize(' '.join(
             self.mystem.lemmatize(text)))
         for stop_word in self.stop_words:
             while stop_word in article_tokens:
                 article_tokens.remove(stop_word)
         articles_tokens.extend(article_tokens)
     text = Text(articles_tokens)
     f_dist = FreqDist(text)
     if not constraint:
         if os.path.exists(
                 generate_file_name_with_postfix(
                     self.config['articles_frequency_info_file'],
                     str(codex_id))):
             os.remove(
                 generate_file_name_with_postfix(
                     self.config['articles_frequency_info_file'],
                     str(codex_id)))
         with open(generate_file_name_with_postfix(
                 self.config['articles_frequency_info_file'],
                 str(codex_id)),
                   mode='w') as articles_frequency_info_file:
             articles_frequency_info_writer = csv.writer(
                 articles_frequency_info_file,
                 delimiter=',',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)
             articles_frequency_info_writer.writerow(['word', 'frequency'])
             for frequency_info in f_dist.most_common(100):
                 articles_frequency_info_writer.writerow([
                     frequency_info[0],
                     frequency_info[1] / len(articles_tokens)
                 ])
     else:
         if os.path.exists(
                 generate_file_name_with_postfix(
                     self.
                     config['articles_frequency_info_file_with_constraint'],
                     str(codex_id))):
             os.remove(
                 generate_file_name_with_postfix(
                     self.
                     config['articles_frequency_info_file_with_constraint'],
                     str(codex_id)))
         with open(generate_file_name_with_postfix(
                 self.
                 config['articles_frequency_info_file_with_constraint'],
                 str(codex_id)),
                   mode='w') as articles_frequency_info_file:
             articles_frequency_info_writer = csv.writer(
                 articles_frequency_info_file,
                 delimiter=',',
                 quotechar='"',
                 quoting=csv.QUOTE_MINIMAL)
             articles_frequency_info_writer.writerow(['word', 'frequency'])
             f_dist = list(
                 filter(lambda item: item[1] > constraint, f_dist.items()))
             for frequency_info in f_dist:
                 articles_frequency_info_writer.writerow([
                     frequency_info[0],
                     frequency_info[1] / len(articles_tokens)
                 ])
コード例 #36
0

print(lexical_diversity(text3))
print(lexical_diversity(text5))
print(percentage(4, 5))
print(percentage(text4.count('a'), len(text4)))
# %%
fdist1 = FreqDist(text1)
fdist1
vocabulary1 = fdist1.keys()
print(vocabulary1)
print(fdist1['whale'])

# %%
fdist1.plot(50, cumulative=True)

# %%
list(fdist1.items())[0:5]

# %%
fdist1.freq('monstrous')

# %%
# Total number of samples
fdist1.N()

# %%
fdist1

# %%
pos_dist = FreqDist(pos_list)
pos_dist.plot(title="Parts of Speech")
for pos, frequency in pos_dist.most_common(pos_dist.N()):
    print('{:<15s}:{:>4d}'.format(pos, frequency))

# Removing stop words
stop = stopwords.words('english') + list(string.punctuation)
stop_tokens = [word for word in tagged_tokens if word[0] not in stop]
# Removing single character words and simple punctuation
stop_tokens = [word for word in stop_tokens if len(word) > 1]
# Removing numbers and possive "'s"
stop_tokens = [word for word in stop_tokens \
if (not word[0].replace('.','',1).isnumeric()) and \
word[0]!="'s" ]
token_dist = FreqDist(stop_tokens)
print("\nCorpus contains", len(token_dist.items()), \
" unique terms after removing stop words.\n")
for word, frequency in token_dist.most_common(20):
    print('{:<15s}:{:>4d}'.format(word[0], frequency))

# Lemmatization - Stemming with POS
# WordNet Lematization Stems using POS
stemmer = SnowballStemmer("english")
wn_tags = {'N': wn.NOUN, 'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV}
wnl = WordNetLemmatizer()
stemmed_tokens = []
for token in stop_tokens:
    term = token[0]
    pos = token[1]
    pos = pos[0]
    try:
コード例 #38
0
ファイル: sent.py プロジェクト: zxlmufc/chinese_nlp
def cal_word_count():
    global train_word_id
    global pos_info
    global neg_info
    pos_info = []
    neg_info = []
    train_word_id = []

    word_fd = FreqDist()  #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist()  #可统计积极文本中的词频和消极文本中的词频

    print('Loading POS>>>')
    line_num = 0
    with open(pos_file, 'r') as fin:
        for line in fin:
            line_num += 1
            if not line_num % 10000: print('LINE:%d' % (line_num))
            items = line.split()
            tmp_col = []
            for item in items:
                item_id = term_to_id(item)
                word_fd[item_id] += 1
                cond_word_fd['pos'][item_id] += 1
                tmp_col.append(item_id)
            pos_info.append(tmp_col)

    print('Loading NEG>>>')
    line_num = 0
    with open(neg_file, 'r') as fin:
        for line in fin:
            line_num += 1
            if not line_num % 10000: print('LINE:%d' % (line_num))
            items = line.split()
            tmp_col = []
            for item in items:
                item_id = term_to_id(item)
                word_fd[item_id] += 1
                cond_word_fd['neg'][item_id] += 1
                tmp_col.append(item_id)
            neg_info.append(tmp_col)

    print('Randomize>>>')
    shuffle(pos_info)
    shuffle(neg_info)

    pos_w_count = cond_word_fd['pos'].N()
    neg_w_count = cond_word_fd['neg'].N()
    total_w_count = pos_w_count + neg_w_count
    #print('pos_w_count=%d, neg_w_count=%d, total_w_count=%d'%(pos_w_count, neg_w_count, total_w_count))
    #print('word_fd_count=%d'%(word_fd.N()))

    #计算卡方统计量
    global word_scores
    word_scores = {}

    print("CALC CHI-SQUARE...")
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['pos'][word], (freq, pos_w_count),
            total_w_count)  #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word],
                                               (freq, neg_w_count),
                                               total_w_count)  #同理
        word_scores[word] = pos_score + neg_score  #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    del word_fd
    del cond_word_fd

    return
コード例 #39
0
    def train_classifier(self,
                         dataset,
                         feature_fn_name='word',
                         train_ratio=0.8,
                         verbose=False,
                         token_column='text',
                         target_column='category',
                         best_ratio=0.8,
                         pos_target_val=1,
                         neg_target_val=-1):
        def word_feats(words):
            return dict([(word, True) for word in words])

        def best_word_feats(words):
            return dict([(word, True) for word in words if word in bestwords])

        def best_bigram_word_feats(words,
                                   score_fn=BigramAssocMeasures.chi_sq,
                                   n=200):
            bigram_finder = BigramCollocationFinder.from_words(words)
            bigrams = bigram_finder.nbest(score_fn, n)
            d = dict([(bigram, True) for bigram in bigrams])
            d.update(best_word_feats(words))
            return d

        def best_trigram_word_feats(words,
                                    score_fn=TrigramAssocMeasures.chi_sq,
                                    n=200):
            tcf = TrigramCollocationFinder.from_words(words)
            trigrams = tcf.nbest(score_fn, n)
            d = dict([(trigram, True) for trigram in trigrams])
            d.update(best_bigram_word_feats(words))
            d.update(best_word_feats(words))
            return d

        if verbose:
            print(
                '\nSelected feature function: {}, token column: {}, train ratio: {}'
                .format(feature_fn_name, token_column, train_ratio))
        df = dataset.sample(frac=1).reset_index(drop=True)
        negids = df[df[target_column] == neg_target_val].index
        posids = df[df[target_column] == pos_target_val].index
        feats = df[token_column]

        if feature_fn_name in ['best_word', 'best_bigram', 'best_trigram']:
            word_fd = FreqDist()
            label_word_fd = ConditionalFreqDist()
            for tokens in df[df[target_column] ==
                             pos_target_val][token_column]:
                for word in tokens.split():
                    word_fd[word] += 1
                    label_word_fd[self._positive_label][word] += 1

            for tokens in df[df[target_column] ==
                             neg_target_val][token_column]:
                for word in tokens.split():
                    word_fd[word] += 1
                    label_word_fd[self._negative_label][word] += 1

            pos_word_count = label_word_fd[self._positive_label].N()
            neg_word_count = label_word_fd[self._negative_label].N()
            total_word_count = pos_word_count + neg_word_count
            word_scores = {}
            for word, freq in word_fd.items():
                pos_score = BigramAssocMeasures.chi_sq(
                    label_word_fd[self._positive_label][word],
                    (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(
                    label_word_fd[self._negative_label][word],
                    (freq, neg_word_count), total_word_count)
                word_scores[word] = pos_score + neg_score

            best_cnt = int(len(word_scores) * best_ratio)
            best = sorted(word_scores.items(),
                          key=lambda item: item[1],
                          reverse=True)[:best_cnt]
            bestwords = set([w for w, s in best])
            if feature_fn_name == 'best_trigram_word_feats':
                feat_fn = best_trigram_word_feats
            elif feature_fn_name == 'best_bigram':
                feat_fn = best_bigram_word_feats
            else:
                feat_fn = best_word_feats

        else:
            feat_fn = word_feats

        negfeats = [(feat_fn(feats[i].split()), self._negative_label)
                    for i in negids]
        posfeats = [(feat_fn(feats[i].split()), self._positive_label)
                    for i in posids]
        if verbose:
            print('No. of samples: {}, Pos: {}, Neg: {}'.format(
                len(feats), len(posfeats), len(negfeats)))

        negcutoff = int(len(negfeats) * train_ratio)
        poscutoff = int(len(posfeats) * train_ratio)

        trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
        testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

        classifier = NaiveBayesClassifier.train(trainfeats)
        refsets = defaultdict(set)
        testsets = defaultdict(set)

        for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)

        metrics = {
            'Accuracy':
            nltk.classify.util.accuracy(classifier, testfeats),
            'Pos precision':
            precision(refsets[self._positive_label],
                      testsets[self._positive_label]),
            'Pos recall':
            recall(refsets[self._positive_label],
                   testsets[self._positive_label]),
            'Neg precision':
            precision(refsets[self._negative_label],
                      testsets[self._negative_label]),
            'Neg recall':
            recall(refsets[self._negative_label],
                   testsets[self._negative_label])
        }
        if verbose:
            print(metrics)

        return classifier, metrics
コード例 #40
0
    def setKeywords(self,method='tfidfNoPro',wordCount=10,startCount=0):
        '''
        function to automatically assign keywords if manual ones have not been assigned
        
        Inputs
        ======
        method: string
            Method used to pick automatically defined keywords. Choose from:
            adjAdv- picks most common adj and adv in text (default and 
                catch if other method doesn't exist)
            judgement-Under development
        wordCount: int
            Number of keywords returned (default 10)
        startCount: int
            Index where keywords are extracted (default 0 i.e. start of list)
        
        Attributes
        ==========
        keywords: list
            List of keywords automatically generated (can also be assigned 
            outside of function manually)
        '''
        #Save input values
        self.keywordCount=wordCount
        self.keywordStar=startCount
        
  
        #Default to 'adjAdv'
        if method=='adjAdv':
            #Get total text string
            txtString=''.join([x for x in self.rawText.values()])
            
            #Get total tag list
            tagList=tagger.tag(nltk.word_tokenize(txtString))
            
            #Define target dict
            targetDict={}
            
            #Loop through each tag in list and get count of tag and word
            for tag in tagList:
                if tag[1] in tagFilterList:
                    word=str.lower(''.join([c for c in tag[0] if c not in string.punctuation]))
                    #Filter out codecerrors
                    if word != 'codecerror':
                        try:
                            targetDict[word]=targetDict[word]+1
                        except:
                            targetDict[word]=1
        
            #Create data frame with counts and sort
            targetDF=pd.DataFrame([[k,v] for k,v in targetDict.items()],columns=['word','count'])
            targetDF.sort(['count'],inplace=True,ascending=False)
            
            #Create keywords based on startCount and wordCount
            ##self.keywords=list(targetDF['word'])[startCount:wordCount+startCount]

                        ###
            keyRaw=list(targetDF['word'])[startCount:wordCount+startCount]
            #print(keyRaw)

            keyStem=[stemmer.stem(word) for word in keyRaw] 
            #print(keyStem)

            self.keywords = keyStem

        elif method=='tfidf':
            # get all tokens for the fileList
            all_words = []
            for toke in self.tokens.values():
                all_words = all_words + toke

            ## create FreqDF with word frequencies from fileList
            freq = FreqDist(all_words) 
            columns_obj = ["term", "freq"]
            freqDF = pd.DataFrame(freq.items(), columns=columns_obj) # convert it to a data frame
            freqDF = freqDF.set_index('term')
            
            ## merge freqDF with idf data frame
            freqit = freqDF.join(self.idf[['idf', 'logidf']])
            # replace null values with max (i.e. if word isn't found, give it the value of the most unique word in IDF)
            maxidf = max(freqit['idf'].dropna())
            maxlogidf = max(freqit['logidf'].dropna())
            freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf
            freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf

            ## create tfidf columns
            freqit['tfidf'] = freqit['freq'] * freqit['idf']
            freqit['logtfidf'] = freqit['freq'] * freqit['logidf']

            ## order by tfidf weight
            freqit = freqit.sort_values(by='tfidf', ascending=False) 

            #filter out codecerror
            keyslist = freqit.iloc[startCount:wordCount+startCount].index.tolist()
            keywords = []
            for word in keyslist:
                if (word != 'codecerror') & (word != ''):
                    keywords = keywords + [word]

            ##
            self.keywords = keywords

        elif method=='tfidfNoPro':
            # get all tokens for the fileList
            all_words = []
            for toke in self.tokens.values():
                all_words = all_words + toke

            ## create FreqDF with word frequencies from fileList
            freq = FreqDist(all_words) 
            columns_obj = ["term", "freq"]
            freqDF = pd.DataFrame(freq.items(), columns=columns_obj) # convert it to a data frame
            #freqDF = freqDF.set_index('term')

            ## drop the pronouns
            terms = freqDF['term'].values.tolist()
            #noPro = drop_pronouns(terms[1:])
            noPro = drop_pronouns(terms)
            freqDF = freqDF.set_index('term')
            freqDF = freqDF.ix[noPro]
            
            ## merge freqDF with idf data frame
            freqit = freqDF.join(self.idf[['idf', 'logidf']])
            # replace null values with max
            maxidf = max(freqit['idf'].dropna())
            maxlogidf = max(freqit['logidf'].dropna())
            freqit.loc[pd.isnull(freqit['idf']), 'idf'] = maxidf
            freqit.loc[pd.isnull(freqit['logidf']), 'logidf'] = maxlogidf

            ## create tfidf columns
            freqit['tfidf'] = freqit['freq'] * freqit['idf']
            freqit['logtfidf'] = freqit['freq'] * freqit['logidf']

            ## order by tfidf weight
            freqit = freqit.sort_values(by='tfidf', ascending=False) 

            #filter out codecerror
            #keyslist = freqit.iloc[startCount:wordCount+startCount].index.tolist()
            keyslist = freqit.index.tolist()
            keywords = []
            for word in keyslist:
                if (word != 'codecerror') & (word != '') & (len(word)>2):
                    keywords = keywords + [word]

            ##
            self.keywords = keywords[startCount:wordCount+startCount]


        elif method=='manual':
            # Pull data from the csv file
            #filepath = "/Users/samanthagarofalo/Documents/Data Science/Capstone/Keywords.csv"

            # Create dataframe with manually entered keywords
            targetDF = pd.read_csv(manualKeywordFilePath)

            # User input to select the group that we are looking at keywords for
            #group = input("Which group would you like to look at? ")
            try:
                keywords = list(targetDF.Keywords[targetDF['Group'] == self.group])
                #print(keywords)
            except:
                keywords = ['this didnt work']
                print(keywords)

            for element in keywords:
                keywords = element.split(' ')

            if len(keywords) == 0:
                print('%%%%\nNO KEYWORDS FOUND: using tfidf by default\n%%%%')
                self.setKeywords(method='tfidf',wordCount=wordCount,startCount=startCount)

            else:
                self.keywords = keywords


        #Judgement method
        elif method=='judgement':
            posList=nounList+tagFilterList
            #Define target dict
            targetDict={}
            for fileName in self.fileList:
                for judgementStr in self.judgements[fileName]:
                    tagList=tagger.tag(nltk.word_tokenize(judgementStr))
            
                    
                    #Loop through each tag in list and get count of tag and word
                    for tag in tagList:
                        if tag[1] in posList:
                            word=str.lower(''.join([c for c in tag[0] if c not in string.punctuation]))
                            #Stem words if useStem True
                            newStopWords=stopWords
                            if self.useStem:
                                word=stemmer.stem(word)
                                newStopWords=[stemmer.stem(x) for x in stopWords]
                             
                            #Remove stopwords if useStopwords ==False
                            if not self.useStopwords: 
                                newStopWords.append("")

                                
                            #Filter out codecerrors
                            if word not in ['codecerror']+[' ']+newStopWords:
                                try:
                                    targetDict[word]=targetDict[word]+1
                                except:
                                    targetDict[word]=1
            #Create data frame with counts and sort
            targetDF=pd.DataFrame([[k,v] for k,v in targetDict.items()],columns=['word','count'])
            targetDF.sort(['count'],inplace=True,ascending=False)
            
            #Create keywords based on startCount and wordCount
            #self.keywords=list(targetDF['word'])[startCount:wordCount+startCount]

                        ###
            keyRaw=list(targetDF['word'])[startCount:wordCount+startCount]
            #print(keyRaw)

            keyStem=[stemmer.stem(word) for word in keyRaw] 
            #print(keyStem)

            self.keywords = keyStem

        else:
            print('ERROR: Method not found')
コード例 #41
0
file_content = open("input_text.txt").read()
tokens = word_tokenize(file_content)
print('\nTokens List:\n')
print(tokens)

get_ngram(tokens, 3)

obwords = word_tokenize(inaugural.raw('2009-Obama.txt'))
waswords = word_tokenize(inaugural.raw('1789-Washington.txt'))
print('\n\nOBAMA')
ob = FreqDist(obwords)
print('No. of words:', len(obwords))
print('No. of distinct words:', len(ob.keys()))

sortob = sorted(ob.items(), key=lambda x: x[1])
print('\n\nOBAMA50-', sortob[-50:])

was = FreqDist(waswords)
sortwas = sorted(was.items(), key=lambda x: x[1])
print('\n\nWASHINGTON0-', sortob[-50:], '\n\n')

obuni = FreqDist(list(ngrams(obwords, 1)))
obbi = FreqDist(list(ngrams(obwords, 2)))
obtri = FreqDist(list(ngrams(obwords, 3)))
sortobuni = sorted(obuni.items(), key=lambda x: x[1])
sortobbi = sorted(obbi.items(), key=lambda x: x[1])
sortobtri = sorted(obtri.items(), key=lambda x: x[1])

print("Unigrams: ", sortobuni[-10:])
print("Bigrams: ", sortobbi[-10:])
コード例 #42
0
    "When used in this sense, the term adopts a meaning reminiscent of receptive fields in actual biological nervous systems."
]

##for s in sentences:
##    local = s
##    local = re.sub('[^a-zA-Z0-9- ]', '', local)
##    local = local.split()
##    print(local)

tokens = word_tokenize(sentences[0])
frequencies = FreqDist(tokens)

#for key, value in frequencies.items():
#    print(key, '-> ', value)

tuples = list(frequencies.items())
print(tuples)
tuples.sort(key=lambda x: x[0])
print(tuples)

from nltk.stem import WordNetLemmatizer

lemma = WordNetLemmatizer()

print(lemma.lemmatize('apples'))
print(lemma.lemmatize('dies'))

from nltk.stem import PorterStemmer
# LancasterStemmer

stem = PorterStemmer()
コード例 #43
0
print '%d tags' % len(tag_counts)
print '%d IOBs\n' % len(iob_counts)

if args.sort == 'tag':
    sort_key = lambda (t, c): t
elif args.sort == 'count':
    sort_key = lambda (t, c): c
else:
    raise ValueError('%s is not a valid sort option' % args.sort)

line1 = '  Tag      Count  '
line2 = '=======  ========='

iobs = sorted(iob_counts.keys())

for iob in iobs:
    line1 += '    %s  ' % iob
    line2 += '  ==%s==' % ('=' * len(iob))

print line1
print line2

for tag, count in sorted(tag_counts.items(),
                         key=sort_key,
                         reverse=args.reverse):
    iob_counts = [
        str(tag_iob_counts[tag][iob]).rjust(4 + len(iob)) for iob in iobs
    ]
    print '  '.join([tag.ljust(7), str(count).rjust(9)] + iob_counts)

print line2
コード例 #44
0
from nltk.probability import FreqDist
from nltk.corpus import treebank

fd = FreqDist()
for word, tag in treebank.tagged_words():
    fd[tag] += 1
tags = list(fd.items())
tags.sort(key=lambda (tag, freq): tag)
for tag, freq in tags:
    print('{0}\t\t\t{1}'.format(tag, freq))
コード例 #45
0
        word_fd[
            word] += 1  #After receiving the parameter 'words', the frequency of each' word 'in' words' will be counted and a dictionary will be returned. Key is' word ', and value is the number of occurrences of word in words.
        label_word_fd['pos'][word] += 1

for f in neg_sentence_list:
    for word in f:
        word_fd[word] += 1
        label_word_fd['neg'][word] += 1

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(
        label_word_fd['pos'][word], (freq, pos_word_count),
        total_word_count)  #用BigramAssocMeasures.chi_sq函数为词汇计算pos评分
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                           (freq, neg_word_count),
                                           total_word_count)
    word_scores[word] = pos_score + neg_score

best = sorted(word_scores.items(), key=lambda s: s[1], reverse=True)[:1000]

bestwords = set([w for w, s in best])


def high_information_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
コード例 #46
0
def create_word_scores():
    angerWords, disgustWords, fearWords, joyWords, surpriseWords = [], [], [], [], []
    with open(ANGER_FILE, 'r', errors="ignore",
              encoding="utf-8") as angerSentence:
        for i in angerSentence:
            angerWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            angerWords.append(angerWord)
    with open(DISGUST_FILE, 'r', errors="ignore",
              encoding="utf-8") as disgustSentence:
        for i in disgustSentence:
            disgustWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            disgustWords.append(disgustWord)
    with open(FEAR_FILE, 'r', errors="ignore",
              encoding="utf-8") as fearSentence:
        for i in fearSentence:
            fearWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            fearWords.append(fearWord)
    with open(JOY_FILE, 'r', errors="ignore", encoding="utf-8") as joySentence:
        for i in joySentence:
            joyWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            joyWords.append(joyWord)
    with open(SURPRISE_FILE, 'r', errors="ignore") as surpriseSentence:
        for i in surpriseSentence:
            surpriseWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            surpriseWords.append(surpriseWord)
    angerWords = list(itertools.chain(*angerWords))
    disgustWords = list(itertools.chain(*disgustWords))
    fearWords = list(itertools.chain(*fearWords))
    joyWords = list(itertools.chain(*joyWords))
    surpriseWords = list(itertools.chain(*surpriseWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in angerWords:
        word_fd[word.lower()] += 1
        cond_word_fd['anger'][word.lower()] += 1
    for word in disgustWords:
        word_fd[word.lower()] += 1
        cond_word_fd['disgust'][word.lower()] += 1
    for word in fearWords:
        word_fd[word.lower()] += 1
        cond_word_fd['fear'][word.lower()] += 1
    for word in joyWords:
        word_fd[word.lower()] += 1
        cond_word_fd['joy'][word.lower()] += 1
    for word in surpriseWords:
        word_fd[word.lower()] += 1
        cond_word_fd['surprise'][word.lower()] += 1

    anger_word_count = cond_word_fd['anger'].N()
    disgust_word_count = cond_word_fd['disgust'].N()
    fear_word_count = cond_word_fd['fear'].N()
    joy_word_count = cond_word_fd['joy'].N()
    surprise_word_count = cond_word_fd['surprise'].N()
    total_word_count = anger_word_count + disgust_word_count + fear_word_count + joy_word_count + surprise_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        anger_score = BigramAssocMeasures.chi_sq(cond_word_fd['anger'][word],
                                                 (freq, anger_word_count),
                                                 total_word_count)
        disgust_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['disgust'][word], (freq, disgust_word_count),
            total_word_count)
        fear_score = BigramAssocMeasures.chi_sq(cond_word_fd['fear'][word],
                                                (freq, fear_word_count),
                                                total_word_count)
        joy_score = BigramAssocMeasures.chi_sq(cond_word_fd['joy'][word],
                                               (freq, joy_word_count),
                                               total_word_count)
        surprise_score = BigramAssocMeasures.chi_sq(
            cond_word_fd['surprise'][word], (freq, surprise_word_count),
            total_word_count)
        word_scores[
            word] = anger_score + disgust_score + fear_score + joy_score + surprise_score

    return word_scores
コード例 #47
0
# 1
from nltk.corpus import cess_esp
# 2
print("2.", len(cess_esp.words()))
# 3
print("3.", len(cess_esp.sents()))
# 4
from nltk.probability import FreqDist

first_file = cess_esp.fileids()[0]
cess_freq0 = FreqDist(cess_esp.words(first_file))
print("4.", cess_freq0.most_common(20))
# 5
print("5.", [w for w, k in cess_freq0.most_common()])
# 6
print("6.", [w for w, k in cess_freq0.items() if len(w) > 7 and k > 2])
# 7
print("7.", [k for w, k in cess_freq0.most_common()])
print("7b. Freq de aparición de la preposición a", cess_freq0.get("a", 0))
# 8
print("8. No de palabras que aparecen una sola vez:",
      len([w for w, k in cess_freq0.items() if k == 1]))
# 9
print("9. La palabra más frecuente es", cess_freq0.max())
# 10
from nltk.corpus import PlaintextCorpusReader

mycorpus = PlaintextCorpusReader("../res/", ".*")
# 11
print("11.")
for doc in mycorpus.fileids():
コード例 #48
0
    def removeStopWords(self, corpus):
        '''
        Remove Stop Words, Punctuations, Special Characters, Rare Words
        Fill data structures for plotting the data distribution
        Stop words list is taken from NLTK
        '''
        from nltk.corpus import stopwords
        from nltk.tokenize import word_tokenize
        from nltk.tokenize.treebank import TreebankWordDetokenizer
        nltk.download('words')
        nltk.download('stopwords')
        nltk.download('punkt')
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(corpus)
        filtered_sentence = []
        stop_word_count = 0
        punctuation_cnt = 0
        digit_cnt = 0
        punc = set(string.punctuation)
        words = set(nltk.corpus.words.words())
        nonEnglish = 0

        for w in word_tokens:
            # Calculate non english words
            if w.lower() not in words:
                nonEnglish += 1
            # Calculate Punctuation Count
            if w in punc:
                punctuation_cnt += 1
            # Calculate Nuymeric Digits Count
            elif w.isnumeric():
                digit_cnt += 1

            elif w not in stop_words:
                filtered_sentence.append(w)

            else:
                stop_word_count += 1
        # Remove rare words
        if REMOVE_RARE_WORDS:
            fdist = FreqDist(filtered_sentence)
            # Get list of words with frequency less than 5
            rare_words = list(filter(lambda x: x[1] <= 5, fdist.items()))
            if DEBUG_PRINT:
                print(rare_words)
            for word in filtered_sentence:
                for entry in rare_words:
                    if word in entry:
                        filtered_sentence.remove(word)

        # Update internal data structure used in plotting
        corpus_attributes.update({"Stop_Words_Count": (stop_word_count)})
        corpus_attributes.update({"Punctuation_Count": (punctuation_cnt)})
        corpus_attributes.update({"No_Of_Words": (len(word_tokens))})
        corpus_attributes.update({"Number_Count": (digit_cnt)})
        corpus_attributes.update({"Non_english_word_Count": (nonEnglish)})
        if DEBUG_PRINT:
            print('stop_word_count', stop_word_count)
            print('punctuation_cnt', punctuation_cnt)
            print('No_Of_Words', (len(word_tokens)))
            print('Number_Count', digit_cnt)
            print('Non_english_word_Count', nonEnglish)
        return TreebankWordDetokenizer().detokenize(filtered_sentence)
コード例 #49
0
def topwords():
    """
        inspired by
        http://www.huffingtonpost.com/brian-honigman/the-100-most-popular-hash_b_2463195.html
        http://editd.com/features/monitor/

        used these resources for understanding nltk usage
        http://www.laurentluce.com/posts/twitter-sentiment-analysis-using-python-and-nltk/
        http://text-processing.com/demo/sentiment/
        http://ravikiranj.net/drupal/201205/code/machine-learning/how-build-twitter-sentiment-analyzer
        http://streamhacker.com/2010/05/24/text-classification-sentiment-analysis-stopwords-collocations/

        http://fashionweekdates.com/world-fashion-week-dates-schedule.html
    """

    ## place tweets into morning and afternoon bins
    ru = db.GqlQuery("SELECT * FROM Tweets where iso!=:1", 'en').fetch(limit=1000)
    en = db.GqlQuery("SELECT * FROM Tweets where iso=:1", 'en').fetch(limit=1000)

    #this is used because nltk.corpus.stopwords.words('english') doesnt work in GAE
    # from https://github.com/arc12/Text-Mining-Weak-Signals/wiki/Standard-set-of-english-stopwords
    stop = "a, about, above, across, after, again, against, all, almost, alone, along, already, also, although, always, am, among, an, and, another, any, anybody, anyone, anything, anywhere, are, area, areas, aren't, around, as, ask, asked, asking, asks, at, away, b, back, backed, backing, backs, be, became, because, become, becomes, been, before, began, behind, being, beings, below, best, better, between, big, both, but, by, c, came, can, cannot, can't, case, cases, certain, certainly, clear, clearly, come, could, couldn't, d, did, didn't, differ, different, differently, do, does, doesn't, doing, done, don't, down, downed, downing, downs, during, e, each, early, either, end, ended, ending, ends, enough, even, evenly, ever, every, everybody, everyone, everything, everywhere, f, face, faces, fact, facts, far, felt, few, find, finds, first, for, four, from, full, fully, further, furthered, furthering, furthers, g, gave, general, generally, get, gets, give, given, gives, go, going, good, goods, got, great, greater, greatest, group, grouped, grouping, groups, h, had, hadn't, has, hasn't, have, haven't, having, he, he'd, he'll, her, here, here's, hers, herself, he's, high, higher, highest, him, himself, his, how, however, how's, i, i'd, if, i'll, i'm, important, in, interest, interested, interesting, interests, into, is, isn't, it, its, it's, itself, i've, j, just, k, keep, keeps, kind, knew, know, known, knows, l, large, largely, last, later, latest, least, less, let, lets, let's, like, likely, long, longer, longest, m, made, make, making, man, many, may, me, member, members, men, might, more, most, mostly, mr, mrs, much, must, mustn't, my, myself, n, necessary, need, needed, needing, needs, never, new, newer, newest, next, no, nobody, non, noone, nor, not, nothing, now, nowhere, number, numbers, o, of, off, often, old, older, oldest, on, once, one, only, open, opened, opening, opens, or, order, ordered, ordering, orders, other, others, ought, our, ours, ourselves, out, over, own, p, part, parted, parting, parts, per, perhaps, place, places, point, pointed, pointing, points, possible, present, presented, presenting, presents, problem, problems, put, puts, q, quite, r, rather, really, right, room, rooms, s, said, same, saw, say, says, second, seconds, see, seem, seemed, seeming, seems, sees, several, shall, shan't, she, she'd, she'll, she's, should, shouldn't, show, showed, showing, shows, side, sides, since, small, smaller, smallest, so, some, somebody, someone, something, somewhere, state, states, still, such, sure, t, take, taken, than, that, that's, the, their, theirs, them, themselves, then, there, therefore, there's, these, they, they'd, they'll, they're, they've, thing, things, think, thinks, this, those, though, thought, thoughts, three, through, thus, to, today, together, too, took, toward, turn, turned, turning, turns, two, u, under, until, up, upon, us, use, used, uses, v, very, w, want, wanted, wanting, wants, was, wasn't, way, ways, we, we'd, well, we'll, wells, went, were, we're, weren't, we've, what, what's, when, when's, where, where's, whether, which, while, who, whole, whom, who's, whose, why, why's, will, with, within, without, won't, work, worked, working, works, would, wouldn't, x, y, year, years, yes, yet, you, you'd, you'll, young, younger, youngest, your, you're, yours, yourself, yourselves, you've, z"

    stopwordsenglish = re.findall(r'\w+', stop, flags = re.UNICODE | re.LOCALE)

    stopwordstwitter = ['http', '#', '@', '!', ':', ';', '&', '\'', '-',
                        't', 'co', 'rt']


    stopwords_list = stopwordsenglish + stopwordstwitter
    freq1 = FreqDist()
    freq2 = FreqDist()

    for t in ru:
        #We only want to work with lowercase for the comparisons
        sentence = t.tweet.lower()

        #remove punctuation and split into seperate words
        words = re.findall(r'\w+', sentence, flags=re.UNICODE | re.LOCALE)

        #corpus = nltk.word_tokenize(words)
        for a in words:
            if a not in stopwords_list:
                freq1.inc(a)

    for t in en:
        #We only want to work with lowercase for the comparisons
        sentence = t.tweet.lower()

        #remove punctuation and split into seperate words
        words = re.findall(r'\w+', sentence, flags=re.UNICODE | re.LOCALE)

        #corpus = nltk.word_tokenize(t.tweet)
        for a in words:
            if a not in stopwords_list:
                freq2.inc(a)

    #display results
    #bins = freq1.B()  # Returns: The total number of sample bins with counts > 0
    f1 = freq1.items()[:90]  # Returns: List of all items in tuple format
    f2 = freq2.items()[:90]

    context = {'one': f1, 'two': f2,
               'stop': stopwords_list
               }

    return render_template('topwords.html', **context)
コード例 #50
0
# ANALYSE SPEECH FROM TRANSCRIPTS
# This algorithm reads the speech text collected from the webscraping
# algorithms and uses NLTK functions to analyse the most frequent words used.

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
import csv

# Read in the speech data
with open("speechData.txt", "r") as words:
    text = words.read()
    text = str(text).replace("][", ", ")

# Retokenise words and create frequency distribution
words = word_tokenize(str(text))
fdist = FreqDist(words)

# Write the results into a csv file
with open("frequency.csv", "w") as fp:
    writer = csv.writer(fp, quoting=csv.QUOTE_ALL)
    writer.writerows(fdist.items())
コード例 #51
0
 def save_unique_words_analysis(self, uniqueness_threshold):
     """Сохраняем информацию о количестве уникальных слов и количестве статей, в которых эти слова встречаются, а также информацию о заданном количестве уникальных слов"""
     articles_tokens = list()
     articles_words_info = dict()
     for (codex_type, _) in tqdm(self.parser.codex_urls):
         raw_articles_info = self.parser.sorted_articles_info[codex_type]
         for article_info in tqdm(raw_articles_info):
             text = self.parser.get_article_text_by_id(article_info.id)
             text = text.lower()
             text = self.remove_chars_from_text(text, self.spec_chars)
             article_tokens = word_tokenize(' '.join(
                 self.mystem.lemmatize(text)))
             for stop_word in self.stop_words:
                 while stop_word in article_tokens:
                     article_tokens.remove(stop_word)
             articles_words_info[self.get_unique_article_identifier(
                 codex_type, article_info.id)] = list(set(article_tokens))
             articles_tokens.extend(article_tokens)
     text = Text(articles_tokens)
     f_dist = FreqDist(text)
     f_dist = list(
         filter(lambda item: item[1] <= uniqueness_threshold,
                f_dist.items()))
     unique_words_info = dict()
     # Сохраняем информацию в виде: 'уникальное слово': ['количество во всем корпусе', 'количество статей, в котром встретилось это слово']
     for word_info in f_dist:
         if word_info[0] not in unique_words_info:
             unique_words_info[word_info[0]] = [word_info[1], 0]
         for article_id in tqdm(articles_words_info):
             if word_info[0] in articles_words_info[article_id]:
                 unique_words_info[word_info[0]][1] += 1
     if os.path.exists(self.config['articles_unique_words_info_file']):
         os.remove(self.config['articles_unique_words_info_file'])
     with open(self.config['articles_unique_words_info_file'],
               mode='w') as articles_unique_words_info_file:
         articles_unique_words_info_writer = csv.writer(
             articles_unique_words_info_file,
             delimiter=',',
             quotechar='"',
             quoting=csv.QUOTE_MINIMAL)
         articles_unique_words_info_writer.writerow(
             ['word', 'word_count', 'articles_count'])
         for info in unique_words_info.items():
             articles_unique_words_info_writer.writerow(
                 [info[0], info[1][0], info[1][1]])
     unique_words_metrics = dict()
     # Сохраняем информацию в виде: 'заданное количество слова во всем корпусе': 'количество таких слов во всем корпусе'
     for value in unique_words_info.values():
         if value[0] not in unique_words_metrics:
             unique_words_metrics[value[0]] = value[1]
         else:
             unique_words_metrics[value[0]] += value[1]
     if os.path.exists(self.config['articles_unique_words_analysis_file']):
         os.remove(self.config['articles_unique_words_analysis_file'])
     with open(self.config['articles_unique_words_analysis_file'],
               mode='w') as articles_unique_words_analysis_file:
         articles_unique_words_analysis_writer = csv.writer(
             articles_unique_words_analysis_file,
             delimiter=',',
             quotechar='"',
             quoting=csv.QUOTE_MINIMAL)
         articles_unique_words_analysis_writer.writerow([
             'count_unique_words_frequency', 'count_unique_words_in_corpus'
         ])
         for info in unique_words_metrics.items():
             articles_unique_words_analysis_writer.writerow(
                 [info[0], info[1]])
     if os.path.exists(
             self.
             config['articles_unique_words_analysis_file_with_frequency']):
         os.remove(
             self.
             config['articles_unique_words_analysis_file_with_frequency'])
     with open(self.
               config['articles_unique_words_analysis_file_with_frequency'],
               mode='w') as articles_unique_words_analysis_file:
         articles_unique_words_analysis_writer = csv.writer(
             articles_unique_words_analysis_file,
             delimiter=',',
             quotechar='"',
             quoting=csv.QUOTE_MINIMAL)
         articles_unique_words_analysis_writer.writerow([
             'count_unique_words_frequency',
             'count_unique_words_in_corpus_frequency'
         ])
         for info in unique_words_metrics.items():
             articles_unique_words_analysis_writer.writerow(
                 [info[0], info[1] / len(articles_tokens)])
コード例 #52
0
simplefilter(action='ignore', category=FutureWarning)

df = pd.read_csv("chatbot_classes.csv")

ques_words = []
list_sent = list(df['question'])

for sentence in list_sent:
    for words in sentence.split():
        ques_words.append(words)

fd = FreqDist()
for word in ques_words:
    fd[word.lower()] += 1

labels, keys = zip(*fd.items())
labels = []
keys = []

for T in fd.most_common(10):
    labels.append(T[0])
    keys.append(T[1])


#function removing punctuations and lower the words
def lower_punc(text):
    w = []
    for word in text.split():
        w.append(word.lower())

    wd = []
コード例 #53
0
ファイル: nimgenetics.py プロジェクト: jmiguel792/HPOMining
def subsetManual(c_uncleaned, c_global):
    """
    Esta función genera dos diccionarios en orden ascendente e inverso respecto del conteo de términos 
    a nivel global utilizando el corpus_lookup sin curar y ejecutando la técnica lookup entre este corpus
    y el global curado para así obtener este análisis. Esta estrategia permite también obtener una selección
    de los 500 términos con menos frecuencia para utilizarlos para el mapeo manual a HPO.Los parámetros son 
    los path hacia el corpus_lookup sin curar y el corpus global generado con la función mergeCorpus.
    """

    # parte 1: texto sin curar -> texto
    with open(c_uncleaned, 'r') as f:
        texto = []
        lines = f.readlines()
        for line in lines:
            texto.append(line)
        del (lines)
    f.closed

    # parte 2: corpus global -> texto_p
    with open(c_global, 'r') as f:
        texto_p = []
        lines = f.readlines()
        for line in lines:
            texto_p.append(line)
        del (lines)
    f.closed

    # parte 3: lookup para el conteo -> mapped
    mapped = []
    for line in texto:
        found = []
        for item in texto_p:
            if item in line:
                found.append(item)
                #print('item found:', item)
        mapped.append(found)

    # compactar mapped -> mapped_flat
    mapped_flat = list(itertools.chain.from_iterable(mapped))

    # eliminar saltos de página -> mapped_final
    mapped_final = []
    for ele in mapped_flat:
        mapped_final.append(ele.replace('\n', ''))

    # parte 4: conteo de términos
    counter = collections.Counter(mapped_final)

    # diccionario ordenado por valor
    d = counter.items()

    # diccionario en orden ascendente al conteo de términos
    d_ord_increasing = collections.OrderedDict(
        sorted(sorted(d), key=lambda t: t[1]))

    # diccionario en orden descendente al conteo de términos
    d_ord_decreasing = collections.OrderedDict(
        sorted(sorted(d), key=lambda t: t[1], reverse=True))

    # DataFrame to write excel by ascending or descending order
    df = pd.DataFrame(data=d_ord_decreasing, index=['count']).T
    df1 = pd.DataFrame(data=d_ord_increasing, index=['count']).T
    df.to_excel(excel_writer='conteo_terminos_increasing.xlsx')
    df1.to_excel(excel_writer='conteo_terminos_decreasing.xlsx')

    # parte 5: extracción 500 términos para validación manual
    fdist = FreqDist(mapped_final)
    all_items = fdist.items()
    terms = fdist.most_common()[-500:]

    terminos = []
    for i in terms:
        terminos.append(i[0])

    # parte 6: guardar los términos seleccionados para el mapeo manual
    outfile = 'términos_mapeo_manual.txt'
    with open(outfile, 'w') as w:
        w.write('\n'.join(terminos))

    return all_items
コード例 #54
0
def lexicalDiversity(
        text):  #this function shows how many times on average a word is used
    return len(text) / len(set(
        text))  #larger results mean less diversity, lower is high diversity


# print(lexicalDiversity(text3))
'''
Frequency distributions show the tallies of each word used
'''
from nltk.probability import FreqDist
fdist = FreqDist(
    text3
)  #this gives the frequency distribution of every word. i.e. tallies of each word used
vocab = fdist.items()  #list/dict of words from freq dist with keys and values
# print(vocab)
# hapaxes=fdist.hapaxes() #hapaxes are words that only appear once
# fdist.plot(25) #plot 25 most common tokens
# fdist.tabulate()

#now we can try to filter out and only get important words of a critical length and occurrence
uniqueWord = set(text3)
importantWords = [wd for wd in uniqueWord if len(wd) > 5 and fdist[wd] > 10]
# print(sorted(importantWords))

#Collocations - show words that appear together most often
c = text3.collocations()
# print(c)

#Extract num chars, words and sents
コード例 #55
0
import nltk
from nltk.probability import FreqDist

sense = nltk.corpus.gutenberg.words('austen-sense.txt')
fdist = FreqDist(sense)
rank = 0
for item in fdist.items():
    rank = rank + 1
    result = str(rank) + " " + str(item[1]) + " " + str(item[0])
    print result
コード例 #56
0

# Leemos los archivos, tokenizamos y sacamos las frecuencias de las palabras.
# He usado dos textos de Philip K. Dick : "La segunda variedad" y "El hombre variable"
# Se pueden cambiar por los que tengas disponibles :)

pkdsv = read_file('PKD_Segunda_variedad.txt')
text_sv = nltk.word_tokenize(pkdsv)
fdist_sv = FreqDist(len(palabra) for palabra in text_sv)

pkdhv = read_file('PKD_Hombre_variable.txt')
text_hv = nltk.word_tokenize(pkdhv)
fdist_hv = FreqDist(len(palabra) for palabra in text_hv)

# Ordenamos las longitudes de las palabras con sus frecuencias.
sorted_by_first_hv = sorted(fdist_hv.items(), key=lambda tup: tup[0])
sorted_by_first_sv = sorted(fdist_sv.items(), key=lambda tup: tup[0])

# Convierto las listas de tuplas en diccionarios.
dict_hv = dict(sorted_by_first_hv)
dict_sv = dict(sorted_by_first_sv)

# Monto un nuevo diccionario con los elementos comunes de ambas y los valores del primero
dict_hv_2 = {}
for key in dict_hv:
    if key in dict_sv:
        dict_hv_2.update({key: dict_hv[key]})

# Monto un nuevo diccionario con los elementos comunes de ambas y los valores del primero
dict_sv_2 = {}
for key in dict_sv:
コード例 #57
0
def answer_three():

    wordfreq = FreqDist(text1)
    wordfreq_desc = sorted(wordfreq.items(), key=lambda x: x[1], reverse=True)

    return wordfreq_desc[:20]  # Your answer here
コード例 #58
0
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.probability import FreqDist
from heapq import nlargest
from collections import defaultdict
import pandas as pd

with open('shengine.txt') as f1:
    data = f1.read().decode('utf-8', 'ignore').replace("\n", " ").replace(
        u"\u2019", "'").replace(u"\u2018", "'")
data.encode('ascii', 'ignore')
sents = sent_tokenize(data)
for i in sents:
    i.encode('ascii', 'ignore')
words = word_tokenize(data.lower())
mystops = set(stopwords.words('english') + list(punctuation))
words = [word for word in words if word not in mystops]
freq = FreqDist(words)
print(pd.DataFrame(list(freq.items()), columns=["Word", "Frequency"]))
#print(nlargest(10,freq, key = freq.get))   #count for the word
ranking = defaultdict(int)
for i, sent in enumerate(sents):
    for w in word_tokenize(sent.lower()):
        if w in freq:
            ranking[i] += freq[w]

final_numbers = nlargest(3, ranking, key=ranking.get)
print(final_numbers)
print([sents[j] for j in sorted(final_numbers)])
コード例 #59
0
def TrainingSynonymCheck(new_words, all_words):
    realwords = []
    for i in all_words:
        if i.isalpha():
            realwords.append(i)

    lemmatized_words = []
    for i in realwords:
        n = lemmatizer.lemmatize(i)
        lemmatized_words.append(str(n))

    FreqDictionary = FreqDist(lemmatized_words)
    l = sorted(FreqDictionary.items(),
               key=operator.itemgetter(1),
               reverse=True)

    #print l

    unique_words = []
    for i in l:
        unique_words.append(i[0])

    # Taking first words from all synsets
    # synDict={}
    # flag=0
    # for word in unique_words:
    #     if word.isalpha():
    #         syns=wn.synsets(word)
    #         lst=[]
    #         for s in syns:
    #             a=s.lemmas()[0].name()
    #             if a!=word and (a not in lst):
    #                 lst.append(str(a))
    #         if lst!=[]:
    #             synDict[word]=(lst)

    # Taking the first synset
    synDict = {}
    flag = 0
    for word in unique_words:
        if word.isalpha():
            syns = wn.synsets(word)
            lst = [word]
            if syns != []:
                #for s in syns:
                s = syns[0]
                a = s.lemmas()
                for i in a:
                    f = i.name()
                    if f != word and (f not in lst):
                        lst.append(str(f))
            if lst != []:
                synDict[word] = (lst)

    synunique_words = copy.deepcopy(
        unique_words
    )  ## synunique_words : copy that contains all the unique words in the questions initially

    for word in unique_words:
        i = unique_words.index(word)
        if word in synDict.keys():
            for syn in synDict[word]:
                for j in unique_words[i + 1:]:
                    if syn == j:
                        idx = unique_words.index(j)
                        unique_words[idx] = word

    return [
        unique_words, synunique_words
    ]  # unique words : changed words , synunique_words : original words unique
コード例 #60
0
def train_and_test(reviews_pos, reviews_neg):
    """
    훈련 및 테스트
    :param reviews_pos: 긍정 리뷰 list
    :param reviews_neg: 부정 리뷰 list
    :return:
    """

    # 긍정 리뷰, 부정 리뷰 각각에서의 전체 단어에 대한 빈도수 계산
    tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l]
    tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l]

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in tot_poswords:
        word_fd[word.lower()] += 1
        label_word_fd['pos'][word.lower()] += 1
    for word in tot_negwords:
        word_fd[word.lower()] += 1
        label_word_fd['neg'][word.lower()] += 1

    pos_words = len(tot_poswords)
    neg_words = len(tot_negwords)
    tot_words = pos_words + neg_words

    # 각 단어별 점수
    word_scores = {}
    for word, freq in iter(word_fd.items()):
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                               (freq, pos_words), tot_words)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                               (freq, neg_words), tot_words)
        word_scores[word] = pos_score + neg_score
    print('total: ', len(word_scores))

    # 점수가 높은 10000개의 단어만 추출
    best = sorted(iter(word_scores.items()),
                  key=lambda args: args[1],
                  reverse=True)[:10000]
    bestwords = set([w for w, s in best])

    negfeatures = [(best_words_features(r.words, bestwords), 'neg')
                   for r in reviews_neg]
    posfeatures = [(best_words_features(r.words, bestwords), 'pos')
                   for r in reviews_pos]

    # 훈련 집합 80%와 테스트 집합 20% 분리
    portionpos = int(len(posfeatures) * 0.8)
    portionneg = int(len(negfeatures) * 0.8)
    print(portionpos, '-', portionneg)
    trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
    print(len(trainfeatures))

    # 훈련
    classifier = NaiveBayesClassifier.train(trainfeatures)

    # 테스트
    testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
    shuffle(testfeatures)
    err = 0
    print('test on: ', len(testfeatures))
    for r in testfeatures:
        sent = classifier.classify(r[0])
        # print(r[1], '-pred: ', sent)
        if sent != r[1]:
            err += 1.
    print('error rate: ', err / float(len(testfeatures)))