def intersection_N_most_popular(text, text2, N, exclude):
    set_exclude = set(exclude)
    M = len(text)
    M2 = len(text2)
    #        tokens = [w.lower() for w in text]
    #        tokens2 = [w.lower() for w in text2]
    fd = FreqDist(text)
    new = []
    sort = sorted(fd.items(), key=itemgetter(1), reverse=True)
    j = 0
    while len(new) < N and j < len(sort):
        if not sort[j][0] in set_exclude:
            new.append(sort[j][0])
        j += 1
    fd2 = FreqDist(text2)
    new2 = []
    sort = sorted(fd2.items(), key=itemgetter(1), reverse=True)
    j = 0
    while len(new2) < N and j < len(sort):
        if not sort[j][0] in set_exclude:
            new2.append(sort[j][0])
        j += 1
    total = 0
    for word in new:
        if word in new2:
            print word, 1.0 * fd[word] / M, 1.0 * fd2[word] / M2
            total += 1
    print "%i words in the intersection" % total
Esempio n. 2
0
def make_cutOff(flatList, bottomCutOff, topCutOff):
    '''
    INPUT:
    flatList is a 1-d list of all tokens in set of tweets and both bottom and
    topCutOff are intergers
    OUTPUT:
    newVocab = a 1-d list of all tokens we want to keep
    thrownOut = a 1-d list of all tokens to throw out
    '''
    fd = FreqDist(flatList)
    newVocab = []
    thrownOut = []

    for item in fd.items()[:topCutOff]:
        # append most common words
        thrownOut.append(item)

    for item in fd.items()[topCutOff:]:
        if item[1] > bottomCutOff:
            # append good words
            newVocab.append(item[0])
        else:
            # append uncommon words
            thrownOut.append(item)

    print 'Cutoffs made...'
    return newVocab, thrownOut
Esempio n. 3
0
def make_cutOff(flatList, bottomCutOff, topCutOff):
    '''
    INPUT:
    flatList is a 1-d list of all tokens in set of tweets and both bottom and
    topCutOff are intergers
    OUTPUT:
    newVocab = a 1-d list of all tokens we want to keep
    thrownOut = a 1-d list of all tokens to throw out
    '''
    fd = FreqDist(flatList)
    newVocab = []
    thrownOut = []
    
    for item in fd.items()[:topCutOff]:
        # append most common words
        thrownOut.append(item)

    for item in fd.items()[topCutOff:]:
        if item[1] > bottomCutOff:
            # append good words
            newVocab.append(item[0])
        else:
            # append uncommon words
            thrownOut.append(item)

    print 'Cutoffs made...'
    return newVocab, thrownOut
Esempio n. 4
0
def n_gram_nltk(terms):
    terms_bi_gram = bigrams(terms)
    terms_tri_gram = trigrams(terms)
    uni_gram_matrix = FreqDist(terms)
    bi_gram_matrix = FreqDist(terms_bi_gram)
    tri_gram_matrix = FreqDist(terms_tri_gram)
    return uni_gram_matrix.items(), bi_gram_matrix.items(
    ), tri_gram_matrix.items()
 def get_most_frequent(self, rawText, number = None, cleaning_level = 3):
     cleaned_tokens_levels = TokensCleaner.clean(self, rawText, cleaning_level)
     freq_distributions_levels = dict()
     for level, cleand_tokens in cleaned_tokens_levels.items():
         all_words = FreqDist(cleand_tokens)
         if number == None:
             freq_distributions_levels[level] = all_words.items()
         else:
             freq_distributions_levels[level] = all_words.items()[:number]
     return freq_distributions_levels
Esempio n. 6
0
def main():
    fileName = '../data/deals.txt'
    words,lines = get_filter(fileName)
    word_dist = FreqDist(words)  # get distribution, descending order
    print("Most Popular Term: ",word_dist.items()[0])# question 1
    print("Least Popular Term: ", word_dist.items()[-1]) # question 2
#   solution 1 for question 3
#    print("Types of Guitars Found:  ",len(count_guitar_types.count(lines)))
#   Solutioin 2 , better and more reasonable, but could be better 
    print("Type of Guitars mentioned", count_guitar_types2.count(lines)) 
Esempio n. 7
0
    def __getTimelineFeatures(self, timeline):
        logger.info(u"Get timeline features")
        tweets = []
        self.__changePhase(PHASE["GET_TIMELINE_URLS"])
        for t in timeline:
            try:
                tweet = TweetText(t, self.__urlBuilder, self.__userBuilder)
            except:
                logger.exception(u"Error: \"" + unicode(t) + u"\"")
                raise ValueError(t)
            logger.debug(u"Tweet:" + unicode(tweet))
            tweets.append(tweet)

        urls = []
        ti = 0
        for tweet in tweets:
            for url in tweet.urls():
                self.__breakIfStopped()
                self.__urlResolver.addUrlToQueue(url)
                urls.append(url)
            logger.info(u"Tweet:" + unicode(tweet))
            ti += 1
            self.__proc = 100 * float(ti) / float(len(tweets))

        #Kategorie
        self.__changePhase(PHASE["GET_TIMELINE_FEATURES"])
        url2labels = {}
        ui = 0
        for url in urls:
            self.__breakIfStopped()
            if not url.isError():
                logger.debug(u"Classify " + unicode(url.getUrl()))
                url2labels[url.getExpandedUrl()] = self._classifier().classify(url.getText())
            ui += 1
            self.__proc = 100 * float(ui) / float(len(urls))

        labelsFreq = FreqDist()
        for labels in url2labels.values():
            for label in labels:
                labelsFreq.inc(label)
        self.__catFreq = labelsFreq.items()
        logger.info(u"Categories: "  + unicode(labelsFreq.items()))
        labelsFreqValues = [(item[0], item[1]) for item in labelsFreq.items() if item[0] not in ['short', 'medium', 'long']]
        #normalizacja
        labelsFreqValues = {label: float(freq) / float(max([f for l,f in labelsFreqValues])) for label, freq in labelsFreqValues}
        logger.info(u"Category factors: "  + unicode(labelsFreqValues))

        #Języki
        langFreq = FreqDist()
        for u in urls:
            langFreq.inc(u.lang())
        self.__langFreq = langFreq.items()
        logger.info(u"Languages: " + unicode(langFreq.items()))

        return labelsFreqValues
Esempio n. 8
0
def get_feats_counts(text):
    tokens = word_tokenize(text)
    t = Text(tokens)
    g1s = ngrams(t, 1)
    freq1 = FreqDist(g1s)
    g1s_list = [(g, count) for g, count in freq1.items()]
    g2s = ngrams(t, 2)
    freq2 = FreqDist(g2s)
    g2s_list = [(g, count) for g, count in freq2.items()]
    gs = g1s_list + g2s_list
    return dict(gs)
def prepResult(title, array):
    arq_base = open("base.txt","r")
    text = arq_base.read().lower().split()

    chrs = (78 - len(title)) / 2
    cont = 0
    enf = ""
    enf2 = "_"
    while cont < chrs:
        enf += "*"
        enf2 += "_"
        cont += 1
    result = ("\n//" + enf + " " + title + " " + enf + "\\\\\n\n"
        "|                   Palavra                    |   |          Frequência           |\n\n")
    frequencia = FreqDist(text)
    frequencia_ord = ordereddict(sorted(frequencia.items(), key = lambda e: (-e[1], e[0])))

    for freq in frequencia_ord:
        if(freq in array):
            lim = 84 / 2
            right = lim / 2 + len(freq)
            chrs = (78 - (len(freq)) + len(str(frequencia_ord[freq]))) / 4
            cont = 0
            enf = ""
            while cont < chrs:
                enf += " "
                cont += 1
            result += "|" + enf + freq + enf + " | " + enf + str(frequencia_ord[freq]) + enf + "|\n"
        

    result += "\n\\\\________________________________________________________________________________//\n\n"
    arq_base.close()
    return result
 def __extract_bigram_words(self, bigrams, values):
     bigrams_number_per_value = self.__configuration_map["most_frequent_bigrams_number_per_value"]
     most_frequent_bigrams = {}
     for value in values:
         fdist = FreqDist(bigrams[value])
         most_frequent_bigrams[value] = fdist.items()[:bigrams_number_per_value]
     return most_frequent_bigrams
def freq_dist(input, filtering_functions=[], plot = False, limit = None, return_counts = False):
    """Takes a list of words (hashtags, keywrods, anything) and plots a frequency distribution
       
       Filtering functions is an ORDERED set of functions to call on the raw input list that are executed before the freq dist
       That is, each item in input is run though f1,f2..,fn where filtering_functions = [f1,...fn]
       
       limit truncates the freq_dist to the limit most common items
       
       return_counts determines whether a list of tuples (word, count) are returned, 
          or whether a list of just the limit most used words is returned
    """
    for f in filtering_functions + [str.lower, str.strip]:
        input = map(f, input) 
    
    nltk_fdist = FreqDist(list(input))    
    
    if plot: #use nltks built in plotting function before destroying the data structure
        nltk_fdist.plot(limit) if limit else nltk_fdist.plot()      
    
    fdist = sorted(nltk_fdist.items(), key=lambda x:(-x[1], x[0]))   #alphabetically sort equally counted items
    fidst = fdist[0:limit] if limit else fdist                                  #apply limit
    fdist = [i[0] for i in fdist] if not return_counts else fdist               #remove counts if desired
        

    
    return fdist
Esempio n. 12
0
File: nc.py Progetto: TorchmanX/TARS
def preprocess(content):
	stopset = set(stopwords.words('english'))
	#replace punctuation and tag with space
	tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ', content.lower())) 
	pos_list = pos_tag(tokens)
	s_tokens = list()

	#noun and verb only
	for pos in pos_list:
		#print pos[1]
		#if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
		if pos[1] in ['NN', 'NNS']:
			s_tokens.append(pos[0])

	wordfreq = FreqDist(s_tokens)
	stemfreq = dict()
	st = LancasterStemmer()
	for word, freq in wordfreq.items():
		#stopwords
		if word in stopset:
			del wordfreq[word]
			continue
		#tiny words
		if len(word) <= 2:
			del wordfreq[word]
			continue
		#stemmer
		stem = st.stem(word)
		try:
			stemfreq[stem]+=freq
		except:
			stemfreq[stem]=freq
	return stemfreq
Esempio n. 13
0
def ngrams_cloud(text, output_filepath):
    tokenizer = RegexpTokenizer(r'\w+')
    text = ' '.join(text)
    sent_words = tokenizer.tokenize(text)

    # Calculate the frequency distance
    freq_dist = FreqDist(bigrams(sent_words))
    # Sort highest to lowest based on the score.
    scoredList = sorted(freq_dist.items(), key=itemgetter(1), reverse=True)
    # word_dict is the dictionary we'll use for the word cloud.
    # Load dictionary with the FOR loop below.
    word_dict = {}
    # Get the bigram and make a contiguous string for the dictionary key.
    # Set the key to the scored value.
    listLen = len(scoredList)
    for i in range(listLen):
        word_dict['_'.join(scoredList[i][0])] = scoredList[i][1]

    WC_max_words = 50
    wordcloud = WordCloud(
        max_words=WC_max_words,
        height=400,
        width=800,
        collocations=False,
        background_color='white',
        colormap='Set2').generate_from_frequencies(
            word_dict
        )  # height=WC_height, width=WC_width, background_color='white')
    wordcloud.to_file(os.path.join(output_filepath, "bigrams_wordcloud.png"))
Esempio n. 14
0
def get_word_bigram_scores(pos_words, neg_words):
    pos_words_plain = list(itertools.chain(*pos_words))
    neg_words_plain = list(itertools.chain(*neg_words))

    bigram_finder = BigramCollocationFinder.from_words(pos_words_plain)
    pos_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    bigram_finder = BigramCollocationFinder.from_words(neg_words_plain)
    neg_bigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = pos_words_plain + pos_bigrams  # 词和双词搭配
    neg = neg_words_plain + neg_bigrams
    all_words = pos + neg

    pos_word_fd = FreqDist(pos)
    neg_word_fd = FreqDist(neg)
    word_fd = FreqDist(all_words)

    pos_word_count = pos_word_fd.N()  # 积极词的数量
    neg_word_count = neg_word_fd.N()  # 消极词的数量
    #total_word_count = pos_word_count + neg_word_count
    total_word_count = word_fd.N()

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(pos_word_fd[word],
                                               (freq, pos_word_count),
                                               total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(neg_word_fd[word],
                                               (freq, neg_word_count),
                                               total_word_count)
        word_scores[word] = pos_score + neg_score
    return word_scores
Esempio n. 15
0
    def summarize(self, text):
        # get words from text
        words = word_tokenize(text)

        # filter out stop words and lower case
        words = [word.lower() for word in words if word not in self.stopwords]

        # filter non-alphameric chars from words
        words = [filter(unicode.isalnum, word) for word in words]
        words = filter(lambda w: len(w) > 0, words)  # Remove empty words

        # stemming
        words = [self.pst.stem(word) for word in words]
        word_frequencies = FreqDist(words)
        most_frequent = [word[0] for word in word_frequencies.items()[:self.top_words_count]]

        # get sentences
        sentences = sent_tokenize(text)

        sentence_score = defaultdict(int)

        for i in range(len(sentences)):
            sentence = sentences[i]
            sentence_words = word_tokenize(sentence)
            sentence_words = [self.pst.stem(word).lower() for word in sentence_words if word not in self.stopwords]

            for sentence_word in sentence_words:
                if sentence_word in most_frequent:
                    sentence_score[i] += 1

        sorted_wordcounts = sorted(sentence_score.iteritems(), key=operator.itemgetter(1), reverse=True)[:self.number_of_sentences]
        summary = "\n".join([sentences[num] for num, count in sorted_wordcounts])

        return summary
def wordfreq(file):
    f = open(file, 'rU')
    raw = f.read()
    raw = raw.replace('\n', ' ')
    #raw = raw.decode('utf8')
    #tokenization
    tokens = nltk.word_tokenize(raw)
    #stopwords = stopwords.words('english') #use the NLTK stopwords
    #lower everything
    words = [w.lower() for w in tokens]
    #words_nostop = [w.lower() for w in tokens]
    #remove numbers
    words = [w for w in words if w.isalpha()]
    #words_nostop = [w for w in words_nostop if w.isalpha()]
    #encode
    words = [w.encode('utf8') for w in words]
    #words_nostop = [w.encode('utf8') for w in words if w not in stopwords]
    #remove punctuations
    words = [w.translate(None, string.punctuation) for w in words]
    #words_nostop = [w.translate(None, string.punctuation) for w in words_nostop]
    freq = FreqDist(words)
    #freq_nostop = FreqDist(words_nostop)
    sorted_freq = sorted(freq.items(), key=lambda k: k[1], reverse=True)
    #sorted_freq_nostop = sorted(freq_nostop.items(),key = lambda k:k[1], reverse = True)
    return sorted_freq
Esempio n. 17
0
def termfreq(storytext, filename):
    '''
    This function takes a speech/text/article, preprocesses it into tokens, 
    removes stopwords, and outputs a csv of term counts and frequencies 
    relative to the size of the speech/text/article
    '''
    
    # Split into tokens, remove stopwords
    tokens = make.preprocess(storytext)
    stops = make.filter_stopwords(tokens)
    numstops = len(stops)    
    
    # Create a FreqDist and turn it into a list of tuples
    freq = FreqDist(stops)
    data = freq.items()[:numstops]
    
    # Build a pandas DataFrame of that list
    df = pd.DataFrame(data)
    df.columns = ['word', 'count']
    
    # Add a 'relative frequency' column to the DataFrame
    a = []
    for i in df['count']:
        a.append(i/numstops)
    df['pct'] = a
    
    # Write the file to csv
    df.to_csv('%s.csv' % filename, sep=',')
    print df
    print 'Check your files for the csv!'    
Esempio n. 18
0
def count_pos(input, language):
    if language == 'english-nltk':
        words = word_tokenize(input)
        pos = pos_tag(words)

    elif language == 'english':
        s = pattern.en.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'spanish':
        s = pattern.es.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'dutch':
        words = word_tokenize(input, 'dutch')
        tagger = nltk.data.load('taggers/alpino_aubt.pickle')
        pos = tagger.tag(words)

    tags = FreqDist(tag for (word, tag) in pos)
    relative_frequency = []
    for item in tags.items():
        relative_frequency.append((item[0], float(item[1])/tags.N()))
    return relative_frequency
Esempio n. 19
0
def count_pos(input, language):
    if language == 'english-nltk':
        words = word_tokenize(input)
        pos = pos_tag(words)

    elif language == 'english':
        s = pattern.en.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'spanish':
        s = pattern.es.parsetree(input, relations=True, lemmata=True)
        words = []
        pos = []
        for sentence in s:
            for w in sentence.words:
                words.append(w.string)
                pos.append((w.string, clean_text.clean_pos(w.type)))

    elif language == 'dutch':
        words = word_tokenize(input, 'dutch')
        tagger = nltk.data.load('taggers/alpino_aubt.pickle')
        pos = tagger.tag(words)

    tags = FreqDist(tag for (word, tag) in pos)
    relative_frequency = []
    for item in tags.items():
        relative_frequency.append((item[0], float(item[1]) / tags.N()))
    return relative_frequency
Esempio n. 20
0
def summary_corpus(data, column, language="english"):
    """
    Return summary info for the frequency of words in the corpus
    example: tokens, vocab, frequency_dist= summary_corpus(data= df, column= 'reviews', language="english")
    """
    tokens = [
        word for text in data[column]
        for word in word_tokenize(text, language=language)
    ]
    vocab = set(tokens)
    frequency_dist = FreqDist(tokens)

    keys, values = [], []
    for key, value in frequency_dist.items():
        keys.append(key)
        values.append(value)

    frequency_dist = {"word": keys, "frequency": values}
    frequency_dist = pd.DataFrame.from_dict(frequency_dist)
    frequency_dist.sort_values(by='frequency',
                               ascending=False,
                               inplace=True,
                               axis=0)

    print('Number of tokens in the corpus :', len(tokens))
    print('Vocabulary size                :', len(vocab))

    return tokens, vocab, frequency_dist
    def build_top_words(self):
        pos_reviews = [(review, c) for (review, c) in self.documents
                       if c == 'pos']
        neg_reviews = [(review, c) for (review, c) in self.documents
                       if c == 'neg']

        pos_words = [token for (review, c) in pos_reviews for token in review]
        neg_words = [token for (review, c) in neg_reviews for token in review]

        fd_all = FreqDist(pos_words + neg_words)
        pos_class_words = [('pos', word) for word in pos_words]
        neg_class_words = [('neg', word) for word in neg_words]
        cfd_pos = ConditionalFreqDist(pos_class_words)
        cfd_neg = ConditionalFreqDist(neg_class_words)

        pos_word_count = len(pos_words)
        neg_word_count = len(neg_words)
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}

        for (word, freq) in fd_all.items():
            pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word],
                                                   (freq, pos_word_count),
                                                   total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word],
                                                   (freq, neg_word_count),
                                                   total_word_count)
            word_scores[word] = pos_score + neg_score

        best = sorted(word_scores.items(), reverse=True,
                      key=lambda x: x[1])[:1000]
        self.top_words = set([w for w, s in best])
Esempio n. 22
0
def findBestWords(wordsInCategories, scoreFunction=BigramAssocMeasures.chi_sq, max_words=1000):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for category, words in wordsInCategories:
        word_fd.update(words)
        label_word_fd[category].update(words)

    word_counts = {}
    for condition in label_word_fd.conditions():
        word_counts[condition] = label_word_fd[condition].N()

    total_word_count = 0
    for condition, count in word_counts.items():
        total_word_count += count

    word_scores = {}

    for word, freq in word_fd.items():
        score = 0
        for condition, count in word_counts.items():
            score += scoreFunction(label_word_fd[condition][word], (freq, word_counts[condition]), total_word_count)
        word_scores[word] = score

    best = sorted(word_scores.items(), key=lambda t: t[1], reverse=True)[:max_words]
    return set([w for w, s in best])
Esempio n. 23
0
def buildcollocgraph(corpus,settings):
    u = nx.Graph()
    wordsall = []
    window = settings['window']
    stem = settings['stem']
    kblack = settings['kblack']
    cgcutoff = settings['cgcutoff']
    ncorpus = normalise_text(corpus,settings) #normalise corpus here
    for doc in ncorpus:
        words = [textacy.extract.ngrams(doc,1,filter_stops=kblack)]
        words = [t.text for word in words for t in word]

        if len(words) > cgcutoff:
            g = textacy.network.terms_to_semantic_network(words, normalize=stem, window_width=window, edge_weighting='cooc_freq')
            u.add_nodes_from(g.nodes(data=True))
            u.add_edges_from(g.edges(data=True))
            wordsall.append(words)
    wordsall = [w for wdlist in wordsall for w in wdlist]
    word_fd = FreqDist(wordsall)
    #test visualise
    #textacy.viz.network.draw_semantic_network(U, node_weights=word_fd, spread=3.0, draw_nodes=True, base_node_size=300, node_alpha=0.25, line_width=0.5, line_alpha=0.1, base_font_size=12, save=False)
    #convert networkx graph to json for d3
    for i,v in [k for k in word_fd.items()]:
        u.node[i]['freq'] = v
    graphdata = json_graph.node_link_data(u)
    graphdata['links'] = [
        {
            'source': graphdata['nodes'][link['source']]['id'],
            'target': graphdata['nodes'][link['target']]['id']
        }
        for link in graphdata['links']]
    return graphdata
Esempio n. 24
0
def generate_corpus(folder_name, top, n):
    '''corpus of words generated to be used as the vocabulary. Function takes into account topn and will 
    create a corpus with the topn amount of tokens if topn is True.'''

    lower = True  #activates lowercase tokens
    subfolders = [i for i in os.listdir(folder_name)
                  ]  #iterates through subfolder
    corpus_list = []
    for i in subfolders:
        for v in os.listdir(folder_name + "/" + i):
            text = open_text(i, folder_name, v, lower)
            corpus_list += [i for i in text]

    corpus_freqs = FreqDist(corpus_list)
    sorted_x = sorted(corpus_freqs.items(),
                      key=operator.itemgetter(1),
                      reverse=True)
    if top == True:
        topn_words = {}
        for i in sorted_x[:n]:
            topn_words[i[0]] = 0
        vocabulary = list(sorted(topn_words.keys()))
        return topn_words, vocabulary  #empty  topn dictionary to be used to populate vectors and vocabulary for columns

    else:
        vocabulary = list(sorted(corpus_freqs.keys()))
        corpus = {str(i): 0 for i in sorted(vocabulary)}
        return corpus, vocabulary
Esempio n. 25
0
def run(rawTokenListList, numTokens, freqThreshold):
    freqDistMap = {}
    retVal = ""
    total = 0

    # freqDistMap [key: n-gram, value: 빈도수]를 구축한다.
    for rawTokenList in rawTokenListList:
        # 각 문장 토큰 리스트를 numTokens 단위로 묶어서 n-gram 리스트를 생성한다.
        ngramList = ngrams(rawTokenList, numTokens)

        # 각 n-gram의 빈도수를 측정한다.
        freqDist = FreqDist(ngramList)

        for key, value in freqDist.items():
            # map 내 n-gram이 존재하면 빈도수를 누적한다.
            if key in freqDistMap:
                freqDistMap[key] += value
            # 최초 등장 n-gram의 경우 map에 추가한다.
            else:
                freqDistMap[key] = value

    # 임계값 이상의 n-gram을 추출한다.
    for key in freqDistMap:
        freq = freqDistMap[key]

        if freq >= freqThreshold:
            for gram in key:
                retVal += (gram + " ")
            retVal += ("- %d\r\n" % freq)
            total += 1

    retVal = (("total: %d\r\nnumTokens: %d, freqThreshold: %d\r\n\r\n" % (total, numTokens, freqThreshold)) + retVal)

    return retVal
Esempio n. 26
0
    def get_notes_bigrams(data):

        # run after self.clean
        bigrams_list = list(bigrams(data))
        bigrams_fdist = FreqDist(bigrams_list)
        bigram_freqs = []
        for k,v in bigrams_fdist.items():
            bigram_freqs.append((k,v))

        sorted_bigram_freqs = sorted(bigram_freqs, key=lambda x: x[1], reverse=True)

        temp_dict = {}
        for bigram in sorted_bigram_freqs:
            if bigram[0] in temp_dict:
                temp_dict[bigram[0]] += int(bigram[1])
            else:
                temp_dict[bigram[0]] = int(bigram[1])

        dict_copy = {}
        for key in temp_dict:
            if key not in dict_copy:
                dict_copy[key] = temp_dict[key]

            for k in temp_dict:
                if (k[1],k[0]) == key:
                    dict_copy[key] += temp_dict[(k[0],k[1])]
                    del dict_copy[key]

        mod_bigram_freqs = []
        for k,v in dict_copy.items():
            mod_bigram_freqs.append((k,v))

        mod_sorted_bigram_freqs = sorted(mod_bigram_freqs, key=lambda x: x[1], reverse=True)
        # self.sorted_bigrams = mod_sorted_bigram_freqs
        return mod_sorted_bigram_freqs
Esempio n. 27
0
    def summarize(self,
                  article_text,
                  num_sentences=DEFAULT_SUMMARIZATION_NUMBER):

        # Get words from article
        words = word_tokenize(article_text)

        # Filter non-alphanumeric chars from words
        words = [filter(unicode.isalnum, word) for word in words]
        words = filter(lambda w: len(w) > 0, words)  # Remove empty words

        # Now lemmatize all words
        words = [
            self.lemmatizer.lemmatize(word).lower() for word in words
            if word.lower() not in self.stopwords
        ]
        word_frequencies = FreqDist(words)
        most_frequent = [word[0] for word in word_frequencies.items()[:100]]

        # Now get sentences
        sentences = self.sent_detector.tokenize(article_text)

        wordcountdict = defaultdict(int)

        for word in most_frequent:
            lem_word = self.lemmatizer.lemmatize(word).lower()
            for i in range(0, len(sentences)):
                if lem_word in sentences[i]:
                    wordcountdict[i] += 1

        sorted_wordcounts = sorted(wordcountdict.iteritems(),
                                   key=operator.itemgetter(1),
                                   reverse=True)[:num_sentences]
        return [sentences[num] for num, count in sorted_wordcounts]
Esempio n. 28
0
def analyze(search_path, config_file):
    """
    analyze PATH tags.json: update syntax to just take a path, ignoring tags
    """
    # load config file, if provided
    if config_file:
        cfg = load_cfg(config_file)
        if 'tags' not in cfg:
            cfg['tags'] = []
    else:
        cfg = {
            'tags': []
        }

    word_list = []
    search_glob = "{}/**".format(search_path)
    for filename in glob.iglob(search_glob, recursive=True):
        if os.path.isfile(filename):
            stem = pathlib.Path(filename).stem.lower()
            word_list += [token for token in re.split(r'\W', stem) if len(token) > 1]

    # remove stopwords and tags
    filtered_words = [word for word in word_list if word not in stopwords.words('english')]
    filtered_words = [word for word in filtered_words if word not in cfg['tags']]

    raw = " ".join(filtered_words)
    bag = nltk.word_tokenize(raw)
    freqdist = FreqDist(bag)

    words_sorted = sorted(freqdist.items(), key =
        lambda kv:(kv[1], kv[0]))
    top_words = words_sorted[-30:]
    top_words.reverse()
    for word in top_words:
        print("{1}: {0}".format(*word))
Esempio n. 29
0
    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')  
              
        stopS = stopwords.words('spanish')
        
        palavrasChaves = [] 
        textoArtigo = []
        
        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',','').replace('.','').replace('-','').replace('(','').replace(')','').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                            if len(i) > 2:
                                textoArtigo.append(i)
        
        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)
        
        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]
        
        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0,len(items)):
            palavrasChaves.append(items[i][0])
            
        return palavrasChaves        
Esempio n. 30
0
 def top_words_from_corpus(self, num_words, test_name):
     corpus_tokens = []
     for i in self.corpus_vars["corpus_member_ids"]:
         title = 'document_' + str(i)
         doc_tokens = Library.document_instances[title].metadata["tokenized_doc"]
         corpus_tokens += doc_tokens
     top_words = []
     fdist_corpus = FreqDist(corpus_tokens)
     fdist_list = fdist_corpus.items()
     if test_name == "Function Word PCA":
         function_pos = ['IN', 'TO', 'CC', 'DT', 'PDT', 'WDT']
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 tagged_top = nltk.pos_tag(top_words)
                 for j,k in tagged_top:
                     if k not in function_pos:
                         top_words.remove(j)
                 if len(top_words) == num_words:
                     break
     elif test_name == "Burrows's Delta":
         for i in fdist_list:
             top_words.append(i[0])
             if len(top_words) == num_words:
                 break
     return top_words
Esempio n. 31
0
    def palavrasChaves(self):
        # fun��o da NLTK que retorna as stopwords na lingua inglesa
        stopE = stopwords.words('english')

        # fun��o da NLTK que retorna as stopwords na lingua portuguesa
        stop = stopwords.words('portuguese')

        stopS = stopwords.words('spanish')

        palavrasChaves = []
        textoArtigo = []

        #retira pontua��es do texto e divide o texto em palavras
        for i in self.titulo.lower().replace(',', '').replace('.', '').replace(
                '-', '').replace('(', '').replace(')', '').split():
            #retira as stopwords da lingua portuguesa do texto do artigo que est� sendo apresentado
            if i not in stop:
                #retira as stopwords da lingua inglesa do texto do artigo que est� sendo apresentado
                if i not in stopE:
                    #ignora palavras com menos de 3 caracteres. Isso � para tratar palavras, como por exemplo o verbo "�"
                    if i not in stopS:
                        if len(i) > 2:
                            textoArtigo.append(i)

        # apresenta a frequencia de repeticoes das palavras no corpo do artigo
        freq = FreqDist(textoArtigo)

        # separa as quatro palavras mais frequentes
        items = freq.items()[:4]

        # coloca as palavras mais frequentes do texto na variavel palavrasChaves
        for i in range(0, len(items)):
            palavrasChaves.append(items[i][0])

        return palavrasChaves
Esempio n. 32
0
def read_all():
    f = open(filename, "r")
    raw = f.read()
    #generate tokens by jieba
    tokens = jieba.lcut(raw)

    #load chinese stop words
    stopwords = []
    cfp = open('stopwords.txt', 'r+')
    for line in cfp:
        for word in line.split():
            stopwords.append(word)
    cfp.close()

    # remove characters in chinese stop words
    wordlist_N = []
    for word in tokens:
        if word not in stopwords:
            if word != '\n' and word != '―' and word != ' ' and word != '\u200b' and word != '\n' and word != '##':
                wordlist_N.append(word)

    #generate a frequency dictionary for wordlist_N
    freq = FreqDist(wordlist_N)

    #sort the frequency list in descending order
    sorted_freq = sorted(freq.items(), key=lambda k: k[1], reverse=True)

    #write result into .txt file
    with open('withoutstopwords.txt', 'w') as f:
        for line in sorted_freq:
            if line[1] > 5:
                f.write(str(line[0]) + '\t' + str(line[1]) + '\n')
    f.close()
Esempio n. 33
0
 def create_vocab(self):
     All_Contents = []
     i=0
     for rest in self.corpus:
     #for hotel in self.corpus:
         print("loading file :" + str(i+1))
         for review in rest.get("Reviews"):
             #print review
             
             s= []
             try: 
                 for v in parse_to_sentence(review.get('Content'),self.stopwords):
                     s = v + s
                 All_Contents = All_Contents + s
             except:
                 print 'parsing error'
         i=i+1
     term_freq = FreqDist(All_Contents)
     Vocab = []
     Count = []
     VocabDict={}
     for k,v in term_freq.items():
         if v>5:
             Vocab.append(k)
             Count.append(v)
     self.Vocab = np.array(Vocab)[np.argsort(Vocab)].tolist()
     self.Count = np.array(Count)[np.argsort(Vocab)].tolist()
     self.VocabDict = dict(zip(self.Vocab,range(len(self.Vocab))))
Esempio n. 34
0
def posAnalysis(collection):

	reviews = collection.find(timeout=False)

	__reportProgress.counter = 0

	skip = 1

	for rev in reviews:
		if skip%200 == 0:
			print 'skip'+str(skip)
		__reportProgress()
		if rev.has_key('tags'):
			skip += 1
			if rev['tags'].has_key('NN'):				
				continue

		sents = sent_tokenize(rev['text'])
		tokens = [word for sent in sents for word in word_tokenize(sent)]
		pos = tagger.tag([tok for tok in tokens if tok not in ',.-$\" '])
		tag_fd = FreqDist(tag for (word, tag) in pos)
		tags = dict()
		for (key,value) in tag_fd.items():
			k = key.replace('$','S')
			out = key.translate(string.maketrans("",""), string.punctuation)
			if len(out)>0:
				tags[k] = value
		collection.update({'_id':rev['_id']},{"$set": {"tags": tags}})		
Esempio n. 35
0
def preprocess(content):
    stopset = set(stopwords.words('english'))
    #replace punctuation and tag with space
    tokens = word_tokenize(re.sub(r'<p>|</p>|[^A-Za-z ]', ' ',
                                  content.lower()))
    pos_list = pos_tag(tokens)
    s_tokens = list()

    #noun and verb only
    for pos in pos_list:
        #print pos[1]
        #if pos[1] in ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
        if pos[1] in ['NN', 'NNS']:
            s_tokens.append(pos[0])

    wordfreq = FreqDist(s_tokens)
    stemfreq = dict()
    st = LancasterStemmer()
    for word, freq in wordfreq.items():
        #stopwords
        if word in stopset:
            del wordfreq[word]
            continue
        #tiny words
        if len(word) <= 2:
            del wordfreq[word]
            continue
        #stemmer
        stem = st.stem(word)
        try:
            stemfreq[stem] += freq
        except:
            stemfreq[stem] = freq
    return stemfreq
def N_keyword_evolution_by_date(corpus, D, N, exclude):
    set_exclude = set(exclude)
    files = sorted([(date_from_file_name(f), f) for f in corpus.fileids()])
    delta = datetime.timedelta(days=D)
    lower_bound = files[0][0]
    upper_bound = files[0][0] + delta
    keywords = []
    i = 0
    while lower_bound <= files[-1][0]:
        text = []
        while i < len(files) and files[i][0] < upper_bound:
            new_file = corpus.words(files[i][1])
            for j in new_file:
                text.append(j.lower())
            i += 1
        else:
            fd = FreqDist(text)
            new = []
            sort = sorted(fd.items(), key=itemgetter(1), reverse=True)
            j = 0
            while len(new) < N:
                if not sort[j][0] in set_exclude:
                    new.append(sort[j][0])
                j += 1
        keywords.append(new)
        lower_bound = upper_bound
        upper_bound = upper_bound + delta
    return keywords
Esempio n. 37
0
def extract_doc_feats_counts(refactorized_documents):
    from nltk import FreqDist
    from collections import defaultdict
    import itertools
    import math
    import pdb
    import numpy

    doc_num = len(refactorized_documents)

    ref_docs_flat = list(itertools.chain.from_iterable(refactorized_documents))
    glob_freqs = FreqDist(ref_docs_flat)

    tokens = glob_freqs.samples()

    for i in range(0, doc_num):
        doc_features = [0] * len(tokens)
        doc_freqs = FreqDist(refactorized_documents[i])

        for (tok, freq) in doc_freqs.items():
            indx = tokens.index(tok)
            doc_features[indx] = freq * doc_freqs.N()

        f_tmp = numpy.asarray(doc_features)
        glob_features[i] = f_tmp.tolist()

    return (glob_features, tokens)
def N_most_freq_words(corpus, N):
    tokens = read_all_corpus(corpus)
    fdist = FreqDist([token.lower() for token in tokens])
    return [
        a
        for a, b in sorted(fdist.items(), key=itemgetter(1), reverse=True)[:N]
    ]
 def __init__(self, tokens):
     frequency = FreqDist(tuple(trigrams(tokens)))
     self.trigram_freq = defaultdict(dict)
     for head_n_tail, num in frequency.items():
         head1, head2, tail = head_n_tail
         head_tup = (head1, head2)
         del head1, head2
         self.trigram_freq[head_tup][tail]: int = num
Esempio n. 40
0
def trigramCalc(data):
	trigram=ngrams(data,3)
	freqDistTrigram=FreqDist(trigram)
	trigramCount={}        
	for k,v in freqDistTrigram.items():
		trigramCount[k[0] +" "+ k[1] +" "+ k[2]]=v

	return trigramCount	
Esempio n. 41
0
def count_bigrams(words):
    bigram_counts = []
    wordFreq = FreqDist(bigrams(words))
    for bigram, count in wordFreq.items():
        printable_bigram = (str(bigram[0]) + " " + str(bigram[1])).replace(
            ',', ' ')
        bigram_counts.append((printable_bigram, count))
    return bigram_counts
Esempio n. 42
0
def bigramCalc(data):
	bigram=ngrams(data,2)
	freqDistBigram=FreqDist(bigram)
	bigramCount={}
	for k,v in freqDistBigram.items():     
		bigramCount[k[0] +" "+k[1]]=v
	
	return bigramCount
Esempio n. 43
0
def unigramCalc(data):
	unigram=ngrams(data,1)
	freqDistUnigram=FreqDist(unigram)
	unigramCount={}
	for k,v in freqDistUnigram.items():     
    		unigramCount[k[0]]=v
	
	return unigramCount
Esempio n. 44
0
def get_features(document):
    document = re.sub('[%s]' % re.escape(string.punctuation), '', document) # removes punctuation
    document = document.lower() # make everything lowercase
    all_words = [w for w in word_tokenize(document) if len(w) > 3 and len(w) < 16]
    p = PorterStemmer()
    all_words = [p.stem(w) for w in all_words]
    all_words_freq = FreqDist(all_words)
    print sorted(all_words_freq.items(), key=lambda(w,c):(-c, w))
    return all_words_freq
Esempio n. 45
0
def get_probs(filename):
    """read the given text and calculate the probabilities for all symbols."""
    with open(filename) as file_in:
        text = file_in.read()
    probs = FreqDist(text)
    count_sum = sum(v for v in probs.values())
    for k,v in probs.items():
        probs[k] = v * 1.0 / count_sum
    return probs
    def __extract_level_words(self, levels_db, level, values):
        words_number_per_value = self.__configuration_map["most_frequent_words_number_per_value"]
        most_freq_words = {}
        for value in values:
            fdist = FreqDist()
            for word_dist in levels_db[level][value]:
                fdist.inc(word_dist[0], count = word_dist[1])

            most_freq_words[value] = fdist.items()[:words_number_per_value]
        return most_freq_words
Esempio n. 47
0
def return_freq_types(list_types, list_words):
    """
    returns 10 most frequent types and 10 most frequent words
    """
    fd = FreqDist(list_types)
    agglomerated = defaultdict(int)
    for w in list_words:
        if not w.lower() in STOPWORDS:
            agglomerated[w] += 1
    sorted_dict = sorted(agglomerated.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_dict[:10], [t for t, freq in fd.items()[:10]]
def process_report():
    tokenizer = RegexpTokenizer(r'\w+')
    f = open('scraper.txt','r')
    textfile = unicode(f.read(),errors='ignore')
    words = tokenizer.tokenize(textfile)
    filtered_words = [w for w in words if not w in stopwords.words('english')]
    fdist = FreqDist(filtered_words)
    with open("report.csv", "wb") as fp:
        writer = csv.writer(fp, quoting=csv.QUOTE_ALL)
        writer.writerows(fdist.items())
    return "Wrote report"
Esempio n. 49
0
    def keywords(self, query):
        final_text = u' '.join(query)

        resultKE = self.key_extractor._fetch_text(final_text)
        resultFE = self.bg_extractor._fetch_text(final_text)
        keywordsFE = [u' '.join(w).lower() for w in resultFE for idx in range(self.bg_extractor.n_best)]
        keywordsFE += resultKE

        keywords = FreqDist(w.lower() for w in keywordsFE)
        return {'response': {'language': '',
                    'keywords': keywords.items(), 'text': query}}
Esempio n. 50
0
	def Bigrams(self,i,words):
		Frecs_BG=self.FrecArray_bigrams
		# < bigrams > #
		bgs = nltk.bigrams(words)
		word_frequencies = FreqDist(bgs)
		most_frequent = [word[0] for word in word_frequencies.items()]
		for w in most_frequent:
			W=" ".join(w)
			if not Frecs_BG.has_key(W):
				Frecs_BG[W]=[]
			idpub_occs=i+"*"+`word_frequencies[w]`
			Frecs_BG[W].append(idpub_occs)
Esempio n. 51
0
	def Monograms(self,i,words):
		lmtzr = WordNetLemmatizer()
		Frecs_MG=self.FrecArray_monograms
		# < monograms > #
		words=[lmtzr.lemmatize(word) for word in words]
		word_frequencies = FreqDist(words)
		most_frequent = [word[0] for word in word_frequencies.items()]#[:10]]
		for w in most_frequent:
			if not Frecs_MG.has_key(w):
				Frecs_MG[w]=[]
			idpub_occs=i+"*"+`word_frequencies[w]`
			Frecs_MG[w].append(idpub_occs)
def frequency(in_file, out_file):
    """Input: a text file
    Output: a table of word frequency with three columns for Word, Count and Percent frequency
    """
    text = unicode(open(in_file, 'r').read(), errors='ignore')
    words = nltk.word_tokenize(text)
    frequency = FreqDist(words)
    total = float(frequency.N())
    output = open(out_file, 'w')
    output.write("Word\tCount\tPercent\n")
    for pair in frequency.items():
        output.write("{pair[0]}\t{pair[1]}\t{pc:.2f}\n".format(pair=pair, pc=100*pair[1]/total))
    output.close()
Esempio n. 53
0
def bestWords():
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    reviews = product_reviews_1.reviews()
    reviewlines = []
    for review in reviews:
        for line in review.review_lines:
            reviewlines.append(line)

    featlines = [line for line in reviewlines if len(line.features) > 0]
    pluswords = []
    minuswords = []
    for line in featlines:
        plus = False
        minus = False
        for feat in line.features:
            if feat[1][0] == "+":
                plus = True
            elif feat[1][0] == "-":
                minus = True
        if plus:
            for word in line.sent:
                pluswords.append(word)
        if minus:
            for word in line.sent:
                minuswords.append(word)

    for word in pluswords:
        word_fd[word.lower()] += 1
        label_word_fd['+'][word.lower()] += 1

    for word in minuswords:
        word_fd[word.lower()] += 1
        label_word_fd['-'][word.lower()] += 1

    pos_word_count = label_word_fd['+'].N()
    neg_word_count = label_word_fd['-'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['+'][word],
                                               (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['-'][word],
                                               (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.items(), key=(lambda s: s[1]), reverse=True)[:515]
    return set([w for w, s in best])
def mapper(key,value):
    sentence = value.split()
    for (index, tagtuple) in enumerate(sentence):
        token, tag = get_token_tag(tagtuple)
        if we_like(token, tag):
            fd = FreqDist()
            token = token.lower()
            window = sentence[index+1:index+5]
            for windowtuple in window:
                wtoken, wtag = get_token_tag(windowtuple)
                if we_like(wtoken, wtag):
                    wtoken = wtoken.lower()
                    fd.inc(wtoken)
            yield token, tuple(fd.items())
Esempio n. 55
0
 def keywords(self, query):
     if query.startswith('www') or query.startswith('http'):
         text = self.html_extractor.extract(query)['response']['text']
         result = self._fetch_text(text)
         for r in result:
             print r
         keywords = result
         return {'response': {'language': '',
                 'keywords': keywords, 'text': query}}
     else:
         result = self._fetch_text(query)
         keywords = result
         keywords = FreqDist(w.lower() for w in keywords)
         return {'response': {'language': '',
                 'keywords': keywords.items(), 'text': query}}
Esempio n. 56
0
    def FreqDistHit(self):
        db = self.getHitSet()

        fdist = FreqDist(db)
        
        print "FreqDistVote. The number"
        print "========================"
        print len(fdist.items())
        print "\n\n"

        print "FreqDistVote. Top votes"
        print "========================"
        print repr(fdist.items()[:50])
        print "\n\n"

        print "FreqDistVote. Most votes"
        print "========================"
        print repr(sorted(fdist.keys()[-50:], reverse=True))
        print "\n\n"

        print "FreqDistVote. Most votes and frequence"
        print "======================================"
        print repr(sorted(fdist.items()[-50:], reverse=True))
        print "\n\n"
Esempio n. 57
0
def get_trigrams_freqdist(tokens):
    tri_grams = trigrams(tokens)
    print 'Returned trigrams'

    freq_dist_trigrams = FreqDist(tri_grams)
    print freq_dist_trigrams.most_common(10)

    freq_dist_trigrams_new = dict()
    for item in freq_dist_trigrams.items():
        temp_str = item[0]
        temp_key = temp_str[0] + ' ' + temp_str[1] + ' ' + temp_str[2]
        freq_dist_trigrams_new[temp_key] = item[1]
    freq_dist_trigrams_new = OrderedDict(sorted(freq_dist_trigrams_new.items(), key=lambda x: x[1], reverse=True))

    return freq_dist_trigrams_new
    def __call__(self, key, value):
        sent = value.split()
        for idx, tagged in enumerate(sent):
            token, tag = self.split_tagged(tagged)

            if self.valid(token, tag):
                dist   = FreqDist()
                window = sent[idx+1:idx+5]

                for wtagged in window:
                    wtoken, wtag = self.split_tagged(wtagged)

                    if self.valid(wtoken, wtag):
                        dist.inc(wtoken)

                yield token, tuple(dist.items())
Esempio n. 59
0
def freq(tokens, n=None, prints=None):
    '''
    This function takes a list of tokens and returns a list of the top n most 
    frequent tokens
    
    It also prints a frequency distribution of the top 50 tokens
    '''
    fdist2 = FreqDist(tokens)
    fdist2.plot(50, cumulative=True)
    [i[0] for i in fdist2.items()[:20]]
    if prints is 'yes':
        if n is None:    
            print fdist2.items()[:20]
            return [i[0] for i in fdist2.items()[:20]]
        else:
            print fdist2.items()[:n]
            return [i[0] for i in fdist2.items()[:n]]
    else:
        if n is None:
            return [i[0] for i in fdist2.items()[:20]]
        else:
            return [i[0] for i in fdist2.items()[:n]]