Exemple #1
0
def get_list_tuples(read_file):

    stop = set(stopwords.words('english'))

    list_tuples = []
    with open(read_file, 'r') as r:
        reader = csv.reader(r, delimiter=',')
        x = 0
        for line in reader:
            #tabsep = line.strip().split('\r')
            #tabsep = line.strip().split('')
            msg = TextBlob(line[1])
            msg.ngrams(
                n=2
            )  ######################### BI-Grams #######################

            try:
                words = msg.words
            except:
                continue
            for word in words:
                #if word not in stopwords.words() and not word.isdigit():
                if word not in stop and not word.isdigit():
                    word = word.lower()
                    #list_tuples.append((word.lower(),line[0]))

                    list_tuples.append((stemmer.stem(word), line[0]))
                    #list_tuples.append((lmtzr.lemmatize(word),line[0]))
                x += 1

        return list_tuples
Exemple #2
0
def _get_detailed_stats(no_code_text):
    """
    Returns detailed stats on text
    :param no_code_text: String to analyse
    :return: list of details
    """
    results = []
    group_by = 'Detailed Text Statistics'
    tb = TextBlob(no_code_text)
    # Spell check here...it's very slow
    results.append(
        TextFeature('Number of sentences',
                    textstat.sentence_count(no_code_text), group_by))
    results.append(
        TextFeature('Number of sentences (again)', len(tb.sentences),
                    group_by))
    results.append(TextFeature('Number of words', len(tb.words), group_by))
    results.append(
        TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by))
    results.append(
        TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity,
                    group_by))
    results.append(
        TextFeature('Detected Language', tb.detect_language(), group_by))
    results.append(
        TextFeature('Number of important phrases', len(tb.noun_phrases),
                    group_by))
    results.append(
        TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by))
    results.append(
        TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by))
    results.append(
        TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by))
    return results
Exemple #3
0
def get_blob_messages(message):
    blob = TextBlob(message)

    if len(blob.words) > 1:
        messages = blob.ngrams(n=2) + blob.ngrams(n=3)
    else:
        messages = blob.ngrams(n=1)

    return messages
def topwords(renoted_id):


    default_stopwords = set(nltk.corpus.stopwords.words('english'))

# We're adding some on our own - could be done inline like this...
# ... but let's read them from a file instead (one stopword per line, UTF-8)
#stopwords_file = './stopwords.txt'
#custom_stopwords = set(codecs.open(stopwords_file, 'r', 'utf-8').read().splitlines())

#all_stopwords = default_stopwords | custom_stopwords
    all_stopwords = default_stopwords
    filepath="alldocs/processed/"+renoted_id+".txt"


    fp = codecs.open(filepath, 'r', 'utf-8')
    wiki=TextBlob(fp.read())
    print wiki.noun_phrases
    print wiki.ngrams(n=2)
    words = nltk.word_tokenize(fp.read())

# Remove single-character tokens (mostly punctuation)
    words = [word for word in words if len(word) > 1]

# Remove numbers
    words = [word for word in words if not word.isnumeric()]

# Lowercase all words (default_stopwords are lowercase too)
    words = [word.lower() for word in words]

# Stemming words seems to make matters worse, disabled
# stemmer = nltk.stem.snowball.SnowballStemmer('german')
# words = [stemmer.stem(word) for word in words]
# Remove stopwords
    words = [word for word in words if word not in all_stopwords]

###lemmetize words
    lmtzr = WordNetLemmatizer()
   # port = PorterStemmer()
    words = [lmtzr.lemmatize(word) for word in words ]
   # words = [port.stem(word) for word in words ]
# Calculate frequency distribution
    fdist = nltk.FreqDist(words)
    retdict=[]
    topwords={}

# Output top 50 words

    for word, frequency in fdist.most_common(50):
        if len(word)>3:
            value= {"word":word,"frequency":frequency}
            retdict.append(value)
    topwords["topwords"]=retdict
    return topwords
Exemple #5
0
def get_joke_location(caption, full=False):
    """
    Parameters
    ----------
    caption : str

    Returns
    -------
    stats : dict
        Dictionary with describing stats. Includes key
        ``joke_location in range(4)`` which describes the quarter the joke is in.
    """
    blob = TextBlob(caption)
    ngrams = [ngram for ngram in blob.ngrams(n=4)]
    if len(ngrams) <= 2:
        ngrams = [ngram for ngram in blob.ngrams(n=2)]
    if len(ngrams) == 0:
        return {}
    perplexities = [perplexity(" ".join(ngram)) for ngram in ngrams]
    perplexities = np.array(perplexities)
    idx = np.argmin(perplexities)

    # Between 1 and 4
    phrase = " ".join(ngrams[idx])
    word = str(ngrams[idx][-1])
    joke_words = 1
    # idx += 3 # because 4-gram
    frac = idx / (len(ngrams))
    joke_quarter = int(frac * 4) + 1

    doc = nlp(caption,
              disable=["tagger", "ner", "entityrecognizer", "textcat"])
    noun_phrases = [str(x) for x in doc.noun_chunks]
    if any(word in bnp for bnp in noun_phrases):
        idx = [i for i, bnp in enumerate(noun_phrases) if word in bnp][0]
        joke_words = len(noun_phrases[idx].split(" "))
        phrase = noun_phrases[idx]

    kwargs = ({
        "word": "".join(word),
        "phrase": phrase,
        "noun_phrases": noun_phrases
    } if full else {})
    return {
        "joke_quarter": joke_quarter,
        "joke_words": joke_words,
        "min_perplexity": perplexities.min().item(),
        "max_perplexity": perplexities.max().item(),
        "mean_perplexity": perplexities.mean().item(),
        "median_perplexity": np.median(perplexities).item(),
        **kwargs,
    }
Exemple #6
0
def headingsdata(tagtype, soup):
    for element in soup.select(tagtype):
        value = {}
        text = element.get_text()
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split("  "))
        text = '\n'.join(chunk for chunk in chunks if chunk)
        wiki = TextBlob(text)
        value["partofspeech"] = wiki.tags
        value["bigrams"] = wiki.ngrams(n=2)
        value["trigrams"] = wiki.ngrams(n=3)
        value1 = {}
        value1[tagtype] = value
        return value1
def check_speech_patterns(text):
	PATTERNS={
		("PRP","DT"),
		("CC","VBD"),
		("VB","RB"),
		("VB","PRP$"),
		("NN","POS"),
		("NN","MD","VB"),
		("VB","PRP$","NN"),
		("MD","VB","VBN"),
		("NN","IN","PRP$"),
		("IN","PRP$","JJ"),
		("VB","PRP","DT","NN"),
		("VBD","RB","JJ","NNS"),
		("NNP","NNP","NNP","NNP"),
		("PRP$","NN","CC","PRP"),
		("NNP", "NNP", "NNP", "NNP", "NNP"), 
		("NN", "IN", "DT", "NNS", "IN"),
		("PRP$", "NN", "IN", "DT", "NN"),
		("IN", "DT", "NN", "WDT", "VBZ"),
		("NN", "IN", "PRP$", "JJ", "NN"),
		("DT", "NN", "IN", "NN", "NN")
	}
	blob= TextBlob(text)
	for i in range (2,6):
		ngrams=blob.ngrams(n=i)
		for gram in ngrams:
			str_gram=" ".join(gram)
			gram_blob=TextBlob(str_gram)
			tags=gram_blob.tags
			lst1, lst2 = zip(*tags)
			if lst2 in PATTERNS:
				return True
	return False
Exemple #8
0
def main():
    """."""
    args = parse_arguments()
    # read the string from the playbook to get the actual value of the argument
    string = tcex.playbook.read(args.string)
    n_gram_number = int(tcex.playbook.read(args.n_gram))

    tcex.log.info('String value: {}'.format(string))
    tcex.log.info('n-gram number: {}'.format(n_gram_number))

    blob = TextBlob(string)

    tags = dict()
    for tag in blob.tags:
        tags[tag[0]] = tag[1]

    tcex.playbook.create_output('json', blob.json)
    tcex.playbook.create_output(
        'nGrams', [str(n_gram) for n_gram in blob.ngrams(n=n_gram_number)])
    tcex.playbook.create_output('nounPhrases', blob.noun_phrases)
    tcex.playbook.create_output('npCounts', blob.np_counts[1])
    tcex.playbook.create_output('polarity', blob.polarity)
    tcex.playbook.create_output('sentences',
                                [str(sentence) for sentence in blob.sentences])
    tcex.playbook.create_output('subjectivity', blob.subjectivity)
    tcex.playbook.create_output('tags', tags)
    tcex.playbook.create_output('tokens', blob.tokens)
    tcex.playbook.create_output('wordCounts', blob.word_counts[1])
    tcex.playbook.create_output('words', blob.words)

    tcex.exit(0)
def sentiment_pattern(text, gram_n=6):
	blob= TextBlob(text)
	ngrams=blob.ngrams(n=gram_n)
	sentiment_list=[]
	datalist = []
	for gram in ngrams:
		str_gram=" ".join(gram)
		print str_gram
		data = (0, 0, str_gram, None)
		datalist.append(Datapoint(*data))

		#gram_blob=TextBlob(str_gram)
		#sentiment=gram_blob.sentiment[0]
		#if sentiment>0:
		#	sentiment=1
		#elif sentiment<0:
		#	sentiment=-1
		#sentiment_list.append(sentiment)

	predictor = pickle.load(open("predictor.pickle", "rb" ) )
	prediction = predictor.predict(datalist)

	for sentiment in prediction:
		sentiment = int(sentiment)
		if sentiment < 2: sentiment_list.append(-1)
		if sentiment == 2: sentiment_list.append(0)
		if sentiment > 2: sentiment_list.append(1)

	print sentiment_list

	return sentiment_list
Exemple #10
0
def get_tupels(text):
    lower = text.lower()
    blob = TextBlob(lower)
    ngrams = blob.ngrams(n=2) # assumption: don't is two words (do n't), as in "do not"
                              # this can be easily changed by modifying the tokenizer
                              # http://stackoverflow.com/questions/30550411
    tuples = map(tuple,map(tuple, ngrams))
    return tuples
Exemple #11
0
def build_ngrams(text, language="en"):
    blob = TextBlob(lower(text, language))
    ngrams = [blob.ngrams(n=n) for n in (3, 2, 1)]
    wordlists = reduce(operator.add, ngrams)
    tokenized = (tokenize(wordlist, language, stem=True)
                 for wordlist in wordlists)
    pure = (tokenize(wordlist, language, stem=False) for wordlist in wordlists)
    return itertools.chain(tokenized, pure)
def get_ngrams(sent_annotations, is_test):
    # tokens = [w.token for w in sent_annotations]
    tokens = [w.lemma for w in sent_annotations]
    sentence = ' '.join(tokens).decode('utf-8', 'ignore')
    blob = TextBlob(sentence)
    unigrams = tokens
    bigrams = blob.ngrams(n=2)
    trigrams = blob.ngrams(n=3)
    unigram_dict = defaultdict(int)
    bigram_dict = defaultdict(int)
    trigram_dict = defaultdict(int)
    for unigram in unigrams:
        unigram_dict[unigram] = 1
    for bigram in bigrams:
        bigram_dict['_'.join(bigram)] = 1
    for trigram in trigrams:
        trigram_dict['_'.join(trigram)] = 1
    return unigram_dict, bigram_dict, trigram_dict
 def generate_ngram(self, text, max_ngram):
     result = []
     word = ''
     for i in range(1, max_ngram):
         blob = TextBlob(text)
         ngram_var = blob.ngrams(n=i)
         word = ' '.join(ngram_var[0])
         result.append(word)
     return result
Exemple #14
0
def Ngrams(strSeq, n=2):
    strSeq_blob = TextBlob(" ".join([word for word in strSeq]))
    seq_grams = strSeq_blob.ngrams(n)
    grammed_words = ["".join([w for w in sentence]) for sentence in seq_grams]

    # grammed_strSeq = [strSeq[i] + strSeq[i+1] for i in range(0, len(strSeq)-1)]
    # print(grammed_strSeq)

    return seq_grams, grammed_words
def sentiment_reviews(reviews, gram_n=5, predictor=None):
	datalist = []

	tag = []
	counttag = [0] * len(reviews)
	for (i, review) in enumerate(reviews):
		blob = TextBlob(review)
		ngrams=blob.ngrams(n=min(gram_n, len(blob.words)))
		
		for gram in ngrams:
			str_gram=" ".join(gram)
			data = (0, 0, str_gram, None)
			datalist.append(Datapoint(*data))
			tag.append(i)
			counttag[i] += 1

	print "start prediction"

	prediction = predictor.predict(datalist)

	cstm = [[0] * 5 for x in reviews]
	for (i, sentiment) in enumerate(prediction):
		sentiment = int(sentiment)
		cstm[tag[i]][sentiment] += 1.0 / counttag[tag[i]]

	trating = 0.0
	tcount = 0.0

	for i in range(len(reviews)):
		if counttag[i] == 0:
			continue
			
		cstm[i][2] = cstm[i][2] / math.pow(counttag[i], 0.44)
		cstm[i][0] = cstm[i][0] * math.pow(counttag[i], 0.22)
		cstm[i][3] = cstm[i][3] * math.pow(counttag[i], 0.22)
		rating = 0.0
		count = 0.0
		for j in range(5):
			rating += (j + 1) * cstm[i][j]
			count += cstm[i][j]

		print cstm[i], " ", counttag[i]

		t = 1 / (1 + math.exp(-(cstm[i][2] / count - 0.45) * 15))
		print cstm[i][2] / count
		trating += rating / count * (1 - t)
		tcount += 1 - t

	trating = trating / tcount
	if trating > 3:
		x = trating - 3
		x = math.pow(x, 0.4647) * 1.4492
		return x + 3
	else:
		x = 3 - trating
		x = math.pow(x, 0.4647) * 1.4492
		return 3 - x
 def concept_extr(temp):
     temp1 = TextBlob(temp)
     sample = []
     for i in range(5):
         for ngram in temp1.ngrams(i):
             sample.append(" ".join(ngram))
             sample.append(" ".join(ngram.lemmatize()))
         sample = list(dict.fromkeys(sample))
     return (sample)
def GetBigrams(text):
    blob = TextBlob(text)
    WordLists = blob.ngrams(n = 2)
    Bigrams = []
    for wordlist in WordLists:
       cstr = ''
       for word in wordlist:    cstr = cstr+word+"_"
       Bigrams.append(cstr)
    return Bigrams
Exemple #18
0
    def get_ngrams(string, size=3):
        blob = TextBlob(string)

        sentences = []
        ngrams = blob.ngrams(n=size)
        for ngram in ngrams:
            sentences.append([x for x in ngram])

        return ngrams
def GetBigrams(text):
    blob = TextBlob(text)
    WordLists = blob.ngrams(n=2)
    Bigrams = []
    for wordlist in WordLists:
        cstr = ''
        for word in wordlist:
            cstr = cstr + word + "_"
        Bigrams.append(cstr)
    return Bigrams
Exemple #20
0
def build_ngrams(text, language='en'):
    blob = TextBlob(lower(text, language))
    ngrams = [blob.ngrams(n=n) for n in (3, 2, 1)]
    wordlists = reduce(operator.add, ngrams)
    tokenized = (
        tokenize(wordlist, language, stem=True)
        for wordlist in wordlists)
    pure = (
        tokenize(wordlist, language, stem=False)
        for wordlist in wordlists)
    return itertools.chain(tokenized, pure)
Exemple #21
0
def get_ngrams(sent_annotations, is_test):
    tokens = [w.token for w in sent_annotations]
    sentence = ' '.join(tokens).decode('utf-8', 'ignore')
    blob = TextBlob(sentence)
    unigrams = tokens
    bigrams = blob.ngrams(n=2)
    trigrams = blob.ngrams(n=3)
    unigram_dict = defaultdict(int)
    bigram_dict = defaultdict(int)
    trigram_dict = defaultdict(int)
    global UNIGRAM_DICT, BIGRAM_DICT, TRIGRAM_DICT
    for unigram in unigrams:
        unigram_dict[unigram] = 1
        UNIGRAM_DICT[unigram] += 1
    for bigram in bigrams:
        bigram_dict['_'.join(bigram)] = 1
        BIGRAM_DICT['_'.join(bigram)] += 1
    for trigram in trigrams:
        trigram_dict['_'.join(trigram)] = 1
        TRIGRAM_DICT['_'.join(trigram)] += 1
    return unigram_dict, bigram_dict, trigram_dict
def _get_detailed_stats(no_code_text):
    """
    Returns detailed stats on text
    :param no_code_text: String to analyse
    :return: list of details
    """
    results = []
    group_by = 'Detailed Text Statistics'
    tb = TextBlob(no_code_text)
    # Spell check here...it's very slow
    results.append(TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by))
    results.append(TextFeature('Number of sentences (again)', len(tb.sentences), group_by))
    results.append(TextFeature('Number of words', len(tb.words), group_by))
    results.append(TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by))
    results.append(TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by))
    results.append(TextFeature('Detected Language', tb.detect_language(), group_by))
    results.append(TextFeature('Number of important phrases', len(tb.noun_phrases), group_by))
    results.append(TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by))
    results.append(TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by))
    results.append(TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by))
    return results
def extract_trigrams(client):
	documents = client['cornell']['documents']

	for doc in documents.find():
		blob = TextBlob(doc['text'])
		valid_trigrams = []
		for s in blob.sentences:
			sentence = TextBlob(s.dict['raw'])
			sentence = TextBlob(sentence.parse())
			trigrams = sentence.ngrams(n=3)
			valid_trigrams = valid_trigrams + get_valid_trigrams(trigrams)
		documents.update({'name':doc['name']},{'$set':{'trigrams':valid_trigrams}})
def n_gram(tweets, n, stop_words):
    """
    Produces an list of highest frequency N-grams along with their count
    
    Arguments:
    tweets (DataFrame): tweets DataFrame for a certain candidate
    n (int): number of words for n gram    
    stop_words (set): set of words that are not used for making the N-gram
    
    Returns:
    max_grams (list): list of highest frequency N-grams
    """

    assert isinstance(tweets, pd.DataFrame)
    assert isinstance(n, int)
    assert isinstance(stop_words, set)

    tweet_list = ""
    for tweet in tweets.loc[:, 'text']:
        tweet_list += tweet
        tweet_list += " "

    tweet_list = remove_punctuation(tweet_list)
    tweets_analysis = TextBlob(tweet_list)

    grams = tweets_analysis.ngrams(n=n)

    gram_counter = collections.Counter()

    for words in grams:
        words_list = list(words)
        word = " ".join(words_list)
        gram_counter[word] += 1

    gram_dict = dict(gram_counter)

    num_grams = 100
    max_grams_list = []

    for i in range(num_grams):
        current_max = max(gram_dict.items(), key=operator.itemgetter(1))
        max_grams_list.append(current_max)
        del gram_dict[current_max[0]]

    max_grams = copy.deepcopy(max_grams_list)

    for gram in max_grams_list:
        words = gram[0].split(" ")
        for word in words:
            if word in stop_words and gram in max_grams:
                max_grams.remove(gram)

    return max_grams
Exemple #25
0
def getNGram(text, n, polarity):
    key_words = []
    text = TextBlob(text)
    for word in text.ngrams(n):
        word = ' '.join(word)
        wordBlob = TextBlob(word)
        if wordBlob.sentiment.polarity > 0.5 and polarity > 0.2:
            if str(wordBlob) not in key_words:
                key_words.append(str(wordBlob))
        if wordBlob.sentiment.polarity < -0.5 and polarity < -0.2:
            if str(wordBlob) not in key_words:
                key_words.append(str(wordBlob))
    return key_words
Exemple #26
0
def analyze_sentiment(filename, verbose=False):
    """Performs sentiment analysis using textblob.
    filename -- txt file containing story transcript
    (first line is assumed to be author's name)
    """

    with open(filename, "r") as f:
        author = f.readline().rstrip()
        transcript = TextBlob(f.read())

    # Keep a list of polarity and subjectivity
    polx, poly = [], []
    suby = []

    num_words = len(transcript.words)  # Number of words in transcript
    num_windows = 1000  # Number of windows to analyze sentiment from
    seg_length = num_words - num_windows

    # Print out variables
    if verbose:
        print(f"""Number of Words: {num_words} \n
        Segment Length: {seg_length} \n
        Number of Windows: {num_windows}""")

    # Conduct the sentiment analysis!! We do this according to Reagan's method of gathering all the
    # words in a sliding window of the text. Each window is analyzed as a whole for sentiment.
    print(f"Conducting SC Analysis on {author}...")
    # Let's keep track of how long this takes; bigger files may become a problem
    startTime = time.time()

    # We use the TextBlob ngrams() function to retrieve all the possible windows of our
    # specified segLength in the transcript
    for index, window in enumerate(transcript.ngrams(seg_length)):
        poly.append(TextBlob(" ".join(window)).sentiment.polarity)
        polx.append(len(poly))

        suby.append(TextBlob(" ".join(window)).sentiment.subjectivity)

        # Report progress during analysis
        if verbose:
            if index % 250 == 0:
                print("Finished window {}/{} [{:.2f}%]".format(
                    index, num_windows, index / num_windows))

    # storing as np array is easier to write to csv
    results = np.stack([polx, poly, suby]).transpose()

    # Report how long the analysis took
    print("SC Analysis Runtime:", round(time.time() - startTime, 2), "seconds")

    return author, results
Exemple #27
0
def check_for_name(text):
    b = TextBlob(text)
    pairs = b.ngrams(n=2)
    for pair in pairs:
        composed_sentence = ''
        for word in pair:
            composed_sentence += word.lower()
            composed_sentence += ' '
        composed_sentence_trimmed = composed_sentence.strip()
        if composed_sentence_trimmed == 'my name':
            for word, part_of_speech in b.pos_tags:
                if part_of_speech == 'NNP':
                    return 0.91, 'Hello {0}'.format(word), 'giggling'
    return 0, None, None
Exemple #28
0
def check_for_suicide(text):
    b = TextBlob(text.replace("'", ""))
    # iterating through n-grams of 2,3,4,5,6,7,8,9
    for (key, value) in NGRAM_DICT.items():
        ngrams = b.ngrams(n=value)
        for ngram in ngrams:
            composed_sentence = ''
            for word in ngram:
                composed_sentence += word.lower()
                composed_sentence += ' '
            composed_sentence_trimmed = composed_sentence.strip()
            if composed_sentence_trimmed in eval(key):
                return 1, EMERGENCY, 'afraid'
    return 0, None, None
Exemple #29
0
def getNGrams(text, n):
    blob = TextBlob(text)
    listofBlobs = blob.ngrams(n)
    listofBigrams = []
    for wordList in listofBlobs:
        flag = True
        for item in wordList:
            if flag:
                bigram = unicode(item)
                flag = False
            else:
                bigram = bigram + " "+ unicode(item)
        # print type(bigram)
        listofBigrams.append(bigram)
    return listofBigrams
Exemple #30
0
def ngram(text, gram=2):
    '''convert test to n-gram

	Args:
		text(str): Text to convert in n-gram
		gram(int): Number of n-gram
	
	Results:
		__gram_model(list(list)): return list of list depends on n-gram input

	Defaults:
		gram = 2

	'''
    __analyzer = TextBlob(str(text))
    return __analyzer.ngrams(n=gram)
Exemple #31
0
 def encode(self, text):
     blob = TextBlob(text)
     ngram = self.args.ngram
     out_parts = []
     for n in range(1, ngram + 1):
         ng_vec_all = []
         # collect all ngram vectors
         for ng in blob.ngrams(n=n):
             ng_vec = np.ones(self.model.vector_size)
             for tok in ng:
                 ng_vec *= self._get_vector(tok)
             ng_vec_all.append(self._normalize_vector(ng_vec))
         # normalize ngram vectors
         ng_vec_all = self._normalize_vector(np.sum(ng_vec_all, axis=0))
         out_parts.append(ng_vec_all)
     return np.concatenate(out_parts, axis=None)
Exemple #32
0
def check_problematic(text):
    blob = TextBlob(text)
    subjective = blob.sentiment.subjectivity
    polarity = blob.sentiment.polarity
    if subjective > 0.33 or polarity < 0:
        for word in blob.words:
            for word1 in problem_words:
                if word.lower() == word1:
                    if word1 == 'criminal' or word1 == 'felon' or word1 == 'criminals':
                        if check_noun(blob, word1):
                            return True, word1
                    else:
                        return True, word1
        for digram in blob.ngrams(2):
            for digram1 in problem_digrams:
                if digram[0].lower() == digram1[0] and digram[1].lower(
                ) == digram1[1]:
                    return True, digram1
    return False, None
def get_grams(comment, n=2, keep_emoji_words=False):
    '''
    Returns n-grams for a sentence, optionally cleaning the string
    in the process.

    Parameters:
    -----------
    comment:            str:    sentence for which n-grams will be 
                                    made
    n:                  int:    number of tokens to include in n-gram
    keep_emoji_words:   bool:   whether emoji will be removed or 
                                    substituted with text 
                                    descriptions
    Returns:
    --------
    list:   list of n-grams
    '''
    blob = TextBlob(clean_text(comment, keep_emoji_words=\
                                        keep_emoji_words))
    return list([' '.join(wordlist) for wordlist in blob.ngrams(n)])
def names_ext(sentence):
    ''' Extracts Names using first_name_search and last_name_search '''
    sentence = TextBlob(sentence)
    possible_names = sentence.noun_phrases
    print "NOUN PHRASES: ", possible_names
    sentence = sentence.ngrams(n=2)
    names = []
    female_first = open('./Names_db/Females_Firsts.txt').read().strip().split("\n")
    male_first = open('./Names_db/Males_Firsts.txt').read().strip().split("\n")
    all_last = open('./Names_db/Last_Namess.txt').read().strip().split("\n")

    for phrases in sentence:
        female_names = first_name_search(phrases[0],female_first)
        male_names = first_name_search(phrases[0],male_first)
        last_names = last_name_search(phrases[1],all_last)
        if female_names and male_names and last_names != "None":
            print female_names
            print male_names
            print last_names, "\n"
    return "None"
Exemple #35
0
def dict_unique_words(str_list, n):
    c = Counter()

    for i in str_list:
        blob = TextBlob(i)
        x = list(blob.words.lemmatize())
        phrases = list(blob.noun_phrases)
        for phr in phrases:
            if phr not in x:
                phrase = ' '.join(phr)
                x.append(phrase)
        ngrams = blob.ngrams(n)
        for gr in ngrams:
            if gr not in x:
                ngram = ' '.join(gr)
                x.append(ngram)
        for w in x:
            c[w] += 1
    unique_words = dict(c)
    return unique_words
 def blogWords(self):
     regex1 = '[^a-zA-Z0-9-/]'
     regex2 = '[^a-zA-Z0-9-\'\"/]'
     filename = 'blogwords.txt'
     i = 0
     textblob = TextBlob(" ".join(self.listOfWords))
     #load blog words text file
     blogWords_file = open(filename, 'r')
     #line represents a blog word
     for line in blogWords_file:
         #Remove non-alphanumeric characters in sequence
         line = re.sub(regex2, ' ', line)
         #array of words in line
         lineArray = [x.lower() for x in line.split()]
         #entry represents an n-gram instance of the input text
         for entry in textblob.ngrams(n = len(lineArray)):
             entry = [re.sub(regex1, '', x).lower() for x in entry]
             if lineArray == entry:
                 i += 1
     return i    
def sentiment_pattern(text, gram_n=6, predictor=None):
	
	blob= TextBlob(text)
	ngrams=blob.ngrams(n=gram_n)
	sentiment_list=[]
	datalist = []
	for gram in ngrams:
		str_gram=" ".join(gram)
		data = (0, 0, str_gram, None)
		datalist.append(Datapoint(*data))

	prediction = predictor.predict(datalist)

	for sentiment in prediction:
		sentiment = int(sentiment)
		if sentiment < 2: sentiment_list.append(-1)
		if sentiment == 2: sentiment_list.append(0)
		if sentiment > 2: sentiment_list.append(1)

	return sentiment_list
	"""
Exemple #38
0
def update_reviews(attr, old, new):

    data_set = pd.read_csv('Outputs/data_set.csv')
    i = int(star_rating.value)

    #output_review_list = extract_ngrams(str(data_set[(data_set['Star_count'] == i)]['Review'].values),2,3)
    data_set_blob = data_set.copy()
    data_set_blob['Noun_sentences'] = data_set_blob['Review'].apply(lambda x:get_nouns(x))

    n_gram_blob = TextBlob(str(data_set_blob[(data_set_blob['Star_count'] == i)]['Noun_sentences'].values))    

    #Styling the paragraph element
    text1 = Paragraph(style={'font-variant': 'small-caps','font-family': "Tahoma"})
    text1.text=""    

    #review1 = text_cleaner(str(n_gram_blob.ngrams(1)[0]))
    #review2 = text_cleaner(str(n_gram_blob.ngrams(1)[1]))
                
    review1 = text_cleaner(n_gram_blob.ngrams(1)[1][0])
    review2 = text_cleaner(n_gram_blob.ngrams(1)[2][0])

    text1.text = "Top "+str(i)+" star reviews feel: "+review1+", followed by "+review2    
    curdoc().add_root(Row(text1))
Exemple #39
0
def index():
  response.content_type = 'text/text; charset=utf-8'
  
  ret =  'Hi there, I\'m process {0}!\n\n'.format(os.getpid())

  sentence = 'Now is better than never.'
  ret += 'Testing TextBlob ngram (n=3) with sentence: \n "{0}" \n'.format(sentence)
  blob = TextBlob(sentence)
  for word_list in blob.ngrams(n=3):
    ret += (' '.join(word_list) + '\n')
  
  data = pd.DataFrame({'A': np.random.randn(3), 'B': np.random.randn(3)})
  func = "pd.DataFrame({'A': np.random.randn(3), 'B': np.random.randn(3)})"
  ret += '\nTesting Numpy and Pandas with command: \n {0} \n{1} \n'.format(func, data.to_json())
    
  ret += '\nCode at: \n https://github.com/alyssaq/bottle-heroku-skeleton \n'
  ret += '\nEnvironment vars:\n'

  for k, v in env.iteritems():
    if 'bottle.' in k:
      continue
    ret += '%s=%s\n' % (k, v)

  return ret
Exemple #40
0
attack_text = """
A drone attack that failed to kill President Nicolás Maduro of Venezuela unfolded on live TV and in front of many witnesses
"""

attack_blob = TextBlob(attack_text)

print(attack_blob.noun_phrases)
print(attack_blob.words)

# toNote: pluralize & singularize!
print(attack_blob.words.singularize())
print(attack_blob.words.pluralize())

print(attack_blob.word_counts['of'])

print(attack_blob.ngrams(n=2))
print(attack_blob.ngrams(n=4))

from textblob import Word
for word in attack_blob.words:
    print(Word(word).correct() == word)

#%% Example from https://www.analyticsvidhya.com/blog/2018/02/natural-language-processing-for-beginners-using-textblob/
av_blob = TextBlob("Analytics Vidhya is a great platform to learn data science. \n It helps community through blogs, hackathons, discussions,etc.")
print(av_blob.tokenize())
print(av_blob.sentences, av_blob.sentences[0])

for phrase in av_blob.noun_phrases:
    print(phrase)  # analytics vidhya; great platform; data science

# toNote: part-of-speech tagging
                        case_sensitive=True))  # specify case sensitivity
print(wiki.noun_phrases.count('python'))
# translation and language detection
# en_blob = TextBlob(u'Simple is better than complex.')
# print(en_blob.translate(to='es'))
# chinese_blob = TextBlob(u"美丽优于丑陋")
# print(chinese_blob.translate(from_lang="zh-CN", to='en'))
# b = TextBlob(u"بسيط هو أفضل من مجمع")
# print(b.detect_language())
# parsing
b = TextBlob("And now for something completely different.")
print(b.parse())
# textblobs are like python strings!
print(zen[0:19])
print(zen.upper())
print(zen.find("Simple"))
apple_blob = TextBlob('apples')
banana_blob = TextBlob('bananas')
print(apple_blob < banana_blob)
print(apple_blob == 'apples')
apple_blob + ' and ' + banana_blob
TextBlob("apples and bananas")
print("{0} and {1}".format(apple_blob, banana_blob))
# n-grams
blob = TextBlob("Now is better than never.")
print(blob.ngrams(n=3))
# getting start and end indices of sentences
for s in zen.sentences:
    print(s)
    print("---- Starts at index {}, Ends at index {}".format(s.start, s.end))
import pip

#!pip install textblob
#!python -m textblob.download_corpora

from textblob import TextBlob

import numpy as np
import pandas as pd

tx =df.loc[0,'full_text']
blob = TextBlob (tx)
blob.tags
blob.sentences[0].words
blob.noun_phrases
blob.ngrams(3)
blob.correct( )
blob.words[3].spellcheck( )
blob.detect_language( )
blob. translate (to= 'ar' )


verbs=[ ]
for word, tag in blob.tags:
    if tag == 'VB ' :
        verbs.append(word.lemmatize( ))
nouns = [ ]
for word,tag in blob.tags:
    if tag == 'NN' :
        nouns.append(word.lemmatize( ) )
nounsp = [ ]
Exemple #43
0
def tweet_content():
    """Generate tweet string (140 characters or less)
    """

#    with open('basho.txt', 'r') as content_file:
#        content = content_file.read()
    r = requests.get("http://novicevagabond.com/projects/haiku/basho.txt")
    content = r.content
     
    nltk.data.path.append("nltk_data/")
    nltk.data.path.append("nltk_data/punkt")
    nltk.data.path.append("fizzle_dizzle/")
#    nltk.download()

#print content

    tokenizer = BlanklineTokenizer()
    cleaned_content = content.lower()
    corpus = TextBlob(cleaned_content,  tokenizer=tokenizer)

    haiku = corpus.sentences
#print haiku

    bigrams = corpus.ngrams(n=2)
    trigrams = corpus.ngrams(n=3)

#print bigrams
    dict = {}
    for bigram in bigrams:
        k = bigram[0]
        v = bigram[1]
        if k in dict:
            if v in dict[k]:
                dict[k][v] = dict[k][v] + 1
            else:
                dict[k][v] = 1
        else:
            dict[k] = { v : 1}

#print dict

    def weighted_choice(map):
        choices = [] 
        for k in map:
            #print k 
            for n in range(1, map[k] + 1):
                choices.append(k)
        #print choices
        choice = random.choice(choices)
        #print choice
        return choice

    seed = random.choice(dict.keys())
    length = random.randint(11,15) 

    output = [seed]
#print output
    for i in range(length):
        output.append(weighted_choice(dict[output[i]]))

    whitespace = " "
    line1 = whitespace.join(output[0:4])
    line2 = whitespace.join(output[4:9])
    line3 = whitespace.join(output[9:])
    line4 = "-- #markov_basho_haiku"
    sep = "\n"
    tweet = sep.join([line1, line2, line3, line4]);
#    print tweet
    return tweet 
Exemple #44
0
def get_ngrams(doc, n):
    blob = TextBlob(doc)
    ngrams = blob.ngrams(n = n)
    return ngrams
Exemple #45
0
for i in main_list:
    xy = i[2].split("-")
    if len(xy) > 1 :
       #print xy[1]
       a[mystr[k]] = xy[1]
    else:
        a[mystr[k]] = xy[0]

    k = k + 1

myset  = Set()

total_words=0

n1 = b.ngrams(n=1)

for i in n1:
    total_words+=1
    if a[i[0]] == "VP" :
        myset.add(i[0])
        #print "this " , i[0]


n2 = b.ngrams(n=2)

for i in n2:
    if i[0]=="ADVP" and i[1]=="VP":
        myset.add(i[0])
        myset.add(i[1])
    elif i[0]=="VP" and i[1]=="ADVP":
# WordLists (A WordList is just a Python list with additional methods.)
animals = TextBlob("cat dog octopus")
print animals.words
print animals.words.pluralize()
# Spelling Correction (Use the correct() method to attempt spelling correction.)
b = TextBlob("I havv goood speling!")
print(b.correct())
w = Word('falibility')
print w.spellcheck()

# Get Word and Noun Phrase Frequencies
monty = TextBlob("We are no longer the Knights who say Ni. " "We are now the Knights who say Ekki ekki ekki PTANG.")
print monty.word_counts['ekki']
# The second way is to use the count() method.
print monty.words.count('ekki')
print monty.words.count('Ekki', case_sensitive=True)

# TextBlobs Are Like Python Strings
print zen.upper()

# You can make comparisons between TextBlobs and strings.
apple_blob = TextBlob('apples')
banana_blob = TextBlob('bananas')
print apple_blob < banana_blob
# You can concatenate and interpolate TextBlobs and strings.
print apple_blob + ' and ' + banana_blob
print "{0} and {1}".format(apple_blob, banana_blob)
# n-grams ( The TextBlob.ngrams() method returns a list of tuples of n successive words. )
blob = TextBlob("Now is better than never.")
print blob.ngrams(n=3)
Exemple #47
0
    def find_component_match(self, title, body, template_data):
        '''Make a list of matching files for arbitrary text in an issue'''

        # DistributionNotFound: The 'jinja2<2.9' distribution was not found and
        #   is required by ansible
        # File
        # "/usr/lib/python2.7/site-packages/ansible/plugins/callback/foreman.py",
        #   line 30, in <module>

        STOPWORDS = [u'ansible', u'core', u'plugin']
        STOPCHARS = [u'"', u"'", u'(', u')', u'?', u'*', u'`', u',']
        matches = []

        if u'Traceback (most recent call last)' in body:
            lines = body.split(u'\n')
            for line in lines:
                line = line.strip()
                if line.startswith(u'DistributionNotFound'):
                    matches = [u'setup.py']
                    break
                elif line.startswith(u'File'):
                    fn = line.split()[1]
                    for SC in STOPCHARS:
                        fn = fn.replace(SC, u'')
                    if u'ansible_module_' in fn:
                        fn = os.path.basename(fn)
                        fn = fn.replace(u'ansible_module_', u'')
                        matches = [fn]
                    elif u'cli/playbook.py' in fn:
                        fn = u'lib/ansible/cli/playbook.py'
                    elif u'module_utils' in fn:
                        idx = fn.find(u'module_utils/')
                        fn = u'lib/ansible/' + fn[idx:]
                    elif u'ansible/' in fn:
                        idx = fn.find(u'ansible/')
                        fn1 = fn[idx:]

                        if u'bin/' in fn1:
                            if not fn1.startswith(u'bin'):

                                idx = fn1.find(u'bin/')
                                fn1 = fn1[idx:]

                                if fn1.endswith(u'.py'):
                                    fn1 = fn1.rstrip(u'.py')

                        elif u'cli/' in fn1:
                            idx = fn1.find(u'cli/')
                            fn1 = fn1[idx:]
                            fn1 = u'lib/ansible/' + fn1

                        elif u'lib' not in fn1:
                            fn1 = u'lib/' + fn1

                        if fn1 not in self.files:
                            if C.DEFAULT_BREAKPOINTS:
                                logging.error(u'breakpoint!')
                                import epdb; epdb.st()
            if matches:
                return matches

        craws = template_data.get(u'component_raw')
        if craws is None:
            return matches

        # compare to component mapping
        matches = self._string_to_cmap_key(craws)
        if matches:
            return matches

        # do not re-process the same strings over and over again
        if craws.lower() in self.match_cache:
            return self.match_cache[craws.lower()]

        # make ngrams from largest to smallest and recheck
        blob = TextBlob(craws.lower())
        wordcount = len(blob.tokens) + 1

        for ng_size in reversed(xrange(2, wordcount)):
            ngrams = [u' '.join(x) for x in blob.ngrams(ng_size)]
            for ng in ngrams:

                matches = self._string_to_cmap_key(ng)
                if matches:
                    self.match_cache[craws.lower()] = matches
                    return matches

        # https://pypi.python.org/pypi/fuzzywuzzy
        matches = []
        for cr in craws.lower().split(u'\n'):
            ratios = []
            for k in self.CMAP.keys():
                ratio = fw_fuzz.ratio(cr, k)
                ratios.append((ratio, k))
            ratios = sorted(ratios, key=lambda tup: tup[0])
            if ratios[-1][0] >= 90:
                cnames = self.CMAP[ratios[-1][1]]
                matches += cnames
        if matches:
            self.match_cache[craws.lower()] = matches
            return matches

        # try to match to repo files
        if craws:
            clines = craws.split(u'\n')
            for craw in clines:
                cparts = craw.replace(u'-', u' ')
                cparts = cparts.split()

                for idx, x in enumerate(cparts):
                    for SC in STOPCHARS:
                        if SC in x:
                            x = x.replace(SC, u'')
                    for SW in STOPWORDS:
                        if x == SW:
                            x = u''
                    if x and u'/' not in x:
                        x = u'/' + x
                    cparts[idx] = x

                cparts = [x.strip() for x in cparts if x.strip()]

                for x in cparts:
                    for f in self.files:
                        if u'/modules/' in f:
                            continue
                        if u'test/' in f and u'test' not in craw:
                            continue
                        if u'galaxy' in f and u'galaxy' not in body:
                            continue
                        if u'dynamic inv' in body.lower() and u'contrib' not in f:
                            continue
                        if u'inventory' in f and u'inventory' not in body.lower():
                            continue
                        if u'contrib' in f and u'inventory' not in body.lower():
                            continue

                        try:
                            f.endswith(x)
                        except UnicodeDecodeError:
                            continue

                        fname = os.path.basename(f).split(u'.')[0]

                        if f.endswith(x):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break
                        if f.endswith(x + u'.py'):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break
                        if f.endswith(x + u'.ps1'):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break
                        if os.path.dirname(f).endswith(x):
                            if fname.lower() in body.lower():
                                matches.append(f)
                                break

        logging.info(u'%s --> %s' % (craws, sorted(set(matches))))
        self.match_cache[craws.lower()] = matches
        return matches
def text_to_ngrams(text):
    blob = TextBlob(text)
    ngrams = blob.ngrams(NGRAM_SIZE)
    parse_ngrams(ngrams)
    return
Exemple #49
0
# Section 12.2.14 snippets
from textblob import TextBlob

text = 'Today is a beautiful day. Tomorrow looks like bad weather.'

blob = TextBlob(text)

blob.ngrams()

blob.ngrams(n=5)

##########################################################################
# (C) Copyright 2019 by Deitel & Associates, Inc. and                    #
# Pearson Education, Inc. All Rights Reserved.                           #
#                                                                        #
# DISCLAIMER: The authors and publisher of this book have used their     #
# best efforts in preparing the book. These efforts include the          #
# development, research, and testing of the theories and programs        #
# to determine their effectiveness. The authors and publisher make       #
# no warranty of any kind, expressed or implied, with regard to these    #
# programs or to the documentation contained in these books. The authors #
# and publisher shall not be liable in any event for incidental or       #
# consequential damages in connection with, or arising out of, the       #
# furnishing, performance, or use of these programs.                     #
##########################################################################
Exemple #50
0
def getBigramCount(inputFileName,state,cat,rating):

    #create the 5 different category hashmaps to store the bigram and the frequency from the csv file
    service = {}
    value = {}
    variety = {}
    ambience = {}
    taste = {}
    accessibility = {}
    cr = csvReader("bigramCombined.csv")
    for r in cr:
        if r[2] == "value":
            k = r[0]
            if k not in value:
                polarity = r[3]
                score = int(r[4])
                if polarity == "Neg":
                    score *= -1
                value[k] = score
        elif r[2] == "service":
            k = r[0]
            if k not in service:
                polarity = r[3]
                score = int(r[4])
                if polarity == "Neg":
                    score *= -1
                service[k] = score
        elif r[2] == "ambience":
            k = r[0]
            if k not in ambience:
                polarity = r[3]
                score = int(r[4])
                if polarity == "Neg":
                    score *= -1
                ambience[k] = score
        elif r[2] == "taste":
            k = r[0]
            if k not in taste:
                polarity = r[3]
                score = int(r[4])
                if polarity == "Neg":
                    score *= -1
                taste[k] = score
        elif r[2] == "variety":
            k = r[0]
            if k not in variety:
                polarity = r[3]
                score = int(r[4])
                if polarity == "Neg":
                    score *= -1
                taste[k] = score
        else:
            k = r[0]
            if k not in accessibility:
                polarity = r[3]
                score = int(r[4])
                if polarity == "Neg":
                    score *= -1
                accessibility[k] = score

    cr = csvReader(inputFileName)
    inputStr = " "
    for r in cr:
        print r[0].split("|")[1]
        print r[0].split("|")[0]
        text = r[0].split("|")[14]
        try:
            text = unicode(text)
            inputStr += text + " "
        except UnicodeDecodeError:
            pass

    blob = TextBlob(inputStr)
    wordsArray = blob.words
    bigrams = blob.ngrams(2)
    bigramsList = []
    for i in bigrams:
        bigramsList.append(i[0]+" "+i[1])

    cw1 = csvWriter("tableau_db_3.csv")
    #headers1 = ["State","Business_Category","Business_Rating","Bigram","Bigram_Cat","Bigram_Freq","Bigram_Sentiment","Bigram_Importance"]
    #cw1.writerow(headers1)
    
    serviceList = {}
    for k in service:
        #check whether the word in the service category exists in the textreview
        if k in bigramsList:
            #store the frequency of the word in the serviceList hashmap
            if k not in serviceList:
                serviceList[k] = 1;
            else:
                serviceList[k] +=1

    #write into the csv output file used for tableau visualisation
    for k in serviceList:
        d = []
        d.append(state)
        d.append(cat)
        d.append(rating)
        d.append(k)
        d.append("Service")
        freq = serviceList[k]
        d.append(freq)
        sentiment = service[k]
        d.append(sentiment)
        #importance of word is calculated by the number of times it appears in the textreview
        #and the sentiment score assigned to the word
        #for example, happy hour will have 25 importance when it appears 25 times in the
        #textreview and is assigned a sentiment score of 1
        importance = freq * sentiment
        d.append(importance)
        cw1.writerow(d)

    ambienceList = {}
    for k in ambience:
        if k in bigramsList:
            print k
            if k not in ambienceList:
                ambienceList[k] = 1;
            else:
                ambienceList[k] +=1

    for k in ambienceList:
        d = []
        d.append(state)
        d.append(cat)
        d.append(rating)
        d.append(k)
        d.append("Ambience")
        freq = ambienceList[k]
        d.append(freq)
        sentiment = ambience[k]
        d.append(sentiment)
        importance = freq * sentiment
        d.append(importance)
        cw1.writerow(d)

    varietyList = {}
    for k in variety:
        if k in bigramsList:
            print k
            if k not in varietyList:
                varietyList[k] = 1;
            else:
                varietyList[k] +=1

    for k in varietyList:
        d = []
        d.append(state)
        d.append(cat)
        d.append(rating)
        d.append(k)
        d.append("Variety")
        freq = varietyList[k]
        d.append(freq)
        sentiment = variety[k]
        d.append(sentiment)
        importance = freq * sentiment
        d.append(importance)
        cw1.writerow(d)

    tasteList = {}
    for k in taste:
        if k in bigramsList:
            print k
            if k not in tasteList:
                tasteList[k] = 1;
            else:
                tasteList[k] +=1

    for k in tasteList:
        d = []
        d.append(state)
        d.append(cat)
        d.append(rating)
        d.append(k)
        d.append("Taste")
        freq = tasteList[k]
        d.append(freq)
        sentiment = taste[k]
        d.append(sentiment)
        importance = freq * sentiment
        d.append(importance)
        cw1.writerow(d)

    accessibilityList = {}
    for k in accessibility:
        if k in bigramsList:
            print k
            if k not in accessibilityList:
                accessibilityList[k] = 1;
            else:
                accessibilityList[k] +=1

    for k in accessibilityList:
        d = []
        d.append(state)
        d.append(cat)
        d.append(rating)
        d.append(k)
        d.append("Accessibility")
        freq = accessibilityList[k]
        d.append(freq)
        sentiment = accessibility[k]
        d.append(sentiment)
        importance = freq * sentiment
        d.append(importance)
        cw1.writerow(d)
Exemple #51
0
    def on_success(self, data):
        # Digest
        if 'text' in data:
            line = data['text']
            
            # Do n-grams
            blob = TextBlob(line)
            ngrams = list(blob.ngrams(n=2))
            for ng in ngrams:
                for word in list(ng):
                        word = word.lower()
                        if word in self.stemmer.stems.keys():
                                word = self.stemmer.stems[word]
                        match = re.search('\w+',word)
                        if match:
                                word = match.group()
                        if word in self.stop_words:
                                word = ''
                if ng[0] and ng[1]:
                        if ' '.join(ng) in self.bookshelf.keys():
                                self.bookshelf[' '.join(ng)] += 1
                        else:
                                self.bookshelf[' '.join(ng)] = 1

            '''
            # Do Unigrams
            for word in line.split(' '):
                word = word.lower()

                # Stemming
                if word in self.stemmer.stems.keys():
                    word = self.stemmer.stems[word]

                # Removing punctuation
                match = re.search('\w+',word)
                if match:
                    word = match.group()

                # Stop words
                if word not in self.stop_words:
                    if word in self.bookshelf.keys():
                        self.bookshelf[word] += 1
                    else:
                        self.bookshelf[word] = 1
            '''

            self.count += 1

            # How often to update? Framerate will depend on the number of relevant 
            # tweets, so one size does not necessarily fit all          
            it = 5
            
            # Move forward
            if self.count % it == 0:
                sorted_words = sorted(self.bookshelf.items(), key=operator.itemgetter(1), reverse=True)
                
                clear()
                print('=== Update ==='.format(self.count))
                
                for i in range(0,20):
                    print('{}): {} [{}]'.format(str(i+1),sorted_words[i][0],sorted_words[i][1]))
Exemple #52
0
from nltk.tokenize import BlanklineTokenizer
import random

with open('basho.txt', 'r') as content_file:
    content = content_file.read()

#print content

tokenizer = BlanklineTokenizer()
cleaned_content = content.lower()
corpus = TextBlob(cleaned_content,  tokenizer=tokenizer)

haiku = corpus.sentences
#print haiku

bigrams = corpus.ngrams(n=2)
trigrams = corpus.ngrams(n=3)

#print bigrams
dict = {}
for bigram in bigrams:
    k = bigram[0]
    v = bigram[1]
    if k in dict:
        if v in dict[k]:
            dict[k][v] = dict[k][v] + 1
        else:
            dict[k][v] = 1
    else:
        dict[k] = { v : 1}