def clean(doc):
    lemma = WordNetLemmatizer()
    exclude = set(string.punctuation)
    stoplist = stopwords.words('english')
    stoplist = stoplist + wordlist
    stop = set(stoplist)
    # stop= stop.append
    # print type(stop)
    # print(stop)
    # exit(0)

    # Remove punctuation
    normalized = result = re.sub(r"http\S+", "", doc)
    normalized = result = re.sub(r"than\S+", "", normalized)
    normalized = result = re.sub(r"@\S+", "", normalized)
    normalized = re.sub(r'[^\w\s]', '', normalized).replace("  ", " ")
    # Standardize words (remove multiple letters):
    normalized = ''.join(''.join(s)[:2]
                         for _, s in itertools.groupby(normalized))
    normalized = TextBlob(normalized)
    normalized = ' '.join(normalized.noun_phrases)
    stop_free = " ".join(
        [i for i in normalized.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    normalized = normalized.lower().strip().replace("\n", " ").replace(
        ".", " ").replace("-", ' ')

    # with open('C:\TuDiabetes_Code - Final\TechTypes_Text_New\Physical_Activity_Clean.txt', 'a') as the_file:
    #     the_file.write(normalized)
    print normalized
    print "********************************************************"
    return normalized
Example #2
0
def extract(ngrams, dataset, doc_id):
    # extract keywords
    print 'Extracting keywords'
    for i, ngram in enumerate(ngrams):
        doc = doc_id[i]

        if field not in dataset[doc]:
            dataset[doc][field] = set()

            if doc > 0 and doc % 1000 == 0:
                print '\t', doc

        for kw in filter(lambda k: '_' in k, ngram):
            keyword = kw.replace('_', ' ')

            kw_tb = TextBlob(keyword)

            # filter out punctuation, etc (make sure that there are two non-punc words)
            if len(kw_tb.words) < 2:
                continue

            # add keywords which are all proper nouns
            distinct_tags = set(t[1] for t in kw_tb.tags)
            if distinct_tags - {'NNP', 'NNPS'} == {}:
                dataset[doc][field].add(kw_tb.lower())
                continue

            # add noun phrases
            for np in kw_tb.lower().noun_phrases:
                dataset[doc][field].add(np)

    return kw_set_to_list(dataset)
Example #3
0
 def get_user_timeline_tweets(self, num_tweets):
     tweets = []
     for tweet in Cursor(self.twitter_client.user_timeline, id="@realdonaldtrump").items(num_tweets):
         t=TextBlob(tweet._json['text'])
         print t.lower(), t.sentiment
         tweets.append(tweet)
     return tweets
Example #4
0
def extract(ngrams, dataset, doc_id):
    # extract keywords
    print 'Extracting keywords'
    for i, ngram in enumerate(ngrams):
        doc = doc_id[i]

        if field not in dataset[doc]:
            dataset[doc][field] = set()

            if doc > 0 and doc % 1000 == 0:
                print '\t', doc

        for kw in filter(lambda k: '_' in k, ngram):
            keyword = kw.replace('_', ' ')

            kw_tb = TextBlob(keyword)

            # filter out punctuation, etc (make sure that there are two non-punc words)
            if len(kw_tb.words) < 2:
                continue

            # add keywords which are all proper nouns
            distinct_tags = set(t[1] for t in kw_tb.tags)
            if distinct_tags - {'NNP', 'NNPS'} == {}:
                dataset[doc][field].add(kw_tb.lower())
                continue

            # add noun phrases
            for np in kw_tb.lower().noun_phrases:
                dataset[doc][field].add(np)

    return kw_set_to_list(dataset)
def classify(review):
	blob = TextBlob(review, classifier=cl)
	blob.lower()
	blob.correct()
	for sentence in blob.sentences:
		if sentence.classify() == "neg" and len(str(sentence)) > 3:
			negative_sen.append(str(sentence))
		elif sentence.classify() == "pos" and len(str(sentence)) > 3:
			positive_sen.append(str(sentence))
Example #6
0
class SexxiBot:
    """ Main ChatBot class to take in user input and return an appropriate response.

    Contains methods: fix_typos, to correct any user's typos;
    help_check, to check if the user has asked for 'help' (a list of possible commands);
    check_phrase_similarity, to compare user inputs to keywords to generate basic responses;
    create_response, to generate a new response based on the users input.
    """

    def __init__(self):
        self.user_input = str()
        self.input_len = int()
        self.response = str()

    def fix_typos(self):
        self.user_input = TextBlob(self.user_input.lower()).tags
        # Fix lazy user typos, or slang
        words = list()
        for i in self.user_input:
            words.append(i[0])

        for part in range(len(words)):
            if words[part] in slang_typo_dict.keys():
                words[part] = slang_typo_dict[words[part]]
        self.user_input = ' '.join(words)
        return False  # Returns false to move on to help_check

    def help_check(self):
        if self.user_input.lower() == "help":
            self.response = responses.HELP
            return True
        return False  # User didn't ask for help, move on to check_phrase_similarity

    def check_phrase_similarity(self):
        self.user_input = TextBlob(self.user_input.lower()).tags
        self.input_len = len(self.user_input)
        for phrase_type in PHRASE_TYPES:
            for phrase in getattr(keywords, phrase_type):
                score = float()
                for word in self.user_input:
                    for n in phrase:
                        if word and n not in unimportant_words:
                            score += liquidmetal.score(n, word[0]) / self.input_len
                if score >= 0.7:  # Could be increased/ decreased through testing to find more optimal value
                    self.response = random.choice(getattr(responses, phrase_type))
                    return True
        return False

    def create_response(self):  # NOT WORKING YET!
        # Craft a response based on user's message
        noun, pronoun, verb, adj, prep, text_len = check_pos_tags.pos_tags(self.user_input)
        self.response = format_response.craft_response(noun, pronoun, verb, adj, prep, text_len)
        print self.response
        return False if self.response == ' ' else True
Example #7
0
def polarity_and_lang(message): #blob has a limit on api calls
    
    try:
        if len(message) > 2:
    
            blob = TextBlob(message)    
        
            leng = blob.detect_language()
            text = ''
            if leng == 'es':
                blob = blob.translate(to='en').lower() 
                text = message
            else:
                blob = blob.lower() 
                text = blob.translate(to='es').lower().raw 
            
            pol = blob.sentiment[0]        
        else:
            print('Se paso a polarity_and_lang un texto menor que 3 caracters')
            pol = 0
            text = message            
            
        
    except Exception as e:
            print('Exception en polarity_and_lang: {0}'.format(e))
            pol = 0
            text = None            
    
    return (pol, text)
Example #8
0
def simplebot(user):
    """Rule base bot, takes an argument, user input in form of a string.
    In sequence will pre-process the string. Lower case, tokenize and remove
    stop words. iterates through CONVERSATION, if filtered_input intersects
    response_set is updated. if the set is empty, it returns a message,
    else it returns the longest string in the set"""

    user_input = user
    user_blob = TextBlob(user_input)

    lower_input = user_blob.lower()
    token_input = lower_input.words
    filtered_input = [w for w in token_input if w not in STOP_WORDS]

    response_set = set()
    for con_list in CONVERSATION:
        for sentence in con_list:
            sentence_split = sentence.split()
            if set(filtered_input).intersection(sentence_split):
                response_set.update(con_list)

    if not response_set:
        return "I am sorry, I don't have an answer, ask again"
    else:
        return max(response_set, key=len)
Example #9
0
def preprocess_sentence(sentence, output_type='TextBlob'):
    """
    Put in lowecase filter out stopwords and stem a single sentence
    Input: String or TextBlob object 
    """
    if type(sentence) == str:
        sentence = TextBlob(sentence)
    elif type(sentence) != textblob.blob.TextBlob:
        raise ValueError('Input is neither a string or a TextBlob object')

    sentence = sentence.lower()
    word_list = sentence.words
    filtered_words = [
        word.stem() for word in word_list
        if word not in stopwords.words('english')
    ]

    if output_type == 'TextBlob':
        return TextBlob(' '.join(filtered_words))
    elif output_type == 'list':
        return filtered_words
    elif output_type == 'string':
        return ' '.join(filtered_words)
    else:
        raise ValueError('Output tpye not understood')
Example #10
0
def keywords(reviews, numKeywords):
    """Gets the [numKeywords] most frequent words within the reviews for 
    a product/service excluding stop_words
    
    Args:
        reviews: a list of all the reviews for a product/service
        numKeywords: The number of keywords we want to have
        
    Returns:
        list; Contains each of the [numKeywords] keywords
    """
    alltext = ""
    for review in reviews:
        alltext = alltext + review

    blob = TextBlob(alltext)
    words = blob.lower().words
    num_words = len(words)

    #count the frequency of each word
    counts = Counter(word for word in words if word not in stop_words)
    keywords = {a: b for a, b in counts.most_common(numKeywords)}

    #quadratic scaling

    for i in keywords:
        keywords[i] = keywords[i] / float(num_words)

    return keywords
    def do_in(self, sentence):

        #converts words to numbers(like two->2)

        sentence = convert_w2n(sentence)

        #TextBlob is used  preprocessing the input
        parsed = TextBlob(sentence)
        parsed = parsed.lower()

        #correct the spelling if present
        parsed = parsed.correct()

        #check main category menu,subcategory menu,and quantity

        resp = check_for_main_category(parsed)
        if resp == None:
            resp = check_for_menu(parsed)

        if resp == None:
            resp = check_for_greeting(parsed)
        if (resp == None):
            check_for_quantity(parsed)

        #prints the output
        if resp != None:
            print(resp)

        elif ((len(Chatbot.sub_cat_item) != 0)
              and (len(Chatbot.quantity_main) != 0)):

            print "Total cost of your order placed for  is", find_cost()

        else:
            print(construct_response(parsed))
Example #12
0
def preProcessingNeg(l):
    blob = TextBlob(l[0])
    blob = blob.lower()
    words = blob.words  # Tokenizing
    newOpinion = ''
    for word in words:

        if word in good:
            newOpinion += 'good' + ' '
        elif word in bad:
            newOpinion += 'bad' + ' '
        elif word not in junk:  # Stemming
            if word[-1] == 's':
                word = word[0:-1]
            if word[-3:] == 'ing':
                word = word[0:-3]
            if word[-2:] == 'ly':
                word = word[0:-2]
            if word == "n't":
                word = 'not'
            if word[-2:] == 'ed':
                word = word[0:-2]
            newOpinion += word + ' '

    threeWords = mostCommon(newOpinion)
    for t in threeWords:
        if t[0] in totalDictNeg:
            totalDictNeg[t[0]] += t[1]
        else:
            totalDictNeg[t[0]] = t[1]

    return newOpinion
Example #13
0
    def augment(self, data):
        """
        A method to paraphrase a sentence.
        
        :type data: str
        :param data: sentence used for data augmentation 
        :rtype:   str
        :return:  The augmented data
        """
        if type(data) is not str:
            raise TypeError("DataType must be a string")
        data = TextBlob(data.lower())
        try:
            data = data.translate(from_lang=self.src, to=self.to)
            data = data.translate(from_lang=self.to, to=self.src)
        except NotTranslated:
            try:  # Switch to googletrans to do translation.
                translator = Translator()
                data = translator.translate(data, dest=self.to,
                                            src=self.src).text
                data = translator.translate(data, dest=self.src,
                                            src=self.to).text
            except Exception:
                print("Error Not translated.\n")
                raise

        return str(data).lower()
Example #14
0
def passage2word(path=trainpath, type='train'):
    listofword = []
    filelist = []
    taglist = []
    traverse(path, filelist, taglist)

    if (type == 'train'):
        #       id_tag = open(r'E:\id_tag.txt','wb')

        with open(os.path.join(nowpath, 'id_tag.txt'), 'wb') as id_tag:
            pickle.dump(taglist, id_tag)
            id_tag.close()
    elif (type == 'test'):
        global taglist_test
        taglist_test = taglist

    global passageNum
    passageNum = len(filelist)
    for file in filelist:
        f = open(file, 'r', errors='ignore')
        passage = f.read()
        f.close()
        str = TextBlob(passage.strip())

        wordlist = str.lower().words
        k = len(wordlist)
        answer = []
        if k > 0:
            for x in range(k):
                answer.append(wordlist[x].lemmatize())
                answer[x] = Word(answer[x]).lemmatize("v")
        filtered_answer = [x for x in answer if not x in stop_words]
        #        print(filtered_answer)
        listofword.append(filtered_answer)
    return listofword
Example #15
0
def update(table, field):
    conn = boto.dynamodb.connect_to_region('us-west-2',
                                           aws_access_key_id='',
                                           aws_secret_access_key='')
    table = conn.get_table(table)
    for line in table.scan():
        newline = line[field]
        text = TextBlob(newline)
        text = text.lower()
        textwords = text.split()
        wordcount = 0
        wordlist = []
        for word in textwords:
            wordcount += 1
            if word not in wordlist:
                wordlist.append(word)
                #handles div0 errors
        if wordcount == 0:
            lexdiv = 0
        else:
            lexdiv = round((len(wordlist) * 1.0) / wordcount, 2)

        polarity = text.sentiment.polarity
        subjectivity = text.sentiment.subjectivity
        line.put_attribute('subjectivity', subjectivity)
        line.put_attribute('polarity', polarity)
        line.put_attribute('lexical diversity', lexdiv)
        line.save()
    def get_tweet_sentiment(self, tweet):
        #Membuat objek TextBlob untuk setiap tweet
        analysis = TextBlob(self.clean_tweet(tweet))

        #Inisialisasi sentimen dan kalimat tweet
        pos = neg = neu = 0
        sentence = analysis.lower()
        words = sentence.split(' ')
        for word in words:
            #Klasifikasi sentimen setiap kata per-huruf dari tweet
            classResult = classifier.classify(word_feats(word))
            if classResult == 'pos':
                pos += 1
            elif classResult == 'neg':
                neg += 1
            elif classResult == 'neu':
                neu += 1
            #print(word, " ", classResult)
        #print(pos, " ",neg, " ",neu)

        #Return hasil klasifikasi sentimen
        if pos > neg and pos >= neu:
            return 1
        elif neg > pos and neg >= neu:
            return -1
        else:
            return 0
Example #17
0
def find_keyword(sentences: list, keyword: str) -> list:
    '''keyword should be in lower case'''
    data = []
    for sentence in sentences:
        tb = TextBlob(sentence)
        if keyword in tb.lower(): data.append(sentence)
    
    return data
def get_ngram_counts(text, size):
    blob = TextBlob(text)
    # Extract n-grams as WordLists, then convert to a list of strings
    ngrams = [' '.join(ngram).lower() for ngram in blob.lower().ngrams(size)]
    # Convert to dataframe then count values and rename columns
    ngram_counts = pd.DataFrame(ngrams)[0].value_counts().rename_axis(
        'ngram').reset_index(name='count')
    return ngram_counts
def get_word_counts(text):
    blob = TextBlob(text)
    words = [[word, count] for word, count in blob.lower().word_counts.items()
             if word not in stopwords]
    word_counts = pd.DataFrame(words).rename({
        0: 'word',
        1: 'count'
    }, axis=1).sort_values(by='count', ascending=False)
    return word_counts
Example #20
0
 def on_status(self, status):
     text = status.text
     print(text)
     temp = TextBlob(text)
     temp = temp.lower()
     for word in temp.words:
         if word in collection:
             collection[word] += 1
         else:
             collection[word] = 1
Example #21
0
 def fit(self, X, y=None):
     words = []
     for x in X:
         x = TextBlob(x.lower())
         words += [word.lemmatize() for word in x.words]
     if self.num_words:
         words = Counter(words)
         self._vocab = [word for word, _ in words.most_common(self.num_words)]
     else:
         self._vocab = list(set(words))
     return self
Example #22
0
def analyse_titles(titles_file='all_titles.txt'):
    with open(os.path.join('csv', titles_file), 'rb') as text_file:
        text = text_file.read().decode('ascii', errors="replace")
    blob = TextBlob(text)
    word_counts = [[word, count]
                   for word, count in blob.lower().word_counts.items()
                   if word not in STOPWORDS and count > 1]
    bigrams = [
        ' '.join(bigram).lower() for bigram in blob.lower().ngrams(2)
        if stopwords_check(bigram)
    ]
    bigram_counts = [[word, count] for word, count in Counter(bigrams).items()
                     if count > 1]
    trigrams = [
        ' '.join(trigram).lower() for trigram in blob.lower().ngrams(3)
        if stopwords_check(trigram)
    ]
    trigram_counts = [[word, count]
                      for word, count in Counter(trigrams).items()
                      if count > 1]
    word_counts = sorted(word_counts, key=itemgetter(1), reverse=True)[:20]
    bigram_counts = sorted(bigram_counts, key=itemgetter(1), reverse=True)[:20]
    trigram_counts = sorted(trigram_counts, key=itemgetter(1),
                            reverse=True)[:20]
    np_counts = [[word, count]
                 for word, count in blob.lower().np_counts.items()
                 if count > 1]
    np_counts = sorted(np_counts, key=itemgetter(1), reverse=True)[:20]
    print '## Most frequent words\n'
    for term in word_counts:
        print '* {} ({})'.format(term[0], term[1])
    print '\n\n## Most frequent bigrams\n'
    for term in bigram_counts:
        print '* {} ({})'.format(term[0], term[1])
    print '\n\n## Most frequent trigrams\n'
    for term in trigram_counts:
        print '* {} ({})'.format(term[0], term[1])
    print '\n\n## Most frequent noun phrases\n'
    for term in np_counts:
        print '* {} ({})'.format(term[0], term[1])
Example #23
0
 def transform(self, X):
     vectors = []
     for x in X:
         x = TextBlob(x.lower())
         word_count = Counter(x.words)
         vector = [0] * len(self._vocab)
         for word, count in word_count.items():
             try:
                 idx = self._vocab.index(word)
                 vector[idx] = count
             except ValueError:
                 pass
         vectors.append(vector)
     return vectors
Example #24
0
def similarWords(faqWord, inputWords, printToWindow, useSynsets=True):
    # This method identifies whether two words are similar
    # Return True or False

    similarPath = False
    matchMethod = ''
    tbFaqWord = TextBlob(faqWord)
    lowerFAQWord = tbFaqWord.lower()
    lowerInputWord = inputWords[1]
    correctedInputWord = inputWords[2]
    if useSynsets:
        FAQWordSynset = lowerFAQWord.words[0].synsets

    # print("Words and Synset Lengths: " + str(lowerFAQWord) + "(" + str(len(lowerFAQWord.words[0].synsets)) + ") & " + str(
    #     lowerInputWord) + "(" + str(len(lowerInputWord.words[0].synsets)) + ") ")

    wordsAreSimilar = False
    if lowerFAQWord == lowerInputWord:
        wordsAreSimilar = True
        matchMethod = 'identical'
    elif lowerFAQWord == correctedInputWord:
        wordsAreSimilar = True
        matchMethod = 'corrected'
    elif lowerFAQWord.words.singularize(
    ) == correctedInputWord.words.singularize():
        wordsAreSimilar = True
        matchMethod = 'singularized'
    elif useSynsets:
        if len(FAQWordSynset) > 0:
            #correctInputWordSynsets = correctedInputWord.words[0].synsets
            correctInputWordSynsets = inputWords[3]
            if len(correctInputWordSynsets) > 0:
                # print("Compare Synsets: " + str(correctedFAQWord.words[0].synsets[0].path_similarity(correctedInputWord.words[0].synsets[0])))
                correctedWordSimilarity = FAQWordSynset[0].wup_similarity(
                    correctInputWordSynsets[0])
                if correctedWordSimilarity is not None:
                    if correctedWordSimilarity >= 0.65:
                        wordsAreSimilar = True
                        similarPath = True
                        matchMethod = 'similarity'

    # printToWindow = True
    if printToWindow & wordsAreSimilar:
        print("Words: " + str(lowerFAQWord) + " & " + str(lowerInputWord) +
              " (" + str(matchMethod) + ")")
        if similarPath:
            print("Synset path similarity: " + str(correctedWordSimilarity))

    return wordsAreSimilar, matchMethod
Example #25
0
    def _preprocess(text):
        ''' 
        Preprocess the text
            1. Converting numbers and variables to a dummy word "hey_num"
            2. Converting all the letters to lowercase
            3. Correcting any spelling mistakes
        '''
        processed_text = HeyAC._word_to_digit(text)
        processed_text, list_var = HeyAC._digit_to_dummy(processed_text)
        processed_text = TextBlob(processed_text)
        processed_text = processed_text.lower()
        #processed_text = processed_text.correct()
        processed_text = str(processed_text)

        return processed_text, list_var
Example #26
0
def simplebot(user_input):
    user_blob = TextBlob(user_input)
    lower_input = user_blob.lower()
    token_input = lower_input.words
    filtered_input = [w for w in token_input if w not in STOP_WORDS]
    response_set = set()
    for con_list in CONVERSATION:
        for sentence in con_list:
            sentence_split = sentence.split()
            if set(filtered_input).intersection(sentence_split):
                response_set.update(con_list)
    if not response_set:
        return "I'm sorry, ask again please!"
    else:
        return max(response_set, key=len)
Example #27
0
def similarWords(faq_word, input_words, print_to_window, use_synsets=True):
    # This method identifies whether two words are similar
    # Return True or False

    similar_path = False
    match_method = ''
    tb_faq_word = TextBlob(faq_word)
    lower_faq_word = tb_faq_word.lower()
    lower_input_word = input_words[1]
    corrected_input_word = input_words[2]
    if use_synsets:
        faq_word_synset = lower_faq_word.words[0].synsets

    # print("Words and Synset Lengths: " + str(lower_faq_word) + "(" + str(len(lower_faq_word.words[0].synsets)) + ") & " + str(
    #     lower_input_word) + "(" + str(len(lower_input_word.words[0].synsets)) + ") ")

    words_are_similar = False
    if lower_faq_word == lower_input_word:
        words_are_similar = True
        match_method = 'identical'
    elif lower_faq_word == corrected_input_word:
        words_are_similar = True
        match_method = 'corrected'
    elif lower_faq_word.words.singularize(
    ) == corrected_input_word.words.singularize():
        words_are_similar = True
        match_method = 'singularized'
    elif use_synsets:
        if len(faq_word_synset) > 0:
            #correct_input_word_synsets = corrected_input_word.words[0].synsets
            correct_input_word_synsets = input_words[3]
            if len(correct_input_word_synsets) > 0:
                corrected_word_similarity = faq_word_synset[0].wup_similarity(
                    correct_input_word_synsets[0])
                if corrected_word_similarity is not None:
                    if corrected_word_similarity >= 0.65:
                        words_are_similar = True
                        similar_path = True
                        match_method = 'similarity'

    # printToWindow = True
    if print_to_window & words_are_similar:
        print("Words: " + str(lower_faq_word) + " & " + str(lower_input_word) +
              " (" + str(match_method) + ")")
        if similar_path:
            print("Synset path similarity: " + str(corrected_word_similarity))

    return words_are_similar, match_method
Example #28
0
def createInputWordTuple(inputSentence, useSynsets=True):
    #This method creates a tuple for each word in the inputSentence
    correctedInputWordSynsets = ''

    inputMessageWords = []  # input Words
    # <Word, lowerWord, correctedWord, wordSynset>
    for inputWord in inputSentence.words:
        tbInputWord = TextBlob(inputWord)
        lowerInputWord = tbInputWord.lower()
        correctedInputWord = lowerInputWord.correct()
        if useSynsets:
            correctedInputWordSynsets = correctedInputWord.words[0].synsets
        inputMessageWords.append(
            (inputWord, lowerInputWord, correctedInputWord,
             correctedInputWordSynsets))

    return inputMessageWords
Example #29
0
def textblob_ngrams(sentence,
                    n=3,
                    remove_stopwords=False,
                    all_lower_case=False):
    ''' Takes in a sentence returns the words and/or punctuation
        in that sentence as the features (depending on chosen tokenizer)

        @Arguments:
            sentence -- Chosen sentence to tokenize, type(sentence) = String

            tokenizer (optional) -- Function of type nlkt.tokenize to be used
                for breaking apart the sentence string. Standard tokenizer
                splits on whitespace and removes punctuation

            remove_stopwords (optional) -- if true, all stopwords in sentence
                will not be included as features. Currently only for English
                text. Value is initially false

            stemmer (optional) -- Function of type nltk.stem to be used for
                stemming word features.

        @Return:
            List of features of the following form:
                {ngram_1: True, ngram_2: True, ... , ngram_n: True}
    '''

    sentence = TextBlob(sentence)
    features = dict()
    clean_string = ''

    # Changes all word features to lower case if true
    if all_lower_case:
        sentence = sentence.lower()

    # Removes stopwords
    for word in sentence.words:
        # Removes word from features if in nlkt.corpus.stopwords('english')
        if remove_stopwords:
            if word.string in stopwords:
                continue
        clean_string += ''.join([word, ' '])

    for ngram in TextBlob(clean_string):
        features[ngram] = True
    return features
Example #30
0
 def findmostusedword(excluded):
     wordlist = []
     nouns = ['NN', 'NNS', 'NNP', 'NNPS']
     prev = [0, 0]
     for headline in news:
         for word in headline.split():
             word = word.lower().replace('--', '')
             wordlist.append(word)
     for word in wordlist:
         if not word in nouse:
             word = TextBlob(word)
             if word.tags == []:
                 continue
             if word.tags[0][1] in nouns:
                 if wordlist.count(
                         word.lower()) > prev[1] and word.tags[0][0] != "-":
                     prev = [word.tags[0][0], wordlist.count(word)]
     return (prev)
Example #31
0
def analyze(liste_domaine, liste_tweet):
    #prend en entrée une liste de textes de tweet et
    # une liste avec le champ lexical d'un domaine EN MINUSCULE
    ###RECUPERE LES MOTS TROUVES DANS LES TWEETS DU CHAMP LEXICAL CORRESPONDANT
    mot_a_trouver = liste_domaine

    tweet_a_analyser = liste_tweet

    L = []
    #dans la liste L, on ajoute les mots du domaine trouvés dans les tweets
    for tweet in liste_tweet:
        current_tweet = TextBlob(tweet)
        current_tweet = current_tweet.lower()
        words = current_tweet.words
        for word in words:
            if word in mot_a_trouver:
                L.append(word)
    return (L)
Example #32
0
def textblob_ngrams(sentence, n=3, remove_stopwords=False, all_lower_case=False):
    ''' Takes in a sentence returns the words and/or punctuation
        in that sentence as the features (depending on chosen tokenizer)

        @Arguments:
            sentence -- Chosen sentence to tokenize, type(sentence) = String

            tokenizer (optional) -- Function of type nlkt.tokenize to be used
                for breaking apart the sentence string. Standard tokenizer
                splits on whitespace and removes punctuation

            remove_stopwords (optional) -- if true, all stopwords in sentence
                will not be included as features. Currently only for English
                text. Value is initially false

            stemmer (optional) -- Function of type nltk.stem to be used for
                stemming word features.

        @Return:
            List of features of the following form:
                {ngram_1: True, ngram_2: True, ... , ngram_n: True}
    '''

    sentence = TextBlob(sentence)
    features = dict()
    clean_string = ''

    # Changes all word features to lower case if true
    if all_lower_case:
        sentence = sentence.lower()

    # Removes stopwords
    for word in sentence.words:
        # Removes word from features if in nlkt.corpus.stopwords('english')
        if remove_stopwords:
            if word.string in stopwords:
                continue
        clean_string += ''.join([word, ' '])

    for ngram in TextBlob(clean_string):
        features[ngram] = True
    return features
Example #33
0
def bias(text_file):
    bias = 0
    tokenized = tokenize(text_file)
    red_flags = ["alien", 'evil', 'monster', 'good', 'aliens']
    for w in range(len(tokenized)):
        word = TextBlob(tokenized[w])
        if word.sentiment.polarity < -.5:
            bias = bias + .5
        if word.sentiment.polarity > .5:
            bias = bias + .5
        if word.sentiment.polarity > -.5 and word.sentiment.polarity < 0:
            bias = bias + .25
        if word.sentiment.polarity < .5 and word.sentiment.polarity > 0:
            bias = bias + .25
        if word.sentiment.polarity > .5:
            bias = bias + 1
        if word.sentiment.polarity < -.5:
            bias = bias + 1
        if word.lower() in red_flags:
            bias = bias + 2
            print(word)
    return (bias / len(tokenized)) * 100
Example #34
0
def hello_monkey():
    """Respond to incoming calls with a simple text message."""
    resp = twiml.Response()
    message=""
    name=""

    fromNumber = request.values.get('From',None)
    myNumber = request.values.get('To',None)
    
    body = request.values.get('Body')
    body = body.decode("ascii", errors="ignore")
    blob = TextBlob(body)


    NLPObject = NLPStuff(resp, blob, message)

    counter = storeCookies(blob)
    message+= salutationToCaller(message, fromNumber, myNumber, counter) 

    if "help" in blob.lower():
       message="This is an information HELP message please tell me what to do"

    return setMessage(message, name, myNumber, counter, body, blob, resp)
Example #35
0
def feature_extractor(text):
    if not isinstance(text, TextBlob):
        text = TextBlob(text.lower())

    return {
        'has_rumor': 'rumor' in text.words,
        'has_gosip': 'gosip' in text.words,
        'has_urbanesia': 'urbanesia' in text.words,
        'has_batista': 'batista' in text.words,
        'has_harahap': 'harahap' in text.words,
        'has_pemasaran': 'pemasaran' in text.words,
        'has_saham': 'saham' in text.words,
        'has_hackathon': 'hackathon' in text.words,
        'has_ipo': 'ipo' in text.words,
        'has_akuisisi': 'akuisisi' in text.words,
        'has_startup': 'startup' in text.words,
        'has_android': 'android' in text.words,
        'has_aplikasi': 'aplikasi' in text.words,
        'has_payment': 'payment' in text.words,
        'has_pembayaran': 'pembayaran' in text.words,
        'has_api': 'api' in text.words,
        'has_kompetisi': 'kompetisi' in text.words,
        'has_ide': 'ide' in text.words,
        'has_permainan': 'permainan' in text.words,
        'has_game': 'game' in text.words,
        'has_fundraising': 'fundraising' in text.words,
        'has_askds': '[Ask@DailySocial]' in text.words,
        'has_investasi': 'investasi' in text.words,
        'has_musik': 'musik' in text.words,
        'has_lagu': 'lagu' in text.words,
        'has_bhinneka': 'bhinneka' in text.words,
        'has_marketplace': 'marketplace' in text.words,
        'has_mobile': 'mobile' in text.words,
        'has_cto': 'cto' in text.words,
        'has_traffic': 'traffic' in text.words,
        'starts_with_[': text[0] == '['
    }
    def tweet_analyser(self, ch, method, properties, body):

        # Extract the json from the tweet
        json_body = json.loads(body.decode('utf-8'))
        text = json_body['text']
        hashtags = json_body['hashtags']

        # Use TextBlob for sentiment analysis on the tweet and extract the sentiment
        text_blob = TextBlob(text)
        json_body['polarity'] = text_blob.sentiment.polarity

        # Try to extract the tweeted team from the names used for both teams
        tweeted_team = self.get_team_by_names(text)

        # If we managed to extract a team, save the tweet along with the name of the team
        if tweeted_team is not None:
            json_body['team'] = tweeted_team.name
            self.collection.insert_one(json_body)
        # If not, try to extract the tweeted team based on the teams' hashtags
        else:
            tweeted_team = self.get_team_by_players(text_blob.lower().words)

            if tweeted_team is not None:
                json_body['team'] = tweeted_team.name
                self.collection.insert_one(json_body)
            else:

                if hashtags is not None:
                    tweeted_team = self.get_team_by_hashtags([x['text'].lower() for x in hashtags])

                if tweeted_team is not None:
                    json_body['team'] = tweeted_team.name
                    self.collection.insert_one(json_body)
                else:
                    json_body['team'] = 'None'
                    self.collection.insert_one(json_body)
                    print("IGNORED : {}".format(text))
Example #37
0
def getContent(tweet):
    txt = TextBlob(tweet['text'].split('https', 1)[0], np_extractor=extractor)
    txt = txt.lower()
    NP = txt.noun_phrases
    Subj = txt.pos_tags
    sentiment = txt.sentiment

    # Convert Noun Phrases from unicode to str before adding
    npToAdd = list()
    for np in NP:
        np = np.encode('ascii', 'ignore')
        npToAdd.append(np)

    # Filter words for greater importance (nouns, proper nouns, etc.)
    SubjToAdd = list()
    for word in Subj:
        # print word
        if word[1] in pos:  # Looking for nouns, or subject (i.e. movie, music, color)
            SubjToAdd.append(word)

    # Create new Tweet objects
    twt = Tweet()
    twt.addNew(tweet, npToAdd, SubjToAdd, sentiment)
    listOfTweets.append(twt)
Example #38
0
def tweet_to_feat(tweet, features):
    tb = TextBlob(tweet)
    #lang = tb.detect_language()
    words = [word.lemma for word in tb.lower().tokenize()]
    return [words.count(feature) for feature in features]
Example #39
0

from __future__ import print_function

from textblob import TextBlob
from nltk.stem.wordnet import WordNetLemmatizer
import sys

lmtzr = WordNetLemmatizer()

for line in sys.stdin.readlines():
    blob = TextBlob(line.strip())

    sys.stdout.write("Detected language: {}\n".format(blob.detect_language()))
    sys.stdout.write("This message had {} words.\n".format(len(blob.words)))
    sys.stdout.write("Corrected sentence\n{}\n".format(blob.lower().correct()))
    proper_nouns = [tag[0] for tag in blob.tags if tag[1] == 'NNP']
    verbs = [lmtzr.lemmatize(tag[0], 'v') for tag in blob.tags if 'V' in tag[1]]
    sys.stdout.write("I found these proper nouns: {}\n".format(proper_nouns))
    sys.stdout.write("I found these verbs: {}\n".format(verbs))

    sentiment = blob.sentiment
    sys.stdout.write("Sentiment for that message: {}\n".format(sentiment))
    if sentiment.polarity > 0 and sentiment.subjectivity > 0.7:
        sys.stdout.write("That sounds amazing!\n")
    elif sentiment.polarity < 0 and sentiment.subjectivity > 0.7:
        sys.stdout.write("It'll get better.\n")
    else:
        sys.stdout.write("Meh.\n")

    sys.stdout.flush()
Example #40
0
            if doc > 0 and doc % 1000 == 0:
                print '\t', doc

        for kw in filter(lambda k: '_' in k, ngram):
            keyword = kw.replace('_', ' ')

            kw_tb = TextBlob(keyword)

            # filter out punctuation, etc (make sure that there are two non-punc words)
            if len(kw_tb.words) < 2:
                continue

            # add keywords which are all proper nouns
            distinct_tags = set(t[1] for t in kw_tb.tags)
            if distinct_tags - {'NNP', 'NNPS'} == {}:
                dataset[doc][field].add(kw_tb.lower())
                continue

            # add noun phrases
            for np in kw_tb.lower().noun_phrases:
                dataset[doc][field].add(np)

    # convert set into list for json serialization
    for d in dataset:
        d[field] = list(d[field])

        # fix 's
        for i, np in enumerate(d[field]):
            if np.endswith(" 's"):
                np = np[:-3]

# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[0])


# list the words
review.words


# list the sentences
review.sentences


# some string methods are available
review.lower()


# ## Part 6: Stemming and Lemmatization

# **Stemming:**
# 
# - **What:** Reduce a word to its base/stem/root form
# - **Why:** Often makes sense to treat related words the same way
# - **Notes:**
#     - Uses a "simple" and fast rule-based approach
#     - Stemmed words are usually not shown to users (used for analysis/indexing)
#     - Some search engines treat words with the same stem as synonyms

# initialize stemmer
stemmer = SnowballStemmer('english')
    now = datetime.datetime.now()

    normSource = normmd.tables.get('Source')
    sources = [dict(row) for row in normcon.execute(select([
            normSource.c.Id,
            normSource.c.Name,
            normSource.c.Content
        ]).where(
            normSource.c.Content.isnot(None)
        ))]
    lemmafrequency = {}
    for source in sources:
        if args.verbosity > 1:
            print "Reading source: " + source['Name']
        content = TextBlob(source['Content'])
        noun_phrases = content.lower().noun_phrases
        lemmas = noun_phrases.lemmatize()
        for lemma in lemmas:
            if lemma in lemmafrequency.keys():
                lemmafrequency[lemma] += 1
            else:
                lemmafrequency[lemma] = 1

        if args.limit > 0:
            args.limit -= 1
            if args.limit == 0:
                break

    normNode = normmd.tables.get('Node')
    normTagging = normmd.tables.get('Tagging')
    nounPhraseNode = normcon.execute(select([
Example #43
0
def find_tweet(tweet, place_list):
    t = TextBlob(unicode(tweet))
    tweet_loc = []
    for word in t.lower().tokenize():
        if word in place_list:
            tweet_loc = tweet_loc + [word]