Exemple #1
0
def find_unknown_speakers(line, known_speakers):
    import re
    '''
	Tests to see if a known speaker is in a line. Prints out if not. 
	So we can see if there's a new unknown speaker
	'''
    speaker_is_known = False
    text = TextBlob(line)
    text = text.replace('\n', '')
    nws_text = text.replace(' ', '')
    for test_speaker in known_speakers:
        if text.startswith(test_speaker):
            speaker_is_known = True
    if speaker_is_known == False and nws_text != '':
        text = str(text)
        print_text = re.sub("\[.*?\]", "[]", text)
        print_text = re.sub("\{.*?\}", "{}", print_text)
        if print_text != '[]' and print_text != '{}':
            print(print_text)  #,'|||',text
def np_swap(aye: str, bee: str) -> Tuple[str, str]:
    """Swap every other noun phrase."""
    a, b = TextBlob(aye), TextBlob(bee)
    a_nps, b_nps = a.noun_phrases, b.noun_phrases

    i = 0
    while i < len(a_nps) and i < len(b_nps):
        if i % 2 != 0:
            anp, bnp = a_nps[i], b_nps[i]
            a = a.replace(anp, bnp)
            b = b.replace(bnp, anp)
        i += 1

    return (a.raw, b.raw)
Exemple #3
0
def do_clean(line, known_speakers):
    speaker_is_known = False
    text = TextBlob(line)
    text = text.replace('\n', '')
    #nws_text = text.replace(' ','')
    for test_speaker in known_speakers:
        if text.startswith(test_speaker):
            speaker_is_known = True
            speaker = test_speaker
            clean_line = remove_prefix(text, speaker)
            return speaker, clean_line
    #if speaker_is_known == False and nws_text != '':
    #	print(text)
    return '-', '-'
Exemple #4
0
def calcSentiment(input):
    ''' calculate sentiment for each twitterfile and average this'''
    # Part of code based on master thesis of Guangxue Cao
    tweets_data = []
    sentiment_array = []
    total = 0
    OneTweetTime = ""
    average_sentiment = 0

    # load input
    for line in input:
        tweets_data.append(line)

    # iterate over all tweets
    for tweet_data in tweets_data:
        tweet = tweet_data["text"]
        # analyze tweet with TextBlob to gain sentiment
        tweet = TextBlob(tweet)

        OneTweetTime = tweet_data["created_at"]

        # remove empty lines
        tweet = tweet.replace("\n", " ")
        tweet = tweet.replace("\r ", " ")
        sentiment = tweet.sentiment.polarity
        sentiment_array.append(sentiment)

    for sentiment in sentiment_array:
        total += sentiment

    if len(sentiment_array) != 0:
        average_sentiment = total / len(sentiment_array)
        return [OneTweetTime, average_sentiment]
    # writer.writerow([OneTweetTime,"sentiment:", average_sentiment])


# tweet = TextBlob(tweet,analyzer=NaiveBayesAnalyzer())
Exemple #5
0
def calcSentiment(input):
	''' calculate sentiment for each twitterfile and average this'''
	# Part of code based on master thesis of Guangxue Cao 
	tweets_data = []
	sentiment_array = []
	total = 0
	OneTweetTime = ""
	average_sentiment = 0

	# load input
	for line in input:
		tweets_data.append(line)

	# iterate over all tweets
	for tweet_data in tweets_data:
		tweet = tweet_data["text"]
		# analyze tweet with TextBlob to gain sentiment
		tweet = TextBlob(tweet)

		OneTweetTime = tweet_data["created_at"]
		
		# remove empty lines
		tweet = tweet.replace("\n", " ")
		tweet = tweet.replace("\r "," ")
		sentiment = tweet.sentiment.polarity
		sentiment_array.append(sentiment)

	for sentiment in sentiment_array:
		total += sentiment

	if len(sentiment_array) != 0:
		average_sentiment = total / len(sentiment_array)
		return [OneTweetTime, average_sentiment]
	# writer.writerow([OneTweetTime,"sentiment:", average_sentiment])













# tweet = TextBlob(tweet,analyzer=NaiveBayesAnalyzer())
def get_category(txt):
    category_counte = Counter()

    a = []
    blob = TextBlob(str(txt))
    blob = blob.replace("-", " ")
    for item in list(blob.noun_phrases):
        bob = TextBlob(item)
        category_counte[item] = sigmoid(
            (float(bob.polarity)) / ((float(bob.subjectivity)) + 1))

    if (len(list(category_counte.most_common())) > 0):
        a = list((category_counte.most_common())[0])
        return str(a[0])
    else:
        return "others"
def pos_swap(aye: str, bee: str, pos: str) -> Tuple[int, str, str]:
    """Swap a given part of speech."""
    # TODO: Replace instances of subword in own definition.
    swaps = 0
    a, b = TextBlob(aye), TextBlob(bee)
    apos, bpos = a.pos_tags, b.pos_tags
    aps = [p for p in apos if p[1] == pos]
    bps = [p for p in bpos if p[1] == pos]

    for i in range(min([len(aps), len(bps)])):

        ap = aps[i][0]
        bp = bps[i][0]
        a = a.replace(ap, bp)
        b = b.replace(bp, ap)
        swaps += 1

    return (swaps, a.raw, b.raw)
Exemple #8
0
def inputNumber(message):
    while True:
        try:
            userInput = int(input(message))
        except ValueError:
            print("Invalid input. Please enter a number: 1, 2, 3, or 4.")
            continue
        if userInput not in [1, 2, 3, 4]:
            print("Invalid integer. Please enter 1, 2, 3, or 4.")
            continue
##############################################################################################################
#######--------CHOICE-#1:-DOCUMENT-FILE----------------------------------------------------------##############
##############################################################################################################
        if userInput == 1:
            docchoice = input("Please enter the name of the Text File.\n")
            sourcedoc = open(docchoice, 'r')
            readsource = sourcedoc.read()
            lowfile = readsource.lower()
            #            filesoup = BeautifulSoup(lowfile,'lxml')
            #            filetext = filesoup.get_text(strip = True)
            #            sent = TextBlob(filetext)
            sent = TextBlob(lowfile)
            slashsplice = sent.replace('/', ' ')
            dashsplice = (slashsplice.replace('-', ' '))
            dashsplice2 = (dashsplice.replace('–', ' '))
            sentblob = TextBlob(lowfile)
            filepunct = TextBlob(str(remove_punctuation(dashsplice2)))
            finaltext = str(remove_punctuation(dashsplice2))
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = sentblob.sentiment.polarity
            subject = sentblob.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in filepunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            tokens = [w.translate(punctuate) for w in lemmatized_list]
            #            splitpunct = filepunct.split()
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '2', '1', '0',
                '–', '’', '’', '“', '”'
            ]
            #            tokens = [w for w in splitpunct]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n-------30 MOST COMMON WORDS-------: \n")
            for key, value in count.most_common(30):
                print("   " + str(value) + " - " + key)
            print("\n-------FREQUENCY CHART-------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(15, cumulative=False)
            ##---------------PHRASE (1,2,3,4 WORDS) COUNTER----------------------------------------
            bitokens = nltk.word_tokenize(finaltext)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(10)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(10)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(10)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print("   ", key, "", key2, "", key3, "", key4, "-", value)
####---------------------READABILITY INDEX--------------------###########
            flesh = int(textstat.flesch_reading_ease(readsource))
            print("--------FLESCH-KINCLAID TEST--------\n",
                  "\n    Readability Score: ", flesh)
            if flesh in range(0, 30):
                print(
                    "    Very difficult to read. Best understood by university graduates."
                )
            if flesh in range(31, 50):
                print("    Difficult to read.")
            if flesh in range(51, 60):
                print("    Fairly difficult to read.")
            if flesh in range(61, 70):
                print(
                    "    Plain English. Easily understood by 13- to 15-year-old students."
                )
            if flesh in range(71, 80):
                print("    Fairly easy to read.")
            if flesh in range(81, 90):
                print("    Fairly easy to read.")
            if flesh in range(90, 100):
                print(
                    "    Very easy to read. Easily understood by an average 11-year-old student."
                )
            print("-----------------------------------\n")

            ##################---END. LOOP---##########################################################################################################
            again = input(
                "\nThank you for using BTL 0.6. Run Again? [Y / N]\n")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break

##############################################################################################################
####----------CHOICE-#2:-URL/LINK-------------------------------------------------------------------------------
##############################################################################################################
        if userInput == 2:
            webchoice = input("Please enter the URL of the website.\n")
            webdoc = urllib.request.urlopen(webchoice)
            readweb = webdoc.read()
            websoup = w3lib.html.remove_tags(readweb)
            #            websoup = BeautifulSoup(readweb,'html5lib')
            #  websoup2 = websoup.text
            print(websoup)
            lowweb = websoup.lower()
            websent = TextBlob(lowweb)
            slashsplice = websent.replace('/', ' ')
            dashsplice = (slashsplice.replace('-', ' '))
            dashsplice2 = (dashsplice.replace('–', ' '))
            dashsplice3 = (dashsplice2.replace(' – ', ' '))
            pagesplice = dashsplice3.replace(' p. ', ' ')
            pagesplice2 = pagesplice.replace(' pp.', ' ')
            webpunct = TextBlob(str(remove_punctuation(pagesplice2)))
            finalweb = str(remove_punctuation(pagesplice2))
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = websent.sentiment.polarity
            subject = websent.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in webpunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            tokens = [w.translate(punctuate) for w in lemmatized_list]
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', " ",
                'mwparseroutput', 'wwww3org', 'xmlnshttp', 'also', '1', '0',
                'svg', '2', 'jw', '’', '“', '”', 'u'
            ]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n---------MOST COMMON WORDS---------: \n")
            for key, value in count.most_common(30):
                print("   " + key + " - " + str(value))
            print("\n---------FREQUENCY CHART---------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(10, cumulative=False)
            #################################################################################################
            ##---------------PHRASE (1,2,3,4) COUNTER----------------------------------------
            ###################################################################################
            bitokens = nltk.word_tokenize(finalweb)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(20)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(20)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(20)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print("   ", key, "", key2, "", key3, "", key4, "-", value)
    #################################################################################################
    ##---------------READABILITY INDEX----------------------------------------
    ###################################################################################
    ##########---------------END LOOP---------------------##############################
            again = input("\nThank you for using BTL 0.6. Run Again? [Y / N]")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                print("Bye!")
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break

########################################################################################################################
############--------CHOICE-#3:-MANUAL-INPUT----------########################################
############################################################################################################

        if userInput == 3:
            manchoice = input("Please enter your text here:\n")
            lowman = manchoice.lower()
            mansoup = BeautifulSoup(lowman, 'html5lib')
            mantext = mansoup.get_text(strip=True)
            mansent = TextBlob(mantext)
            sent = TextBlob(manchoice)
            manpunct = TextBlob(str(remove_punctuation(mansent)))
            finalman = str(remove_punctuation(mansent))
            splitpunct = manpunct.split()
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '0', '–', '’',
                '“', '”', '’'
            ]
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = sent.sentiment.polarity
            subject = sent.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in manpunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            #            tokens = [w.translate(punctuate) for w in lemmatized_list]
            tokens = [w for w in splitpunct]
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '—'
            ]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n------35 MOST COMMON WORDS------: \n")
            for key, value in count.most_common(35):
                print("   " + key + " - " + str(value))
            print("\n------FREQUENCY CHART------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(10, cumulative=False)
            #################################################################################################
            ##---------------PHRASE (1,2,3,4 WORDS) COUNTER----------------------------------------
            ##################################################################################
            bitokens = nltk.word_tokenize(finalman)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(10)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(10)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(10)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print(
                    "   ",
                    key,
                    "",
                    key2,
                    "",
                    key3,
                    "",
                    key4,
                    "-",
                    value,
                )
    ######---------------READABILITY INDEX#----------------####
            flesh = int(textstat.flesch_reading_ease(manchoice))
            print("\n----------FLESCH-KINCLAID TEST----------:\n",
                  "\n    Readability Score: ", flesh, "\n")
            if flesh in range(0, 31):
                print(
                    "    --Very difficult to read. Best understood by university graduates.--"
                )
            if flesh in range(31, 51):
                print("    --Difficult to read.--")
            if flesh in range(51, 61):
                print("    --Fairly difficult to read.--")
            if flesh in range(61, 71):
                print(
                    "    --Plain English. Easily understood by 13 to 15-year-old students.--"
                )
            if flesh in range(71, 81):
                print("    --Fairly easy to read.--")
            if flesh in range(81, 91):
                print("    --Fairly easy to read.--")
            if flesh in range(91, 100):
                print(
                    "    --Very easy to read. Easily understood by an average 11-year-old student.--"
                )
            print("\n------------------------------------------\n")

            again = input("\nThank you for using BTL 0.3. Run Again? [Y / N]")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                print("Bye!")
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break
###################################################################################################################
##########---------CHOICE 4: QUIT PROGRAM-------------------------------------------------------------------------------
######################################################################################################################
        if userInput == 4:
            print("Thank you for using BTL 0.5. Bye!")
            quit()
            break
#Training the classifier on the body dataset
with open("dataset2.json", 'r', encoding="utf-8-sig") as fp2:
	cl2 = NaiveBayesClassifier(fp2, format="json")

#Taking the string values
str1 = str(headline)
headline = TextBlob(str1)
body = str(body)
tb_body = TextBlob(body)
subjectivity = tb_body.sentiment.subjectivity
subjectivity = float(subjectivity) * 100
body_classify = str(cl2.classify(body))
body = body.lower()

#Finding the subjectivity
headline = headline.replace('Was', '')
headline = headline.replace('was', '')
headline = headline.replace('’','')

#Finding the tags in the sentence
array = headline.tags
array1 = []

#Finding the hot words
for ii in array:
	name, tag = ii
	name = str(name)
	name = name.lower()
	if(tag.count('NN')>0):
		name = TextBlob(name)
		array1.append(name)
Exemple #10
0
    pickle.dump(cl2, f2)
    f2.close()
    print("Pickle created")

#Taking the string values
str1 = str(headlines)
headline = TextBlob(str1)
body = str(articles)
tb_body = TextBlob(body)
subjectivity = tb_body.sentiment.subjectivity
subjectivity = float(subjectivity) * 100
body_classify = str(cl2.classify(body))
body = body.lower()

#Finding the subjectivity
headline = headline.replace('Was', '')
headline = headline.replace('was', '')
headline = headline.replace('’', '')

#Finding the tags in the sentence
array = headline.tags
array1 = []

#Finding the hot words
for ii in array:
    name, tag = ii
    name = str(name)
    name = name.lower()
    if (tag.count('NN') > 0):
        name = TextBlob(name)
        array1.append(name)
Exemple #11
0
import pandas as pd
from textblob import TextBlob
import os

data = pd.read_csv('../../gen/data-preparation/temp/parsed-data.csv',
                   sep='\t',
                   encoding='utf-8')
data.head()

for i, j in data.iterrows():
    print(i)

    try:
        blob = TextBlob(j['text'])
        blob = blob.replace('.', ' ').replace(',', ' ').replace('#', ' ').replace('!', ' ').replace('?', ' ')\
            .replace(':', ' ').replace(';', ' ').replace('&', ' ') .replace('/', ' ').replace('&', ' ')
        data.loc[i, 'polarity'] = blob.sentiment.polarity
        data.loc[i, 'subjectivity'] = blob.sentiment.subjectivity
    except:
        data.loc[i, 'polarity'] = ''
        data.loc[i, 'subjectivity'] = ''

data.head()

os.makedirs('../../gen/data-preparation/output/', exist_ok=True)

data.to_csv('../../gen/data-preparation/output/dataset.csv', index=False)

print('done.')
        continue
    else:
        wordList = wordList + word + " "


negString = ''
posString = ''
neutString = ''

for word in bigString.split():
    tb = TextBlob(word)

    if ('https') in tb or ('@') in tb or ("re") in tb or ("rt") in tb or (".") in tb:
        continue
    elif ('#') in tb:
        tb = tb.replace("#", "")
        if tb.polarity > 0 :
            str(tb)
            posString = posString + word + " "
        elif tb.polarity < 0 :
            str(tb)
            negString = negString + word + " "
        else:
            str(tb)
            neutString = neutString + word + " "
    elif len(tb) <= 3:
        continue
    else:
        tb = tb.replace("#", "")
        if tb.polarity > 0 :
            str(tb)
auth.set_access_token(config['access_token'], config['access_token_secret'])

api = tweepy.API(auth)

# Load text file
filename=open("./txt/marx2.txt",'r')
text=filename.readlines()
text = ' '.join(text)
filename.close()

blob = TextBlob(text.decode('utf-8'))
tags = blob.tags

for blobs in blob.tags:
	if blobs[1] == 'NNP':
		wordchange = '#'+blobs[0]
		blob = blob.replace(blobs[0],wordchange)
		print "changing: " + wordchange

for sentence in blob.sentences:
	sentence = re.sub('\#+', '#', str(sentence))
	print sentence
	print "--"
	
	try:
		print "next tweet: " + str(sentence)
		api.update_status(sentence)
		time.sleep(120)#Tweet every 15 minutes
	except:
		continue
#blob.translate(to="es")  # 'La amenaza titular de The Blob...'
Exemple #14
0
#if conn is not None:
#   conn.execute(sql_create_projects_table)
#  print("table should be created")
#else:
#   print("Error! cannot create the database connection.")

## Basic Functionality -- Inputs Text Generates Output
st.subheader(
    """ Basic Text Sentiment Functionality - text from any Social Media Platform """
)
opinionInput = st.text_input('Type in some text')
if (st.button('Generate Sentiment!') and opinionInput != ""):
    opinion = TextBlob(opinionInput, analyzer=NaiveBayesAnalyzer())
    st.write(opinion.sentiment)
    #Write insert statement here
    test = opinion.replace(" ", "") + " " + opinion.sentiment[0]
    strippedOpinion = str(opinion).replace(" ", "")
    resultingSentiment = str(opinion.sentiment[0])
    #qry = "INSERT INTO entries(entry,sentiment) VALUES( '"+strippedOpinion+ "' , '"+resultingSentiment+"');"
    #conn.execute(qry)
    #conn.commit()
else:
    st.write('Provide an Input!')

st.subheader(
    """ Twitter Sentiment Functionality - queries text from Twitter Social Media Platform """
)
twitterOpinionInput = st.text_input('Enter text')
if (st.button('Generate Sentiment') and opinionInput != ""):
    stro = ""
    for tweets in api.search(q=twitterOpinionInput, lang="en"):
Exemple #15
0
data = pd.read_csv('../../gen/data-preparation/temp/parsed-data.csv', sep = '\t')
data.head()

DetectorFactory.seed = 0
analyser = SentimentIntensityAnalyzer()
good_words = ['spectacular', 'good', 'great', 'best', 'goat', 'incredible', 'amazing', 'crazy', 'insane', 'fire']
delete_words = ['$', '%', '=', '»', '«', '@', '  ', '£', '§', '€', '*']

for i, j in data.iterrows():
    print(i)
    time=0
    date = str(j['created_at'])
    
    blob = TextBlob(str(j['text'.lower()]))
    for d in delete_words:
            blob = blob.replace(d, '') 
            
    if 'RT' in str(j['text']):
            data.loc[i, 'retweet'] = True
    else:
        data.loc[i, 'retweet'] = False
            
    try:
        date = date.split(' ')
        hour = date[3].split(':')
        time += float(hour[0]) + float(hour[1])/60

        data.loc[i, 'hour'] = time
        data.loc[i, 'language'] = detect(str(j['text']))
        data.loc[i, 'polarity'] = blob.sentiment.polarity
        data.loc[i, 'subjectivity'] = blob.sentiment.subjectivity
Exemple #16
0
import os
from textblob import TextBlob
from textblob.en import Spelling
import os
path = "spelling-model-weighted.txt"
assert os.path.isfile(path)
spelling = Spelling(path=path)
MOCKDATA = "hi i dont spel"
test = TextBlob(MOCKDATA)
test1 = test.replace('dont', "don't")
test1 = test1.replace('doesnt', "doesn't")
test1 = test1.replace('didnt', "didn't")
test1 = test1.replace('wont', "won't")
test1 = test1.replace('wouldve', "would've")
test1 = test1.replace('cant', "can't")
test1 = test1.replace('couldnt', "couldn't")
test1 = test1.replace('couldve', "could've")
test1 = test1.replace('shouldnt', "shouldn't")
test1 = test1.replace('shoulve', "shouldn've")
test1 = test1.replace('mightve', "might've")
test1 = test1.replace('havent', "haven't")
test1 = test1.replace('lets', "let's")

print(test1)
for word in test1.words:
    print(spelling.suggest(word))
            word = translator.translate(word, dest=detect).text  
            print("recognised " + word)
            word = translator.translate(word, dest='en').text   #translating the language to 'english' for search
        print("\nSearching...", word)

     #Working with browser
    driver = webdriver.Chrome()
    if(s_engine in ['Google', 'google']):
        driver.get('https://google.com')
        searchbox = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[1]/div/div[2]/input')
        searchbox.send_keys(word)
        searchButton = driver.find_element_by_xpath('//*[@id="tsf"]/div[2]/div[1]/div[3]/center/input[1]')
        searchButton.click()
    elif(s_engine in ['YouTube', 'Youtube', 'youtube']):
        if(" " in searchItem):
            searchItem = word.replace(" ", "+")
        searchItem = "https://www.youtube.com/results?search_query=" + word
        driver.get(searchItem)
    elif(s_engine in ['Wikipedia', 'Wiki', 'wikipedia', 'wiki']):
        driver.get('https://en.wikipedia.org/wiki/Main_Page')
        searchbox = driver.find_element_by_xpath('//*[@id="searchInput"]')
        searchbox.send_keys(word)
        searchButton = driver.find_element_by_xpath('//*[@id="searchButton"]')
        searchButton.click()
    else:
        print("We couldn't recognize the search engine")
    print("\nExecution successful.")
    
except Exception as e:
    print(e)
    print("Service time-out !")
# ***
# ### 9.0 Successful and unsuccessful projects histogram (by count) per category

# In[41]:

# Loop over all the keywords in all the data.keywords and increment the counts in the appropriate counter objects
from textblob import TextBlob
blob = TextBlob(data.keywords[6])

# In[42]:

data.keywords[6]

# In[43]:

blob = blob.replace("-", " ")

# In[44]:

blob.noun_phrases

# In[45]:

blob.sentiment

# In[46]:


def get_category(txt):
    category_counte = Counter()
#Append the json file
with open(input_file) as input_novartis:
    for line in input_novartis:
        tweets_novartis.append(json.loads(line))

#parsing the text and date data
with open(output_file, 'w', newline='') as output_novartis:
    writer = csv.writer(output_novartis)

    for tweets_novartis in tweets_novartis:

        tweet = tweets_novartis["full_text"]
        lan = tweets_novartis["created_at"]

        #Sentiment Analysis
        tweet = TextBlob(tweet)

        tweet = tweet.replace("\n", " ")
        tweet = tweet.replace("\r", " ")

        def set_date(self, lan):
            date = time.striptime(lan, '%b %d %Y ')

            self.date = datetime.fromtimestamp(time.mktime(date))

        #Sentiment score
        sentiment = [tweet.sentiment.polarity]

        writer.writerows(zip(sentiment, [lan[4:10], lan[26:]]))
    # counter+=1
    #if counter>10: break 
    if (len(text)>1): 
      text = re.sub(r'@\w*',' ', text)
      text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
      text = re.sub(r'[\n]*', '', text)
      text.strip()
      text = text.replace('  ', ' ')
      text= text.replace('  ', ' ')   
      text_blob = TextBlob(text)

      
      #removing stop words
      text_blob = TextBlob(text).lower()
      for d in stopwords.words('english'):
          text_blob = text_blob.replace(d.lower() + ' ', ' ')
              
          #cleaning to remove extra spaces
          text_blob = text_blob.replace('  ', ' ')

              
      #correcting spelling
      text_blob=text_blob.correct()

      #lemmatization
      text_blob=Word(text_blob).lemmatize()
    else:
      text_blob = TextBlob(text).lower()
    
    #sentiment analysis
    score_vader = analyser.polarity_scores(text_blob)
def genQuestion(sentence, ner):

    #print("ner: ",ner)
    """
    outputs question from the given text
    """

    time_flag = 0
    word_ner_map = {}
    for i in range(len(ner)):
        word_ner_map[ner[i][0]] = ner[i][1]
        if ner[i][1] == "TIME" or ner[i][1] == "DATE":
            time_flag = 1

    #print(word_ner_map)

    if type(sentence) is str:
        line = TextBlob(sentence)

    bucket = {}  # Create an empty dictionary

    for i, j in enumerate(line.tags):
        #print(j)
        if j[1] not in bucket:
            bucket[j[1]] = i

    #print(bucket)
    question = ''

    l1 = ['NNP', 'VBG', 'VBZ', 'IN']
    l2 = ['NNP', 'VBG', 'VBZ']

    l3 = ['PRP', 'VBG', 'VBZ', 'IN']
    l4 = ['PRP', 'VBG', 'VBZ']
    l5 = ['PRP', 'VBG', 'VBD']
    l6 = ['NNP', 'VBG', 'VBD']
    l7 = ['NN', 'VBG', 'VBZ']

    l8 = ['NNP', 'VBZ', 'JJ']
    l9 = ['NNP', 'VBZ', 'NN']

    l10 = ['NNP', 'VBZ']
    l11 = ['PRP', 'VBZ']
    l12 = ['NNP', 'NN', 'IN']
    l13 = ['NN', 'VBZ']

    # Who question generation rules
    l14 = ['NNP', 'VBD', 'NN']
    l15 = ['NNP', 'VBZ', 'NN']
    l16 = ['NNP', 'VB', 'NN']

    questions = []

    # With the use of conditional statements the dictionary is compared with the list created above
    # Question starting with WHO
    if all((key in bucket for key in l15)) and (word_ner_map.get(
            line.words[bucket['NNP']], "") == "PERSON"):
        question = line.replace(line.words[bucket['NNP']], "Who") + "?"
        questions.append(question)

    elif all((key in bucket for key in l16)) and (word_ner_map.get(
            line.words[bucket['NNP']], "") == "PERSON"):
        question = line.replace(line.words[bucket['NNP']], "Who") + "?"
        questions.append(question)

    elif all((key in bucket for key in l14)) and (word_ner_map.get(
            line.words[bucket['NNP']], "") == "PERSON"):
        question = line.replace(line.words[bucket['NNP']], "Who") + "?"
        questions.append(question)

    # Question starting with WHEN
    if all((key in bucket for key in l15)) and time_flag:
        start_index = bucket['VBZ']
        end_index = bucket['IN']
        question = "When " + line.words[bucket['VBZ']]

        for i in range(end_index):
            if i != start_index:
                question += (" " + line.words[i])

        question += (" " + "?")
        questions.append(question)

    elif all((key in bucket for key in l14)) and time_flag:
        start_index = bucket['VBD']
        end_index = bucket['IN']
        question = "When " + line.words[bucket['VBD']]

        for i in range(end_index):
            if i != start_index:
                question += (" " + line.words[i])

        question += (" " + "?")
        questions.append(question)

    elif all((key in bucket for key in l16)) and time_flag:
        start_index = bucket['VB']
        end_index = bucket['IN']
        question = "When " + line.words[bucket['VB']]

        for i in range(end_index):
            if i != start_index:
                question += (" " + line.words[i])

        question += (" " + "?")
        questions.append(question)

    # Question starting with WHAT
    if all(key in bucket
           for key in l1):  #'NNP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?'
        questions.append(question)

    elif all(key in bucket for key in l2):  #'NNP', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + ' ' + line.words[bucket['VBG']] + '?'
        questions.append(question)

    elif all(key in bucket
             for key in l3):  #'PRP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['PRP']] + ' ' + line.words[bucket['VBG']] + '?'
        questions.append(question)

    elif all(key in bucket for key in l4):  #'PRP', 'VBG', 'VBZ' in sentence.
        question = 'What ' + line.words[
            bucket['PRP']] + ' ' + ' does ' + line.words[
                bucket['VBG']] + ' ' + line.words[bucket['VBG']] + '?'
        questions.append(question)

    elif all(key in bucket for key in l7):  #'NN', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NN']] + ' ' + line.words[bucket['VBG']] + '?'
        questions.append(question)

    elif all(key in bucket for key in l8):  #'NNP', 'VBZ', 'JJ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + '?'
        questions.append(question)

    elif all(key in bucket for key in l9):  #'NNP', 'VBZ', 'NN' in sentence
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NNP']] + '?'
        questions.append(question)

    elif all(key in bucket for key in l11):  #'PRP', 'VBZ' in sentence.
        if line.words[bucket['PRP']] in ['she', 'he']:
            question = 'What' + ' does ' + line.words[bucket['PRP']].lower(
            ) + ' ' + line.words[bucket['VBZ']].singularize() + '?'
            questions.append(question)

    elif all(key in bucket for key in l10):  #'NNP', 'VBZ' in sentence.
        question = 'What' + ' does ' + line.words[bucket[
            'NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?'
        questions.append(question)

    elif all(key in bucket for key in l13):  #'NN', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[
            bucket['NN']] + '?'
        questions.append(question)

    # When the tags are generated 's is split to ' and s. To overcome this issue.
    if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’":
        question = question.replace(" ’ ", "'s ")
        questions.append(question)

    if "because" in sentence.lower() and line.words[bucket['VBZ']]:
        question = 'Why ' + line.words[bucket['VBZ']]

        end_index = sentence.split().index("because")
        for i in range(end_index):
            if i != bucket['VBZ']:
                question += (" " + line.words[i])

        question += (" " + "?")
        questions.append(question)

    return questions