Esempio n. 1
0
def get_best_syntax_tree(text, cfg):

    blob = TextBlob(text)
    ph_list = []
    #trova i token genera le regole semplificate
    #Taglia l'albero a vari livelli fino e genera le frasi
    #cfg.print_grammar()
    #print()
    for depth in range(4):
        cfg_copy = CFG.CFG(cfg.prod)
        cfg_copy.prune(blob.upper().words, depth)
        if "S" in cfg_copy.prod:
            #Genera un set di frasi
            for i in range(10):
                sent, tree = cfg_copy.gen_random_convergent('S')
                ph_list.append({"S": sent, "Tree": tree, "Score": 0})
        else:
            break

    for ph in ph_list:
        ph["Score"] = fuzz.ratio(ph["S"], blob.upper())

    max_score = ph_list[0]["Score"]
    best_ph = ph_list[0]
    for ph in ph_list:
        if ph["Score"] > max_score:
            max_score = ph["Score"]
            best_ph = ph

    #print(best_ph)
    return best_ph
Esempio n. 2
0
 def on_status(self, status):
     if from_creator(status):
         try:
             # Prints out the extended tweet
             getattr(status, 'extended_tweet')
             analysis = TextBlob(status.extended_tweet['full_text'])
             # set sentiment
             if analysis.sentiment.polarity > 0:
                 analysis = 'positive'
             elif analysis.sentiment.polarity == 0:
                 analysis = 'neutral'
             else:
                 analysis = 'negative'
             # convert tuple to string for tweepy
             print(analysis.upper() + " " + status.user.screen_name + " " + status.extended_tweet['full_text'])
             client.publish(
                 TargetArn="", # Input AWS ARN for SNS
                 Message=analysis.upper() + " " + status.user.screen_name + " " + status.extended_tweet['full_text']
             )
             # Saves the extended tweet to a file
             with open("Test.txt", 'a') as file:
                 file.write(analysis.upper() + status.extended_tweet['full_text'])
         except AttributeError:
             # Prints out the regular length tweet
             analysis = TextBlob(status.text)
             # set sentiment
             if analysis.sentiment.polarity > 0:
                 analysis = 'positive'
             elif analysis.sentiment.polarity == 0:
                 analysis = 'neutral'
             else:
                 analysis = 'negative'
             # convert tuple to string for tweepy compliance
             print(analysis.upper() + " " + status.user.screen_name + " " + status.text)
             client.publish(
                 TargetArn="", # Input AWS ARN for SNS
                 Message=analysis + " " + status.user.screen_name + " " + status.text
             )
             # Saves the regular length tweet to a file.
             with open("Test.txt", 'a') as file:
                 file.write(analysis.upper() + status.text)
         return True
     return True
Esempio n. 3
0
def chatbot_extractor(document):

    blob = TextBlob(document)
    words = blob.upper().words
    #print(words)
    feat = {}
    #question
    feat["contain(WHERE)"] = "WHERE" in words
    feat["contain(WHAT)"] = "WHAT" in words
    feat["contain(HOW)"] = "HOW" in words
    feat["contain(WHICH)"] = "WHICH" in words
    #punteggiatura
    feat["contain(?)"] = "?" in words
    feat["contain(.)"] = "." in words
    #command
    feat["contain(DO)"] = "DO" in words
    feat["contain(FIND)"] = "FIND" in words
    feat["contain(GET)"] = "GET" in words
    feat["contain(TELL)"] = "TELL" in words
    feat["contain(SEARCH)"] = "SEARCH" in words
    #approval
    feat["contain(YES)"] = "YES" in words
    feat["contain(NO)"] = "NO" in words
    feat["contain(OK)"] = "OK" in words
    feat["contain(GOOD)"] = "GOOD" in words
    #saluti
    feat["contain(HELLO)"] = "HELLO" in words
    feat["contain(HI)"] = "HI" in words
    feat["contain(BYE)"] = "BYE" in words
    feat["contain(HEY)"] = "HEY" in words
    #statements
    feat["contain(THINK)"] = "THINK" in words
    feat["contain(LIKE)"] = "LIKE" in words
    feat["contain(FEEL)"] = "FEEL" in words

    return feat
Esempio n. 4
0
               "Simple is better than complex.")
print(zen.words,zen.sentences)
w=Word('Cats')
v=Word('went')
print(w.lemmatize())
print(v.lemmatize('v'))
for sentence in zen.sentences:
    print(sentence.sentiment)
b = TextBlob("I havv goood speling!")
print(b.correct())
w1 = Word('falibility')
print(w1.spellcheck())
monty = TextBlob("We are no longer the Knights who say Ni. "
                 "We are now the Knights who say Ekki ekki ekki PTANG.")
print(monty.word_counts['ekki'])
print(monty.words.count('ekki'))
print(monty.words.count('ekki',case_sensitive=True))
en_blob = TextBlob(u'Simple is better than complex.')
print(en_blob.translate(to='es'))
chinese_blob = TextBlob(u"美丽优于丑陋")
print(chinese_blob.translate(from_lang="zh-CN", to='en'))
b = TextBlob(u"بسيط هو أفضل من مجمع")
print(b.detect_language())
print(zen[0:19])
print(zen.upper())
apple_blob = TextBlob('apples')
banana_blob = TextBlob('bananas')
print(apple_blob + ' and ' + banana_blob)
print("{0} and {1}".format(apple_blob, banana_blob))

def cooc_graph(search_term, tweets_dataframe, save_path, NUM_OF_COOCS=5):
    def dict_value_sort(dictionary):
        return (dict(
            sorted(dictionary.items(), key=lambda item: item[1],
                   reverse=True)))

    def dice_significance(cooc_array, word, word_list):
        """Dice statistical significance for "word" against all
        other words in "word_list" using corresponding 2d array
        of cooccurrance.
        """
        word_index = word_list.index(word.upper())
        k = len(cooc_array)
        ki = sum(cooc_array[:, word_index])
        kj = np.sum(cooc_array, axis=0)
        kij = cooc_array[word_index, :]
        dice_stat = 2 * kij / (ki + kj)
        stat_dict = {word: d for word, d in zip(word_list, dice_stat)}
        stat_dict.pop(word.upper())
        return (stat_dict)

    ALL_SEARCH_TERMS = TextBlob(search_term).words

    tweets = tweets_dataframe["Tweet"].dropna().values

    # Sort out fonts
    font_files = font_manager.findSystemFonts(
        fontpaths=
        "/Users/jamesashford/Documents/Projects/Hackathons/Oxford Hack 2020/OxHack-2020/TCARS/rsc"
    )
    font_list = font_manager.createFontList(font_files)
    font_manager.fontManager.ttflist.extend(font_list)

    # Extract and clean words
    all_words = TextBlob(" ".join(tweets).upper()).words.lemmatize()
    # Get stop-words
    stop_words = list(set(stopwords.words('english'))) + ['thi']
    # Remove Stop and Short Words
    words = [
        w for w in all_words if len(w) > 3 and w.lower() not in stop_words
    ]

    # Remove words that only occur once
    counts = dict(Counter(words))
    key_words = [word for word in counts if counts[word] > 1]

    # Create dtm
    dtm = np.array(
        [[1 if (tweet.upper().count(word) > 0) else 0 for word in key_words]
         for tweet in tweets])
    # Co-occurrances
    cooc = np.dot(dtm.T, dtm)

    graph_data = []
    layer1_names = []
    layer2_names = []
    for term in ALL_SEARCH_TERMS:
        # Statistical significance of search_term
        term_dice_stats = dice_significance(cooc, term, key_words)
        term_dice_stats = dict_value_sort(term_dice_stats)
        # Get NUM_OF_COOCS most similar words
        most_similar = list(term_dice_stats.keys())[0:NUM_OF_COOCS]
        layer1_names += most_similar
        # Create a structure to hold the node links
        graph_data += [{
            "from": term.upper(),
            "to": set_name,
            "stat": term_dice_stats[set_name]
        } for set_name in most_similar]
        # Iterate over each of the chosen coocs, and find their closest
        for word in most_similar:
            # Find stats for this word
            word_dice_stats = dice_significance(cooc, word, key_words)
            word_dice_stats = dict_value_sort(word_dice_stats)
            # Choose top nearby matches
            top_neighbours = list(word_dice_stats.keys())[0:10]
            layer2_names += top_neighbours
            new_graph_data = [{
                "from": word.upper(),
                "to": set_name,
                "stat": word_dice_stats[set_name]
            } for set_name in top_neighbours]
            # Add to existing graph data
            graph_data += new_graph_data

    # Convert graph data to pandas dataframe
    gd = pd.DataFrame.from_dict(graph_data)
    # Create co-occurance graph
    # G = nx.from_numpy_matrix(cooc)
    G = nx.from_pandas_edgelist(gd, "from", "to", "stat")

    # Generate colours
    colours, sizes = [], []
    l0, l1, l2 = {}, {}, {}
    for node in G:
        if node in ALL_SEARCH_TERMS.upper():
            col = 'darkblue'  #'red'
            size = min(counts[node] * 100, 5000)  #5000
            l0[node] = node
        elif node in layer1_names:
            col = 'lightblue'  #'orange'
            size = min(counts[node] * 100, 3000)  #2500
            l1[node] = node
        else:
            col = 'cyan'  #'blue'
            size = counts[node] * 10  #1000
            l2[node] = node
        colours.append(col)
        sizes.append(size)

    # Visualisation
    fig = plt.figure(figsize=(5, 3), dpi=200)
    pos = nx.spring_layout(G)
    if len(ALL_SEARCH_TERMS) == 1:
        pos = nx.nx_agraph.graphviz_layout(G, prog='twopi')
    # Draw edges
    edges = nx.draw_networkx_edges(
        G, pos, alpha=1, width=1,
        edge_color='white')  #width=gd["stat"].values*10)
    # Draw nodes, once white for background then again with colour+alpha
    nodes = nx.draw_networkx_nodes(G,
                                   pos,
                                   alpha=1,
                                   node_color='white',
                                   node_size=sizes)
    nodes = nx.draw_networkx_nodes(G,
                                   pos,
                                   alpha=0.8,
                                   node_color=colours,
                                   node_size=sizes)
    nodes.set_edgecolor('black')
    # Draw labels for each layer, with proportional sizes
    labels0 = nx.draw_networkx_labels(G,
                                      pos,
                                      labels=l0,
                                      font_family="Swiss911 UCm BT",
                                      font_size=30)
    labels1 = nx.draw_networkx_labels(G,
                                      pos,
                                      labels=l1,
                                      font_family="Swiss911 UCm BT",
                                      font_size=15)
    labels2 = nx.draw_networkx_labels(G,
                                      pos,
                                      labels=l2,
                                      font_family="Swiss911 UCm BT",
                                      font_size=11,
                                      font_color="white")
    labels2 = nx.draw_networkx_labels(G,
                                      pos,
                                      labels=l2,
                                      font_family="Swiss911 UCm BT",
                                      font_size=10,
                                      font_color="black")

    # Save
    file_name = f"{save_path}/{search_term}_coocgraph.png"
    plt.savefig(file_name, transparent=True)
Esempio n. 6
0
print "---------------------------------------------------------------"

sentence = TextBlob('Use 4 spaces per indentation level.')
print sentence.words
print sentence.words[2].singularize()
print sentence.words[-1].pluralize()

animals = TextBlob("cat dog octopus")
print animals.words
print animals.words.pluralize()

# TextBlobs Are Like Python Strings
print animals[0:10]

# You can use common string methods.
print animals.upper()
print animals.find('dog')

print "---------------------------------------------------------------"
from textblob import Word

w = Word("octopi")
print w.lemmatize()

w = Word("went")
print w.lemmatize("v")  # Pass in WordNet part of speech (verb)

print "---------------------------------------------------------------"

b = TextBlob("I havv goood speling!")
print(b.correct())
"""
Created on Wed Jun 13 16:07:51 2018

@author: akansal2
"""
#importing libraies
from textblob import TextBlob


#TextBlob Strings
Str1 = TextBlob('Amazing')
Str2 = TextBlob('Spider Man')

#Textblob string operations
Str1.lower()
Str1.upper()
Str1[1:4]
Str1 + " " + Str2
Str1.detect_language()



#Paragraph and sentence operations
para = TextBlob("My name is aditya. \n I live is  Modinagar.\n My apples id is [email protected]")
para.sentences  # distinguish sentences with combination of . and \n
para.sentences[0]
para.sentences[1]
para.sentences[2]
para.sentences[0].words
for n in para.sentences[1].noun_phrases:
    print(n)
Esempio n. 8
0
        layer2_names += top_neighbours
        new_graph_data = [{"from":word.upper(), "to":set_name, "stat":word_dice_stats[set_name]} for set_name in top_neighbours]
        # Add to existing graph data
        graph_data += new_graph_data

# Convert graph data to pandas dataframe
gd = pd.DataFrame.from_dict(graph_data)
# Create co-occurance graph
# G = nx.from_numpy_matrix(cooc)
G = nx.from_pandas_edgelist(gd, "from", "to", "stat")

# Generate colours
colours, sizes = [], []
l0, l1, l2 = {}, {}, {}
for node in G:
    if node in ALL_SEARCH_TERMS.upper():
        col = 'darkblue' #'red'
        size = counts[node]*1000 #5000
        l0[node] = node
    elif node in layer1_names:
        col = 'lightblue' #'orange'
        size = counts[node]*1000 #2500
        l1[node] = node
    else:
        col = 'cyan' #'blue'
        size = counts[node]*100 #1000
        l2[node] = node
    colours.append(col)
    sizes.append(size)

# Visualisation
Esempio n. 9
0
sent = TextBlob("I haawve goood speling")
correct_sent = sent.correct()

w = Word("haave")
spellcheck = w.spellcheck()

#Get Word and Noun Phrase Frequencies
words = TextBlob('We are no longer together. We are enemies now.')
word_counts = words.word_counts
#You can specify whether or not the search should be case-sensitive (default is False).

#Translation and Language Detection
en_blob = TextBlob("You are my best friend")
pl_blob = en_blob.translate(to='pl')

blob = TextBlob("Mam na imię Piotr")
detected_lang = blob.detect_language()

#Parsing
text = TextBlob('I know You')
text_parse = text.parse()

#string
text = TextBlob("Hello World")
upper_text = text.upper()
find_world = text.find("World")

#ngrams
blob = TextBlob("Now is better than never.")
ngram = blob.ngrams(n=3)
# WordLists (A WordList is just a Python list with additional methods.)
animals = TextBlob("cat dog octopus")
print animals.words
print animals.words.pluralize()
# Spelling Correction (Use the correct() method to attempt spelling correction.)
b = TextBlob("I havv goood speling!")
print(b.correct())
w = Word('falibility')
print w.spellcheck()

# Get Word and Noun Phrase Frequencies
monty = TextBlob("We are no longer the Knights who say Ni. " "We are now the Knights who say Ekki ekki ekki PTANG.")
print monty.word_counts['ekki']
# The second way is to use the count() method.
print monty.words.count('ekki')
print monty.words.count('Ekki', case_sensitive=True)

# TextBlobs Are Like Python Strings
print zen.upper()

# You can make comparisons between TextBlobs and strings.
apple_blob = TextBlob('apples')
banana_blob = TextBlob('bananas')
print apple_blob < banana_blob
# You can concatenate and interpolate TextBlobs and strings.
print apple_blob + ' and ' + banana_blob
print "{0} and {1}".format(apple_blob, banana_blob)
# n-grams ( The TextBlob.ngrams() method returns a list of tuples of n successive words. )
blob = TextBlob("Now is better than never.")
print blob.ngrams(n=3)
from textblob import TextBlob

text = "I love to watch football, but I have never played it"

text_blob_object = TextBlob(text)
text2 = text_blob_object.upper()
text3 = text_blob_object.lower()

print(text)
print(text2)
print(text3)
Esempio n. 12
0
###############################################################################
"""
Basic
"""

from textblob import TextBlob
## creating a textblob object
blob = TextBlob("Hyderabad is a great place to learn software courses.")

## textblobs are like python strings
blob[0:5]

blob.upper()
blob.lower()

blob2 = TextBlob("It contains a lot of institutes to learn and get the job.")

## concat
blob + " And " + blob2

###############################################################################
"""
Tokenization


blob = TextBlob("Hyderabad is a great place to learn software courses. \n It contains a lot of institutes to learn and get the job.")
blob.sentences
blob.sentences[0]

"""
###############################################################################