Exemple #1
0
def preprocessing(tweet, sentiment):

    global count
    tweet = re.sub('(www\.[^\s]+)', '', str(tweet))  # remove url
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', str(tweet))  # remove hyperlink
    tweet = re.sub(r'&\w*', '', str(tweet))  #remove &amp
    tweet = re.sub('@[^\s]+', '', tweet)  #remove @
    tweet = re.sub(r'#\w*', '', str(tweet))  #remove hashtags
    tweet = re.sub(r'\$\w*', '', str(tweet))  # Remove tickers
    tweet = tweet.strip(
        ' ')  #remove white spaces from the front and end of a string
    tweet = tweet.lower()  # remove upper case
    negations_dic = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not"
    }
    t = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
    tweet = t.sub(lambda x: negations_dic[x.group()], str(tweet))
    tweet = re.sub('[^a-zA-Z]', ' ', str(tweet))  # take alphabet only
    tweet = TextBlob(tweet).correct()
    tweet = re.sub('[\s]+', ' ', str(tweet))  #Remove additional white spaces
    tweet = tweet.strip(
        ' ')  #remove white spaces from the front and end of a string
    tweet = tweet.split()
    ps = PorterStemmer()  #removal of suffices, like “ing”, “ly”, “s”, etc
    tweet = ' '.join(tweet)

    length = len(tweet.split())

    if length != 0:

        fp2.writelines(sentiment + '\n')
        fp1.writelines(tweet + '\n')
Exemple #2
0
def bed_availability(beds):
    data = html2text.html2text(requests.get(beds).text)
    time.sleep(2)
    blob = TextBlob(data)
    # print(blob)
    x = blob.split("####")
    # print(x)
    j = [i for i in x if i.startswith(" **")]
    extract_out = []
    for hospital in j:
        try:
            contact = re.findall("[0-9]{10}", hospital)[0]
        except IndexError:
            contact = None
        hospital_name = hospital.split("\n")[0].replace('*', '')
        vacant_index = hospital.split("\n").index('Vacant')
        icu_vacant_index = hospital.split("\n").index('ICU Vacant')
        non_icu_vacant_index = hospital.split("\n").index('Non ICU Vacant')
        # print(vacant_index,icu_vacant_index,non_icu_vacant_index)

        vacant = hospital.split("\n")[vacant_index - 2].replace(
            '*', '').replace(' ', '').replace('_', '')
        icu_vacant = hospital.split("\n")[icu_vacant_index - 2].replace(
            '*', '').replace(' ', '').replace('_', '')
        non_icu_vacant = hospital.split("\n")[non_icu_vacant_index -
                                              2].replace('*', '').replace(
                                                  ' ', '').replace('_', '')
        # print(vacant,icu_vacant,non_icu_vacant)

        extract_out.append(
            (hospital_name, contact, int(vacant), int(icu_vacant),
             int(non_icu_vacant)))
    return extract_out
Exemple #3
0
def update(table, field):
    conn = boto.dynamodb.connect_to_region('us-west-2',
                                           aws_access_key_id='',
                                           aws_secret_access_key='')
    table = conn.get_table(table)
    for line in table.scan():
        newline = line[field]
        text = TextBlob(newline)
        text = text.lower()
        textwords = text.split()
        wordcount = 0
        wordlist = []
        for word in textwords:
            wordcount += 1
            if word not in wordlist:
                wordlist.append(word)
                #handles div0 errors
        if wordcount == 0:
            lexdiv = 0
        else:
            lexdiv = round((len(wordlist) * 1.0) / wordcount, 2)

        polarity = text.sentiment.polarity
        subjectivity = text.sentiment.subjectivity
        line.put_attribute('subjectivity', subjectivity)
        line.put_attribute('polarity', polarity)
        line.put_attribute('lexical diversity', lexdiv)
        line.save()
 def get_text(self):
     """ NOTE: THIS SHOULD NOT REBUILD DICT EVERY TIME -- REFACTOR """
     blob = TextBlob(self.content.decode('utf-8'))
     words_ = blob.split()
     d = parser.build_ngram_dict(words_)
     s = parser.build_sentence(d)
     # TODO: add check for max text length
     self.text = s
Exemple #5
0
def clean(doc, stop_words, exclude):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop_words])
    #print('stop free is')
    #print(stop_free)
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    blob = TextBlob(punc_free)
    normalized = " ".join(lemma.lemmatize(word) for word in blob.split())
    return normalized
Exemple #6
0
def get_best_lines(sentence):
    """
    Takes a list of words and searches a csv file for lines that are similar
    Returns the line/s from csv that have most words in common
    Also returns the number of words in common found
    """
    bestlines = []
    blcount = 0

    parsed = TextBlob(sentence)
    userinput = parsed.split(" ")
    userinput = list(set(userinput))  #remove dupe words
    filelines = []

    priceflag, priceop, pricenum = get_filter_variables("price", sentence)
    cpuflag, cpuop, cpunum = get_filter_variables("cpu", sentence)

    try:
        dir_path = os.path.dirname(os.path.realpath(__file__))
        #print(dir_path)
        fp = open(dir_path + '/newtrim.csv', 'r')
        line = fp.readline()
        count = 0
        while line:
            if (priceflag):
                if (not compare_string_op(float(get_price_from_sentence(line)),
                                          float(pricenum), priceop)):
                    line = fp.readline()
                    continue
            if (cpuflag):
                if (not compare_string_op(float(get_cpu_from_sentence(line)),
                                          float(cpunum), cpuop)):
                    line = fp.readline()
                    continue

            filteredline = " ".join(list(set(
                line.split(","))))  #remove dupe words in line
            for word in userinput:
                if (re.search(word.lower(), filteredline.lower())):
                    count += 1
            if (count > blcount):
                blcount = count
                bestlines = [line]
            elif (count == blcount):
                bestlines.append(line)
            line = fp.readline()
            count = 0
    finally:
        fp.close()

    ec2instances = []

    for line in bestlines:

        ec2instances.append(get_instance_from_sentence(line))
        #print(ec2instances)
    return list(set(ec2instances))
Exemple #7
0
def dk():
    word3 = TextBlob(varname1.get())
    lan = word3.detect_language()
    lan_todict = languages.get()
    lan_to = lan_dict[lan_todict]
    word3 = word3.translate(from_lang=lan, to=lan_to)
    sp = word3.split()
    label3.configure(text=word3)
    varname2.set(word3)
Exemple #8
0
    def __init__(self,article):
        article = TextBlob(article)
        #words = [word.singularize() for word in article.words]
        #sentences = article.sentences

        words = article.split()
        sentences = article.split('.')

        self['polarity'] = article.sentiment.polarity
        self['subjectivity'] = article.sentiment.subjectivity

        word_lens = [len(word) for word in words]
        sentence_lens = [len(sentence.split()) for sentence in sentences]
        punct = [char for char in article if char in punctuation]

        freq_items = {
                        'freq_question_marks':[punct,'?',sentences],\
                        'freq_exclamation_marks':[punct,'!',sentences],\
                        'freq_quotation_marks':[punct,'?',sentences]
                     }

        freq_items_per_thousand = {
                                'freq_commas':[punct,',',words], \
                                'freq_semi_colons':[punct,';',words],\
                                'freq_ands': [words, 'and', words],\
                                'freq_buts': [words, 'but', words],\
                                'freq_howevers': [words, 'however', words],\
                                'freq_ifs': [words, 'if', words],\
                                'freq_thats': [words, 'that', words],\
                                'freq_mores': [words, 'more', words],\
                                'freq_verys': [words, 'very', words]
                                }

        for item,params in freq_items.items():
            self[item] = self.find_freq(params[0],params[1],params[2])

        for item,params in freq_items_per_thousand.items():
            self[item] = self.find_freq_per_thousand(params[0],params[1],params[2])

        self['article_len'] = len(words)
        self['type_token_ratio'] = len(set(words)) / self['article_len']
        self['mean_word_len'] = np.mean(word_lens)
        self['mean_sentence_len'] = np.mean(sentence_lens)
        self['std_sentence_len'] = np.std(sentence_lens)
Exemple #9
0
def tweet_processor(path, part, freq=1):  
    myFile = pd.read_csv(path, sep=',')
    tweets = myFile["text"]
    if "May" in path: 
        part = 1
    # if "May" not in path:
    tweets = tweets[int(len(tweets)*(part-1)*0.5):int(len(tweets)*part*0.5)] 
    blob =  " ".join(myFile["text"]).split(" ") 
    processed_tweets = []
    compound_sent = [] 
    print("n tweets: ",len(tweets))
    sid = SentimentIntensityAnalyzer()
    for tweet in tweets:
        cleaned_tweet = p.clean(tweet.lower())
        filtered_tweet= clean_tweets(cleaned_tweet) 
        ss = sid.polarity_scores(filtered_tweet) 
        cur_sent = [ss['neg'],ss['pos'], ss['neu'], ss['compound']]  
        blob = TextBlob(filtered_tweet)
        Sentiment = blob.sentiment     
        polarity = Sentiment.polarity
        subjectivity = Sentiment.subjectivity
        if filtered_tweet != "" and len(filtered_tweet) >2: 
            processed_tweets.append(filtered_tweet)  
            compound_sent.append(cur_sent)
    # np.savetxt("processed_tweets.csv", processed_tweets, delimiter=",", fmt='%s') 
    compound_sent = np.asarray(compound_sent)
    freqs = []
    
    print("number of words: ",len((" ".join(processed_tweets).split(" ")))) 
    print("unique words: ",len(set(" ".join(processed_tweets).split(" ")))) 

    if freq ==0: #Use blob counting
        words = set(blob.split(" "))
        for word in set(blob.split(" ")):
            if word != "" and len(word)>2: 
                freqs.append([word,blob.count(word)])    
        freqs = np.asarray(freqs)
        freqs = freqs[np.argsort(freqs[:, 1])][::-1]

    if freq ==1: #Use NLTK freqdist
        freqs = pfreq_dist(" ".join(processed_tweets).split(" "))
        freqs = np.asarray(freqs)  
    return processed_tweets, freqs, compound_sent
Exemple #10
0
def makeGraph2(url):
    article = Article(url)
    article.download()

    # Parse
    article.parse()

    # nlp
    article.nlp()
    blob = TextBlob(article.text)

    # Seaborn

    # configure size of heatmap
    #sns.set(rc={'figure.figsize':(35,3)})

    # function to visualize
    def visualise_sentiments(data):
        svm = sns.heatmap(pd.DataFrame(data).set_index("Sentence").T,
                          center=0,
                          annot=True,
                          cmap="PiYG")
        image_object = BytesIO()
        figure = svm.get_figure()
        figure.savefig(image_object, format="PNG", facecolor="#36393E")

        #sns.subplots_adjust(left=0.0, bottom=0.1, right=0.45)

        image_object.seek(0)
        return image_object

    # visualization
    return visualise_sentiments({
        "Sentence": ["SENTENCE"] + blob.split(),
        "Sentiment": [blob.sentiment.polarity] +
        [blob.sentiment.polarity for word in blob.split()],
        "Subjectivity":
        [blob.sentiment] + [blob.sentiment for word in blob.split()],
    })
Exemple #11
0
def click(event=None):
    try:
        wrd3 = TextBlob(varname.get())
        ln = wrd3.detect_language()
        lang_todict = languages.get()
        ln_to = lang_dict[lang_todict]

        wrd3 = wrd3.translate(from_lang=ln, to=ln_to)
        label3.configure(text=wrd3)
        varname1.set(wrd3)
        sp = wrd3.split()
    except:
        varname1.set("try another keyword")
def clean_process(text):
    #make lowercase
    clean_text = text.lower()

    #remove punctuation and numbers#
    clean_text = [
        char for char in clean_text if char not in string.punctuation
    ]
    clean_text = [char for char in clean_text if char not in string.digits]
    clean_text = ''.join(clean_text)

    #remove spasi kelebihan di depan/akhir review#
    clean_text = clean_text.strip()

    #Spelling Correction#
    clean_text = TextBlob(clean_text).correct()

    #remove stopwords#
    clean_text = [
        word for word in clean_text.split(' ')
        if word not in stopwords.words('english')
    ]
    clean_text = [word for word in clean_text if word not in new_stopwords]

    #make it whole again#
    clean_text = ' '.join(clean_text)

    #stringnya di-tokenize dulu menjadi token berupa kata (word token)#
    clean_text = clean_text.split()

    #setiap tokennya di lemmatize
    new_string = []
    for word in clean_text:
        x_word = lemmatizer.lemmatize(word)
        new_string.append(x_word)

    return new_string
Exemple #13
0
def translate_func():
    dict1 = googletrans.LANGUAGES
    dict2 = {}
    #for i in dict1.items():
    #    dict2[i[1]] = i[0]
    #print(dict2)
    try:
        word3 = TextBlob(l1_txt.get())
        lan = word3.detect_language()
        lan_todict = languages.get()
        lan_to = ln_dict[lan_todict]
        word3 = word3.translate(from_lang=lan, to=lan_to)
        sp = word3.split()
        var2.set(word3)
    except:
        var2.set("Try any other word or sentence.")
def main():
    filename = sys.argv[1]

    with open(filename) as f:
        content = f.read()

    blob = TextBlob(content.decode('utf-8'))

    words = blob.split()
    d = build_ngram_dict(words)
    pprint(d)
    print()
    s = build_sentence(d)
    print(s)
    if s in content.decode('utf-8'):
        print("\nBummer! This sentence is just a copy of one in the corpus.")
def respond(sentence):
    """Parse the user's inbound sentence and find candidate terms that make up a best-fit response"""
    cleaned = preprocess_text(sentence)
    parsed = TextBlob(cleaned)

    pronoun, noun, adjective, verb = find_candidate_parts_of_speech(parsed)

    # If we said something about the bot and used some kind of direct noun, construct the
    # sentence around that, discarding the other candidates
    resp = check_for_comment_about_bot(pronoun, noun, adjective)
    #check whether asking for name
    if not resp:
        for word in parsed.words:
            if word == "Name" or word == "name":
                resp = random.choice(RESPONSES_TO_NAME)
            elif word == "am":
                resp = random.choice(GREET_WITH_NAME).format(
                    **{'word': parsed.split()[-1]})

    # If we just greeted the bot, we'll use a return greeting
        if not resp:
            resp = check_for_greeting(parsed)
    #any issues regarding app or service
    if not resp:
        for word in parsed.words:
            if word == "app" or word == "APP":
                resp = random.choice(APP_SERVICES_INFO)

            if pronoun and word == "problem" or word == "issues" or word == "issue" or word == "help":
                resp = random.choice(APP_PROBLEM)

    if not resp:
        # If we didn't override the final sentence, try to construct a new one:
        if not pronoun:
            resp = random.choice(NONE_RESPONSES)
        elif pronoun == 'I' and not verb:
            resp = random.choice(COMMENTS_ABOUT_SELF)
        else:
            resp = construct_response(pronoun, noun, verb)

    # If we got through all that with nothing, use a random response
    if not resp:
        resp = random.choice(NONE_RESPONSES)

    print(resp)

    return resp
def get_sentiment(clean_words):
    #joined list to string to use text blob library
    word_blob = ' '.join(clean_words)
    blob = TextBlob(word_blob)  #create blob object
    for word in blob.split():
        print(word)
        analysis = TextBlob(word)
        # determines polarity and subjectivity scores of each word
        print(analysis.sentiment)

        #categorizing words based upon sentiment value between -1 and 1
        if analysis.sentiment[0] > 0:
            print('Positive')
        elif analysis.sentiment[0] < 0:
            print('Negative')
        else:
            print('Neutral')
def callback(ch, method, properties, body):

    # We pass each tweet/reddit comment to TextBlob for decoding
    message = TextBlob(body.decode("utf-8"))

    #Figure out whether it's a tweet or a reddit comment
    message_type = message.split()[:1]
    #print("***********" + message_type[0] + "***********")

    # Get the timestamp and the polarity
    result = {}
    result["date"] = datetime.datetime.utcnow()
    result["polarity"] = message.sentiment.polarity

    if message_type[0] == 'REDDIT':
        # Insert into tweet collection
        reddit_col.insert_one(result)
    else:
        #insert into reddit collection
        tweet_col.insert_one(result)
class TransformText:
    def __init__(self, text):
        self.t_textext = str(text)

        self.t_textmp_phrases = defaultdict(int)

        self.stop_list = [
            "(", ")", "]", ".", "\\", "/", "[", '...', '–', ':', ';', '____',
            '___', '+', '/w', '>'
        ]

        self.sub_chunks = []

        #calls grammar first in list is the nount-adj/verb-noun phrase
        #this current grammar parses sentence in some very interesting ways.
        #definitely worth keep this list of expressions

        #NPS and VPH patterns
        self.grammar_II = """
                    NNP: {<J.*|N.*>+}
                    NMM: {<SYM>?<CD>?<N.+>+}
                    VAN: {<V.*>?<J.*|N.*>*<HYPH>*<J.*|N.*>+}
                    NUM: {<CD>+}
                """

        self.grammar_III = """
                    NUM: {<CD>+}
                    VPH: {<V.*|N.*>*<IN>*<V.*|N.*>+}
                """
        # sequence to strip stop words and characters from the text
        # split text into string list

        self.t_text = TextBlob(self.t_textext)

        self.nlp = spacy.load('en',
                              parser=False)  #spaCY corpus https://spacy.io

    """*******************************************************************************
        The following set of functions use the TextBlob package to perform some simple
        text processing prcedures. Sentence chunking, basic word tokenization as well
        as n-grams, noun phrase chunking and a simple POS tagger.
        
        Note: I would only use these for basic processing -  sentence chunking/Bi-Grams.
    ******************************************************************************"""

    def get_sentences(self):
        return self.t_text.sentences

    def get_words(self):
        return self.t_text.words

    def get_bigrams(self):
        return self.t_text.ngrams(n=2)

    def get_trigrams(self):
        return self.t_text.ngrams(n=3)

    def get_np_chunks(self, text=None):
        if text == None:
            return self.t_text.noun_phrases
        else:
            self.t_text = TextBlob(text)
            return self.t_text.noun_phrases

    def simple_tagger(self, text=None):
        if text == None:
            return self.t_text.tags
        else:
            self.t_text = TextBlob(text)
            return self.t_text.tags

    """***********************************************************************************
        The following set of methods break out of Textblob and use NLTK (directly), GENSIM or 
        home-rolled solutions.
        
        E.g. the tokenizer and dictionary methods provide a more sophisticated model for 
        accessing and updating terms in the corpus. 
        
        #full tokener currently only breaks down words, another version will include a 
        unicode charcter input model. The TextBlob tool to split and return a parsed word 
        list is not always accurate, or in the format we want for high performance 
        prediction.
        
        Output can be altered by modifying the stop_list parameters.
        
    ************************************************************************************"""

    def full_tokener(self):
        #takes a textblob input, splits words in a list
        tmp = self.t_text.split()
        for j in tmp:
            for word in j.split():
                list(word)  #split list strings into list of characters
                tmp2 = []
                # iterate through sublist items and remove if present in the stop_list
                for chars in list(word):
                    if chars not in self.stop_list:  #IMPORTANT adding or removing values will change the output
                        tmp2.append(chars)
                        joined_chars = ''.join(tmp2)
                    else:
                        del chars

                self.clean_list.append(joined_chars)

        return self.clean_list

    #method requires
    def sent_tokener(self):
        self.sent_tokens = []
        for i in self.t_text.sentences:
            self.sent_tokens.append(str(i).split())

        return self.sent_tokens

    def clean_sent_tokens(self, sent_list):
        #takes a textblob input, splits words in a list'
        self.clean_sent = []

        for j in sent_list:
            list(j)  #split list strings into list of characters
            tmp = []
            # iterate through sublist items and remove if present in the stop_list
            for chars in list(j):
                if chars not in self.stop_list:  #IMPORTANT adding or removing values will change the output
                    tmp.append(chars)
                    joined_chars = ''.join(tmp)
                else:
                    del chars

            self.clean_sent.append(joined_chars)

        return self.clean_sent

    ########################ReGex POS Tagging#######################

    #baseline nltk parser
    def nltk_tagger(self, pros_list):
        """
            This provides basic functionality for tagging and parsing a single document.
            This method takes a tokenized document or list of words with 'most' special
            characters removed from the file.
        
        """

        self.tagged = nltk.pos_tag(
            pros_list)  #Implements the MAXENT POS Tagger

        #returns a list of objects. Relevant phrase objects will have a label associated
        return self.tagged

    """#######################################################################################
    
    Spacy Chunker/POS Tool
    
    In order to use the spacy tools (space_tgger, and space_ent), you must pass raw text strings.
    Not lists, as in the case of the other
    
    ########################################################################################"""

    def space_tagger(self, text_doc):

        self.docs = []

        doc = self.nlp(text_doc)  #load text

        for word in doc:
            seq = (word.text, word.tag_)
            self.docs.append(seq)

        #self.docs.append(sents)

        return self.docs

    def space_ent(self, text_doc):

        self.doc_ents = []

        doc = self.nlp(text_doc)  #load text

        for word in doc.ents:
            seq = (word.text, word.label_)
            self.doc_ents.append(seq)

        #self.docs.append(sents)

        return self.doc_ents

    def parse(self, tagged_text):

        cp = nltk.RegexpParser(self.grammar_II)  #

        self.result = cp.parse(tagged_text)

        return self.result

    """***********************************************************************************
        Sentence based chunker methods - As of 3/22/2017 these produce the best results.
    
    ************************************************************************************"""

    def np_sub_chunks(self, result):
        """
            Finds NP (nounphrase) leaf nodes of a chunk tree.
            Takes on the result list from the re_parse_chunker function.
        
            Checks to see if object in sentence tree has a label. 
            Items without labels are ignored.
        """
        self.clean_list = []

        #uses the nltk.Tree class and tree class methods | worth reading
        for tree in result.subtrees():
            phrases = []
            #load label for each expression - labels are defined by the grammar attribute
            phrases.append(tree.label())

            for i in tree.pos():
                phrases.append(i[0][0])

            self.sub_chunks.append(phrases)

        return self.sub_chunks

    def return_sub_chunks(self, lower=True):
        """
        Process NP Chunks and returns a list of unique phrases for a given document.
        """
        phrase_set = []
        self.sent_phrases = []

        tag_set = ["NMM"]

        [
            phrase_set.append(chunk) for chunk in self.sub_chunks
            if chunk[0] in tag_set
        ]

        for phrase in phrase_set:

            if lower == False:
                phrase[1:] = [' '.join(phrase[1:])]
            else:
                phrase[1:] = [' '.join(phrase[1:]).lower()]

            x = phrase[1:]
            self.sent_phrases.append(x[0])

        print(list(set(self.sent_phrases)))

        self.doc_phrases = list(set(self.sent_phrases))

        return self.doc_phrases

    ################################
    ################################
    ################################
    def return_chunks(self):
        """
        Process NP Chunks and returns a list of unique phrases for a given document.
        """
        phrase_set = []
        for chunk in self.chunks:
            phrase = ' '.join(chunk)
            phrase_set.append(phrase)

        self.doc_phrases = list(set(phrase_set))
        return self.doc_phrases

    #generate a data dump object to be passed to json.dumps method for saving dict list as json.
    def phrase_dump(self, doc_term, phrase_list, sentences):
        self.t_textmp_phrases.update(document=doc_term,
                                     phrases=phrase_list,
                                     sentences=sentences)

        return self.t_textmp_phrases

    """***********************************************************************************
        The following set of methods are currently in experimental phase.
        These include LDA and Non-Parametric Bayesian Inference Models for Topic Analysis.
        The combination of noun phrasing/named entity extraction along with topic models
        can provide a baseline for creating hierachical classification tools to detect
        hierachies of text/conceptual relationships.
    
    ************************************************************************************"""

    def gen_bag_words(self, documents):
        self.dictionary = corpora.Dictionary(documents)
        self.dictionary.save(
            'data_dump.dic')  #Note uses a different approach than SDRCake
        self.corpus = [
            self.dictionary.doc2bow(document) for document in documents
        ]
        return self.corpus

    def bayesian_topic(self):
        #this leverages gensim non-parametric bayesian algorithm
        self.model = models.HdpModel(self.corpus, id2word=self.dictionary)
        return self.model

    #IMPORTANT Method - Turns a document into an edge list - Document Terms as Network
    def pairwise(self, iterable):
        "s -> (s0,s1), (s1,s2), (s2, s3), ..."
        a, b = tee(iterable)
        next(b, None)
        self.pairs = zip(a, b)
        return self.pairs

    """**********************************************************************************
    
        Baseline html tag removal. Should be called all documents to ensure special characters 
        are removed. Works relatively well for our currrent purposes. It is an older method 
        originally supplied within the NLTK package but was removed in the most recent iterations. 
        I saved it because it actually works very well compared to other methods I have seen.
        
    ************************************************************************************"""

    def clean_html(self, html):
        """
        Remove HTML markup from the given string.

        :param html: the HTML string to be cleaned
        :type html: str
        :rtype: str
        """
        self.str_html = str(html)

        # First we remove inline JavaScript/CSS:
        self.cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "",
                              self.str_html.strip())
        # Then we remove html comments. This has to be done before removing regular
        # tags since comments can contain '>' characters.
        self.cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", self.cleaned)
        # Next we can remove the remaining tags:
        self.cleaned = re.sub(r"(?s)<.*?>", " ", self.cleaned)
        # Finally, we deal with whitespace
        self.cleaned = re.sub(r"&nbsp;", " ", self.cleaned)
        self.cleaned = re.sub(r"[\s]", "  ", self.cleaned)
        self.cleaned = re.sub(r"  ", " ", self.cleaned)
        self.cleaned = re.sub(r"  ", "\n", self.cleaned)

        self.clean = self.cleaned.split()

        for i in self.clean:
            if len(i) <= 1:
                self.clean.remove(i)
            else:
                pass

        self.clean = ' '.join(self.clean)

        return self.clean
Exemple #19
0
# [2-3] ON YOUR OWN:

# Using the code above for figures, create a new table that lists the top 10 most frequent words and how many times they occur in that text.

import plotly.graph_objects as go

fig = go.Figure(data=[
    go.Table(header=dict(values=['A Scores', 'B Scores']),
             cells=dict(values=[[100, 90, 80, 90], [95, 85, 75, 95]]))
])
fig.show()

#%%
from collections import Counter

split_words = blob.split()

Counter = Counter(split_words)

most_occur = Counter.most_common(10)

##print(most_occur)

#split into two variables
mostusedword, countofuse = zip(*most_occur)

print(mostusedword)
print(countofuse)

#I tried to install "Plotly" to do this question, but I do not have the administrative
#priviliges to download plotly. Thus, I can't make a table for this question, but I
if __name__=="__main__":

	count = 0
	days = {}
	polarity = {}
	num_files = int(sys.argv[1])
	#print num_files
	for i in range (1,num_files+1):
		input_file = open(sys.argv[1+i],'r')
		#print i
		for line in input_file:
			tweet_json = json.loads(line)
			tweet = TextBlob(tweet_json['text'])
			blob = TextBlob(tweet_json['created_at'])
			#print blob
			date = blob.split(' ')
			day = date[0]
			if day == 'Sun':
				day = "2016-04-03"
			if day == 'Mon':
				day = "2016-04-04"
			if day == 'Tue':
				day = "2016-04-05"
			if day == 'Wed':
				day = "2016-04-06"
			if day == 'Thu':
				day = "2016-04-07"
			if day == 'Fri':
				day = "2016-04-08"
			if day == 'Sat':
				day = "2016-04-09"
Exemple #21
0
class Article_Reading:
    link=None
    url=None
    article=None
    analysis=None
    PharseText=None
    file_name='data/saved_article.csv'

    def __init__(self,url):
        self.url=url
        self.article = Article(self.url)
        self.article.download()
        self.article.parse()
        self.article.nlp()
        self.ParseText=TextBlob(self.article.text)
        
    def analize_total_text(self):
        self.analysis=TextBlob(self.article.text)
        print(self.analysis.polarity)
    
    def analize_by_sentence(self):
        for sentence in self.ParseText.sentences:
            print(sentence)
    

    def get_article_title(self):
        print(self.article.title)

        return self.article.title
    
    def get_article_author(self):
        print(self.article.authors)

        return self.article.authors
    
    def get_article_date(self):
        print(self.article.publish_date)

        return self.article.publish_date
    
    def get_article_summary(self):
        print(self.article.summary)

        return self.article.summary

    
    def get_article_tags(self):
        print(self.article.tags)

        return self.article.tags
    
    def save_article(self):
        from string import punctuation

        # print(punctuation)

        punctuation=list(punctuation)
        punctuation.append('\n')
        
        # for sent in self.ParseText:
            # print(sent)
        test=self.ParseText.split(' ')
        print(test)

        # tokens=[ token for token in self.ParseText if token not in punctuation ]

        # print(tokens)

        # punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~''\“\”/'

        # for sentence in self.ParseText.sentences:
        #     for ele in sentence:
        #         if ele in punc:
        #             sentence=sentence.replace(ele,"")
        # print(sentence)


        


        # row_contents=[f"{self.get_article_title()}",f"{self.get_article_author()}",f"{self.get_article_date()}",f"{self.get_article_summary()}",f"{self.get_article_tags()}"]

        # with open(self.file_name,'a+',newline='') as  write_obj:

        #     csv_writter=writer(write_obj)

        #     csv_writter.writerow(row_contents)


        # print(row_contents)
        
        pass

    
   



    


    
    def test(self):
        print("hello")
Exemple #22
0
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from pathlib import Path
from wordcloud import WordCloud
import imageio

nltk.download("stopwords")
stops = stopwords.words("english")

old_john = TextBlob(Path("book of John text.txt").read_text())
john = old_john.split()

# Creates new list of words containing no machine code
new_john = [jan for jan in john if jan not in stops]

# This block converts the book of John into string tokens.
# If any of these tokenized words match qualifies as "noun" part of speech
# the noun will be sent the list, "noun_john" - a frequency distribution is constructed from the list of nouns
# .most_common(x) can be used to return a tuple of "x" most common words and a counter
is_noun = lambda pos: pos[:2] == 'NN'
tokenized_john = nltk.word_tokenize(str(old_john))
noun_john = [
    word for (word, pos) in nltk.pos_tag(tokenized_john) if is_noun(pos)
]
stopwords = nltk.corpus.stopwords.words('english')
john_frequency = nltk.FreqDist(w.lower() for w in noun_john
                               if w not in stopwords)
top15_johns = john_frequency.most_common(15)
john_wc = " ".join([str(jee) for jee in top15_johns])
Exemple #23
0
    def Getsearch(self):

        auth = OAuthHandler(self.API_KEY, self.API_SECRET_KEY)
        auth.set_access_token(self.ACESS_TOKEN_KEY,
                              self.ACESS_TOKEN_SECRET_KEY)
        api = tweepy.API(auth)
        searchTerm = str(self.search1.get())
        NoOfTerms = int(self.search2.get())
        tweets = tweepy.Cursor(api.search, q=searchTerm,
                               lang="en").items(NoOfTerms)
        fp1 = open("tweets.csv", 'w')
        count = 1
        xx = []
        count = 1
        for tweet in tweets:
            xx.append(tweet.text)
        for tweet in xx:
            tweet = re.sub('(www\.[^\s]+)', '', str(tweet))  # remove url
            tweet = re.sub(r'https?:\/\/.*\/\w*', '',
                           str(tweet))  # remove hyperlink
            tweet = re.sub(r'&\w*', '', str(tweet))  #remove &amp
            tweet = re.sub('@[^\s]+', '', str(tweet))  #remove @
            tweet = re.sub(
                r'#\w*', '', str(tweet)
            )  #remove hashtags tweet = re.sub(r'\$\w*', '', str(tweet))   # Remove tickers
            tweet = tweet.strip(
                ' ')  #remove white spaces from the front and end of a string
            tweet = tweet.lower()  # remove upper case
            negations_dic = {
                "isn't": "is not",
                "aren't": "are not",
                "wasn't": "was not",
                "weren't": "were not",
                "haven't": "have not",
                "hasn't": "has not",
                "hadn't": "had not",
                "won't": "will not",
                "wouldn't": "would not",
                "don't": "do not",
                "doesn't": "does not",
                "didn't": "did not",
                "can't": "can not",
                "couldn't": "could not",
                "shouldn't": "should not",
                "mightn't": "might not",
                "mustn't": "must not"
            }
            t = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')
            tweet = t.sub(lambda x: negations_dic[x.group()], str(tweet))
            tweet = re.sub('[^a-zA-Z]', ' ', str(tweet))  # take alphabet only
            tweet = TextBlob(tweet).correct()

            tweet = re.sub(r'\b\w{1,2}\b', '',
                           str(tweet))  # Remove words with 2 or fewer letters
            tweet = re.sub('[\s]+', ' ',
                           str(tweet))  #Remove additional white spaces
            tweet = tweet.strip(
                ' ')  #remove white spaces from the front and end of a string
            tweet = tweet.split()
            ps = PorterStemmer(
            )  #removal of suffices, like “ing”, “ly”, “s”, etc
            tweet = str(' '.join(tweet))

            tweetx = "\n" + tweet + "\n"
            self.TxtBox.insert(0.0, tweetx)
            count = count + 1
            fp1.writelines(tweet + '\n')
        fp1.close()
Exemple #24
0
    def clean_text(self, text):

        text = text.lower()
        text = re.sub(
            r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
            "", text)
        text = re.sub(
            r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}",
            "", text)

        if self.clean_wiki_tokens:
            # pictures
            text = re.sub(r"image:[a-zA-Z0-9]*\.jpg", " ", text)
            text = re.sub(r"image:[a-zA-Z0-9]*\.png", " ", text)
            text = re.sub(r"image:[a-zA-Z0-9]*\.gif", " ", text)
            text = re.sub(r"image:[a-zA-Z0-9]*\.bmp", " ", text)

            # css
            text = re.sub(r"#([A-Fa-f0-9]{6}|[A-Fa-f0-9]{3})", " ", text)
            text = re.sub(r"\{\|[^\}]*\|\}", " ", text)

            # templates
            text = re.sub(r"\[?\[user:.*\]", " ", text)
            text = re.sub(r"\[?\[user:.*\|", " ", text)
            text = re.sub(r"\[?\[wikipedia:.*\]", " ", text)
            text = re.sub(r"\[?\[wikipedia:.*\|", " ", text)
            text = re.sub(r"\[?\[special:.*\]", " ", text)
            text = re.sub(r"\[?\[special:.*\|", " ", text)
            text = re.sub(r"\[?\[category:.*\]", " ", text)
            text = re.sub(r"\[?\[category:.*\|", " ", text)

        # clean char type
        for typo, correct in self.clean_word_dict.items():
            text = re.sub(typo, " " + correct + " ", text)
            # text = re.sub(typo, correct, text)

        # abbr convert
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r",", " ", text)
        text = re.sub(r"\.", " ", text)
        text = re.sub(r"!", " ! ", text)
        text = re.sub(r"\/", " ", text)
        text = re.sub(r"\?", " ? ", text)
        text = re.sub(r"\!", " ! ", text)
        text = re.sub(r"\"", " ", text)
        text = re.sub(r"\^", " ^ ", text)
        text = re.sub(r"\+", " + ", text)
        text = re.sub(r"\-", " - ", text)
        text = re.sub(r"\=", " = ", text)
        text = re.sub(r"'", " ", text)
        text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
        text = re.sub(r":", " : ", text)
        text = re.sub(r" e g ", " eg ", text)
        text = re.sub(r" b g ", " bg ", text)
        text = re.sub(r" u s ", " american ", text)
        text = re.sub(r"\0s", "0", text)
        text = re.sub(r" 9 11 ", "911", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r"j k", "jk", text)
        text = re.sub(r"\s{2,}", " ", text)
        # Numeric chars
        text = re.sub(r'\d+', " ", text)
        # Get ride of punctuation  (After processed with abbr, punctuation is useless)
        text = re.sub(r"^\w\s", "", text)

        # In toxic, words like fuckkkkk or fffffuck are explicit
        if self.convert_typo:
            convert_text = text.split()
            convert_text = [
                w if "f**k" not in w else "f**k" for w in convert_text
            ]
            convert_text = [
                w if "dick" not in w else "dick" for w in convert_text
            ]
            convert_text = [
                w if "bitch" not in w else "bitch" for w in convert_text
            ]
            text = " ".join(convert_text)

        if self.stem_words:
            text = text.split()
            stemmer = SnowballStemmer('english')
            stemmed_words = [stemmer.stem(word) for word in text]
            text = " ".join(stemmed_words)

        if self.lemmatize:
            text = text.split()
            wnl = WordNetLemmatizer()
            lemmed_words = [wnl.lemmatize(word) for word in text]
            text = " ".join(lemmed_words)

        if self.remove_stopwords:
            raise NotImplementedError

        if self.error_correct:
            text = TextBlob(text).correct()

        if self.count_null_words:
            text = text.split()
            for t in text:
                self.word_count_dict[t] += 1
            text = " ".join(text)

        # Get ride of unnecessary blanks  (when char_level == True, too many blanks may hurt the result)
        if not self.count_null_words and not self.stem_words and not self.lemmatize:
            text = " ".join(text.split())

        return text
Exemple #25
0
ax1.grid(zorder=1)
ax1.xaxis.grid(False)
plt.hist(td['duration'],range(0,4000,250),zorder=0,color = "#66B266")
plt.xlabel('duration(seconds)')
plt.ylabel('How many talks in that duration')
plt.title('TED duration Distribution')
plt.axvline(x=td['duration'].mean(),linestyle='--')
plt.axvline(x=td['duration'].median(),color = '#FFFF7F',linestyle='-.')
plt.legend(['mean of duration','median of duration'], loc='upper right')
plt.show()
# [',]# 排名前10的tag
m = ['[',"'",',',']']
tags_split = []
indi_tag = []
for t in title_rank['tags']:
    t = t.split("'")
    #print(t)
    for i in t:
        if i[0] in m:
            t.remove(i)
    tags_split.append(t)
title_rank['tags_split'] = tags_split
for row in tags_split:
    for w in row:
        if w in indi_tag:
            continue
        else:
            indi_tag.append(w)
tags_count = []
for t in title_rank['tags_split']:
    tags_count.append(len(t))
Exemple #26
0
from textblob.decorators import requires_nltk_corpus
from textblob.base import BaseTagger
import os

# importing a text:
text1 = filehandle = open('corpus1.txt')
text = filehandle.read()

# removes \n from the text and adds each line to lines_list
lines_list = text.splitlines()
# joins the elements of lines_list and makes raw text
text_raw = "".join(lines_list)
# implement raw text to textblob
blob = TextBlob(text_raw)
# parsing sentences
sentences = blob.split('.')
# tag senteces
tags = blob.tags
# print the sentences
print('\n\n{:-^160}'.format(' Parsed Sentences ') + '\n\n')
for i in range(0, len(sentences)):
    print(sentences[i])
# put tags into a list
list3_tags = []
for i in range(0, len(sentences)):
    list3_tags.append(tag(sentences[i]))
tags_list_final = []
tags_final = []
# separate tags by sentence
for i in range(0, (len(list3_tags) - 1)):
    for j in range(0, len(list3_tags[i])):
Exemple #27
0
def sentiment_page():
    uid = request.cookies.get("UID")
    location = request.cookies.get("Location")
    keyword = request.cookies.get("Keyword")
    coordDict = {
        "NY": (40.7829, -73.9682),
        "LA": (34.0522, -118.2436),
        "CH": (41.8781, -87.6232),
        "US": (39.8283, -98.5795)
    }
    if location != "US":
        zoom = 11
    else:
        zoom = 5
    print("Starting Analysis...")
    db = get_db()
    c = db.cursor()

    gmap = gmplot.GoogleMapPlotter(
        coordDict[location][0],
        coordDict[location][1],
        zoom,
        apikey='AIzaSyCEYyEKiSKuoEW20-XKL53kJ3CuySnWVbI')
    posNounPhrases = dict()
    negNounPhrases = dict()
    count = 0
    for row in c.execute('''SELECT * FROM Tweets'''):
        #text is 2, lats are 3, lons are 4
        tweetBlob = TextBlob(row[2])
        nounList = tweetBlob.split()
        if keyword in nounList or keyword == "NONE":
            if float(tweetBlob.sentiment.polarity) > .75:
                gmap.marker(float(row[3]), float(row[4]), 'maroon')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    try:
                        posNounPhrases[i] += 1
                    except KeyError:
                        posNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) > .5:
                gmap.marker(float(row[3]), float(row[4]), 'red')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    try:
                        posNounPhrases[i] += 1
                    except KeyError:
                        posNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) > .25:
                gmap.marker(float(row[3]), float(row[4]), 'deeppink')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    try:
                        posNounPhrases[i] += 1
                    except KeyError:
                        posNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) > 0:
                gmap.marker(float(row[3]), float(row[4]), 'pink')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    try:
                        posNounPhrases[i] += 1
                    except KeyError:
                        posNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) == 0:
                pass  #gmap.marker(float(row[3]), float(row[4]), '#FFFFFF')
                #print(tweetBlob.sentiment.polarity, row[2])
                count += 1
            elif float(tweetBlob.sentiment.polarity) > -.25:
                gmap.marker(float(row[3]), float(row[4]), 'lightblue')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    #print(i)
                    try:
                        negNounPhrases[i] += 1
                    except KeyError:
                        negNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) > -.5:
                gmap.marker(float(row[3]), float(row[4]), 'deepskyblue')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    #print(i)
                    try:
                        negNounPhrases[i] += 1
                    except KeyError:
                        negNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) > -.75:
                gmap.marker(float(row[3]), float(row[4]), 'cornflowerblue')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    #print(i)
                    try:
                        negNounPhrases[i] += 1
                    except KeyError:
                        negNounPhrases[i] = 1
            elif float(tweetBlob.sentiment.polarity) > -.75:
                gmap.marker(float(row[3]), float(row[4]), 'darkslateblue')
                #print(tweetBlob.sentiment.polarity, row[2])
                for i in nounList:
                    #print(i)
                    try:
                        negNounPhrases[i] += 1
                    except KeyError:
                        negNounPhrases[i] = 1

    dir_path = os.path.dirname(os.path.realpath(__file__))
    newPath = dir_path + "/templates/{0}.html".format(uid)
    gmap.draw(newPath)
    pos = []
    neg = []
    #print(posNounPhrases)
    #print(negNounPhrases)
    for i in posNounPhrases:
        pos.append((posNounPhrases[i], i))
    for i in negNounPhrases:
        neg.append((negNounPhrases[i], i))
    pos = sorted(pos, reverse=True)
    neg = sorted(neg, reverse=True)
    posStr = ""
    negStr = ""
    posCount = 0
    negCount = 0
    for i in range(100):
        try:
            if posCount < 10 and len(pos[i][1]) > 3:
                posStr += pos[i][1] + ", "
                posCount += 1
        except IndexError:
            posCount = 10
        try:
            if negCount < 10 and len(neg[i][1]) > 3:
                negStr += neg[i][1] + ", "
                negCount += 1
        except IndexError:
            negCount = 10
    print("Most positive phrases: " + posStr)
    print("Most negative phrases: " + negStr)
    print("Total tweets with neutral sentiment: " + str(count))
    count = 0
    flag = False
    flag2 = True
    with open(newPath, "r+") as f:
        data = f.readlines()
        for line in data:
            #if count==2:
            #                f.write('<meta http-equiv="refresh" content="120; url=http://127.0.0.1:5000/SentimentPins"/>')
            count += 1
            if flag == True and flag2 == True:
                flag2 = False
                flag = False

                f.write('<ul class="list-group">\n')
                f.write(
                    '<li class="list-group-item">People in {0} are upset about . . .</li>\n'
                    .format(location))
                f.write(
                    '<li class="list-group-item">{0}</li>\n'.format(posStr))
                f.write('</ul>\n\n')

                f.write('<ul class="list-group">\n')
                f.write(
                    '<li class="list-group-item">People in {0} are happy about . . .</li>\n'
                    .format(location))
                f.write(
                    '<li class="list-group-item">{0}</li>\n'.format(negStr))
                f.write('</ul>\n')
            if "body" in line:
                flag = True
            f.write(line)

    f.close()
    wb.open_new_tab("file://" + newPath)
    return redirect('/Home')
def calc_sentiment(text):
    blob = TextBlob(clean_tweet(text))
    return [blob.sentiment.polarity,len(blob.split(" "))]
Exemple #29
0
def inputNumber(message):
    while True:
        try:
            userInput = int(input(message))
        except ValueError:
            print("Invalid input. Please enter a number: 1, 2, 3, or 4.")
            continue
        if userInput not in [1, 2, 3, 4]:
            print("Invalid integer. Please enter 1, 2, 3, or 4.")
            continue
##############################################################################################################
#######--------CHOICE-#1:-DOCUMENT-FILE----------------------------------------------------------##############
##############################################################################################################
        if userInput == 1:
            docchoice = input("Please enter the name of the Text File.\n")
            sourcedoc = open(docchoice, 'r')
            readsource = sourcedoc.read()
            lowfile = readsource.lower()
            #            filesoup = BeautifulSoup(lowfile,'lxml')
            #            filetext = filesoup.get_text(strip = True)
            #            sent = TextBlob(filetext)
            sent = TextBlob(lowfile)
            slashsplice = sent.replace('/', ' ')
            dashsplice = (slashsplice.replace('-', ' '))
            dashsplice2 = (dashsplice.replace('–', ' '))
            sentblob = TextBlob(lowfile)
            filepunct = TextBlob(str(remove_punctuation(dashsplice2)))
            finaltext = str(remove_punctuation(dashsplice2))
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = sentblob.sentiment.polarity
            subject = sentblob.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in filepunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            tokens = [w.translate(punctuate) for w in lemmatized_list]
            #            splitpunct = filepunct.split()
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '2', '1', '0',
                '–', '’', '’', '“', '”'
            ]
            #            tokens = [w for w in splitpunct]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n-------30 MOST COMMON WORDS-------: \n")
            for key, value in count.most_common(30):
                print("   " + str(value) + " - " + key)
            print("\n-------FREQUENCY CHART-------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(15, cumulative=False)
            ##---------------PHRASE (1,2,3,4 WORDS) COUNTER----------------------------------------
            bitokens = nltk.word_tokenize(finaltext)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(10)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(10)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(10)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print("   ", key, "", key2, "", key3, "", key4, "-", value)
####---------------------READABILITY INDEX--------------------###########
            flesh = int(textstat.flesch_reading_ease(readsource))
            print("--------FLESCH-KINCLAID TEST--------\n",
                  "\n    Readability Score: ", flesh)
            if flesh in range(0, 30):
                print(
                    "    Very difficult to read. Best understood by university graduates."
                )
            if flesh in range(31, 50):
                print("    Difficult to read.")
            if flesh in range(51, 60):
                print("    Fairly difficult to read.")
            if flesh in range(61, 70):
                print(
                    "    Plain English. Easily understood by 13- to 15-year-old students."
                )
            if flesh in range(71, 80):
                print("    Fairly easy to read.")
            if flesh in range(81, 90):
                print("    Fairly easy to read.")
            if flesh in range(90, 100):
                print(
                    "    Very easy to read. Easily understood by an average 11-year-old student."
                )
            print("-----------------------------------\n")

            ##################---END. LOOP---##########################################################################################################
            again = input(
                "\nThank you for using BTL 0.6. Run Again? [Y / N]\n")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break

##############################################################################################################
####----------CHOICE-#2:-URL/LINK-------------------------------------------------------------------------------
##############################################################################################################
        if userInput == 2:
            webchoice = input("Please enter the URL of the website.\n")
            webdoc = urllib.request.urlopen(webchoice)
            readweb = webdoc.read()
            websoup = w3lib.html.remove_tags(readweb)
            #            websoup = BeautifulSoup(readweb,'html5lib')
            #  websoup2 = websoup.text
            print(websoup)
            lowweb = websoup.lower()
            websent = TextBlob(lowweb)
            slashsplice = websent.replace('/', ' ')
            dashsplice = (slashsplice.replace('-', ' '))
            dashsplice2 = (dashsplice.replace('–', ' '))
            dashsplice3 = (dashsplice2.replace(' – ', ' '))
            pagesplice = dashsplice3.replace(' p. ', ' ')
            pagesplice2 = pagesplice.replace(' pp.', ' ')
            webpunct = TextBlob(str(remove_punctuation(pagesplice2)))
            finalweb = str(remove_punctuation(pagesplice2))
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = websent.sentiment.polarity
            subject = websent.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in webpunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            tokens = [w.translate(punctuate) for w in lemmatized_list]
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', " ",
                'mwparseroutput', 'wwww3org', 'xmlnshttp', 'also', '1', '0',
                'svg', '2', 'jw', '’', '“', '”', 'u'
            ]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n---------MOST COMMON WORDS---------: \n")
            for key, value in count.most_common(30):
                print("   " + key + " - " + str(value))
            print("\n---------FREQUENCY CHART---------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(10, cumulative=False)
            #################################################################################################
            ##---------------PHRASE (1,2,3,4) COUNTER----------------------------------------
            ###################################################################################
            bitokens = nltk.word_tokenize(finalweb)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(20)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(20)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(20)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print("   ", key, "", key2, "", key3, "", key4, "-", value)
    #################################################################################################
    ##---------------READABILITY INDEX----------------------------------------
    ###################################################################################
    ##########---------------END LOOP---------------------##############################
            again = input("\nThank you for using BTL 0.6. Run Again? [Y / N]")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                print("Bye!")
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break

########################################################################################################################
############--------CHOICE-#3:-MANUAL-INPUT----------########################################
############################################################################################################

        if userInput == 3:
            manchoice = input("Please enter your text here:\n")
            lowman = manchoice.lower()
            mansoup = BeautifulSoup(lowman, 'html5lib')
            mantext = mansoup.get_text(strip=True)
            mansent = TextBlob(mantext)
            sent = TextBlob(manchoice)
            manpunct = TextBlob(str(remove_punctuation(mansent)))
            finalman = str(remove_punctuation(mansent))
            splitpunct = manpunct.split()
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '0', '–', '’',
                '“', '”', '’'
            ]
            print("\n-----------------------------------------------")
            print("-----Sentiment Analysis Guide------------------")
            print("-----------------------------------------------")
            print(
                "    Polarity(Emotion): \n    [ -1:Negative,   0:Neutral,   1:Positive ]"
            )
            print(
                "\n    Subjectivity(Fact VS Opinion): \n    [ 0:Objective    1:Subjective ]"
            )
            print("------------------------------------------------")
            polar = sent.sentiment.polarity
            subject = sent.sentiment.subjectivity
            print("\n|------------------------------------|")
            print("|-----SENTIMENT ANALYSIS RESULTS-----|")
            print("|------------------------------------|")
            print("|    Polarity: ", polar,
                  "                \n|    Subjectivity: ", subject,
                  "            ")
            print("|------------------------------------|")
            tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
            words_and_tags = [(w, tag_dict.get(pos[0], 'n'))
                              for w, pos in manpunct.tags]
            lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
            punctuate = str.maketrans('', '', string.punctuation)
            #            tokens = [w.translate(punctuate) for w in lemmatized_list]
            tokens = [w for w in splitpunct]
            stoplist = stopwords.words('english') + [
                'ie', 'may', 'us', 'shall', 'etc', 'thereof', '—'
            ]
            clean_tokens = tokens[:]
            for token in tokens:
                if token in stoplist:
                    clean_tokens.remove(token)
            count = Counter(clean_tokens)
            print("\n------35 MOST COMMON WORDS------: \n")
            for key, value in count.most_common(35):
                print("   " + key + " - " + str(value))
            print("\n------FREQUENCY CHART------:")
            freq = nltk.FreqDist(clean_tokens)
            freq.plot(10, cumulative=False)
            #################################################################################################
            ##---------------PHRASE (1,2,3,4 WORDS) COUNTER----------------------------------------
            ##################################################################################
            bitokens = nltk.word_tokenize(finalman)
            bgs = nltk.ngrams(bitokens, 2)
            fdist = nltk.FreqDist(bgs)
            count = fdist.most_common(10)
            tgs = nltk.ngrams(bitokens, 3)
            fdist2 = nltk.FreqDist(tgs)
            count2 = fdist2.most_common(10)
            qgs = nltk.ngrams(bitokens, 4)
            fdist3 = nltk.FreqDist(qgs)
            count3 = fdist3.most_common(10)
            print("\n--------COMMON PHRASES (2 WORDS)--------:\n")
            for (key, key2), value in count:
                print("   ", key, "", key2, "", "-", value)
            print("\n--------COMMON PHRASES (3 WORDS)--------:\n")
            for (key, key2, key3), value in count2:
                print("   ", key, "", key2, "", key3, "-", value)
            print("\n--------COMMON PHRASES (4 WORDS)--------:\n")
            for (key, key2, key3, key4), value in count3:
                print(
                    "   ",
                    key,
                    "",
                    key2,
                    "",
                    key3,
                    "",
                    key4,
                    "-",
                    value,
                )
    ######---------------READABILITY INDEX#----------------####
            flesh = int(textstat.flesch_reading_ease(manchoice))
            print("\n----------FLESCH-KINCLAID TEST----------:\n",
                  "\n    Readability Score: ", flesh, "\n")
            if flesh in range(0, 31):
                print(
                    "    --Very difficult to read. Best understood by university graduates.--"
                )
            if flesh in range(31, 51):
                print("    --Difficult to read.--")
            if flesh in range(51, 61):
                print("    --Fairly difficult to read.--")
            if flesh in range(61, 71):
                print(
                    "    --Plain English. Easily understood by 13 to 15-year-old students.--"
                )
            if flesh in range(71, 81):
                print("    --Fairly easy to read.--")
            if flesh in range(81, 91):
                print("    --Fairly easy to read.--")
            if flesh in range(91, 100):
                print(
                    "    --Very easy to read. Easily understood by an average 11-year-old student.--"
                )
            print("\n------------------------------------------\n")

            again = input("\nThank you for using BTL 0.3. Run Again? [Y / N]")
            acceptable = ["Y", "y", "N", "n"]
            if again in ["Y", "y"]:
                print("What kind of document?")
                return inputNumber(message)
            if again in ["N", "n"]:
                print("Bye!")
                quit()
            while again not in acceptable:
                print(
                    "\nSorry, didn't catch that. Please select an option below:"
                )
                return inputNumber(message)
            break
###################################################################################################################
##########---------CHOICE 4: QUIT PROGRAM-------------------------------------------------------------------------------
######################################################################################################################
        if userInput == 4:
            print("Thank you for using BTL 0.5. Bye!")
            quit()
            break
Exemple #30
0
#textblob for spelling correction
check_spel = pos_str
pos_str = TextBlob(check_spel)

try:
    if check_spel in word_vectors.vocab:

        pos_str = str(pos_str)
        # remove spaces both in the beginning and in the end of of string
        pos_str = re.sub("^\s+|\s+$", "", pos_str, flags=re.UNICODE)

        # any input that is NOT a-z, A-Z, 0-9,-,*
        pos_str = re.sub('[^a-zA-Z0-9-_*.]', ' ', pos_str)
        pos_str = re.sub(' +', ' ', re.sub('\W', ' ', pos_str))

        pos_words = pos_str.split(' ')

        if (len(pos_words[0]) > 0):

            st.write('SIMILAR TO ', pos_str)
            df = pd.DataFrame(model.wv.most_similar(positive=pos_words,
                                                    topn=10),
                              columns=['SIMILAR_word', 'similarity'])
            df1 = df[['SIMILAR_word']]
            link_list = []
            for i in df['SIMILAR_word']:

                word = 'https://scholar.google.nl/scholar?hl=nl&as_sdt=0%2C5&q=' + i

                link_list.append(word)
            # rename column as SIMILAR for UI