Esempio n. 1
0
def split_sentence_based_on_verbs(reviewText):
    review_spacy = nlp(reviewText)
    review_textblob = TextBlob(reviewText)
    if not review_textblob.detect_language() == 'en':
        review_textblob = review_textblob.translate(to='en')
        review_spacy = nlp(review_textblob.string)
    else:
        contains_romanian_words = 0

        for word in review_textblob.words:
            word_textblob = TextBlob(word)
            if len(word_textblob.string) >= 3 and word_textblob.detect_language() == 'ro':
                contains_romanian_words = 1
                break

        if contains_romanian_words == 1:
            new_reviewText = ''
            for word in review_spacy:
                word_textblob = TextBlob(word.orth_)
                if not word.is_title and len(word_textblob.string) >= 3:
                    if word_textblob.detect_language() != 'ro':
                        new_reviewText = new_reviewText + ' ' + word_textblob.string
                    else:
                        new_word = word_textblob.translate(to='en')
                        new_reviewText = new_reviewText + ' ' + new_word.string
                else :
                    new_reviewText = new_reviewText + ' ' + word_textblob.string
                    # only_english_words = 0
                    # break
            review_textblob = TextBlob(new_reviewText)
            review_spacy = nlp(review_textblob.string)

    new_sentences = []
    verbs_positions = []
    for k in range(0, len(review_spacy)):
        if review_spacy[k].pos == VERB and review_spacy[k].dep_ == 'ROOT':
            verbs_positions.append(k)
    start = 0
    if len(verbs_positions) > 0:
        for p in range(0, len(verbs_positions)):
            if p == len(verbs_positions) - 1:
                new_sentences.append(review_spacy[start:len(review_spacy)].text)
            else:
                q = verbs_positions[p] + 1
                while q < len(review_spacy):
                    if review_spacy[q].is_stop and ((review_spacy[q].pos == CONJ and (q < len(review_spacy)-1 and review_spacy[q-1].pos != review_spacy[q+1].pos)) or (review_spacy[q].pos == DET and review_spacy[q].lower_ in ['the', 'this', 'those', 'which', 'other', 'another']) or (review_spacy[q].pos == PUNCT and review_spacy[q] in [',', ';'])):
                        new_sentences.append(review_spacy[start:q].text)
                        start = q
                        break
                    q += 1
    else:
        new_sentences.append(reviewText)
    return new_sentences
Esempio n. 2
0
def translate_msg(message):
    try:
        if (len(message.text) > 3):
            b = TextBlob(unicode(message.text))
            if (b.detect_language() == "ru"):
                tr_text = unicode(b.translate(to="en"))
                bot.send_message(message.chat.id, tr_text)
            if (b.detect_language() == "en"):
                tr_text = unicode(b.translate(to="ru"))
                bot.send_message(message.chat.id, tr_text)
    except Exception as e:
        print (e.message)
        bot.send_message(message.chat.id, "Sorry Boss,can't translate :("
                                          " Try another message, please " +
                                          telegram.Emoji.KISSING_FACE)
Esempio n. 3
0
def findLanguage(reducedList3):
	languageMap = {}
	currentNumber = 0

	shuffle(reducedList3)
	for i in reducedList3:
		if currentNumber < 5000:
			if len(i[0]) > 5:
				try:
					b = TextBlob(unicode(i[0]))
					currentLanguage = b.detect_language()
					if currentLanguage in languageMap:
						languageMap[currentLanguage] += 1
					else:
						languageMap[currentLanguage] = 1
				except: 
					pass
			currentNumber += 1
			print currentNumber

	listOfWords = []
	for i in languageMap:
		for x in range(0, languageMap[i]):
			listOfWords.append(i)

	listOfWordsCounter = collections.Counter(listOfWords)
	print 'Best Languages:', listOfWordsCounter.most_common(5)

	print languageMap
Esempio n. 4
0
def update_book(book):
    blob = TextBlob(book.description)

    if blob.detect_language() == 'en':
        description = ''
        nouns = filter(lambda x: x[1] == 'NN' or x[1] == 'NNP', blob.tags)

        for noun, tag in nouns:
            description += noun + " "

            if len(noun) > 2:
                description += TextBlob(noun).translate(to='ko').string + " "

    else:
        description = book.description

    book_document = search.Document(
        doc_id=book.ISBN,
        fields=[
            search.TextField(name='title', value=remove_punc(book.title)),
            search.TextField(name='author', value=remove_punc(book.author)),
            search.TextField(name='description', value=remove_punc(description))
        ]
    )

    index = get_book_index()
    index.put(book_document)
Esempio n. 5
0
    def scrape(self,links=[],ads=True,translator=False):
        responses = []
        values = {}
        data = []
        
        if ads:
            for link in links:
                r = requests.get(link)
                responses.append(r)
        else:
            for link in links:
                r = requests.get(link)
                text = unidecode(r.text)
                html = lxml.html.fromstring(text)

                links = html.xpath("//div[@class='cat']/a/@href")
                for link in links:
                    if len(self.base_urls) > 1 or len(self.base_urls[0]) > 3:
                        time.sleep(random.randint(5,27))
                    try:
                        responses.append(requests.get(link))
                        print link
                    except requests.exceptions.ConnectionError:
                        print "hitting connection error"
                        continue

        for r in responses:
            text = r.text
            html = lxml.html.fromstring(text)
            values["title"] = html.xpath("//div[@id='postingTitle']/a/h1")[0].text_content()
            values["link"] = unidecode(r.url)
            values["new_keywords"] = []
            try:
                values["images"] = html.xpath("//img/@src")
            except IndexError:
                values["images"] = "weird index error"
            pre_decode_text = html.xpath("//div[@class='postingBody']")[0].text_content().replace("\n","").replace("\r","")  
            values["text_body"] = pre_decode_text 
            try:
                values["posted_at"] = html.xpath("//div[class='adInfo']")[0].text_content().replace("\n"," ").replace("\r","")
            except IndexError:
                values["posted_at"] = "not given"
            values["scraped_at"] = str(datetime.datetime.now())
            body_blob = TextBlob(values["text_body"])
            title_blob = TextBlob(values["title"])
            values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api
            values["polarity"] = body_blob.polarity
            values["subjectivity"] = body_blob.sentiment[1]
            if values["language"] != "en" and not translator:
                values["translated_body"] = body_blob.translate(from_lang="es")
                values["translated_title"] = title_blob.translate(from_lang="es")
            else:
                values["translated_body"] = "none"
                values["translated_title"] = "none"
            text_body = values["text_body"]
            title = values["title"]
            values["phone_numbers"] = self.phone_number_parse(values)
            data.append(values)
        
        return data
Esempio n. 6
0
def answer(question):
    global IsAnswer,detected,u
    IsAnswer = True
    DetectLang = TextBlob(question)
    detected = DetectLang.detect_language()
    if detected == 'en':
        print("language detected: en")
        u = 'en'
        print(len(words),"len(words)")
        low = question.lower()
        questions = re.sub('[^\w]',' ',low).split() #list
        BadWords(questions)
        print(questions)
        def writeout(words,question,IsAnswer):
            r = []
            if len(words) > 3000:
                a1 = len(questions)
                for x in range(0,a1):
                    words.remove(random.choice(words))
                print(len(words),"len(words)")
            else:
                pass
            os.remove('newwords.txt')
            file = open('newwords.txt','w')
            words.extend(questions)
            r.extend(words)
            s = ' '.join(r)
            file.write(s)
        writeout(words,question,IsAnswer)
        randomthought()
    else:
        u = detected
        print("language detected:",u)
        randomthought()
Esempio n. 7
0
def review_features_romanian(reviewText, type):
    review_spacy = nlp(reviewText)
    review_textblob = TextBlob(reviewText)
    review_spacy_ents = review_spacy.ents
    word_features_array = []
    # print(review_textblob)
    if not review_textblob.detect_language() == 'en':
        review_textblob = review_textblob.translate(to='en')
        review_spacy = nlp(review_textblob.string)
    else:
        contains_romanian_words = 0

        for word in review_textblob.words:
            word_textblob = TextBlob(word)
            if len(word_textblob.string) >= 3 and word_textblob.detect_language() == 'ro':
                contains_romanian_words = 1
                break

        if contains_romanian_words == 1:
            new_reviewText = ''
            for word in review_spacy:
                word_textblob = TextBlob(word.orth_)
                if not word.is_title and len(word_textblob.string) >= 3:
                    if word_textblob.detect_language() != 'ro':
                        new_reviewText = new_reviewText + ' ' + word_textblob.string
                    else:
                        new_word = word_textblob.translate(to='en')
                        new_reviewText = new_reviewText + ' ' + new_word.string
                else :
                    new_reviewText = new_reviewText + ' ' + word_textblob.string
            review_textblob = TextBlob(new_reviewText)
            review_spacy = nlp(review_textblob.string)
            # print(review_spacy)w_spacy)
    for i in range(len(review_spacy)):
        word = review_spacy[i]
        # if not word.is_stop and not word.is_punct:
        if (word.pos == NOUN or (word.pos == VERB and TextBlob(word.orth_).sentiment.polarity > 0) or word.pos == ADJ or word.pos == ADV) and not word.is_punct:
        # if word.pos == NOUN:
            if type == labelType.Label.aspect:
                word_features_array.append(word_aspect_features(review_spacy, review_textblob, review_spacy_ents, i))
            elif type == labelType.Label.attribute:
                word_features_array.append(word_attribute_features(review_spacy, review_textblob, review_spacy_ents, i))
            elif type == labelType.Label.polarity:
                word_features_array.append(word_polarity_features(review_spacy, review_textblob, review_spacy_ents, i))
            elif type == labelType.Label.emotion:
                word_features_array.append(word_emotion_features(review_spacy, review_textblob, review_spacy_ents, i))
    return word_features_array
Esempio n. 8
0
def translate_this(jenni, msg):
    t_msg = TextBlob(msg.groups()[0])
    from_lang = t_msg.detect_language()
    if from_lang != 'en':
        translated = t_msg.translate(from_lang=from_lang, to='en')
        jenni.reply("{}".format(translated))
    else:
        return
Esempio n. 9
0
def scanForMultipleLanguages(target, words):
    langmap = makeLangPrefixMapping()
    langprefs = set()
    for word in words:
        blob = TextBlob(words)
        detect = blob.detect_language()
        if detect is not langmap[target]:
            langprefs.add(detect)
    return langprefs
Esempio n. 10
0
def find_loc(p1):
    if p1 != "None":
        for city in cities_l:
            if city in p1.lower():
                return city
        for k, v in cities_nn.iteritems():
            if k in p1.lower():
                return v
        t1 = TextBlob(p1.lower())
        if "la" in p1.lower() and t1.detect_language() == "en":
            return "los angeles"
    return "None"
Esempio n. 11
0
 def rating(self):
     if self._rating:
         return self._rating
     elif len(self.text) > 3:
         blob = TextBlob(self.text)
       
         try:
             if blob.detect_language() == 'en':
                 return round(min(max(blob.sentiment.polarity, -0.5), 0.5) * 4 + 3)
         except urllib.error.HTTPError:
             LOG.warning("Rating detection failed: HTTPError")
             return None
    def getEngTag(self, tag):
        "Get the tag in English"
        tagName = TextBlob(tag.decode('utf-8'))
        tagName = tagName.words[0].singularize()
        
        if len(tagName) >= 3:
            lang = tagName.detect_language()

            if lang != 'en':
                tagName = tagName.translate(from_lang=lang, to='en')

        return tagName.encode('utf-8')     
Esempio n. 13
0
def handle(request):
    page = request.match_info.get('page')
    content = yield from fetch_page(page)
    text = strip(content)
    blob = TextBlob(text.decode('utf-8'))
    words = list({ w for w in blob.words if len(w) > 4})
    words.sort()
    body = { 'sentences': len(blob.sentences),
             'words': len(words),
             'language': blob.detect_language(),
             'blob': words }
    return web.Response(body=json.dumps(body).encode('utf-8'),
                        content_type="application/json; charset=utf-8")
Esempio n. 14
0
def echo(word, word_eol, userdata):
    global my_language
    try:
        original = TextBlob(word_eol[3][1:].decode("utf-8"))
        lang = original.detect_language()
        nick = word[0].split("!")[0].replace(":","")
        if lang != my_language:
            res = original.translate(from_lang=lang, to=my_language)
        if len(res) > 0:
            print("\037\00312" + nick + " said: " + str(res).replace( \
                  "\n","") + " (From lang=%s)" % str(lang))
        return hexchat.EAT_NONE
    except:
        return hexchat.EAT_NONE
 def parse_text_meta_data(self,html,values):
     if self.debug: print "Processing textual information - language, polarity, subjectivity.."
     body_blob = TextBlob(values["text_body"])
     title_blob = TextBlob(values["title"])
     values["language"] = body_blob.detect_language() #requires the internet - makes use of google translate api
     values["polarity"] = body_blob.polarity
     values["subjectivity"] = body_blob.sentiment[1]
     if values["language"] != "en" and not translator:
         values["translated_body"] = body_blob.translate(from_lang="es")
         values["translated_title"] = title_blob.translate(from_lang="es")
     else:
         values["translated_body"] = "none"
         values["translated_title"] = "none"
     return values
Esempio n. 16
0
def filter_lang(texts, lang):
    """ Keep only texts identified as written in lang

    :texts: A list of texts to process
    :lang: The language we want to retain texts for
    :returns: list of texts classified as written in lang

    """
    lang_texts = []
    for text in texts:
        if len(text) > 3:
            blob = TextBlob(text)
            if blob.detect_language() == lang:
                lang_texts.append(text)
    return lang_texts
Esempio n. 17
0
    def on_status(self, status):
        tweet = TextBlob(re.sub(r"http\S+", "", status.text))

        if len(tweet) < 4:
            return

        if tweet.detect_language() == 'en':
            result = str(tweet + " [" + str(tweet.polarity) + "]")

            tweet_text = str(tweet)
            if tweet.polarity >= POS_PARAM and tweet_text not in pos_tweets:
                pos_tweets.append(tweet_text)
                print("POSITIVE: " + result)
            if tweet.polarity <= NEG_PARAM and tweet_text not in neg_tweets:
                neg_tweets.append(tweet_text)
                print("NEGATIVE: " + result)
Esempio n. 18
0
    def onButtonPressed(self, button):
        textbuffer = tview_translate.get_buffer()
        start = textbuffer.get_start_iter()
        end = textbuffer.get_end_iter()
        textbuffer.delete(start, end)

        textbuffer = tview_text.get_buffer()
        start = textbuffer.get_start_iter()
        end = textbuffer.get_end_iter()

        text = u"{0}".format(textbuffer.get_text(start, end, False))
        tree_iter = comboboxtext_to.get_active_iter()
        language_to = None
        if tree_iter is not None:
            model = comboboxtext_to.get_model()
            key, language_to = model[tree_iter][:2]

        tree_iter = comboboxtext_from.get_active_iter()
        language_from = None
        if tree_iter is not None:
            model = comboboxtext_from.get_model()
            key, language_from = model[tree_iter][:2]

        if language_to is not None:
            value = r.hget(text + ":" + language_to, language_from)
            if value:
                textbuffer = tview_translate.get_buffer()
                textbuffer.set_text(value)
                return

            blob = TextBlob(text)
            if language_from == 'detect':
                language_from = blob.detect_language()

            if language_from is None:
                translate = u"{0}".format(blob.translate(to=language_to))
            else:
                translate = u"{0}".format(blob.translate(
                    from_lang=language_from, to=language_to))

            textbuffer = tview_translate.get_buffer()
            textbuffer.set_text(translate)
            if value is None:
                mapping = {language_from: translate}
                r.hmset(text + ":" + language_to, mapping)
Esempio n. 19
0
def filter_data(line):
    global relevant_terms
    try:
        tweet = json.loads(line)
        if not 'delete' in tweet.keys():
            blob = TextBlob(tweet['text'])
            if  blob.detect_language() != 'pt':
                text_pt = unicode(str(blob.translate(to="pt")),'utf-8')
            else:
                text_pt = tweet['text']
            text_pt = remove_accents(text_pt)

            for term in relevant_terms:
                if term in text_pt or term in tweet['text']:
                    return True
    except:
        pass
    return False
    def translate(text, from_language, to_language):
        """
            translate: Translate from/to language. Uses Google Translate.

            Params:
            - text: The text that will be translated.
            - from_language: The language from.
            - to_language: The langue that will be translated.

            Results:
            - text translated in string format.
        """
        textBlob = TextBlob(text.decode(ENCODING, 'ignore'))
        
        if not from_language:
            from_language = textBlob.detect_language()

        return textBlob.translate(from_lang=from_language, to=to_language)
Esempio n. 21
0
	def returnEntryVersusTarget(self, datalist):
		'''Some users write in a language that is different from their target language
		   (i.e. if they are practicing a language that they didn't specify that they were
		   learning, or if they are writing an entry in their native language asking someone
		   to translate something for them). This function counts how many of these instances
		   exist in the specified dataset.'''
		t0 = time()
		prefmap = makePrefixLangMapping()
		not_orig_lang = 0
		for data in datalist:
			blob = TextBlob(data[self.ENTRY])
			entrylang = blob.detect_language()
			islang = True
			for d in data[self.STUDYING].split():
				if entrylang not in prefmap: continue
				if prefmap[entrylang] == d: continue
				not_orig_lang += 1
		print("Took %s seconds" % (time() - t0))
		print("Of %s entries, there are %s entries written in a different language than specified" % 
			(len(datalist), not_orig_lang))
def _get_detailed_stats(no_code_text):
    """
    Returns detailed stats on text
    :param no_code_text: String to analyse
    :return: list of details
    """
    results = []
    group_by = 'Detailed Text Statistics'
    tb = TextBlob(no_code_text)
    # Spell check here...it's very slow
    results.append(TextFeature('Number of sentences', textstat.sentence_count(no_code_text), group_by))
    results.append(TextFeature('Number of sentences (again)', len(tb.sentences), group_by))
    results.append(TextFeature('Number of words', len(tb.words), group_by))
    results.append(TextFeature('Sentiment Polarity', tb.sentiment.polarity, group_by))
    results.append(TextFeature('Sentiment Subjectivity', tb.sentiment.subjectivity, group_by))
    results.append(TextFeature('Detected Language', tb.detect_language(), group_by))
    results.append(TextFeature('Number of important phrases', len(tb.noun_phrases), group_by))
    results.append(TextFeature('Number of word bi-grams', len(tb.ngrams(2)), group_by))
    results.append(TextFeature('Number of word tri-grams', len(tb.ngrams(3)), group_by))
    results.append(TextFeature('Number of word 4-grams', len(tb.ngrams(4)), group_by))
    return results
Esempio n. 23
0
def compute_relevance(line):
    global relevant_terms
    tweet = json.loads(line)
    blob = TextBlob(tweet['text'])
    if  blob.detect_language() != 'pt':
        text_pt = unicode(str(blob.translate(to="pt")),'utf-8')
    else:
        text_pt = tweet['text']
    text_pt = remove_accents(text_pt)

    frequency = 0.0 
    for term in relevant_terms:
        if term in text_pt or term in tweet['text']:
            if term == 'socialbasebr':
                frequency += 10.0
            else:
                frequency += 1.0

    #mutual information
    value = 0.2*int(tweet['retweet_count']) + 0.3*int(tweet['favorite_count']) + 0.1*len(tweet['entities']['user_mentions']) + 0.4*frequency
    return (tweet['text'], value)
Esempio n. 24
0
def detectLangauge(string):
    # Note langauges can be found here:
    # https://cloud.google.com/translate/v2/using_rest#language-params
    nlp = TextBlob(unicode(string, 'utf-8'))
    return nlp.detect_language()
Esempio n. 25
0
class NLP(object):

    '''
    NLP tools 
    required : 
    corpus_path

    '''

    def __init__(self, _text, *args, **kwargs):
        # print "init NLP"

        # try: 
        #     _text.decode("utf-8")
        # except UnicodeDecodeError :
        #     print "ok"
        #     pass


        self.text = _text
        self.blob = TextBlob(_text)
        self.sentences = self.blob.sentences

    def words(self):
        return self.blob.words

    def count_words(self):
        return dict(self.blob.word_counts)

    def get_language(self):
        return self.blob.detect_language()

    def keywords(self):

        t0 = time()

        # Used when tokenizing words
        sentence_re = r'''(?x)      # set flag to allow verbose regexps
              ([A-Z])(\.[A-Z])+\.?  # abbreviations, e.g. U.S.A.
            | \w+(-\w+)*            # words with optional internal hyphens
            | \$?\d+(\.\d+)?%?      # currency and percentages, e.g. $12.40, 82%
            | \.\.\.                # ellipsis
            | [][.,;"'?():-_`]      # these are separate tokens
        '''

        toks = nltk.regexp_tokenize(self.text, sentence_re)

        postoks = nltk.tag.pos_tag(toks)
        # postoks = self.blob.pos_tags

        tree = chunker.parse(postoks)
        # print tree

        def leaves(tree):
            """Finds NP (nounphrase) leaf nodes of a chunk tree."""
            for subtree in tree.subtrees(filter=lambda t: t.node == 'NP'):
                yield subtree.leaves()

        def normalise(word):
            """Normalises words to lowercase and stems and lemmatizes it."""
            word = word.lower()
            word = stemmer.stem_word(word)
            word = lemmatizer.lemmatize(word)
            return word

        def acceptable_word(word):
            """Checks conditions for acceptable word: length, stopword."""
            accepted = bool(2 <= len(word) <= 40
                            and word.lower() not in stopwords)
            return accepted

        def get_terms(tree):
            for leaf in leaves(tree):
                # print leaf
                term = [normalise(w) for w, t in leaf if acceptable_word(w)]
                yield term

        terms = get_terms(tree)

        words = []
        for term in terms:
            for word in term:
                words.append(word)

        print "Done in %fs" % (time() - t0)
        return words

    def analyze_sentiment(self):
        sentiments = []
        for sentence in self.sentences:
            sentiments.append({
                "polarity": sentence.sentiment.polarity,
                "subjectivity": sentence.sentiment.subjectivity
            })
        return sentiments

    def get_adjectives(self):
        adj = []
        for word, POStag in sorted(set(self.blob.tags)):
            if POStag == "JJ":
                adj.append(str(word))
        return adj

    # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

    def start_with_number(self, s):
        data = [c for c in s if c in '0123456789Xx']
        if len(data) != 0:
            return True
        else:
            return False

    def filter_out_nastyness(self):
        '''Filter out proper nouns (NNP & NNPS), numbers (CD) and symbols (SYM)'''
        
        to_filter_out = ["ed.", '"']

        for word, tag in self.blob.tags:
            if tag == "NNP"         \
                or tag == "NNPS"        \
                or tag == "CD"           \
                or tag == "SYM"         \
                or word[0:4] == "doi:" or word[0:4] == "isbn" or word[0:4] == "ISBN" \
                    or self.start_with_number(word) == True  \
                    or any(x.isupper() for x in word[2:]) == True \
                    or tag == "FW":
                to_filter_out.append(word)

        regex = re.compile('\\b(%s)\\W'%('|'.join(map(re.escape,to_filter_out))),re.UNICODE)
        clean = regex.sub(" ", self.text)
        # clean.decode("utf-8")

        clean_ok = ''.join([i for i in clean if i not in '()#'])
        clean_ok=clean_ok.replace("R.sub","")
        # clean_ok.decode("utf-8")
        return clean_ok

    def get_clean_text(self):
        txt=self.filter_out_nastyness()
        return txt

    def get_verbs(self):
        verbs = []
        for word, POStag in sorted(set(self.blob.tags)):
            if POStag == "VB":
                verbs.append(str(word))
        return verbs

    def get_nouns(self):
        nouns = []
        for word, POStag in sorted(set(self.blob.tags)):
            if POStag == "NN":
                nouns.append(str(word))
        return nouns

    def get_noun_phrases(self):
        nouns = []
        for word in self.blob.noun_phrases:
            nouns.append(str(word))
        return nouns

    def translate_to(self, _language):
        return self.blob.translate(to=_language)
Esempio n. 26
0
business_info = lookup.business_dict(YELP_BIZ_FILE)

# Organize reviews into Business - User - Review(s) tree structure
n, review_tree = 0, {}
with open(YELP_DATA_FILE) as data_file: 
    for line in data_file: 
        review_data = json.loads(line)
        business_id = review_data['business_id']
        user_id = review_data['user_id']
        review_text = review_data['text'].replace('\n', ' ')
        review_rate = int(review_data['stars'])
        if business_info[business_id][0] not in US_STATES: 
            if len(review_text) < 3: 
                continue
            blob_review = TextBlob(review_text)
            if blob_review.detect_language() != 'en': 
                continue
        if n % 50000 == 0: print n
        n += 1
        if review_tree.get(business_id) is None: 
            review_tree[business_id] = {user_id: [(review_text, review_rate)]}
        elif review_tree[business_id].get(user_id) is None: 
            review_tree[business_id][user_id] = [(review_text, review_rate)]
        else: 
            review_tree[business_id][user_id].append((review_text, review_rate))

# Truncate on two conditions: 1 user-business has 5+ reviews; 1 business has 500+ reviews
def append_review(review_per_business): 
    n_per_business = 0
    for user_id, reviews in review_per_business.iteritems():
        n_per_user = min(len(reviews), REVIEW_LIMIT[0])
def is_target_language(text):
    from textblob import TextBlob
    blob = TextBlob(text)
    if len(text) >= 3 and blob.detect_language() in TARGET_LANGUAGE:
        return True
    return False
def findlanguage(inputs):
    # Uses Textblob to find the language of the inputted text
    words = TextBlob(inputs)
    #print("Detected Language : ", words.detect_language())

    return words.detect_language()
Esempio n. 29
0
import speech_recognition as sr
from textblob import TextBlob

r = sr.Recognizer()
mic = sr.Microphone()

with mic as source:
    r.adjust_for_ambient_noise(source)
    print('Recording... Please speak now.')
    audio = r.listen(source)

trans = r.recognize_google(audio, language='es-ES')
print(trans)

blob2 = TextBlob(trans)
lang = blob2.detect_language()

newline = '\n'
print(f'Detected language: {lang}. {newline}Getting sentiment polarity...')

if lang == 'en':
    blob2_ready = blob2
else:
    blob2_ready = blob2.translate(to='en')

sentiment = blob2_ready.sentiment.polarity

print(f'{newline}Sentiment polarity: {sentiment}. {newline}This means:')

if sentiment == 0:
    print('Customer was neutral.')
Esempio n. 30
0
# Authentification
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)

# Get API
api = tweepy.API(auth)

# Get user
user = api.get_user(args.user)

# Get statuses
for status in api.user_timeline(screen_name=args.user, count=200):
    # Analyse tweet
    tweet = TextBlob(status.text)

    # Show sentiment analysis
    print(u"Tweet \"{}\"".format(status.text))
    print(u"Polarity {}, Subjectivity {}".format(tweet.sentiment.polarity,
                                                 tweet.sentiment.subjectivity))
    print(u"Language : {}".format(tweet.detect_language()))
    try:
        print(u"French : {}".format(tweet.translate(from_lang="en-US",
                                                    to='fr')))
    except textblob.exceptions.NotTranslated:
        pass
    # end try
    print(u"Tokens : {}".format(tweet.words))
    print(u"")
# end for
Esempio n. 31
0
Лапочкин Д. 40%
"""

from textblob import TextBlob

text = input('Введите текст: ')
blob = TextBlob(text)
syllables = 0
sentence = 0
fre = 0

for b in ['.', '!', '?']:
    if text.count(b) > 0:
        sentence += text.count(b)

if blob.detect_language() == 'ru':
    syllables = sum(1 for x in text.lower() if x in 'уеоаыяиюэ')
else:
    syllables = sum(1 for x in text.lower() if x in 'aeiouy')
asl = (text.count(' ') + 1) / sentence
asw = syllables / (text.count(' ') + 1)

if blob.detect_language() == 'ru':
    fre = 206.835 - (1.3 * asl) - (60.1 * asw)
else:
    fre = 206.835 - (1.015 * asl) - (84.6 * asw)

if blob.detect_language() == 'ru':
    blob = blob.translate(to="en")
if blob.sentiment.polarity > 0.33:
    tonality = 'положительный'
Esempio n. 32
0
def main():
    """Text Analysis App """

    st.title("Language Detector & Translator")

    image = Image.open("people_speaking.jpg")
    st.sidebar.image(image,
                     caption="Different languages",
                     use_column_width=True)

    activities = ["Detector & Translator", "About"]
    choice = st.sidebar.selectbox("Menu", activities)

    if choice == 'Detector & Translator':
        st.subheader("Text Area")
        lista_modos = ("For 23 languages", "For selected languages")
        modo = st.sidebar.radio("Choose", lista_modos)
        texto_default = 'Text'
        raw_text = st.text_area("Copy&Paste -> Ctrl+Enter", texto_default)
        blob = TextBlob(raw_text)

        # Audioplay
        #if st.button("Audio"):
        #    play(raw_text)

        if modo == "For selected languages":
            #texto_default = 'Texto'
            #raw_text = st.text_area("Copy&Paste -> Ctrl+Enter",texto_default)
            #blob = TextBlob(raw_text)
            try:

                if (raw_text == " " or raw_text == "  " or raw_text == "   "
                        or raw_text == "    "):
                    st.error("Please write something in the text area")
                elif (raw_text != texto_default) and len(raw_text) > 0 and (
                        raw_text != " " or raw_text != "  "
                        or raw_text != "   " or raw_text != "    "):
                    dict_idioma_full = lista_idiomas_full()

                    idioma_original = get_value(blob.detect_language(),
                                                dict_idioma_full)

                    original_key = get_key(idioma_original, dict_idioma_full)

                    st.success("Original Language" + ":  " + idioma_original +
                               " (" + original_key + ")")

                    dict_idioma = lista_idiomas(idioma_original)
                    options = st.multiselect("Choose a language",
                                             tuple(dict_idioma.values()))

                    idioma_final = get_key(idioma_original, dict_idioma)

                    #st.write("Original language:",idioma_original)
                    for i in range(len(options)):
                        value = options[i]
                        idioma_final = get_key(value, dict_idioma)
                        if (idioma_original != idioma_final):
                            texto_convertido = blob.translate(to=idioma_final)
                            st.success("Language" + ": " + value + " (" +
                                       idioma_final + ")")
                            st.text(texto_convertido)
                            #play(texto_convertido,idioma_final)

            except:
                st.error(
                    "ERROR: text must be at least 3 letters and the word must exist in the formal language"
                )

        else:
            try:
                flag = False
                if (raw_text == " " or raw_text == "  " or raw_text == "   "
                        or raw_text == "    "):
                    st.error("Please write something in the text area")
                elif (raw_text != texto_default) and len(raw_text) > 0 and (
                        raw_text != " " or raw_text != "  "
                        or raw_text != "   " or raw_text != "    "):
                    dict_idioma_full = lista_idiomas_full()

                    idioma_original = get_value(blob.detect_language(),
                                                dict_idioma_full)
                    original_key = get_key(idioma_original, dict_idioma_full)
                    st.success("Original Language" + ":  " + idioma_original +
                               " (" + original_key + ")")

                    dict_idioma = lista_idiomas(idioma_original)
                    options = dict_idioma.values()

                    st.write("Original Language:", idioma_original)
                    idioma_lista = list(options)

                    for i in range(len(idioma_lista)):
                        value = idioma_lista[i]
                        #st.text(value)
                        idioma_final = get_key(value, dict_idioma)
                        if (idioma_original != idioma_final):
                            texto_convertido = blob.translate(to=idioma_final)
                            st.success("Language" + ": " + value + " (" +
                                       idioma_final + ")")
                            st.text(texto_convertido)
                            flag = True

            except:
                if flag != True:
                    st.error(
                        "ERROR: text must be at least 3 letters and the word must exist in the formal language"
                    )

    elif choice == 'About':
        st.subheader("I hope you enjoy it and use to learn something")
        st.subheader("Built with Streamlit and Textblob")
        #st.write("Problems:")
        #st.write(" - sometimes the original language can't be correctly detected")
        #st.write(" - sometimes the sound will fail.")
        st.subheader("by Silvio Lima")

        if st.button("Linkedin"):
            js = "window.open('https://www.linkedin.com/in/silviocesarlima/')"
            html = '<img src onerror="{}">'.format(js)
            div = Div(text=html)
            st.bokeh_chart(div)

    else:
        # Audioplay
        st.subheader("Text Area")
        texto_default = 'Text'
        raw_text = st.text_area("Copy&Paste -> Ctrl+Enter", texto_default)
        blob = TextBlob(raw_text)
        try:
            if (raw_text == texto_default or raw_text == " "
                    or raw_text == "  " or raw_text == "   "
                    or raw_text == "    "):
                st.error("Please write something in the text area")
            else:
                dict_idioma_full = lista_idiomas_full()
                idioma_original = get_value(blob.detect_language(),
                                            dict_idioma_full)
                original_key = get_key(idioma_original, dict_idioma_full)

                st.success("Original Language" + ":  " + idioma_original +
                           " (" + original_key + ")")
                play(raw_text, original_key)

                dict_idioma = lista_idiomas(idioma_original)
                options = st.multiselect("Choose a language",
                                         tuple(dict_idioma.values()))

                for i in range(len(options)):
                    value = options[i]
                    idioma_final_key = get_key(value, dict_idioma)
                    try:
                        if (idioma_original != idioma_final_key):
                            texto_convertido = str(
                                blob.translate(to=idioma_final_key))
                            st.success("Language" + ": " + value + " (" +
                                       idioma_final_key + ")")
                            st.text(texto_convertido)
                            play(texto_convertido, idioma_final_key)

                    except:
                        st.error(
                            "ERROR: some languages will fail to play the sound."
                        )
        except:
            st.error(
                "ERROR: text must be at least 3 letters and the word must exist in the formal language"
            )
Esempio n. 33
0
from textblob import TextBlob

text = input("Enter text here=> ")
obj = TextBlob(text)

print("Detecting language\n", obj.detect_language())

print("Translate to")
print("1. bengali\t 2.chinese \t 3. german\t 4. gujrati\t 5. japanese:")
to = int(input("Enter your choice=> "))
if 5 < to < 1:
    print("Wrong choice")
    exit()
elif to == 1:
    to = 'bn'
elif to == 2:
    to = 'zh'
elif to == 3:
    to = 'de'
elif to == 4:
    to = 'gu'
else:
    to = 'jv'

print(obj.translate(to=to))
Esempio n. 34
0
sent = TextBlob("I haawve goood speling")
correct_sent = sent.correct()

w = Word("haave")
spellcheck = w.spellcheck()

#Get Word and Noun Phrase Frequencies
words = TextBlob('We are no longer together. We are enemies now.')
word_counts = words.word_counts
#You can specify whether or not the search should be case-sensitive (default is False).

#Translation and Language Detection
en_blob = TextBlob("You are my best friend")
pl_blob = en_blob.translate(to='pl')

blob = TextBlob("Mam na imię Piotr")
detected_lang = blob.detect_language()

#Parsing
text = TextBlob('I know You')
text_parse = text.parse()

#string
text = TextBlob("Hello World")
upper_text = text.upper()
find_world = text.find("World")

#ngrams
blob = TextBlob("Now is better than never.")
ngram = blob.ngrams(n=3)
  
public_tweets = api.search(key_word)  

for tweet in public_tweets:  
	print("\n\n")  
	tweet_text = clean_tweet(tweet.text)  

	tType = tweetType(tweet_text)  

	if(tType == 'Retweet'):  
		tweet_text = tweet_text.replace("RT ","")  


    
	analysis = TextBlob(tweet_text)  
	if(analysis.detect_language() != 'en' and len(tweet_text) > 3):  
		lang = analysis.detect_language()  
		try:  
			analysis.translate(from_lang=lang ,to='en')  
		except:  
			pass  

	            
	print("Tweet : ",tweet_text)  
	print()  
	print("Result of sentiment analysis : ",end="")  
	if(analysis.sentiment.polarity  > 0):  
		print("Happy")  

	elif(analysis.sentiment.polarity == 0):  
		print("Neutral")  
Esempio n. 36
0
import pandas as pd
df = pd.read_csv("liveChatData.csv")

df.describe()

## Top 50 Most active users
top = df.Author.value_counts().head(50)

a = df[df.Author == "trevor wasike"].reset_index(drop=True)["Message"]

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=20, stop_words="english")
X = vectorizer.fit_transform(a)
print(vectorizer.get_feature_names())

from textblob import TextBlob
from textblob.exceptions import TranslatorError

t1 = TextBlob(a[1])

t1.detect_language()

t1.translate(to="en")

from langdetect import detect
Esempio n. 37
0
def call_from_here(text):

    resp = client.message(text)
    # print(resp)
    # print('entity value is {}'.format(list(resp['entities'])[0]))
    entity = None
    value = None
    timeit = 0
    subject = ''
    try:
        entity = list(resp['entities'])[0]
        value = resp['entities'][entity][0]['value']
        # print(entity,value)
        # time.sleep(.10)
        if entity != None and value != None:
            if entity == 'niku':
                # print('in niku')
                if value == 'read':
                    print('reading')
                    data = startread()
                    speakit(data)
            if entity == 'translate':
                # print(entity,value)

                if value == 'tamil':
                    data = startread()
                    if data == '':
                        os.system('aplay notification_sound/error.wav')
                        speakit('sorry,no data selected')
                        print('/n/n sorry,no data selected')
                    else:
                        speakit('trying to translate selected text in tamil')
                        translate = TextBlob(data)
                        try:
                            translate_lang = translate.detect_language()
                            # print('deteced language is ',translate_lang)
                            if translate_lang == 'ta':
                                speakit(
                                    'the selected language is already in tamil'
                                )
                            else:
                                speakit('translating data from {}'.format(
                                    translate_lang))
                                translated_data = translate.translate(
                                    from_lang=translate_lang, to='ta')
                                translated_data = str(translated_data)
                                # print(translated_data)
                                with open('translated.txt',
                                          'w') as result_file:
                                    result_file.write(translated_data)
                                    os.system(
                                        'aplay notification_sound/translating.wav'
                                    )
                                    speakit(
                                        'the data was translated sucessfully')
                                    print(
                                        '/n /n the data was translated sucessfully and saved in translated.txt /n/n'
                                    )
                            # niku.trans_text.setText(translated_data)
                        except Exception as e:
                            os.system('aplay notification_sound/error.wav')
                            speakit(
                                'sorry some error happend while translating')
                            print(
                                'sorry some error happend while translating the error is',
                                e)

            elif entity == 'datetime' or entity == 'subject':
                timeit, subject = set_reminder(resp)

            elif entity == 'open':
                open_software(value)
            elif entity == 'show':
                window_movement(value)

            elif entity == 'weather':
                data = weather_api(value)
                speakit('the weather in {} is {}'.format(value, data))
                print('the weather in {} is {}'.format(value, data))
            elif entity == 'search_niku':
                ddgs(value)

                #
            # check_reminder(timeit,subject)

        # else:
        #     print(text)
        #     call_correct(text)

    except IndexError:
        # print(resp)
        # print(text)
        text = text.lower()
        call_correct(text)
# - *- coding: utf- 8 - *-
from textblob import TextBlob

word1 = TextBlob("thank you for using this")
lang = word1.detect_language()
z = word1.translate(from_lang='en', to='hi')
print(lang)
print(z)

# first install textblob library for using this
# pip install textblob
# encoding format should be exact and placed at beginning of file
Esempio n. 39
0
def detect_language(text):
    input = TextBlob(text)
    language= input.detect_language()
    return language
Esempio n. 40
0
import requests
import json
import textblob
from textblob import TextBlob

cidade = input('Informe sua cidade: ')

req = requests.get('http://api.openweathermap.org/data/2.5/weather?q=' +
                   cidade + '&APPID=574708452380626a25e411bfeab9dd7a')

#print(req.text)

#Transforma em dicionário python
tempo = json.loads(req.text)

#Traduzindo retorno
condicao_us = TextBlob(tempo['weather'][0]['main'])
condicao_us.detect_language()
condicao_traduzida = condicao_us.translate(to="pt_br")

print('Condição do tempo:', condicao_traduzida)
#Convertendo de Kelvin para Celsius
print('Temperatura: ', float(tempo['main']['temp']) - 273.15, '°C')
Esempio n. 41
0
india_trends = api.trends_place(2282863, )[0]['trends'][:25]

text_list = []
hashtags_list = []
message = ''

try:
    for i in india_trends:
        hashtag = str(i['name'])
        if i['name'][0] == '#':
            temp = str(i['name'])[1:]
        else:
            temp = str(i['name'])
        lang = TextBlob(temp)
        if lang.detect_language() == 'en':
            driver.get(
                "https://news.google.com/topstories?hl=en-IN&gl=IN&ceid=IN:en")
            time.sleep(5)
            google_news = driver.find_element_by_xpath("//a[@title='News']")
            action = webdriver.common.action_chains.ActionChains(driver)
            action.move_to_element_with_offset(google_news, 500, 0)
            action.click()
            action.send_keys(temp)
            action.send_keys(Keys.ENTER)
            action.perform()
            time.sleep(5)
            bodyText = driver.find_element_by_tag_name("body").text
            if 'No results found.' not in bodyText:
                try:
                    link_element = driver.find_element_by_xpath(
Esempio n. 42
0
def main():
    
    """Ouça e Fale App """
    
    st.title("Reader & Voice")
   
    activities = ["Home","PDF","TXT","About"]
    choice = st.sidebar.radio("Home",activities)

    if choice == 'Home':
        st.write("Only files:")
        st.markdown("### PDF or TXT")
        st.write("After uploading you can convert to 7 languages")
        st.markdown("### English, Spanish, French, Italian, Japanese, Russian  and Chinese")

        #st.write("Definitions")
        #st.write("PCA is not a statistical method to infer parameters or test hypotheses. Instead, it provides a method to reduce a complex dataset to lower dimension to reveal sometimes hidden, simplified structure that often underlie it.")
        #st.write("")
        #st.write("PCA is a statistical method routinely used to analyze interrelationships among large numbers of objects.")
        #st.write("")
        #st.write("Principal component analysis (PCA) is a mathematical algorithm that reduces the dimensionality of the data while retaining most of the variation in the data set.")
        
    if choice == 'PDF':
        
        file = carregar_texto('pdf')
        pdf = pdftotext.PDF(file)
            #for page in pdf:
            #    st.text(page)
            
        blob = TextBlob(pdf[0])
        st.text(blob)
        st.write(blob.detect_language())

        #dict_idioma_full = lista_idiomas_full()
        #idioma_original = get_value(blob.detect_language(),dict_idioma_full)
            #original_key = get_key(idioma_original, dict_idioma_full)
                    
            #st.success("Original Language"+":  "+ idioma_original + " ("+original_key+")")

            # Original sound
            #play(raw_text,original_key)
                    
                              
            #dict_idioma = lista_idiomas(idioma_original)
            #options = st.multiselect("Choose a language", tuple(dict_idioma.values()))
                                      
                    

            #for i in range(len(options)):
            #    value = options[i]
            #    idioma_final_key = get_key(value, dict_idioma)
            #    try:
            #        if (idioma_original != idioma_final_key):
            #            texto_convertido = str(blob.translate(to=idioma_final_key))
            #            st.success("Language"+": "+ value + " ("+idioma_final_key+")")
            #            st.write(texto_convertido)
            #            #st.text(idioma_final_key)
            #            play(texto_convertido,idioma_final_key)
            #            
            #    except:
            #        st.error("ERROR: some languages will fail to play the sound.")

            #dict_idioma_full = lista_idiomas_full()
            #idioma_original = get_value(blob.detect_language(),dict_idioma_full)
            #original_key = get_key(idioma_original, dict_idioma_full)
                    
            #st.success("Original Language"+":  "+ idioma_original + " ("+original_key+")")

            # Original sound
            #play(blob,original_key)
            #convert(blob)
        #except:
        #    st.warning("PDF please")

      
    if choice == 'TXT':
        try:
            file = carregar_texto('txt')
            blob= TextBlob(file.getvalue())
            st.markdown(blob)
            #dict_idioma_full = lista_idiomas_full()
            #idioma_original = get_value(blob.detect_language(),dict_idioma_full)
            #original_key = get_key(idioma_original, dict_idioma_full)
                    
            #st.success("Original Language"+":  "+ idioma_original + " ("+original_key+")")
            # Original sound
            #play(file.getvalue(),original_key)

            #st.write(blob.detect_language())
            #st.subheader(blob)
            convert(file, blob)
                          
                              
            #dict_idioma = lista_idiomas(idioma_original)
            #options = st.multiselect("Choose a language", tuple(dict_idioma.values()))
                                      
                    

            #for i in range(len(options)):
            #    value = options[i]
            #    idioma_final_key = get_key(value, dict_idioma)
            #    try:
            #        if (idioma_original != idioma_final_key):
            #            texto_convertido = str(blob.translate(to=idioma_final_key))
            #            st.success("Language"+": "+ value + " ("+idioma_final_key+")")
            #            st.write(texto_convertido)
            #            #st.text(idioma_final_key)
            #            play(texto_convertido,idioma_final_key)
            #            
            #    except:
            #        st.error("ERROR: some languages will fail to play the sound.")

        except:
            st.warning("TXT please")
Esempio n. 43
0
 def recognize_languages(self, files):
     for i in range(len(files)):
         blob = TextBlob(files[i])
         self.languages[blob.detect_language()] += [i]
!python -m textblob.download_corpora

from textblob import TextBlob
#import nltk
#nltk.download()


tx = df.loc[0,'full_text']
blob = TextBlob(tx)
blob.tags
blob.sentences[0].words
blob.noun_phrases
blob.ngrams(3)
blob.correct()
blob.words[3].spellcheck()
blob.detect_language()
blob.translate(to= 'ar') 

verbs = list()
for word, tag in blob.tags:
  if tag == 'VB':
    verbs.append(word.lemmatize())

nouns = list()
for word, tag in blob.tags:
	if tag == 'NN':
		nouns.append(word.lemmatize())

blob.sentiment.polarity
blob.sentiment.subjectivity
@author: akansal2
"""
#importing libraies
from textblob import TextBlob


#TextBlob Strings
Str1 = TextBlob('Amazing')
Str2 = TextBlob('Spider Man')

#Textblob string operations
Str1.lower()
Str1.upper()
Str1[1:4]
Str1 + " " + Str2
Str1.detect_language()



#Paragraph and sentence operations
para = TextBlob("My name is aditya. \n I live is  Modinagar.\n My apples id is [email protected]")
para.sentences  # distinguish sentences with combination of . and \n
para.sentences[0]
para.sentences[1]
para.sentences[2]
para.sentences[0].words
for n in para.sentences[1].noun_phrases:
    print(n)
for t in para.sentences[1].tags:
    print(t)
    
Esempio n. 46
0
def main():
    """NLP App with Streamlit and TextBlob"""

    #st.title("NLP Simple Examples")

    title_templ = """
    <div style="background-color:blue;padding:8px;">
    <h1 style="color:cyan">NLP Simple Examples</h1>
    </div>
    """

    st.markdown(title_templ,unsafe_allow_html=True)

    subheader_templ = """
    <div style="background-color:cyan;padding:8px;">
    <h3 style="color:blue">Natural Language Processing On the Go...</h3>
    </div>
    """

    st.markdown(subheader_templ,unsafe_allow_html=True)

    st.sidebar.image("https://www.centreofexcellence.com/app/uploads/2016/09/nlp-diploma-course.jpg", use_column_width=True)

    activity = ["Text Analysis", "Translation", "Sentiment Analysis", "About"]
    choice = st.sidebar.selectbox("Menu",activity)



	# Text Analysis CHOICE
    if choice == 'Text Analysis':

        st.subheader("Text Analysis")        
        st.write("")
        st.write("")

        raw_text = st.text_area("Write something","Enter a Text in English...",height=250)

        if st.button("Analyze"):
            if len(raw_text) == 0:
            	st.warning("Enter a Text...")
            else:
            	blob = TextBlob(raw_text)
            	st.write("")

            	if blob.detect_language() != 'en':
            		st.warning("Enter a Text in English...")
            	else:
            		st.info("Basic Functions")
            		col1, col2 = st.beta_columns(2)

            		with col1:
            			with st.beta_expander("Basic Info"):
            				st.success("Text Stats")
            				word_desc = nt.TextFrame(raw_text).word_stats()
            				result_desc = {"Length of Text":word_desc['Length of Text'],
											"Num of Vowels":word_desc['Num of Vowels'],
											"Num of Consonants":word_desc['Num of Consonants'],
											"Num of Stopwords":word_desc['Num of Stopwords']}
            				st.write(result_desc)

            			with st.beta_expander("Stopwords"):
            				st.success("Stop Words List")
            				stop_w = nt.TextExtractor(raw_text).extract_stopwords()
            				st.error(stop_w)

            		with col2:
            			with st.beta_expander("Processed Text"):
            				st.success("Stopwords Excluded Text")
            				processed_text = str(nt.TextFrame(raw_text).remove_stopwords())
            				st.write(processed_text)

            			with st.beta_expander("Plot Wordcloud"):
            			    st.success("Wordcloud")
            			    plot_wordcloud(raw_text)



            		st.write("")
            		st.write("")
            		st.info("Advanced Features")
            		col3, col4 = st.beta_columns(2)

            		with col3:
            			with st.beta_expander("Tokens&Lemmas"):
            				st.write("T&L")
            				processed_text_mid = str(nt.TextFrame(raw_text).remove_stopwords())
            				processed_text_mid = str(nt.TextFrame(processed_text_mid).remove_puncts())
            				processed_text_fin = str(nt.TextFrame(processed_text_mid).remove_special_characters())
            				tandl = text_analyzer(processed_text_fin)
            				st.json(tandl)

            		with col4:
            			with st.beta_expander("Summarize"):
            				st.success("Summarize")
            				summary_text = summarize(raw_text,ratio=0.4)
            				if summary_text != "":
            					st.success(summary_text)
            				else:
            					st.warning("Please insert a Longer Text")


        


    # Translation CHOICE
    elif choice == 'Translation':

        st.subheader("Text Translation")

        st.write("")
        st.write("")
        raw_text = st.text_area("","Write something to be translated...")
        if len(raw_text) < 3:
            st.warning("Please provide a string with at least 3 characters...")
        else:
            blob = TextBlob(raw_text)
            lang = blob.detect_language()
            #st.write(lang)
            tran_options = st.selectbox("Select translation language",['Chinese', 'English', 'German', 'Italian', 'Russian', 'Spanish'])
            if st.button("Translate"):
                if tran_options == 'Italian' and lang != 'it':
                    st.text("Translating to Italian...")
                    tran_result = blob.translate(from_lang=lang, to='it')
                elif tran_options == 'Spanish' and lang != 'es':
                    st.text("Translating to Spanish...")
                    tran_result = blob.translate(from_lang=lang, to='es')
                elif tran_options == 'Chinese' and lang != 'zh-CN':
                    st.text("Translating to Chinese...")
                    tran_result = blob.translate(from_lang=lang, to='zh-CN')
                elif tran_options == 'Russian' and lang != 'ru':
                    st.text("Translating to Russian...")
                    tran_result = blob.translate(from_lang=lang, to='ru')
                elif tran_options == 'German' and lang != 'de':
                    st.text("Translating to German...")
                    tran_result = blob.translate(from_lang=lang, to='de')
                elif tran_options == 'English' and lang != 'en':
                    st.text("Translating to English...")
                    tran_result = blob.translate(from_lang=lang, to='en')
                else:
                    tran_result = "Text is already in " + "'" + lang + "'"


                st.success(tran_result)
            
        
    

    # Sentiment Analysis CHOICE
    elif choice == 'Sentiment Analysis':
        
        st.subheader("Sentiment Analysis")

        st.write("")
        st.write("")

        raw_text = st.text_area("", "Enter a Text...")

        if st.button("Evaluate"):
            if len(raw_text) == 0:
                st.warning("Enter a Text...")
            else:
                blob = TextBlob(raw_text)
                lang = blob.detect_language()

                if lang != 'en':
                    tran_result = blob.translate(from_lang=lang, to='en')
                    blob = TextBlob(str(tran_result))

                result_sentiment = blob.sentiment
                st.info("Sentiment Polarity: {}".format(result_sentiment.polarity))
                st.info("Sentiment Subjectivity: {}".format(result_sentiment.subjectivity))

        



    # About CHOICE
    else:# choice == 'About':
        st.subheader("About")

        st.write("")
        st.write("")

        st.markdown("""
        ### NLP Simple Examples (App with Streamlit and TextBlob)
        
        ##### By
        + **[Rosario Moscato LAB](https://www.youtube.com/channel/UCDn-FahQNJQOekLrOcR7-7Q)**
        + [[email protected]](mailto:[email protected])
        """)
    def instaBot(self, c_id, c_secret, file, duration):
        "Retrieve the Instagram posts and analyze them"
        api = InstagramAPI(client_id=c_id, client_secret=c_secret)
        
        posts, next = api.tag_recent_media(tag_name='food', count=30)
        temp, max_tag = next.split('max_tag_id=')
        max_tag = str(max_tag)

        stop = time.time() + duration * 60

        while time.time() < stop:
            print "[*] " + str(len(posts)) + " posts retrieved."
            for post in posts:
                if self.isNewPost("log/posts.log", post.id):
                    count = 0
                    langs = {}
                    print post.id
                    
                    for tag in post.tags:                       
                        tagName = TextBlob(tag.name)
                        tagName = tagName.words[0].singularize()
                        
                        if len(tagName) >= 3 and tagName != 'food':
                            try:
                                lang = tagName.detect_language()
                            except:
                                print "[-] Fail to detect the language."
                                continue
                            
                            print "[*] " + tagName, '->', lang
                            langs.setdefault(lang, 0)
                            langs[lang] += 1
                            
                            if lang != 'en':
                                try:
                                    tagName = tagName.translate(from_lang=lang, to='en')
                                except:
                                    print "[-] Fail to translate the tag."
                                    continue
                                print "[*] Traduction: ", tagName

                            tagRelatedToFood = self.isTagRelatedToFood(tagName)
                            if tagRelatedToFood:
                                count += 1
                                print "[+] Tag related to food."
                            elif tagRelatedToFood == False:
                                print "[-] Tag not related to food."
                            else: # tagRelatedToFood == None
                                if self.isRelatedTo(tagName, self.foodWords):
                                    count += 1
                                    self.updateTags(self.foodTagsFile, tagName)
                                    self.writeTagLog("log/newTags.log", tag, True)
                                    print "[+] Tag related to food."
                                else:
                                    self.updateTags(self.noFoodTagsFile, tagName)
                                    self.writeTagLog("log/newTags.log", tag, False)
                                    print "[-] Tag not related to food."
                    if count > 0:
                        self.savePost(file, post)
                        self.writePostLog("log/posts.log", post, langs, True)
                        print "[+] Post saved."
                    else:
                        self.writePostLog("log/posts.log", post, langs, False)
                        print "[-] Post forget."
                    print '-------------------'
                
            posts, next = api.tag_recent_media(tag_name='food', max_tag_id=max_tag)
            temp, max_tag = next.split('max_tag_id=')
            max_tag = str(max_tag)

            if not next:
                break
Esempio n. 48
0
def simple_identification():
    client_from = MongoClient()
    db_from = client_from["SSD"]
    coll_from = db_from["raw_data"]
    start_time = time()
    date = datetime.today().strftime("%Y_%m_%d-%H_%M_%S")
    path_to_file = date + " - DetectEmojisWithSpacymoji_Performance.txt"
    p_file = codecs.open(path_to_file, encoding='utf-8', mode='a')
    p_file.write(date +
                 " Detecting Emojis with Spacymoji Test - Local Execution" +
                 "\n")
    p_file.flush()
    # II. Prepare data
    p_file.write("Preparing initial data ... " + "\n")
    path_to_configuration = food_detection_root.ROOT_DIR + os.path.sep + 'configuration' + os.path.sep \
                            + 'configuration.ini'
    config = ConfigParser(interpolation=ExtendedInterpolation())
    config.read_file(codecs.open(path_to_configuration, "r", "utf8"))
    path = food_detection_root.ROOT_DIR + os.path.sep + 'data' + os.path.sep
    # Read complementary characters
    complementary_characters_list_file = codecs.open(
        path + "list - complementary_characters.txt", encoding='utf-8')
    complementary_characters_list = complementary_characters_list_file.read(
    ).splitlines()
    complementary_characters_list_file.close()
    complementary_characters_dict = {}
    for aux in complementary_characters_list:
        aux_char = aux.split('\t')
        complementary_characters_dict[aux_char[2]] = [aux_char[1], aux_char[3]]
    # print(complementary_characters_dict)
    # 3. Configure Spanish POS tagger
    spanish_pipeline = spacy.load('es')
    emoji = Emoji(spanish_pipeline)
    spanish_pipeline.add_pipe(emoji, first=True)
    tag_map = spacy.es.TAG_MAP
    # start
    all_from_tweets = coll_from.find()
    count = 0
    stop = 1000
    p_file.write("Total data to process: " + str(stop) + "\n")
    for raw_data in all_from_tweets:
        if 'text' in raw_data.keys() and 'lang' in raw_data.keys():
            if "place" in raw_data.keys():
                place = raw_data["place"]
                if place is not None:
                    if "country_code" in place.keys():
                        raw_data_country_code = raw_data["place"][
                            "country_code"]
                        if raw_data_country_code in ["CO"]:
                            lang = raw_data["lang"]
                            text = raw_data['text']
                            if lang == 'es':
                                identify_special_characters(
                                    text, spanish_pipeline, tag_map, p_file)
                                count += 1
                            else:
                                if len(text) >= 3:
                                    blob = TextBlob(text)
                                    detection = True
                                    detected_language = ''
                                    while detection:
                                        try:
                                            detected_language = blob.detect_language(
                                            )
                                            detection = False
                                        except:
                                            print(
                                                'error while getting detected language'
                                            )
                                    if detected_language == 'es':
                                        identify_special_characters(
                                            text, spanish_pipeline, tag_map,
                                            p_file)
                                        count += 1
                            print(count)
                            if count == stop:
                                break
    all_from_tweets.close()
    client_from.close()
    p_file.write("Total elements in new list: " + str(count) + "\n")
    execution_time = time() - start_time
    p_file.write("Execution time: " + str(timedelta(seconds=execution_time)) +
                 "\n")
    p_file.flush()
    p_file.close()
Esempio n. 49
0
os.system('say "{}"'.format(textLower))

wav = gTTS(text=textLower, lang='en')

wav.save("KYC.wav")

translator = Translator()

print(translator.translate(text, dest='zh-CN').text)
'''
“了解您的客户”表格是投资行业的标准表格,可确保投资顾问了解客户的风险承受能力,投资知识和财务状况的详细信息。 KYC表格保护客户和投资顾问。
'''

text = TextBlob(text)

print(text.detect_language())

print(text.sentiment)

print(text.translate(to='ja'))
'''
en
Sentiment(polarity=0.13333333333333333, subjectivity=0.25)
「あなたのクライアントを知る」フォームは、投資顧問会社が顧客のリスク許容度、投資知識および財政状態に関する詳細な情報を確実に把握する、投資業界の標準的なフォームです。 KYCフォームは、クライアントと投資顧問の両方を保護します。
'''

# Convert to WAV of FLAC with format check, then load and recognize for compatibility assurance

try:

    wav = wave.open("KYC.wav", 'r')
sentence = TextBlob('Use 4 spaces per indentation level.')
print(sentence.words[2].singularize()) #similarly you can use pluralize
print()

#word lemmatization
w = Word("octopi")
print("octopi -> ",w.lemmatize())
w = Word("went")
print("went -> ",w.lemmatize("v"))
print()

#definition
print("Octopus : ",Word("octopus").definitions)
print()

#translation and language detection
en_blob = TextBlob(u'Simple is better than complex.')
print('Simple is better than complex.')
print("SPANISH : ",en_blob.translate(to='es'))
en_blob = TextBlob(u'Comment allez vous?')
print('Comment allez vous?')
print("language : ",en_blob.detect_language())
print()

#spell-check
w = Word("banama")
print("banama")
print("correction : ",w.correct())
print("suggestions : ",w.spellcheck())
print()
Esempio n. 51
0
            adj_ctr += 1
            adj.append(pair[0])
        elif tag == 'NN' or tag == 'NNS':
            noun_ctr += 1
            noun.append(pair[0])

print('\nTotal number of adjectives in tweet collection= ', adj_ctr)
print("List of adjectives in tweet collection: ", adj)
print('\nTotal number of nouns in tweet collection= ', noun_ctr, noun)
print("List of nouns in tweet collection: ", noun)

#working on string entered manually
st = "I amm an ostrich and nobody can see me, not even I myself."
st2 = "It's so sad that turtles can only walk slow."
st3 = "Not being smart and amazing is not worst thing in the world."
blob = TextBlob(st)
blob2 = TextBlob(st2)
blob3 = TextBlob(st3)
print('\nOriginal string: ', st)
print('Spell checked string: ', blob.correct())
print("Detecting language in above sentence...", blob.detect_language())
print(blob.translate(to='hi'))
if blob2.sentiment.polarity > 0:
    print("\n'", st2, "'", 'is positive')
else:
    print("\n'", st2, ",", 'is negative')

if blob3.sentiment.polarity > 0:
    print("\n'", st3, "'", 'is positive')
else:
    print("\n'", st3, "'", 'is negative')
Esempio n. 52
0
#message = 'Persona Natural'
#context = [1,  'Jubilacion Patronal',  -0.024999999999999994,  'cuanto cuesta un estudio actuarial para una empresa pequena']
context = [0, None, None, None, None, 0]
out_message, context = proc_message(message, context)
out_message
context

out_message, context = proc_message(message, context)
out_message

message = 'No'

blob = TextBlob(message)

if blob.detect_language() != 'en':

    blob = blob.translate(to='en').lower()

#def set_greeting_text(self, text):
#        data = {"setting_type": "greeting", "greeting": {"text": text}}
#        fmt = self.graph_url + "me/thread_settings?access_token={token}"
#        return requests.post(fmt.format(token=self.access_token),
#                             headers={"Content-Type": "application/json"},
#                             data=json.dumps(data))

topics = ['jubilacion patronal', 'consultoria', 'recursos humanos', 'IESS']

rep = 'su tema {0} es el que tratamos'

print(rep.format(topics[0]))
Esempio n. 53
0
from textblob import TextBlob

eb = TextBlob('Meu coraçao bate feliz quando te ve.')
print(eb.detect_language())
Esempio n. 54
0
def readfile(country, language):
    df = pd.read_json('/home/saad/Data/Twitter/country/' + country +
                      '/stats/data.json')
    words = [
        'flat', 'curv', 'distance', 'lockdown', 'lock', 'pandamic', 'safe',
        'quaran', 'social distan', 'social_distan', 'distancing', 'stay',
        'remote', 'home', 'indoor'
    ]
    trans_word = []
    for j in range(0, len(words)):
        try:
            t_word = translator.translate([words[j]], dest=language)
            for translation in t_word:
                trans_word.append()
        except:
            continue

    tweets = list(df['tweet'])
    countries = list(df['country'])
    trends = list(df['trend'])
    retweets = list(df['retweets'])
    favorites = list(df['favorites'])
    dates = list(df['date'])
    dfObj = {}
    dfObj['Date'] = []
    dfObj['Tweet'] = []

    for i in range(0, len(tweets)):
        a = 0
        b = 0
        c = 0

        if trends[i] != 'roadsafety' and (any(word in trends[i]
                                              for word in words)
                                          or any(word in trends[i]
                                                 for word in trans_word)):
            a = 1
            try:
                blob = TextBlob(tweets[i])
                lang = blob.detect_language()
                if (lang != 'en'):
                    print("doing")
                    tweets[i] = blob.translate(from_lang=lang, to='en')
                dfObj['Date'].append(dates[i])
                dfObj['Tweet'].append(tweets[i])
            except:
                continue
            # dfObj['Date'].append(dates[i])
            # dfObj['Tweet'].append(tweets[i])

        if any(word in tweets[i]
               for word in words) or any(word in tweets[i]
                                         for word in trans_word):
            b = 1
            try:
                blob = TextBlob(tweets[i])
                lang = blob.detect_language()
                if (lang != 'en'):
                    print("doing")
                    tweets[i] = blob.translate(from_lang=lang, to='en')
                dfObj['Date'].append(dates[i])
                dfObj['Tweet'].append(tweets[i])
            except:
                continue

        if (a == 1) and (b == 0):
            try:
                blob = TextBlob(tweets[i])
                lang = blob.detect_language()
                if (lang != 'en'):
                    print("doing")
                    tweets[i] = blob.translate(from_lang=lang, to='en')
                dfObj['Date'].append(dates[i])
                dfObj['Tweet'].append(tweets[i])
            except:
                continue

    ds = dict(Date=np.array(dfObj['Date']), Tweet=np.array(dfObj['Tweet']))
    ds = pd.DataFrame({key: pd.Series(value) for key, value in ds.items()})
    ds['Date'] = pd.to_datetime(ds['Date'])
    ds = ds.sort_values(by='Date')
    ds.to_pickle("/home/saad/Data/Twitter/country/" + country +
                 "/stats/nlpmood.pkl")
Esempio n. 55
0
#partnerfunds.com
for i in range(0, len(test)):
    siteurl = str(test[i][0])
    text = str(test[i][1])

    #Cortical.io
    termKeyWords = client.extractKeywords(text)
    termBitmap = client.getTextBitmap(text)['fingerprint']['positions']

    #TextBlob
    blob = TextBlob(text)

    MySqlKeyWordDat = (','.join(termKeyWords), siteurl)
    MySqlBitMapDat = (str(termBitmap), siteurl)
    MySqlTextBlobDat = (str(blob.sentiment), siteurl)
    MySqLangDat = (str(blob.detect_language()), siteurl)

    print "---For "+siteurl+" keywords = " + ",".join(termKeyWords) + " sentiment = " + MySqlTextBlobDat[0] + " lang:" + MySqLangDat[0]

    MySqlKeyWordDatQ = """UPDATE """+dbtable+""" SET cortical_io_keywords = %s WHERE siteurl = %s"""
    MySqlBitMapDatQ = """UPDATE """+dbtable+""" SET cortical_io = %s WHERE siteurl = %s"""
    MySqBlobDatQ = """UPDATE """+dbtable+""" SET opencalais = %s WHERE siteurl = %s"""
    MySqLangDatQ = """UPDATE """+dbtable+""" SET watson = %s WHERE siteurl = %s"""

    #upload keywords and bitmap to database
    cur.execute(MySqlKeyWordDatQ, MySqlKeyWordDat)
    cur.execute(MySqlBitMapDatQ, MySqlBitMapDat)
    cur.execute(MySqBlobDatQ, MySqlTextBlobDat)
    cur.execute(MySqLangDatQ, MySqLangDat)
    con.commit()
Esempio n. 56
0
def detect():
    word = TextBlob(text_in.get('1.0', 'end'))
    if len(word) > 2:
        label_detected_lang.configure(text=lang_dict_rev[word.detect_language()].upper())
    else:
        label_detected_lang.configure(text='')
Esempio n. 57
0
# text=parsed["content"]
# print(text)
# print("\n")

#London is the capital. And most populous city. England.
text = sys.argv[1]  #"Parlez-vous anglais?"

outfile = "output.txt"
f = open(outfile, "w")
f.write(str(text))
f.close()

text1 = open("output.txt")
text1 = text1.read()
translation = TextBlob(text1)
if translation.detect_language() != 'en':
    en_blob = translation.translate(to='en')
    text1 = en_blob

text1 = str(text1)
#text="Parlez-vous anglais? London is the capital and most populous city of England and  the United Kingdom. Today Machine learning (ML) is the scientific study of algorithms in  statistical models that computer systems use to progressively improve their performance on a specific task. Machine learning algorithms build a mathematical model of sample data in India, known as training data, in order to make predictions or decisions for Google without being explicitly programmed to perform the task. Machine learning algorithms are used in the applications of google email filtering, detection of network intruders, and computer vision, where it is infeasible to develop an algorithm of specific instructions for performing the task. Machine learning is closely related to computational statistics, which focuses on making predictions using computers. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a field of study within machine learning, and focuses on exploratory data analysis through unsupervised learning.In its application across business problems, machine learning is also referred to as predictive analytics. Manchester United is a very famous football club."

doc = nlp(text1)

# Add some Stop Words
my_stop_words = [':', '.', ',', '-', '(', ')', '"', ' ']
for stopword in my_stop_words:
    add_word = nlp.vocab[stopword]
    add_word.is_stop = True

#Printing After Removing StopWords stopwords = list(STOP_WORDS)
from textblob import TextBlob
from textblob.exceptions import TranslatorError
import numpy as np
import pandas as pd
df=pd.read_csv("liveChatData.csv")
language=[]
msgs=df.Message

for i in range(len(msgs)):
    try:
        t1=TextBlob(msgs[i])
        lan=t1.detect_language()
        language.append(lan)
    except (TypeError,TranslatorError):
        lan=np.nan
        language.append(lan)
        print("Translator Error")
    print(i)

df["Language"]=language
df.to_csv("langDf.csv")
Esempio n. 59
0
def is_english(t):
    t = t.replace("#", "")
    chk = TextBlob(t)
    if chk.detect_language() == 'en':
        return True
    return False
Esempio n. 60
0
from textblob import TextBlob

input_str = input('\n enter the string:')
textblob_obj = TextBlob(input_str)
detectLanguage = textblob_obj.detect_language()
print('\n detected language', detectLanguage)

input_string = input('\n enter the text')

textblob_obj_2 = TextBlob(input_string)
arabic_op = textblob_obj_2.translate(to='ar')

print('\n ip string is converted into arabic_op:', arabic_op)

china_corona = textblob_obj_2.translate(to='zh-CN')
print('\n ip string is converted into china_corona:', china_corona)

french_op = textblob_obj_2.translate(to='fr')
print('\n ip string in french:', french_op)

greekop = textblob_obj_2.translate(to='el')
print('\n ip is converted to greek', greekop)

hindi_op = textblob_obj_2.translate(to='hi')
print('\n ip is converted into hindi', hindi_op)