def get_similar_words(cleaned_query_words, doc):
    global expanded_query_terms
    try:
        command = "/home/sneha/phoenix/galago/galago-3.6-bin/bin/galago doc --index=/phoenix/ir_code/galago-index-rb4/ --id=" + doc + " --text=true --metadata=false --tokenize=true | sed -n '/Term vector:/,/<TEXT>/p'"
        outl = subprocess.check_output(command, shell=True) 
        out = outl.split('\n')
        word_array = list()
        for i in range(1, len(out) - 2):
            words = out[i].split()
            if(len(words) > 0):
                word_array.append(words[len(words)-1])
       
        q_syn_list = list()
        for q in cleaned_query_words:
            q_wn = Word(q)
            q_syn_list.append(q_wn.get_synsets(NOUN))

        for w in word_array:
            if w not in cleaned_query_words:
                w_wn = Word(w)
                w_syn = w_wn.get_synsets(NOUN)
                for q_syn in q_syn_list:
                    max_syn = 0
                    for i in range(0, min(2, len(q_syn))):
                        for j in range(0, min(2, len(w_syn))):
                            syn = q_syn[i].path_similarity(w_syn[j])
                            max_syn = max(max_syn, syn)
                    if(max_syn > 0.3):
                        expanded_query_terms[w] = max_syn
    except:
        pass
    print "Done processing " + doc
def default_adv_xxx_bigram_polarity(bigram, negation=None, prior_polarity_score=False, linear_score=None):
	"""Calculates the bigram polarity based on a empirical factor from each adverb group
		and SENTIWORDNET word polarity
	"""

	second_word_polarity = word_polarity(bigram['second_word'], 
								bigram['second_word']['tag'], 
								prior_polarity_score = prior_polarity_score,
								linear_score = linear_score)

	# If is a verb, tries again in lemmatized form
	if bigram['second_word']['tag'] in util.PENN_VERBS_TAGS and \
		(second_word_polarity == None or second_word_polarity[0] == 0):
			w = Word(bigram['second_word']['raw'])
			bigram['second_word']['lemma'] = w.lemmatize("v")
			second_word_polarity = word_polarity(bigram['second_word'],
											bigram['second_word']['tag'], 
											prior_polarity_score = prior_polarity_score,
											linear_score = linear_score)

	#if the ngram_2 does not have polarity, so stops the method
	if second_word_polarity == None:
		return None

	return apply_adverb_factor(bigram['first_word']['raw'],second_word_polarity[0], negation)
Exemple #3
0
def preprocess(tagged):
    word = Word(tagged[0])
    if word.isalpha() and word not in stopwords:
        tag = penn_to_wn(tagged[1])
        l = word.lemmatize(tag)
    else:
        l = ''
    return l
def getword(word):
    w = Word(word[0])
    result = word[0]
    if word[1] == 'JJR' or word[1] == 'JJS':
        result = w.lemmatize('a')
    elif word[1] == 'NNS' or word[1] == 'NNP' or word[1] == 'NNPS':
        result = w.lemmatize('n')
    return result
Exemple #5
0
 def get_verbs(self, words_tags):
     result = []
     for idx, (word, tag) in enumerate(words_tags):
         if tag in self.verb_tags:
             verbz = Word(word)
             if verbz.lemmatize("v") not in self.neglect and word not in self.neglect:
                 src_verb = verbz.lemmatize("v")
                 result.append((idx, src_verb))
     return result
Exemple #6
0
def getTags(string):
  tb = TextBlob(string)
  # tb = TextBlob(str(tb.correct()))
  nouns = tb.noun_phrases
  real_nouns_of_NY = list()
  sentiment = tb.sentiment.polarity
  for noun in nouns:
    n = Word(noun)
    real_nouns_of_NY.append(n.lemmatize())
  return sentiment, real_nouns_of_NY
def extractWordPosPredicate(word, line,after):
	numofwords = len(word.split())
	i=0
	#print "WordPOSPREDICATE",word,line
	w = Word(word.split()[0])
	result = list()
	found = 0
	while i <= (len(line.split()) - numofwords):
		if re.match("[A-Za-z]+$",line.split()[i]):
			w = Word(word.split()[0])
			w1 = Word(line.split()[i])
			if w.lemmatize("v") == w1.lemmatize("v"):
				j = 1
				flag = 0
				for j in xrange(numofwords):
					#print line.split()[i+j],word.split()[j],numofwords
					if re.match("^[A-Za-z]+$",line.split()[i+j]):
						w = Word(word.split()[j])
						w1 = Word(line.split()[i+j])
						if w.lemmatize("v") != w1.lemmatize("v"):
							flag = 1
							break
				if flag == 0:
					found = 1
					if after==1:
						result.append(i+numofwords)						#Returns the position from where we have to take n gram after
					else:
						result.append(i - 1)   							#Returns the position from where we have to take n gram before
		
		i+=1

	if found == 1:
		return result
	else:
		return [-1]
Exemple #8
0
def extractWordPosPredicate(word, line,after):
	numofwords = len(word.split())
	i=0
	w = Word(word.split()[0])
	while i <= (len(line.split()) - numofwords):
		if re.match("[A-Za-z]+$",line.split()[i]):
			w = Word(word.split()[0])
			w1 = Word(line.split()[i])
			if w.lemmatize("v") == w1.lemmatize("v"):
				j = 1
				flag = 0
				for j in xrange(numofwords):
					#print line.split()[i+j],word.split()[j],numofwords
					if re.match("^[A-Za-z]+$",line.split()[i+j]):
						w = Word(word.split()[j])
						w1 = Word(line.split()[i+j])
						if w.lemmatize("v") != w1.lemmatize("v"):
							flag = 1
							break
				if flag == 0:
					if after==1:
						return i+numofwords						#Returns the position from where we have to take n gram after
					else:
						return i - 1   							#Returns the position from where we have to take n gram before
		
		i+=1

	return -1
 def transformToWords(self,x):
     '''
     permet d'avoir une list de mot lemmanizé
     '''
     words = []
     for word in x.textBlob.words:
         myWord = Word(word.lemmatize('v').encode('utf-8'))
         myWord = Word(myWord.lemma)
         myWord = Word(myWord.singularize().upper().encode('utf-8'))
         words.append(myWord)
     return words
def lemmatize(textblob):
    '''
    permet d'avoir une list de mot lemmanizé
    '''
    words = []
    for word in textblob.words:
        myWord = Word(word.lemmatize('v').encode('utf-8'))
        myWord = Word(myWord.lemma)
        myWord = Word(myWord.singularize().upper().encode('utf-8'))
        words.append(myWord)
    return words
	def sentenceToFeatures(self, sentence):
		feat = []

		for word in sentence:
			werd = Word(word)
			syns = [w.lemma_names for w in werd.get_synsets()]
			for syn in syns:
				try:
					feat.append(self.word_to_idx[syn])
				except KeyError:
					continue
		return list(set(feat))
Exemple #12
0
def imprimir_resto(clase, puesto, descrip, req):
    #lineaTotal = filter(lambda x: x in string.printable, lineaTotal)
    archEscritura.write(clase)
    archEscritura.write(",")
    blobPuesto = TextBlob(puesto.decode('utf-8'))
    blobDescrip = TextBlob(descrip.decode('utf-8'))
    blobReq = TextBlob(req.decode('utf8', 'ignore'))
    wordsPuesto = blobPuesto.words
    wordsDescrip = blobDescrip.words
    wordsReq = blobReq.words
    
    for wordP in wordsPuesto:
        nword = strip_accents(wordP)
        exclude = set(string.punctuation)
        nword = ''.join(ch for ch in nword if ch not in exclude)
        nword = nword.lower()
        nword = filter(lambda x: x in string.printable, nword)
        archEscritura.write(nword)
        archEscritura.write(" ")
    archEscritura.write(",")
    stemmer = SnowballStemmer("spanish")
    cad = ""
    for wordD in wordsDescrip:
        nwordD = strip_accents(wordD)
        exclude = set(string.punctuation)
        nwordD = ''.join(ch for ch in nwordD if ch not in exclude)
        nwordD = filter(lambda x: x in string.printable, nwordD)
        if nwordD not in (stopwords.words('spanish')):#Elimnimar Stop words
            w=Word(nwordD)        
        #comentarios.append(w)
            word2= stemmer.stem(w.lower())
            archEscritura.write(word2)
            archEscritura.write(" ")
    archEscritura.write(",")
    lista = []
    for wordP in wordsReq:
        nwordP = strip_accents(wordP)
        exclude = set(string.punctuation)
        nwordP = ''.join(ch for ch in nwordP if ch not in exclude)
        nwordP = filter(lambda x:x in string.printable, nwordP)
        if nwordP not in (stopwords.words('spanish')):
            w=Word(nwordP)
            word3 = stemmer.stem(w.lower())
            if word3 not in lista:
                lista.append(word3)
    for pal in lista:
        archEscritura.write(pal)
        archEscritura.write(" ")
    archEscritura.write("\n")
	def initialize(self, sentences):
		self.max_feat_len = 0
		self.word_to_idx = {}
		idx = self.index_offset
		for sentence in sentences:
			syn_count = 0
			for word in sentence:
				werd = Word(word)
				syns = [w.lemma_names for w in werd.get_synsets()]
				for syn in syns:
					syn_count += 1
					if syn not in self.word_to_idx:
						self.word_to_idx[syn] = idx
						idx += 1
			self.max_feat_len = max(self.max_feat_len, syn_count)
	def loadData(self, filename):

		with open(filename, "r") as reviewfile:
			review = TextBlob("")
			for line in reviewfile:
				review += TextBlob(line).lower()
			for sentence in review.sentences:
				tmp = []
				for word in sentence.words:
					if word not in self.stopwords:
						w = Word(word)
						tmp.append(w.lemmatize())
						if word not in self.wordfrequency:
							self.wordfrequency[word] = review.word_counts[word]
				self.sentences.append(tmp)
 def split_sentences(self, line):
     correction_line = ""
     tokens = line.split()
     for token in tokens:
         if token.lower() in self.features:
             correction_line = correction_line + str(" ") + token
             continue
         b = Word(token)
         possible_values = b.spellcheck()
         result = possible_values[0][0]
         for word in possible_values:
             if word[0].lower() in self.features:
                 result = word[0]
                 break
         correction_line = correction_line + str(" ") + result
     return correction_line
Exemple #16
0
def spell_check(line):
    modified_line=line
    word_list=word_tokenize(line)
    for word in word_list:
        word=word.lower()
        if word in spell_dict.keys():
            modified_line = re.sub(word,spell_dict[word],line)
        elif word.isalnum():
            search = open('English_words.txt', 'r')
            if word not in english_dict and word not in search.read():
                w = Word(word)
                suggestion=w.spellcheck()
                if max(suggestion)[1] > 0.9:          
                    word_checked=max(suggestion)[0]   
                    spell_dict[word]=word_checked
                    modified_line = re.sub(word,spell_dict[word],modified_line)
    return modified_line
Exemple #17
0
def stopWordStem(linea):
    blob=TextBlob(linea.decode('utf-8'))
    words=blob.words
    comentarios=""
    stemmer = SnowballStemmer("spanish")
    primero=True
    for word in words:    
        if word not in (stopwords.words('spanish')):#Elimnimar Stop words
            w=Word(word)        
            if (primero):
                comentarios+=(stemmer.stem(w.lower()))
                primero=False
            else:
                comentarios+=" "
                comentarios+=(stemmer.stem(w.lower()))
            
    
    
    return comentarios
def preposition(line):
	first_letter = line[0]
	for word in line.split():
		tb = TextBlob(word)
		for w, t in tb.tags:
			if t == 'NN':
				b = Word(word)
				if word == str(b.singularize()):
					# print word + " is probably singular like " + b.singularize()
					if not_a_vowel(first_letter):
						return random.choice(['The ', 'A ', '']) + line
					else:
						return random.choice(['The ', 'An ', '']) + line
				elif word == str(b.pluralize()):
					return random.choice(['The ', 'Some ', 'Many ', 'Of ', 'For all of the ', '']) + line
	## if it gets to this point, we dont know if it is plural, so just figure out if 'a' or 'an'
	if not_a_vowel(first_letter):
		return random.choice(['A ', 'The ', '']) + line
	else:
		return random.choice(['An ', 'The ', '']) + line
Exemple #19
0
def imprimir_resto(lineaTotal):
    blob=TextBlob(lineaTotal.decode('utf-8'))
#separamos en palabras
    words=blob.words
    for word in words:
        nword = strip_accents(word)
        exclude = set(string.punctuation)
        nword = ''.join(ch for ch in nword if ch not in exclude)
        ncomments.append(nword)
    comentarios=[]
    stemmer = SnowballStemmer("spanish")
    count = 0
    for word in ncomments:    
        if count < 2:
            exclude = set(string.punctuation)
            word = ''.join(ch for ch in word if ch not in exclude)
            word = word.lower()
                    #word = strip_accents(word)
            comentarios.append(word)
        else:
            if word not in (stopwords.words('spanish')):#Elimnimar Stop words
                           
        #comentarios.append(w)
                exclude = set(string.punctuation)
                word = ''.join(ch for ch in word if ch not in exclude)
                word = word.lower()
                word = strip_accents(word)
                w=Word(word) 
                comentarios.append(stemmer.stem(w.lower()))
        count += 1
        
    ulist = []
    for com in comentarios:
        if com not in ulist:
            ulist.append(com)
         
    for com in ulist:
        com = filter(lambda x: x in string.printable, com)
        archEscritura.write(com)
        archEscritura.write(",")
    archEscritura.write("\n")
Exemple #20
0
def processRow(row):
    import re
    import nltk
    from textblob import TextBlob
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from textblob import Word
    from nltk.util import ngrams
    import re
    from wordcloud import WordCloud, STOPWORDS
    from nltk.tokenize import word_tokenize
    tweet = row
    #Lower case
    tweet.lower()
    #Removes unicode strings like "\u002c" and "x96"
    tweet = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', tweet)
    tweet = re.sub(r'[^\x00-\x7f]',r'',tweet)
    #convert any url to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert any @Username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = re.sub('[\n]+', ' ', tweet)
    #Remove not alphanumeric symbols white spaces
    tweet = re.sub(r'[^\w]', ' ', tweet)
    #Removes hastag in front of a word """
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #Remove :( or :)
    tweet = tweet.replace(':)','')
    tweet = tweet.replace(':(','')
    #remove numbers
    tweet = ''.join([i for i in tweet if not i.isdigit()])
    #remove multiple exclamation
    tweet = re.sub(r"(\!)\1+", ' ', tweet)
    #remove multiple question marks
    tweet = re.sub(r"(\?)\1+", ' ', tweet)
    #remove multistop
    tweet = re.sub(r"(\.)\1+", ' ', tweet)
    #lemma
    from textblob import Word
    tweet =" ".join([Word(word).lemmatize() for word in tweet.split()])
    #stemmer
    #st = PorterStemmer()
    #tweet=" ".join([st.stem(word) for word in tweet.split()])
    #Removes emoticons from text
    tweet = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-­p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:','', tweet)
    #trim
    tweet = tweet.strip('\'"')
    row = tweet
    return row
def tokenize_tweet(document):
    global uselessTerm
    document = document.lower()
    a = document.index("username")
    b = document.index("clusterno")
    c = document.rindex("tweetid") - 1
    d = document.rindex("errorcode")
    e = document.index("text")
    f = document.index("timestr") - 3
    # 提取用户名、tweet内容和tweetid三部分主要信息
    document = document[c:d] + document[a:b] + document[e:f]
    terms = TextBlob(document).words.singularize()

    result = []
    for word in terms:
        expected_str = Word(word)
        expected_str = expected_str.lemmatize("v")
        if expected_str not in uselessTerm:
            result.append(expected_str)

    return result
Exemple #22
0
def nb_of_speliing_errors(essay):
    
    """Feature 4: Utiliza a bilbioteca textblob para correção de erros de ortografia"""
    essay = re.sub('@\S+','',essay)                                                 # Retira nomes padrão do ASAP iniciados em @
    
    vector = CountVectorizer()
    vector.fit_transform([essay])
    
    list_tokens = vector.get_feature_names()
    
    spell_errors = 0    
    for i in list_tokens:
        __result = None
        __result = re.search('@\S+', i) 										# retira nomes padrão do ASAP como @Location
        if __result is None:
            w = Word(i)														    # da biblioteca textblob
            result_tuple = w.spellcheck()									    # o método spellcheck() returna uma tupla com a sugestão de correção e o nível de certeza em %
            if i != result_tuple[0][0] and result_tuple[0][1] == 1.0:			# só faz correções que o corretor tem certeza, 100%, evita sugestão de singular em formas plurais
                spell_errors += 1
    nb_of_spell_errors = spell_errors
    return nb_of_spell_errors
Exemple #23
0
def standardize_note(note, abbrDict):
    '''
    Standardize a note (collection of words).
    Assumes a first pass of abbreviation approximation
    has been performed.
    '''
    noteLw = note.lower()
    noteLw = noteLw.replace("'s", "")
    blob = TextBlob(noteLw)
    lematBlob = []
    for k, (word, pos) in enumerate(blob.tags):
        # clean the word
        word = word.strip("/")
        word = word.strip(".")
        # check once more to see if there is something
        word = Word(_replace_abbr(word, abbrDict))
        if "." in word:
            splitMore = word.split('.')
            for spWord in splitMore:
                fixedWord = _replace_abbr(spWord, abbrDict)
                for w in fixedWord.split():
                    lematBlob.append(_word_std(Word(w), pos))
        else:
            for w in word.split():
                lematBlob.append(_word_std(Word(w), pos))
    return " ".join(filter(lambda x: x != None, lematBlob))
Exemple #24
0
def textblob_adj(filepath, outfilepath, countfilepath, minpos):
    file = open(filepath)
    t = file.read()
    blobed = TextBlob(t)
    #counts = Counter(tag for word,tag in blobed.tags)
    adj_list = []
    adv_list = []
    adj_tag_list = ['JJ', 'JJR', 'JJS']
    adv_tag_list = ['RB', 'RBR', 'RBS']
    for (a, b) in blobed.tags:
        if b in adj_tag_list:
            expected_str = Word(a)
            expected_str = expected_str.lemmatize('a')
            adj_list.append(expected_str)
        elif b in adv_tag_list:
            expected_str = Word(a)
            expected_str = expected_str.lemmatize('r')
            adv_list.append(expected_str)
        else:
            pass

    with open(outfilepath, "w") as txt_file:
        for line in adj_list:
            txt_file.write(line + " ")
        for line in adv_list:
            txt_file.write(line + " ")
    # return adj_list, adv_list, counts['JJ']+counts['JJR']+counts['JJS'], counts['RB']+counts['RBR']+counts['RBS']

    count_from_text_file(outfilepath, countfilepath, minpos)
Exemple #25
0
def spell_check(query):
    #split query
    splitted_query = query.split()
    #empty list for spell checked query
    corrected_query = []
    #searching freq_dict in db
    dict_collection = mongo.db["dict_collection"]
    freq_dict = dict_collection.find_one({"name": "freq_dict"})["freq_dict"]
    #for each word in splitted query
    for word in splitted_query:
        #convert to testblob word
        blob_word = Word(word)
        #all the possible corrections to word
        possible_corrections = blob_word.spellcheck()
        #initial counter
        freq_counter = 1
        #for the case when spelling is incorrected but no word in document to correct it
        at_least_one = False
        #in case the spelling is correct
        corrected_word = blob_word
        #for each possible correction in the word
        for p in possible_corrections:
            #p[0]'s are the corrections and p[1] scores
            if p[0] in freq_dict.keys():
                #signifies at least one correction is present in dictionary so frequency based correction
                at_least_one = True
                #frequency of p[0]
                frequency = freq_dict[p[0]]
            else:
                frequency = 0
            #keeping highest frequency and corresponding word in record
            if frequency >= freq_counter:
                freq_counter = frequency
                corrected_word = p[0]
        #no correction was present in dictionary
        if at_least_one is False:
            #return correction with highest score
            corrected_word = blob_word.correct()
        corrected_query.append(corrected_word)
    return " ".join(corrected_query)
Exemple #26
0
def save_file_dict(file_path, save_file_dict_path):
    # file_dict = {}
    # file_dict_keys = []
    stop_words = stopwords.words('english')
    count = 0
    for fname in file_path:
        with open(fname, 'r', encoding='ISO-8859-1') as f:
            # tokenization / normalization
            s = f.read().lower()
            s = s.replace('/', ' ')
            s = s.replace('-', ' ')
            w = TextBlob(s).words

            clean_w_list = []
            for word in w:
                # stemming (w = w.stem() another way)
                word = Word(word)
                word = word.lemmatize()
                word = Word(word)
                word = word.lemmatize("v")
                # stopwords (nltk.download("stopwords") download the dataset)
                if(word not in stop_words and (word not in ['\'s', '\'ll', '\'t'])):
                    clean_w_list.append(word)

            clean_w_dict = dict(collections.Counter(clean_w_list))
            
            path_list = fname.split('\\')
            save_dict_name = path_list[-2] + '_' + path_list[-1]
            save_dict_path = save_file_dict_path + '/' + save_dict_name + '.txt'
            save_dict(clean_w_dict, save_dict_path)
            # file_dict[save_dict_name] = clean_w_dict
            # file_dict_keys.append(save_dict_name)

            print("The %dth file finished."%(count))
            count += 1
Exemple #27
0
def process_data_cleaning(data):
    # remove undesired unicode characters
    print("start data cleaning")

    data["message"] = data["message"].apply(lambda x: x.replace(
        "\\/", "/").encode("ascii", "ignore").decode("ascii"))

    # remove html tags, urls etc.
    tag_cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    url_cleaner = re.compile('http\S+|http\S+|www\S+')

    data["message"] = [re.sub(tag_cleaner, " ", x) for x in data["message"]]
    data["message"] = [re.sub(url_cleaner, " ", x) for x in data["message"]]

    # transform into lowercase
    data["message"] = data["message"].apply(lambda x: x.lower())

    # remove smileys, symbols and all the crap
    data["message"] = [remove_emoji(x) for x in data["message"]]

    # remove all numbers
    data["message"] = data["message"].apply(lambda x: " ".join(
        x for x in x.split() if check_if_number(x) is not True))

    # remove all special characters, which might be left
    spec = set(string.punctuation)
    data["message"] = data["message"].apply(
        lambda x: x.translate({ord(i): None
                               for i in spec}))

    # remove stopwords - check on lowercase level
    stop = nltk.corpus.stopwords.words('english')
    data["message"] = data["message"].apply(
        lambda x: " ".join(x for x in x.split() if x not in stop))

    # remove rows which include empty messages
    empty_string_filter = data["message"] != ""
    data = data[empty_string_filter]

    print("end data cleaning")
    print("start spelling correction")
    # spelling correction
    data["message"] = data["message"].apply(lambda x:
                                            (str(TextBlob(x).correct())))
    print("end spelling correction")
    print("start lemmatisation")
    # perform lemmatisation
    data["message"] = data["message"].apply(
        lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    print("start lemmatisation")

    return data
	def Clean(self):

		data = pd.read_csv(self.path)

		data = data.drop('author', axis=1)


		data = data.drop(data[data.sentiment == 'anger'].index)
		data = data.drop(data[data.sentiment == 'boredom'].index)
		data = data.drop(data[data.sentiment == 'enthusiasm'].index)
		data = data.drop(data[data.sentiment == 'empty'].index)
		data = data.drop(data[data.sentiment == 'fun'].index)
		data = data.drop(data[data.sentiment == 'relief'].index)
		data = data.drop(data[data.sentiment == 'surprise'].index)
		data = data.drop(data[data.sentiment == 'love'].index)
		data = data.drop(data[data.sentiment == 'hate'].index)
		data = data.drop(data[data.sentiment == 'neutral'].index)
		data = data.drop(data[data.sentiment == 'worry'].index)

		data['content'] = data['content'].apply(lambda x: " ".join(x.lower() for x in x.split()))


		data['content'] = data['content'].str.replace('[^\w\s]',' ')



		stop = stopwords.words('english')
		data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))


		from textblob import Word
		data['content'] = data['content'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

		import re
		def del_repeat(text):
		    pattern = re.compile(r"(.)\1{2,}")
		    return pattern.sub(r"\1\1", text)

		data['content'] = data['content'].apply(lambda x: " ".join(del_repeat(x) for x in x.split()))

		freq = pd.Series(' '.join(data['content']).split()).value_counts()[-10000:]


		freq = list(freq.index)
		data['content'] = data['content'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))



		lbl_enc = preprocessing.LabelEncoder()
		y = lbl_enc.fit_transform(data.sentiment.values)

		return data
 def transformToWords(self, x):
     '''
     permet d'avoir une list de mot lemmanizé
     '''
     words = []
     for word in x.textBlob.words:
         myWord = Word(word.lemmatize('v').encode('utf-8'))
         myWord = Word(myWord.lemma)
         myWord = Word(myWord.singularize().upper().encode('utf-8'))
         words.append(myWord)
     return words
Exemple #30
0
def get_hypernym(askedfor):
    # put in a word, get a list of other words
    word = Word(askedfor)
    if word.synsets:
        generalsenses = word.synsets[0]
        next_level = [
            h.name().split(".")[0] for h in generalsenses.hypernyms()
        ]
        print('A {} is a/an {}'.format(askedfor, next_level[0]))
        return (next_level[0])
    else:
        print('{} is NOT a word'.format(askedfor))
        return ('{} is NOT a word'.format(askedfor))
Exemple #31
0
	def defineWord(self, word):
		with humanfriendly.AutomaticSpinner("Loading, this can take awhile for the first time, but repeating the command again will be considerably faster..."):
			try:
				blob = Word(word)
				defs = blob.definitions
				s = ""
				if len(defs) > 0:
					for item in defs[0:4]:
						s += item.capitalize()+".\n"
					return s
				else:
					return "No result found."
			except RuntimeError:	
				blob = Word(word)
				defs = blob.definitions
				s = ""
				if len(defs) > 0:
					for item in defs[0:4]:
						s += item.capitalize()+".\n"
					return s
				else:
					return "No result found."
def main():
    f = open('relativity.txt', 'r')
    content = f.read()

    wiki = TextBlob("My namee is John!")
    #wiki.tags
    sentiment = wiki.sentiment
    w = Word('hullo')
    print(wiki.correct())
    to_esp = wiki.translate(to='fr')
    print(to_esp)

    input()
def Input_pipeline(Tweet, filename):
    Tweet = str(Tweet).lower().replace('[^\w\s]', '').replace(
        r'http\S+', '').strip().replace('\s+', ' ')
    Tweet = ' '.join(
        [Word(item).lemmatize() for item in Tweet.split() if item not in stop])
    TFIDF = Vectorizer.transform([Tweet]).toarray()
    Classifier_ = pickle.load(open(filename, 'rb'))
    Prediction = Classifier_.predict(TFIDF)
    if Prediction[0] == 0:
        return 'Real'
    elif Prediction[0] == 1:
        return 'Fake'
    return Prediction
Exemple #34
0
def predict():
    if request.method == 'POST':
        message = request.form['message']
        if (len(message) > 2):
            text = message
            pre_processed_reviews = []
            data = gensim.utils.simple_preprocess(text, min_len=2)
            review = ' '.join(WordNetLemmatizer().lemmatize(word)
                              for word in data if word not in stop_words)
            pre_processed_reviews.append(review.strip())
            tfidf_model = joblib.load(MODEL_tfidf)
            vect = tfidf_model.transform(pre_processed_reviews)
            lr_model = joblib.load(MODEL_lr)
            my_prediction = lr_model.predict(vect)
        else:
            my_prediction = 3
            return render_template('home.html', prediction=my_prediction)

        blob = TextBlob(text)
        nouns = list()
        for word, tag in blob.tags:
            if tag == 'NN':
                nouns.append(word.lemmatize())
        display = []
        output = ""
        for item in random.sample(nouns, len(nouns)):
            word = Word(item)
            if word not in display:
                display.append(word.capitalize())

        for i in display:
            if len(i) > 2:
                output = output + " " + i
            else:
                output = ""

        return render_template('home.html',
                               prediction=my_prediction,
                               summary=output)
def findLemmas(adj):
    synonyms = []
    antonyms = []
    word = Word(adj)
    for syn in word.synsets[:]:
        # for syn in list(wn.senti_synsets(adj)):
        for l in syn.lemmas():
            # for l in syn.synset.lemmas():
            synonyms.append(l.name())
            if l.antonyms():
                antonyms.append(l.antonyms()[0].name())

    return synonyms, antonyms
Exemple #36
0
def index(request):
    Note = "Not Found"
    if request.method == 'POST':
        text = request.POST['text']
        lower = text.lower()
        defination = Word(lower).definitions
        final = ' '.join(map(str, defination))
        if final is "":
            messages.error(request,
                           'Either spelling mistake or can\'t find your word')
        context = {"defination": final}
        return render(request, 'index.html', context)
    return render(request, 'index.html')
 def answer_yes_no_question(self, qcorpus, tk, relevant_sentences):
     relevant_sentence = nlp(str(relevant_sentences[0]))
     q_tags = tk.tags
     keyword = ""
     key_tag = ""
     for (token, tag) in q_tags:
         if tag[0] == 'N' or tag[0] == "V":
             keyword = token
             key_tag = tag
     keyword_found = False
     neg = False
     for token in relevant_sentence:
         if str(token) == keyword or (key_tag[0] == "V"
                                      and Word(str(token)).lemmatize("v")
                                      == Word(keyword).lemmatize("v")):
             keyword_found = True
         if token.dep_ == 'neg':
             neg = not neg
     if keyword_found:
         if neg: return "No"
         else: return "Yes"
     return "No"
Exemple #38
0
def recuperer_mots_tweet(data, seuil):
    # Fonction renvoyant les mots uniques et lemmatisés d'un ensemble de tweets, et qui sont peu fréquents.

    ## Création de la liste des mots
    L = []

    ## Récupération des mots
    for tweet in data['tweet_textual_content']:
        t = TextBlob(tweet)
        L.append(t.words)

    ## Suppression des mots trop fréquents
    for word in L:
        if L.count(word) > seuil:
            L.delete(word)

    ## Lemmatisation
    for word in L:
        w = Word(word)
        L.append(w.lemmatize())

    return L
Exemple #39
0
def synonyms(word, maxSyns):
    syns, ants = [], []
    for syn in Word(word).synsets:
        for l in syn.lemmas():
            syns.append(l.name())
            if l.antonyms():
                ants.append(l.antonyms()[0].name())
    final = [
        syns[i] for i in sample(range(0, len(syns)), min(maxSyns, len(syns)))
    ]
    for ant in ants:
        final.append(ant)
    return final
Exemple #40
0
 def place_mark(self, word):
     if word in self.previous_words:
         self.status = 2
         return self.letters
     elif len(word) <= 2 or len(word) > 6:
         self.status = 0
         return self.letters
     elif not (self.dictionary.meaning(word) == None):
         self.previous_words.append(word)
         self.calc_score(len(word))
         self.status = 1
     else:
         w = WordDict(word)
         check = w.spellcheck()
         if (check[0][1] == 1.0) and (word == check[0][0]):
             self.previous_words.append(word)
             self.calc_score(len(word))
             self.status = 1
         else:
             self.calc_score(0)
             self.status = 0
     return self.letters
Exemple #41
0
def normalization(array):
    sw = stopwords.words("english")
    tempArray = []

    for i in range(0, len(array)):
        comment = array[i]  # obtaining the words inside of the sentence
        sentence = ""  # used for storing words after processes
        for word in comment.split():  # each word
            if word in sw:  # if word is stopword, skip this word for process in below
                continue
            word = Word(word).lemmatize()  # lemmatize and stemming
            word = word.lower()  # capitalize
            word = "".join(
                char for char in word
                if char.isalpha() or char == " " or char == "'"
            )  # to joining. There is 3 different control occur because eliminating the possiblity of missing stopwords due to " ' " and space.
            sentence += word + " "

        tempArray.append(sentence)
        sentiment_score = TextBlob(sentence).sentiment
        print(sentiment_score)
    return tempArray
Exemple #42
0
def find_synms(word, c=None, pos=None):
    from textblob import Word
    from itertools import chain

    synonyms = Word(word).get_synsets(pos)

    #for wl in synonyms:
    #   print(synonyms[0], wl.path_similarity(synonyms[0]), wl.lemma_names())

    lemmas = chain.from_iterable([word.lemma_names() for word in synonyms])
    lemmas = mm.remove_dup_list(lemmas, case=True)

    return lemmas[0:c]
Exemple #43
0
def paragraph_lemma(txt):
    """
    Lemmatize a paragraph.

    Parameters
    ----------
    txt : str
        Texts after removing numbers and punctuations.

    Returns
    -------(pd.Series(nltk.ngrams(words, 2)).value_counts())[:10]
    lemma_txt : str
        lemmatized paragraph with
        each word being lowercase.

    """
    token_word = word_tokenize(txt)
    lemma_txt = ' '.join([
        Word(word.lower()).lemmatize() for word in token_word
        if len(Word(word.lower()).lemmatize()) > 1
    ])
    return lemma_txt
def get_correct(text: TextBlob) -> Dict[str, Union[str, Dict]]:
    """ Функция, возвращающая текст без ошибок и варианты правильного написания слов. """

    corrected_word = text.correct()
    correctly_vars: Dict = {
        x: str(Word(x).spellcheck()[0][0])
        for x in text.words
    }

    return {
        "corrected": str(corrected_word),
        "correctly words": correctly_vars
    }
Exemple #45
0
def definition(text='hate'):
    e = ''
    mean = []
    try:
        if request.method == 'POST':
            text = request.form['mean']
            mean = Word(text).definitions
        else:
            e = 'Sorry, I dont have anything about this.'

    except:
        e = 'Sorry, I dont have anything about this.'
    return render_template('home.html', e=e, mean=mean[:5])
Exemple #46
0
    def __to_singular(row, ne):

        if 'NIW' not in ne.izen_lexikografikoa.values:
            return row

        word = Word(row.word).singularize()

        try:
            singular = ne[ne.word == word].reset_index().loc[0]
            row.izen_lexikografikoa = singular.izen_lexikografikoa
        except:
            pass
        return row
Exemple #47
0
def spell_checker(deduped_text, final_text):
    temp = list()
    for text in deduped_text:
        zen = text.split(' ')
        num_words = len(zen)
        crt_words = 0
        empty_words = 0

        for word in zen:
            if word == '':
                empty_words += 1
            else:
                w = Word(word)
                if w.spellcheck()[0][1] > 0.9:
                    crt_words += 1

        num_words -= empty_words
        if crt_words / num_words >= 0.6:
            temp.append(text)

    if len(temp) > 0:
        final_text.extend(temp)
Exemple #48
0
def lemmatize(textblob):
    '''
    permet d'avoir une list de mot lemmanizé
    '''
    words = []
    for word in textblob.words:
        myWord = Word(word.lemmatize('v').encode('utf-8'))
        myWord = Word(myWord.lemma)
        myWord = Word(myWord.singularize().upper().encode('utf-8'))
        words.append(myWord)
    return words
Exemple #49
0
def preprocess(sentence, use_stemmer=False, use_lowercase=False,
               use_stopwords = False,remove_nonalpha=False, use_lemma = False):
    # We always tokenize
    blob = TextBlob(sentence)

    # words = blob.words
    words_and_tags = blob.tags

    words_and_tags_list = [list(x) for x in words_and_tags]


    if use_stopwords:
        stopwords = list(set(nltk_stopwords.words('english')))
    else:
        stopwords = []

    # Using just stop words without preprocessing to lower makes little sense, as it doesn't find all of them then
    # Therefore, we need to lowercase akk the words, and then look if they appear in stopwords.
    # Note that we still return the unlowered version for the word, if it's not found in stopwords.
    # words = [word for word in words if word.lower() not in stopwords]

    # List comprehension sorting out. Take the first element for each nested list, and check if it's
    # in the stopwords list. Then root out nested lists, which have only pos remaining.
    words_and_tags_list = [[word for word in wordpos if word.lower() not in stopwords] for wordpos in words_and_tags_list]

    words_and_tags_list = [x for x in words_and_tags_list if len(x) > 1]


    if use_lowercase:
        # words = [word.lower() for word in words if word.lower() not in stopwords]
        words_and_tags_list = [[word[0].lower(), word[1]] for word in words_and_tags_list]

    # Note that this also removes all words, which have a number in them
    if remove_nonalpha:
        # words = [word for word in words if word.isalpha() and word not in stopwords]
        words_and_tags_list = [[word for word in wordpos if word.isalpha()] for wordpos in words_and_tags_list]

        words_and_tags_list = [x for x in words_and_tags_list if len(x) > 1]


    # We stem if asked to, although it will cause problems with synset searches
    # Stemmer also makes words lowercase by default
    if use_stemmer:
        stemmer = SnowballStemmer("english")
        # words = [stemmer.stem(word) for word in words]
        words_and_tags_list = [[stemmer.stem(word[0]), word[1]] for word in words_and_tags_list]

    if use_lemma:
        words_and_tags_list = [[Word(word[0]).lemmatize(pos=posTb2Wn.get(word[1])), word[1]] for word in words_and_tags_list]

    return words_and_tags_list
def Sim2(text1, text2) :
    
    stop = stopwords.words('english')
    
    text1=regexpProcessing(text1)
    text2=regexpProcessing(text2)
    
    # convert both texts into upper case
    TEXT1=text1.strip()
    TEXT2=text2.strip()
    TEXT1=TEXT1.lower()
    TEXT2=TEXT2.lower()
    
    token1 = generateTokens(TEXT1)
    token2 = generateTokens(TEXT2)
    
    t1List=[]
    for tok1 in token1:
        word1 = Word(tok1)
        w1=word1.spellcheck()
        correctw=w1[0][0]
        confidence = w1[0][1]
        
        if (confidence > 0.8) and (correctw not in stop):
            t1List.append(correctw)
            
            
    t2List=[]
    for tok2 in token2:
        word2 = Word(tok2)
        w2=word2.spellcheck()
        correctw=w2[0][0]
        confidence = w2[0][1]
        
        if (confidence > 0.8) and (correctw not in stop):
            t2List.append(correctw)
            
             
        
        
    
    
    
    for i in range(len(TextItems)):        
        token = generateTokens(TextItems[i])
        tokenList.append(token)
        token = []
    # spell correction
     
    
    # POS Tagging
    word1 = wn.synset('dog.n.01')
    word2 = wn.synset('cat.n.01')

    word1.path_similarity(word2)
    return CosineSimilarity
Exemple #51
0
def processSentence(sentence):
    result_list = list()
    wiki = TextBlob(sentence)
    for word_tuple in wiki.tags:
#        print(word_tuple)
        try:
            w = Word(word_tuple[0])
            if word_tuple[1].startswith('JJ'):
                k = 'a'
            else:
                k = word_tuple[1][0].lower()
            if k in {'c','p','i','w','d','t','m','e','u','f','s'}:
                norm = w.lemmatize()
            else:
                norm = w.lemmatize(k)
#            print(k,norm)
        except:
            #print word_tuple
            norm = ''

        if norm == word_tuple[0]:
            norm = ''
        result_list.append(word_tuple[0]+'/'+norm+'/'+word_tuple[1])
    return ' '.join(result_list)
print()

#sentiment analysis
testimonial = TextBlob("Textblob is amazingly simple to use. What great fun!")
print("Textblob is amazingly simple to use. What great fun!")
a = testimonial.sentiment
print(a) #returns polarity[-1(for worst) to 1(for best)] and subjectivity
print()

#word inflection
sentence = TextBlob('Use 4 spaces per indentation level.')
print(sentence.words[2].singularize()) #similarly you can use pluralize
print()

#word lemmatization
w = Word("octopi")
print("octopi -> ",w.lemmatize())
w = Word("went")
print("went -> ",w.lemmatize("v"))
print()

#definition
print("Octopus : ",Word("octopus").definitions)
print()

#translation and language detection
en_blob = TextBlob(u'Simple is better than complex.')
print('Simple is better than complex.')
print("SPANISH : ",en_blob.translate(to='es'))
en_blob = TextBlob(u'Comment allez vous?')
print('Comment allez vous?')
def asprate(review):
    food_selected_sent = []
    service_selected_sent = []
    price_selected_sent = []
    ambience_selected_sent = []
    category = []

    zen = TextBlob(review)
    sentences = zen.sentences
    for sentence in sentences:
        words = sentence.words
        for i in words:
            w = Word(i)
            i = w.lemmatize()
            if i in food:
                food_selected_sent.append(sentence)

                break
            elif i in service:
                service_selected_sent.append(sentence)

                break
            elif i in price:
                price_selected_sent.append(sentence)

                break
            elif i in ambience:
                ambience_selected_sent.append(sentence)

                break
    # print (food_selected_sent,service_selected_sent,price_selected_sent,ambience_selected_sent)
    food_polarity = []
    service_polarity = []
    price_polarity = []
    ambience_polarity = []
    for i in food_selected_sent:
        food_polarity.append(i.sentiment.polarity)

    for i in service_selected_sent:
        service_polarity.append(i.sentiment.polarity)

    for i in price_selected_sent:
        price_polarity.append(i.sentiment.polarity)

    for i in ambience_selected_sent:
        ambience_polarity.append(i.sentiment.polarity)

    # print (food_polarity,service_polarity,price_polarity,ambience_polarity)
    if food_polarity:

        sum_food = 0
        for i in food_polarity:
            sum_food += i
        print "food", scale_rating(sum_food / len(food_polarity))

    if service_polarity:

        sum_service = 0
        for i in service_polarity:
            sum_service += i
        print "service", scale_rating(sum_service / len(service_polarity))

    if price_polarity:

        sum_price = 0
        for i in price_polarity:
            sum_price += i
        print "price", scale_rating(sum_price / len(price_polarity))

    if ambience_polarity:

        sum_ambience = 0
        for i in ambience_polarity:
            sum_ambience += i
        print "ambience", scale_rating(sum_ambience / len(ambience_polarity))
     im=enhancer.enhance(5)
     im=im.convert('1')
     #Saving the image
     im.save('final.jpg')
     #Reading the image
     text=pytesseract.image_to_string(Image.open('final.jpg'))
     #Getting the text from the image using pytesseract
     if len(text)!=0:
         print(text)
         token=nltk.word_tokenize(text)
         l=len(token)
         list_sugg=[]
         for i in range(0,l):    
             print("...................")
             t_line=TextBlob(token[i])
             w_line=Word(token[i])
             l=w_line.spellcheck()
             length=len(l)
             print("are you looking for")
             for i in range(0,length):
                 print(str(i+1)+"->"+str(l[i][0]))
             
             print("according to me   :"+str(t_line.correct()))
             list_sugg.append(str(t_line.correct()))
         print("according to me......")    
         print(" ".join(list_sugg))
     #'q' for exit
     if cv2.waitKey(1) &0xFF == ord('q'):
         break
 except:
     break
Exemple #55
0
import pandas as pd

from textblob import TextBlob
from textblob import Word

mon_ami_photos = pd.read_pickle(r'C:\Users\LauraM\Desktop\mon_ami_gabi_photos.pkl')
mon_ami_reviews = pd.read_pickle(r'C:\Users\LauraM\Desktop\mon_ami_gabi_reviews.pkl')
#print(mon_ami_photos['caption'])
#print(mon_ami_reviews['text'])

reviewsEntities=[]
counter=0
totalRev=len(mon_ami_reviews['text'])
for review in mon_ami_reviews['text']:
    print '%d : %s'%(len(reviewsEntities),review)
    textTB=TextBlob(review);
    textTB.correct();
    print textTB.sentiment
    entities=[]
    for word in textTB.noun_phrases:
        w=Word(word)
        w.singularize();
        w.lemmatize()
        entities.append(w)
        print(w)
    reviewsEntities.append(entities)
    print '%d/%d : %s'%(len(reviewsEntities),totalRev,entities)

def decodeWeatherData(data):
    status = Word(data['status'].lower())
    status = status.lemmatize()

    # Set origin
    setStructureOrigin(status)

    if status == 'clear':
        pass
    elif status == 'rain':
        status = 'rainy'
    elif status == 'cloud':
        status = 'cloudy'


    poemStructure['weather_status'] = [status]
    poemStructure['actual_temp'] = [str(data['temp'])] 

    '''
    Above 120F: Torrid (R34,G0,B0)
    110 to 120F: Extremely hot (R58,G0,B0)
    100 to 110F: Excessively hot (R88,G0,B0)
    90 to 100F: Very hot (R192,G0,B0)
    80 to 90F: Hot (R255,B0,G0)
    70 to 80F: Very warm (R255,G192,B0)
    60 to 70F: Warm (R255,G255,B0)
    50 to 60F: Mild (R204,G102,B0)
    40 to 50F: Cool (R146,G208,B80)
    30 to 40F: Chilly (R115,G190,B211)
    10 to 30F: Cold (R0,G112,B192)
    10 to -20F: Very cold (R112,G48,B160)
    -20 to -40F: Bitterly cold (R214,G0,B147)
    Below -40F: Brutally cold (R255,G102,B153)

    Read more: http://www.city-data.com/forum/weather/1620160-your-personal-temperature-colors-descriptors-climate.html#ixzz47Ga7mIFW
    '''

    if data['temp'] < 20:
        poemStructure['temp_descriptor'] = ['biting','frigid','frosty','glacial','icy','numbing','polar','wintry','arctic','bitter','chill','chilled','cutting']
    elif data['temp'] < 40:
        poemStructure['temp_descriptor'] = ['breezy','brisk','cool','crisp','freezing','frosty','icy','wintry','arctic','icebox','sharp','biting','blowy','drafty','fresh','glacial','hawkish','nippy','penetrating','snappy']
    elif data['temp'] < 60:
        poemStructure['temp_descriptor'] = ['mild','moderate','pleasant','refreshing','summerlike','summery','temperate']
    elif data['temp'] < 80:
        poemStructure['temp_descriptor'] = ['balmy','broiling','clement','flushed','glowing','heated','hot','lukewarm','pleasant','snug','summery','sweaty','temperate','thermal','warmish']
    elif data['temp'] < 100:
        poemStructure['temp_descriptor'] = ['baking','blistering','broiling','burning','fiery','hot','red-hot','roasting','scalding','scorching','sizzling','torrid','tropical','warm'] 

    # Wind descriptors found here - http://gyre.umeoce.maine.edu/data/gomoos/buoy/php/variable_description.php?variable=wind_speed
    wind = data['wind']
    if wind < 3:
        poemStructure['wind_descriptor'] = ['smoke rises vertically', 'the air is calm']
        wind = ['calmly','stilly']
    elif wind < 7:
        poemStructure['wind_descriptor'] = ['weather vanes are quiet','smoke drifts calmly']
        wind = ['lightly']
    elif wind < 12:
        poemStructure['wind_descriptor'] = ['small twigs move', 'light flags extend']
        wind = ['gently']
    elif wind < 18:    
        poemStructure['wind_descriptor'] = ['small branches sway','paper blows about']
        wind = ['moderately']
    elif wind < 24:
        poemStructure['wind_descriptor'] = ['trees lazily sway', 'waves are breaking']
        wind = ['freshly']
    elif wind < 31:
        poemStructure['wind_descriptor'] = ['the wind tugs', 'wind rushes', 'umbrellas revolt']
        wind = ['strongly']
    elif wind < 38:
        poemStructure['wind_descriptor'] = ['people walk at acute angles', 'twigs break']
        wind = ['gusting']
    else:
        poemStructure['wind_descriptor'] = ['feels like a hurricane', 'trees are falling']
        wind = ['severely', 'violently']
    
    return {}
Exemple #57
0
for i in range(20):
	random_ballet = story_ballets[randint(0,len(story_ballets)-1)]
	section = len(random_ballet) / 20
	random_line = random_ballet[randint(i*section, (i+1)*section)]

	random_line = random_line.decode('utf-8')
	blob = TextBlob(random_line)
	nouns = []
	verbs = []
	adjs = []
	advs = []
	prps = []
	for word,pos in blob.tags:
		#print word, pos
		if pos == "NNS" or pos == "NNP" or pos == "NN" or pos == "PRP":
			w = Word(word.lower())
			nouns.append(w.lemmatize())
		elif pos == "VBZ" or pos == "VBG" or pos == "VBD":
			w = Word(word)
			verbs.append(w.lemmatize())
			#print w.lemmatize(word)
		elif pos == "JJ":
			adjs.append(word)
		elif pos == "RB":
			advs.append(word)
		elif pos == "PRP$":
			prps.append(word)

	loop = True
	search_for = []
	print random_line
#!/usr/bin/python

from textblob import Word
from textblob.wordnet import NOUN
word = Word("plant")
print word.get_synsets(NOUN)
from textblob import Word
import sys

string_tocheck = Word(sys.argv[1])
print string_tocheck.spellcheck()[0][0]
Exemple #60
0
#archFreq=open('Frecuencia.txt','w')
#archTF_IDF=open('TF_IDF.txt','w')
#archDoc=open('Documentos.txt','w')

lineas=archLectura.readlines()
#print "Read Line: %s" % (lineas)

lineaTotal=""
for linea in lineas:
    lineaTotal+=linea
blob=TextBlob(lineaTotal.decode('utf-8'))



#separamos en palabras
words=blob.words
#print words
#quitamos los stop words
#hacemos un stemming snowball
comentarios=[]
stemmer = SnowballStemmer("spanish")
for word in words:    
    if word not in (stopwords.words('spanish')):#Elimnimar Stop words
        w=Word(word)        
        comentarios.append(stemmer.stem(w.lower()))


print "Nuevo: %s " % (comentarios)

#print stopwords.words('spanish')