def sanitize_string_for_search(text_string, find_synonyms=True): porter = stem.porter.PorterStemmer() #Para evitar problemas de codificacion al anadir a solr text_string = text_string.strip().lower().decode('utf-8') #Para evitar errores al anadir a solr text_string = sw_utils.remove_control_characters(text_string) #Eliminando entities de html text_string = sw_utils.remove_ascii_codes_from_string(text_string) #Se quitan los acronimos de internet text_string = sw_utils.remove_internet_acronyms_from_string(text_string) #Se quitan las palabras cortas text_string = sw_utils.remove_small_words_from_string(text_string) #Se le aplican stopwords de ingles pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') text_string = pattern.sub('', text_string) string_tokens = nltk.word_tokenize(text_string) if find_synonyms: text_string = find_synonyms_and_stem(string_tokens) else: text_string = " ".join([porter.stem(kw) for kw in string_tokens]) #Se realiza el stemming de la cadena. #text_string = " ".join([porter.stem(kw) for kw in string_tokens]) return text_string
def join_tweet_texts(tweets): print strftime("%Y-%m-%d %H:%M:%S", gmtime()) + " Juntando texto de los tweets..." text = '' for tweet in tweets: text += sw_utils.remove_internet_acronyms_from_string(tweet[u'content_stemmed']) + ' . ' return text