def clean_string(cls, s): try: assert (isinstance(s, str) and len(s) > 0), 'The string is invalid' cucco = Cucco() normalizations = [ 'remove_extra_white_spaces', 'remove_accent_marks', ('replace_symbol', { 'replacement': '_' }), ('replace_emojis', { 'replacement': '' }), ('replace_urls', { 'replacement': '' }), ] new_s = cucco.normalize(s, normalizations).lower().rstrip('-_') return True, new_s except AssertionError as e: logger.exception(str(e)) return False, None except Exception as e: logger.exception('Error while cleaning string {}'.format(str(e))) return False, None
def get_page_sentences(url): paragraphs_normalized = [] normEsp = Cucco() norms = ['replace_punctuation', 'remove_extra_whitespaces'] soup = BeautifulSoup(requests.get(url).text, "lxml") paragraphs = soup.find_all('p') stripped_paragraph = [tag.get_text().strip() for tag in paragraphs] for sentence in stripped_paragraph: paragraphs_normalized.append(normEsp.normalize(sentence, norms)) return paragraphs_normalized
def normalize2(text): from cucco import Cucco from cucco.config import Config import re text = text.lower() cucco_config = Config() cucco_config.language = detect_language(text) if (cucco_config.language in ('es', 'en', 'fr')): cucco = Cucco(config=cucco_config) normalizations = [ 'remove_stop_words', # 'remove_accent_marks', # french accents ('replace_hyphens', { 'replacement': ' ' }), ('replace_symbols', { 'replacement': ' ' }), ('replace_punctuation', { 'replacement': ' ' }), 'remove_extra_white_spaces', ] else: cucco = Cucco() normalizations = [ # 'remove_stop_words', -- not an identified language # 'remove_accent_marks', # french accents ('replace_hyphens', { 'replacement': ' ' }), ('replace_symbols', { 'replacement': ' ' }), ('replace_punctuation', { 'replacement': ' ' }), 'remove_extra_white_spaces', ] text = cucco.normalize(text, normalizations) text = re.sub('(\d+)%', '%', text) # convert numbers percent to % text = re.sub('(\d+)', '#', text) # convert numbers to # # text = re.sub('#(?P<word>([a-zA-Z])+)', '\g<word>', text) # remove numbers before and after strings' # text = re.sub('(?P<word>([a-zA-Z])+)#', '\g<word>', text) # remove numbers before and after strings' text = text.split() # text = [w for w in text if ( len(w) > 2 and len(w) < 20 ) ] # remove short and very long words text = ' '.join(text) return text
def get_page_sentences(url): stripped_sentences, final_sentences = ([] for i in range(2)) soup = BeautifulSoup(requests.get(url).text, "lxml") list_paragraphs = soup.find_all('p') stripped_sentences = [tag.get_text().strip() for tag in list_paragraphs] norm_esp = Cucco() norms = ['replace_punctuation', 'remove_extra_whitespaces'] for sentence in stripped_sentences: if len(sentence) > 0: final_sentences.append(norm_esp.normalize(sentence, norms)) return final_sentences
def normalize_df_headers(df): norm_esp = Cucco() headers = df.columns normalized_headers = [] for header in headers: normalized_header = norm_esp.normalize(str.lower(header).replace(' ', '')).replace('–', '_') normalized_headers.append( normalized_header if len(normalized_header) != 0 else str.lower(header)) df.columns = normalized_headers return df
def preprocessing(doc_set): print("Iniciando preprocesamiento...") tokenizer = RegexpTokenizer(r'\w+') es_stop = get_stop_words('es') es_stop.append(u'rt') es_stop.append(u'RT') es_stop.append(u'Rt') normEsp = Cucco(language='es') norms = [ 'remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces', 'remove_accent_marks' ] stemmer = SnowballStemmer('spanish') #stemmer = Stemmer.Stemmer('spanish') out_set = [] for doc in doc_set: doc = normEsp.normalize(doc, norms) raw = doc.lower() tokens = tokenizer.tokenize(raw) stooped_tokens = [i for i in tokens if not i in es_stop] #stemmer_words = stemmer.stemWords(stooped_tokens) stemmer_words = [parse(s, lemmata=True) for s in stooped_tokens] stemmer_words = [a[4] for a in [b.split("/") for b in stemmer_words]] #stemmer_words = [] #for word in stooped_tokens: # #stemmer_words.append(stemmer.stem(word)) # stemmer_words.append(word) out_set.append(stemmer_words) dictionary = corpora.Dictionary( out_set) #diccionario con las palabras enlazadas a una id corpus = [dictionary.doc2bow(doc) for doc in out_set] #print(corpus[0]) #imprime la bolsa de palabras, son tuplas de la forma (termID, termfrecuency) en el documento 0 #print(corpus[1]) print("Done") return dictionary, corpus, out_set
def load_sentences(list_urls): paragraphs_normalized = [] token_paragraphs = [] normEsp = Cucco() norms = ['replace_punctuation', 'remove_extra_whitespaces'] for i in range(len(list_urls)): url = list_urls[i] soup = BeautifulSoup(requests.get(url).text, "lxml") # headline = soup.find('h1').get_text() paragraphs = soup.find_all('p') stripped_paragraph = [tag.get_text().strip() for tag in paragraphs] for sentence in stripped_paragraph: paragraphs_normalized.append(normEsp.normalize(sentence, norms)) for j in paragraphs_normalized: token_paragraphs.append(word_tokenize(j)) return token_paragraphs
def Wordcloud(listaTexto, nombreArchivo, termino): #Normalizar el texto cucco = Cucco() text_tweets = '' for x in listaTexto: text_tweets += cucco.normalize(str(x)) + ' ' stopwords_spa = stopwords.words('spanish') #Tokenizar tweets tokenized_words_tweets = word_tokenize(text_tweets) words_tweets = [word.lower() for word in tokenized_words_tweets if (len(word)>3 and word.lower() != termino.lower() and word.lower() not in termino.lower())] texto_words_tweets = [word for word in words_tweets if word not in stopwords_spa] '''#NER java_path = JavaPath() os.environ['JAVAHOME'] = java_path _model_filename = ModelPath() _path_to_jar = JarPath() st = StanfordNERTagger(model_filename=_model_filename, path_to_jar=_path_to_jar) classified_text_tweets= st.tag(texto_words_tweets) dict_tweets = dict() for element in classified_text_tweets: if(element[1]!='O'): if(element[0] in dict_tweets): dict_tweets[element[0]]+=1 else: dict_tweets[element[0]]=1 sorted(dict_tweets.items(),key=operator.itemgetter(1),reverse=True)[0:10]''' wordcloud = WordCloud(max_font_size=50, max_words=200, background_color="white").generate(str(texto_words_tweets).replace("'","")) plt.imshow(wordcloud, interpolation='bilinear') plt.axis("off") plt.savefig('media/'+nombreArchivo+'.jpg') plt.close()
def test_remove_stop_words(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force invalid language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'invalid' assert self._cucco.remove_stop_words(before, **kwargs) == before, message # Test lazy load self._cucco = Cucco(lazy_load=True) for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message
def test_remove_stop_words(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force invalid language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'invalid' assert self._cucco.remove_stop_words(before, **kwargs) == before, message # Test lazy load self._cucco = Cucco(lazy_load=True) for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message
def load_data(filepath): captions = [] tags = [] zipped = () cucco = Cucco() with open(filepath, 'r+') as file: doc = file.read() doc = json.loads(doc) for obj in doc: for post in doc[obj]: hashtags = doc[obj][post]['tags'] if len(hashtags) > 0: capt = [ cucco.replace_emojis( str(doc[obj][post]['caption']).lower(), '') ] tags += hashtags cap = capt * len(hashtags) captions += cap return captions, tags
def remove_stop_words(sentence): normEng = Cucco(language='en') normEsp = Cucco(language='es') norms = [ 'remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces' ] sent = normEng.normalize(sentence, norms) return normEsp.normalize(sent, norms)
class EngPreprocessing: def __init__(self): self.norm_eng = Cucco(language='en') self.norm_ops = ['replace_punctuation', 'remove_extra_whitespaces'] def process(self, sentences): result = [] for sentence in sentences: print('preprocessing sentence: ', sentence) expand_contraction = self.__expand_contraction(sentence.lower()) steamming = self.__steaming(expand_contraction) remove_number = self.__remove_number(steamming) normalising = self.__normalise(remove_number) result.append(normalising) return result def __expand_contraction(self, sentence): def replace(match): return eng_cList[match.group(0)] return eng_c_re.sub(replace, sentence) def __steaming(self, sentence): return ' '.join( lemEng.Sentence(lemEng.parse(sentence, lemmata=True)).lemmata) def __remove_number(self, sentence): """ Removes all numbers from strings, both alphabetic (in English) and numeric. Intended to be part of a text normalisation process. If the number contains 'and' or commas, these are left behind on the assumption the text will be cleaned further to remove punctuation and stop-words. """ query = sentence.replace('-', ' ').lower().split(' ') resultwords = [word.strip() for word in query if word not in eng_nums] noText = ' '.join(resultwords) noNums = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", r" ", noText) # remove numeric number noNums = re.sub(r"\s\s+", r"\s", noNums) return noNums def __normalise(self, sentence): return self.norm_eng.normalize(text=sentence, normalizations=self.norm_ops)
def text_processor( language='en', num=False, lower=False, level='token', normalize=True, max_len=None, min_len=0, ): normalizations = [('replace_emails', { 'replacement': '<email>' }), ('replace_emojis', { 'replacement': '<emoji>' }), ('replace_urls', { 'replacement': '<url>' })] normalizer = None try: from normalizr import Normalizr normalizer = Normalizr().normalize except ImportError: try: from cucco import Cucco normalizer = Cucco().normalize except ImportError: warnings.warn( "Try installing normalizr or cucco for better normalization") NUM = re.compile('[0-9]+') def processor(sent): if normalize and normalizer is not None: sent = normalizer(sent, normalizations) if num: sent = NUM.sub('<num>', sent) # number substitution if lower: sent = sent.lower() # downcase sent = segmenter(sent, level=level) if len(sent) <= min_len: return None if max_len is not None and len(sent) > max_len: return sent[:max_len] else: return sent return processor
def clean_str_for_update(s): ''' Make all characters in lower case. Replace all spaces, special characters with '_' Drop all the trailing characters '-' and '_' Add character 'x' if the string starts with number(s) ''' try: assert (isinstance(s, str) and len(s) > 0), 'The column name is invalid' cucco = Cucco() normalizations = [ 'remove_extra_white_spaces', 'remove_accent_marks', ('replace_characters', { 'characters': ['-', '*'], 'replacement': '' }) ] new_s = cucco.normalize(s, normalizations).strip('-_\n ') return new_s except Exception as e: logging.exception('Error while cleaning column name') raise
def normalize(text): """ normalizing the input -- it is supposed to remove stopwords (if not, nltk.corpus.stopwords.words()-- list of stopwords ) / markup cleaning / new lines / punctuation and " (string.punctuation() ) / 's / to_lowercase / / names? / numbers - change to . or other char (#) / steaming (normalizing) - nltk.stem.porter.PorterStemmer() remove if length <= 3 or >= 20 or contains http / roman numbers / bullet point (.) / text + number o al reves > text / ' / - / for normalizing only necessary stop words (overrepresented), low caps, numbers by # / punctuation REMOVE PREPOSITIONS N ALL LANGUAGES :param text: :return: """ from cucco import Cucco import re cucco = Cucco() text = text.lower() text = cucco.normalize(text) text = re.sub('(\d+)%', '%', text) # convert numbers percent to % text = re.sub('(\d+)', '#', text) # convert numbers to # text = re.sub('(•|“|‘|”|’s|(.|)’)', "", text) # remove dot point for lists and “‘” # remove english possessive 'sop’s' and its # remove french l’ and d’ or ‘key # Mr.Ging > mrging 'genderbasedviolence' ascertain iraniansupported fuelefficient logisticsrelated # 19 471 no in 780 996 00 10pm a as 425 abovementioned avenirstrongp genderrelated # in word_counts there are numbers and short words text = re.sub('#(?P<word>([a-zA-Z])+)', '\g<word>', text) # remove numbers before and after strings' text = re.sub('(?P<word>([a-zA-Z])+)#', '\g<word>', text) # remove numbers before and after strings' text = text.split() text = [w for w in text if (len(w) > 2 and len(w) < 20) ] # remove short and very long words text = ' '.join(text) return text
#from Skylar.models import Flow #afrom Skylar.utils.utils import format_message from ai.model.keras_similarr import keras_similar from ai.model.utils.feature_extractor import extract_features from ai.model.utils.nltk_util import mark_negation from ai.model.utils.qclassifier import Qclassifier from ai.model.utils.spelling.spelling import Spelling from ai.skysentiment import get_sentiment_values_2 as get_sentiment_values from sematch.semantic.similarity import WordNetSimilarity from nltk import word_tokenize, pos_tag, ne_chunk from nltk.chunk import tree2conlltags from cucco import Cucco normalizr = Cucco() normalizations = ['remove_extra_white_spaces', 'replace_punctuation', 'replace_symbols', 'remove_accent_marks'] class fmodel(object): def __init__(self): self.out = {} self.keras = keras_similar() self.classifier = Qclassifier() self.spell=Spelling() self.wn = WordNetSimilarity() self.en_nlp = spacy.load("en_core_web_md") self.stopwords_en=[] with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'utils', 'stopwords_en.txt')) as f:
from cucco import Cucco cucco = Cucco() print(cucco.normalize('Who let the cucco out?'))
number_rows = len(urls_list) number_cols = 3 array_data = np.empty(shape=(number_rows, number_cols), dtype='object') for i in range(number_rows): url_second = urls_list[i] array_data[i][0] = url_second p_tags = BeautifulSoup(requests.get(url_second).text).find_all('p') array_data[i][1] = BeautifulSoup( requests.get(url_second).text).find_all('h1') array_data[i][2] = [tag.get_text().strip() for tag in p_tags] #save to csv if wanted #pd.DataFrame(arra_data).to_csv("HypothesisA.csv") #Cleaning the text normEsp = Cucco() norms = ['replace_punctuation', 'remove_extra_whitespaces'] new_stopwords = set( stopwords.words('spanish')) - {'ella', 'ellas', 'una', 'unas', 'él'} for i in range(number_rows): p_tags_text = [ normEsp.normalize(sentence, norms) for sentence in array_data[i][2] ] espTokens = [word_tokenize(text) for text in p_tags_text] flatList = [word for sentList in espTokens for word in sentList] filtered = [word for word in flatList if word not in new_stopwords] array_data[i][2] = filtered espFreq = FreqDist(word for word in array_data[0][2]) for word, frequency in espFreq.most_common(20): print(u'{}: {}'.format(word, frequency))
import re from cucco import Cucco _CUCCO = Cucco() NORMALIZATIONS = ['remove_extra_white_spaces'] def normalize(text: str) -> str: """ Text normalization. >>> normalize("ООО 'ВЫМПЕЛКОМ' ") "ООО 'ВЫМПЕЛКОМ'" >>> normalize('ЗАО "ЮВЕЛИРНЫЙ завод') 'ЗАО "ЮВЕЛИРНЫЙ завод' >>> normalize("ОАО 'ЁЛКИ и ПАЛКИ' ") "ОАО 'ЁЛКИ и ПАЛКИ'" >>> normalize('Столовая №1') 'Столовая №1' :param text: some hand typed text :return: normalized text """ return _CUCCO.normalize(text, NORMALIZATIONS) def company_name_normalization(name: str) -> str: """ Company name normalization
def setup_method(self): self._cucco = Cucco()
class TestCucco(object): _cucco = None @staticmethod def _tests_generator(test): for test in TESTS_DATA['tests'][test[5:]]: yield (test['after'], test['before'], test['characters'] if 'characters' in test else '', test['kwargs'] if 'kwargs' in test else dict(), test['message']) def setup_method(self): self._cucco = Cucco() def test_normalize(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.normalize(before, **kwargs) == after, message def test_remove_accent_marks(self, request): for after, before, _, _, message in self._tests_generator( request.node.name): assert self._cucco.remove_accent_marks(before) == after, message def test_remove_stop_words(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force invalid language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'invalid' assert self._cucco.remove_stop_words(before, **kwargs) == before, message # Test lazy load self._cucco = Cucco(lazy_load=True) for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message def test_replace_characters(self, request): for after, before, characters, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_characters(text=before, characters=characters, **kwargs) == after, message def test_replace_emails(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_emails(text=before, **kwargs) == after, message def test_replace_emojis(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_emojis(text=before, **kwargs) == after, message def test_remove_extra_whitespaces(self, request): for after, before, _, _, message in self._tests_generator( request.node.name): assert self._cucco.remove_extra_whitespaces( before) == after, message def test_replace_hyphens(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_hyphens(text=before, **kwargs) == after, message def test_replace_punctuation(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_punctuation(text=before, **kwargs) == after, message def test_replace_symbols(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_symbols(text=before, **kwargs) == after, message def test_replace_urls(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_urls(text=before, **kwargs) == after, message
def searchTweets(query): db = firestore.client() maxCount = 100 max_id = -1 count = 0 obj = { query : { "regioes": { "Norte": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Nordeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Centro-Oeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Sul": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Sudeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 } } } } other_obj = { "regioes": { "Norte": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Nordeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Centro-Oeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Sul": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Sudeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 } } } users_ref = db.collection(query) docs = users_ref.stream() jsonT = "" for doc in docs: jsonT = doc.to_dict()["porcentagem"] if jsonT == "": while count < maxCount: if max_id <= 0: searched_tweets = api.search(q=query+" -filter:retweets", lang="pt-br", tweet_mode='extended', count=maxCount*5) else: searched_tweets = api.search(q=query+" -filter:retweets", lang="pt-br", tweet_mode='extended', count=maxCount*5, max_id=str(max_id - 1)) if not searched_tweets: print("tem nada aq mona") break else: for tweet in searched_tweets: if (tweet.place is not None) and (count < maxCount): text = json.dumps(tweet._json['full_text'], sort_keys=True, indent=4, ensure_ascii=False).encode('utf8').decode() finalText = text.split(" ") text = "" for aux in finalText: if not '@' in aux and not 'https://' in aux: text += aux + " " count += 1 text = Cucco.replace_emojis(text) text = text.replace('"', '') municipio = (json.dumps(tweet._json['place']['full_name'], sort_keys=True, indent=4, ensure_ascii=False).encode('utf8')).decode().split(",")[0].replace('"',"") try: if municipio == 'Sao Paulo': municipio = 'São Paulo' regiao = regioes.getRegion(ufbr.get_cidade(municipio).codigo) em = classify(text) other_obj["regioes"][regiao][em] +=1 other_obj["regioes"][regiao]["count"] +=1 pass except Exception as identifier: count -= 1 pass max_id = searched_tweets[-1].id arr_reg = ["Norte", "Nordeste", "Centro-Oeste", "Sul", "Sudeste"] arr_emo = ["tristeza", "alegria", "amor", "raiva"] for i in arr_reg: for j in arr_emo: total = other_obj["regioes"][i]["count"] if total == 0: obj[query]["regioes"][i][j] = 0 else : obj[query]["regioes"][i][j] = round((other_obj["regioes"][i][j] / total) * 100, 2) db.collection(query).add({ "tweets_classificados": json.dumps(other_obj), "porcentagem" : json.dumps(obj) }) objs = [obj, other_obj] return objs else: users_ref = db.collection(query) docs = users_ref.stream() jsonP = "" for doc in docs: jsonP = doc.to_dict()["porcentagem"] jsonT = doc.to_dict()["tweets_classificados"] arr = [json.loads(jsonP), json.loads(jsonT)] return arr
import glob import errno from cucco import Cucco cucco = Cucco() positive_files = './pos/*.txt' negative_files = './neg/*.txt' normalizations = [ 'remove_accent_marks', 'remove_extra_whitespaces', 'remove_stop_words', 'replace_charachters', 'replace_emails', 'replace_emojis', 'replace_hyphens', 'replace_punctuation', 'replace_symbols', 'replace_urls' ] iterations = 0 files = glob.glob(negative_files) for name in files: with open(name, "r+") as f: text = f.read().decode('utf8') words = text.split(" ") a = [word for word in words if '@'.decode('utf-8') not in word] for i, word in enumerate(a): a[i] = a[i].replace("&", "&") a[i] = a[i].replace("<", "<") a[i] = a[i].replace(">", ">") a[i] = a[i].replace(""", '"') output = ' '.join(a) normalized_out = cucco.normalize(output, normalizations) print(normalized_out)
def get_tasks(task_id): abc = [] graph = facebook.GraphAPI(access_token=token, version=3.1) node = "/%s" % task_id video = graph.request( node + "/comments?fields=id,message,comment_count," "reactions.type(LIKE).limit(0).summary(total_count).as(like)," "reactions.type(LOVE).limit(0).summary(total_count).as(love)," "reactions.type(WOW).limit(0).summary(total_count).as(wow)," "reactions.type(HAHA).limit(0).summary(total_count).as(haha)," "reactions.type(SAD).limit(0).summary(total_count).as(sad)," "reactions.type(ANGRY).limit(0).summary(total_count).as(angry)") # video = graph.request(node + '?fields=' # 'reactions.type(LIKE).limit(0).summary(total_count).as(like),' # 'reactions.type(LOVE).limit(0).summary(total_count).as(love),' # 'reactions.type(WOW).limit(0).summary(total_count).as(wow),' # 'reactions.type(HAHA).limit(0).summary(total_count).as(haha),' # 'reactions.type(SAD).limit(0).summary(total_count).as(sad),' # 'reactions.type(ANGRY).limit(0).summary(total_count).as(angry)') # Wrap this block in a while loop so we can keep paginating requests until # finished. # Baca dataset joy_feel = read_dataset(get_full_path("dataset/cf/pp/filter/joy.txt"), "joy") disgust_feel = read_dataset( get_full_path("dataset/cf/pp/filter/disgust.txt"), "disgust") sadness_feel = read_dataset( get_full_path("dataset/cf/pp/filter/sadness.txt"), "sadness") anger_feel = read_dataset(get_full_path("dataset/cf/pp/filter/anger.txt"), "anger") fear_feel = read_dataset(get_full_path("dataset/cf/pp/filter/fear.txt"), "fear") surprise_feel = read_dataset( get_full_path("dataset/cf/pp/filter/surpriseExtra.txt"), "surprise") # filter away words that are less than 3 letters to form the training data dataku = [] for (words, sentiment) in (joy_feel + disgust_feel + sadness_feel + anger_feel + fear_feel + surprise_feel): dataku.append((words.rstrip(), sentiment)) lines = [] labels = [] for words, sentiment in dataku: html_parser = HTMLParser() lines.append(html_parser.unescape(words)) labels.append(sentiment) headlines, labels = lines, labels pipeline = Pipeline([ ( "count_vectorizer", CountVectorizer( ngram_range=(2, 3), min_df=1, max_df=0.8, stop_words=frozenset([ "saya", "sedang", "lagi", "adalah", "di", "dari", "karena", "dan", "dengan", "ke", "yang", "untuk", "itu", "orang", ]), ), ), ("tfidf_transformer", TfidfTransformer()), ("classifier", MultinomialNB()), ]) pipeline.fit(headlines, labels) angerx = 0 joyx = 0 surprisex = 0 sadnessx = 0 fearx = 0 disgustx = 0 while True: try: # print("Get post comments data :") for each_video in video["data"]: if each_video["message"] != "": # connect to database init_tag() html_parser = HTMLParser() spell_check = jalanSpellCheck() koreksi_slang = slangWordCorrect() cucco = Cucco() kata = cucco.replace_emojis(each_video["message"]) # Escape HTML kata = html_parser.unescape(each_video["message"]) kata = " ".join(kata.split()) # Hapus emoji kata = cucco.replace_emojis(kata) normalizations = ["remove_extra_white_spaces"] # Hapus extra spasi kata = cucco.normalize(kata, normalizations) kata = kata.replace("/", " ") # Conver ke lowercase kata = kata.lower() # Hapus repeating character yang lebih dari 2 kata = re.sub(r"(.)\1+", r"\1\1", kata) # Proses ,. yang sisa jadi 2 kata = kata.replace("..", ".") kata = kata.replace(",,", ",") kata = kata.replace("!!", "!") kata = kata.replace("??", "?") # Tambahkan spasi habis titik rx = r"\.(?=\S)" kata = re.sub(rx, ". ", kata) # Slang correction kata = koreksi_slang.jalan(kata) # Spellcheck error # tampung_kata_1 = [] # tampung_1 = kata.split() # for word in tampung_1: # tampung_kata_1.append(spell_check.correctSpelling(word)) # kata = " ".join(tampung_kata_1) asdqwe = kata # Check apakah ada tanda baca di akhir if (re.match(".*[^.?!]$", kata) is not None) == True: kata = kata + " ." resultx = do_tag(kata) kata = " ".join(resultx) # print(words) # xxx = "".join([" " + i for i in words]).strip() # kata = xxx if kata != "": linesz = [] linesz.append(kata) words = [] for y in linesz: lines = y.split() for x in lines: word = x.split("/") chars_to_remove = set(( ",", "IN", "CC", "SC", "CDO", "CDC", "CDP", "CDI", "DT", "MD", "OP", "CP", "SYM", ".", )) if word[1] not in chars_to_remove: words.append(word[0] + "_" + word[1]) resultx = "".join([" " + i for i in words]).strip() # print(resultx) cobaa = [] cobaa.append(resultx) for x in pipeline.predict(cobaa): hasilx = x if hasilx == "anger": angerx = angerx + 1 elif hasilx == "joy": joyx = joyx + 1 elif hasilx == "sadness": sadnessx = sadnessx + 1 elif hasilx == "fear": fearx = fearx + 1 elif hasilx == "disgust": disgustx = disgustx + 1 elif hasilx == "surprise": surprisex = surprisex + 1 comments_data = { "id": each_video["id"], "komen": each_video["message"], "asdqwe": asdqwe, "komen_edit": resultx, "prediksi": hasilx, "like_count": each_video["like"]["summary"]["total_count"], "love_count": each_video["love"]["summary"]["total_count"], "wow_count": each_video["wow"]["summary"]["total_count"], "haha_count": each_video["haha"]["summary"]["total_count"], "sad_count": each_video["sad"]["summary"]["total_count"], "angry_count": each_video["angry"]["summary"]["total_count"], } abc.append(comments_data) # Attempt to make a request to the next page of data, if it exists. video = requests.get(video["paging"]["next"]).json() except KeyError: # When there are no more pages (['paging']['next']), break from the # loop and end the script. break ctrku = { "anger": angerx, "joy": joyx, "sadness": sadnessx, "fear": fearx, "surprise": surprisex, "disgust": disgustx, } # comments_data = { # 'id' : video['comment_count'], # 'video_like' : video['like']['summary']['total_count'], # 'video_love': video['love']['summary']['total_count'], # 'video_wow': video['wow']['summary']['total_count'], # 'video_haha': video['haha']['summary']['total_count'], # 'video_sad': video['sad']['summary']['total_count'], # 'video_angry': video['angry']['summary']['total_count'] # } # abc.append(comments_data) return jsonify({"tasks": abc}, {"ASD": ctrku})
def __init__(self): self.norm_eng = Cucco(language='en') self.norm_ops = ['replace_punctuation', 'remove_extra_whitespaces']
def remove_emoji(text): cucco = Cucco() return cucco.replace_emojis(text)
write a python program for searching and replacing a pattern. sentence = "This is a phone number 672-123-456-9910" pattern = r".*(phone).*?([\d-]+)" match = re.match(pattern, sentence) match.groups() match.group() match.group(0) match.group(1) match.group(2) match.group(1,2) write a python program for searching and replacing flags. ## THis library checks for Emojis and replaces it with the regular expressions from cucco import Cucco cucco = Cucco() a=cucco.replace_emojis(':) :)) :( FSDFSDDFSDfv') print(a) write the syntax and a simple program for regular expression pattern in python. import re text = 'You can try to find an ant in this string' pattern = 'an?\w' for match in re.finditer(pattern, text): sStart = match.start() sEnd = match.end() sGroup = match.group() print('Match "{}" found at: [{},{}]'.format(sGroup, sStart,sEnd))
class TestCucco(object): _cucco = None @staticmethod def _tests_generator(test): for test in TESTS_DATA['tests'][test[5:]]: yield (test['after'], test['before'], test['characters'] if 'characters' in test else '', test['kwargs'] if 'kwargs' in test else dict(), test['message']) def setup_method(self): self._cucco = Cucco() def test_normalize(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.normalize(before, **kwargs) == after, message def test_remove_accent_marks(self, request): for after, before, _, _, message in self._tests_generator(request.node.name): assert self._cucco.remove_accent_marks(before) == after, message def test_remove_stop_words(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force invalid language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'invalid' assert self._cucco.remove_stop_words(before, **kwargs) == before, message # Test lazy load self._cucco = Cucco(lazy_load=True) for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message def test_replace_characters(self, request): for after, before, characters, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_characters(text=before, characters=characters, **kwargs) == after, message def test_replace_emails(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_emails(text=before, **kwargs) == after, message def test_replace_emojis(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_emojis(text=before, **kwargs) == after, message def test_remove_extra_white_spaces(self, request): for after, before, _, _, message in self._tests_generator(request.node.name): assert self._cucco.remove_extra_white_spaces(before) == after, message def test_replace_hyphens(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_hyphens(text=before, **kwargs) == after, message def test_replace_punctuation(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_punctuation(text=before, **kwargs) == after, message def test_replace_symbols(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_symbols(text=before, **kwargs) == after, message def test_replace_urls(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_urls(text=before, **kwargs) == after, message
def test(task_id): video = [] conn = create_connection("datafacebook/Kompas/" + str(task_id) + ".db") cursor = conn.execute( "SELECT comment_id, comment_content, like_count, love_count, wow_count, haha_count, sad_count, angry_count from Comments" ) for row in cursor: video.append({ "id": row[0], "message": row[1], "like": row[2], "love": row[3], "wow": row[4], "haha": row[5], "sad": row[6], "angry": row[7], }) conn.close() abc = [] joy_feel = read_dataset(get_full_path("dataset/cf/pp/filter/joy.txt"), "joy") disgust_feel = read_dataset( get_full_path("dataset/cf/pp/filter/disgust.txt"), "disgust") sadness_feel = read_dataset( get_full_path("dataset/cf/pp/filter/sadness.txt"), "sadness") anger_feel = read_dataset(get_full_path("dataset/cf/pp/filter/anger.txt"), "anger") fear_feel = read_dataset(get_full_path("dataset/cf/pp/filter/fear.txt"), "fear") surprise_feel = read_dataset( get_full_path("dataset/cf/pp/filter/surpriseExtra.txt"), "surprise") dataku = [] for (words, sentiment) in (joy_feel + disgust_feel + sadness_feel + anger_feel + fear_feel + surprise_feel): dataku.append((words.rstrip(), sentiment)) lines = [] labels = [] for words, sentiment in dataku: html_parser = HTMLParser() lines.append(html_parser.unescape(words)) labels.append(sentiment) headlines, labels = lines, labels pipeline = Pipeline([ ( "count_vectorizer", CountVectorizer( ngram_range=(2, 3), min_df=1, max_df=0.8, stop_words=frozenset([ "saya", "sedang", "lagi", "adalah", "di", "dari", "karena", "dan", "dengan", "ke", "yang", "untuk", "itu", "orang", ]), ), ), ("tfidf_transformer", TfidfTransformer()), ("classifier", MultinomialNB()), ]) pipeline.fit(headlines, labels) angerx = 0 joyx = 0 surprisex = 0 sadnessx = 0 fearx = 0 disgustx = 0 for each_video in video: if each_video["message"] != "": # connect to database init_tag() html_parser = HTMLParser() spell_check = jalanSpellCheck() koreksi_slang = slangWordCorrect() cucco = Cucco() kata = cucco.replace_emojis(each_video["message"]) # Escape HTML kata = html_parser.unescape(each_video["message"]) kata = " ".join(kata.split()) # Hapus emoji kata = cucco.replace_emojis(kata) normalizations = ["remove_extra_white_spaces"] # Hapus extra spasi kata = cucco.normalize(kata, normalizations) kata = kata.replace("/", " ") # Conver ke lowercase kata = kata.lower() # Hapus repeating character yang lebih dari 2 kata = re.sub(r"(.)\1+", r"\1\1", kata) # Proses ,. yang sisa jadi 2 kata = kata.replace("..", ".") kata = kata.replace(",,", ",") kata = kata.replace("!!", "!") kata = kata.replace("??", "?") # Tambahkan spasi habis titik rx = r"\.(?=\S)" kata = re.sub(rx, ". ", kata) # Slang correction kata = koreksi_slang.jalan(kata) # Spellcheck error # tampung_kata_1 = [] # tampung_1 = kata.split() # for word in tampung_1: # tampung_kata_1.append(spell_check.correctSpelling(word)) # kata = " ".join(tampung_kata_1) asdqwe = kata # Check apakah ada tanda baca di akhir if (re.match(".*[^.?!]$", kata) is not None) == True: kata = kata + " ." resultx = do_tag(kata) kata = " ".join(resultx) if kata != "": linesz = [] linesz.append(kata) words = [] for y in linesz: lines = y.split() for x in lines: word = x.split("/") chars_to_remove = set(( ",", "IN", "CC", "SC", "CDO", "CDC", "CDP", "CDI", "DT", "MD", "OP", "CP", "SYM", ".", )) if word[1] not in chars_to_remove: words.append(word[0] + "_" + word[1]) resultx = "".join([" " + i for i in words]).strip() cobaa = [] cobaa.append(resultx) for x in pipeline.predict(cobaa): hasilx = x if hasilx == "anger": angerx = angerx + 1 elif hasilx == "joy": joyx = joyx + 1 elif hasilx == "sadness": sadnessx = sadnessx + 1 elif hasilx == "fear": fearx = fearx + 1 elif hasilx == "disgust": disgustx = disgustx + 1 elif hasilx == "surprise": surprisex = surprisex + 1 comments_data = { "id": each_video["id"], "komen": each_video["message"], "asdqwe": asdqwe, "komen_edit": resultx, "prediksi": hasilx, "like_count": each_video["like"], "love_count": each_video["love"], "wow_count": each_video["wow"], "haha_count": each_video["haha"], "sad_count": each_video["sad"], "angry_count": each_video["angry"], } abc.append(comments_data) ctrku = { "anger": angerx, "joy": joyx, "sadness": sadnessx, "fear": fearx, "surprise": surprisex, "disgust": disgustx, } return jsonify({"tasks": abc}, {"ASD": ctrku})
# -*- coding: utf-8 -*- import sys import enchant import os from nltk.tokenize import RegexpTokenizer from stop_words import get_stop_words #from pattern.en import parse from cucco import Cucco #La primera linea debe tener el nombre del archivo o pagina #for line in sys.stdin: tokenizer = RegexpTokenizer(r'\w+') cuccoEn = Cucco() norms = [ 'remove_stop_words', 'replace_punctuation', 'remove_extra_whitespaces', 'remove_accent_marks' ] dic = enchant.Dict("en_US") en_stop = get_stop_words('en') fileName = os.environ["map_input_file"] #fileName = "Test.cpp"; raw = None tokens = [] final_tokens = [] parse_tokens = []
def dialogue_iterator(filename, test=False, raw=False): """ Iterate dialogues in the specified file. One may specify whether to read a test dataset (without evaluation scores and user types) and to return raw dialogue phrases (without postprocessing). """ cu = Cucco() normalizations = [ 'remove_accent_marks', ('replace_emojis', { 'replacement': ' ' }), ('replace_hyphens', { 'replacement': '' }), ('replace_punctuation', { 'replacement': '' }), ('replace_urls', { 'replacement': ' ' }), 'remove_extra_whitespaces' ] with open(filename) as input_file: for r in json.load(input_file): if not raw: r['context'] = cu.normalize(r['context']) # form the thread list th_list = [] for i in r['thread']: if not raw: i['text'] = i['text'].rstrip() if not i['text']: continue i['text'] = cu.normalize(i['text'], normalizations) i['text'] = i['text'].lower() th_list.append(Thread(i['text'], i['userId'], i.get('time'))) # if we're dealing with the test dataset, do not return user types # and evaluation scores if test: d = Dialogue(r['context'], r['dialogId'], None, th_list, None) else: # form the evaluation dictionary ev_dict = {} for i in r['evaluation']: if i['userId'] == 'Alice': ev_dict['Alice'] = i['quality'] elif i['userId'] == 'Bob': ev_dict['Bob'] = i['quality'] else: raise ValueError('incorrect user ID') # form the user list us_dict = {} for i in r['users']: if i['id'] == 'Alice': us_dict['Alice'] = i['userType'] elif i['id'] == 'Bob': us_dict['Bob'] = i['userType'] else: raise ValueError('incorrect user ID') d = Dialogue(r['context'], r['dialogId'], Evaluation(ev_dict['Alice'], ev_dict['Bob']), th_list, User(us_dict['Alice'], us_dict['Bob'])) yield concat_phrases(d)