def load_data(filepath): captions = [] tags = [] zipped = () cucco = Cucco() with open(filepath, 'r+') as file: doc = file.read() doc = json.loads(doc) for obj in doc: for post in doc[obj]: hashtags = doc[obj][post]['tags'] if len(hashtags) > 0: capt = [ cucco.replace_emojis( str(doc[obj][post]['caption']).lower(), '') ] tags += hashtags cap = capt * len(hashtags) captions += cap return captions, tags
def get_tasks(task_id): abc = [] graph = facebook.GraphAPI(access_token=token, version=3.1) node = "/%s" % task_id video = graph.request( node + "/comments?fields=id,message,comment_count," "reactions.type(LIKE).limit(0).summary(total_count).as(like)," "reactions.type(LOVE).limit(0).summary(total_count).as(love)," "reactions.type(WOW).limit(0).summary(total_count).as(wow)," "reactions.type(HAHA).limit(0).summary(total_count).as(haha)," "reactions.type(SAD).limit(0).summary(total_count).as(sad)," "reactions.type(ANGRY).limit(0).summary(total_count).as(angry)") # video = graph.request(node + '?fields=' # 'reactions.type(LIKE).limit(0).summary(total_count).as(like),' # 'reactions.type(LOVE).limit(0).summary(total_count).as(love),' # 'reactions.type(WOW).limit(0).summary(total_count).as(wow),' # 'reactions.type(HAHA).limit(0).summary(total_count).as(haha),' # 'reactions.type(SAD).limit(0).summary(total_count).as(sad),' # 'reactions.type(ANGRY).limit(0).summary(total_count).as(angry)') # Wrap this block in a while loop so we can keep paginating requests until # finished. # Baca dataset joy_feel = read_dataset(get_full_path("dataset/cf/pp/filter/joy.txt"), "joy") disgust_feel = read_dataset( get_full_path("dataset/cf/pp/filter/disgust.txt"), "disgust") sadness_feel = read_dataset( get_full_path("dataset/cf/pp/filter/sadness.txt"), "sadness") anger_feel = read_dataset(get_full_path("dataset/cf/pp/filter/anger.txt"), "anger") fear_feel = read_dataset(get_full_path("dataset/cf/pp/filter/fear.txt"), "fear") surprise_feel = read_dataset( get_full_path("dataset/cf/pp/filter/surpriseExtra.txt"), "surprise") # filter away words that are less than 3 letters to form the training data dataku = [] for (words, sentiment) in (joy_feel + disgust_feel + sadness_feel + anger_feel + fear_feel + surprise_feel): dataku.append((words.rstrip(), sentiment)) lines = [] labels = [] for words, sentiment in dataku: html_parser = HTMLParser() lines.append(html_parser.unescape(words)) labels.append(sentiment) headlines, labels = lines, labels pipeline = Pipeline([ ( "count_vectorizer", CountVectorizer( ngram_range=(2, 3), min_df=1, max_df=0.8, stop_words=frozenset([ "saya", "sedang", "lagi", "adalah", "di", "dari", "karena", "dan", "dengan", "ke", "yang", "untuk", "itu", "orang", ]), ), ), ("tfidf_transformer", TfidfTransformer()), ("classifier", MultinomialNB()), ]) pipeline.fit(headlines, labels) angerx = 0 joyx = 0 surprisex = 0 sadnessx = 0 fearx = 0 disgustx = 0 while True: try: # print("Get post comments data :") for each_video in video["data"]: if each_video["message"] != "": # connect to database init_tag() html_parser = HTMLParser() spell_check = jalanSpellCheck() koreksi_slang = slangWordCorrect() cucco = Cucco() kata = cucco.replace_emojis(each_video["message"]) # Escape HTML kata = html_parser.unescape(each_video["message"]) kata = " ".join(kata.split()) # Hapus emoji kata = cucco.replace_emojis(kata) normalizations = ["remove_extra_white_spaces"] # Hapus extra spasi kata = cucco.normalize(kata, normalizations) kata = kata.replace("/", " ") # Conver ke lowercase kata = kata.lower() # Hapus repeating character yang lebih dari 2 kata = re.sub(r"(.)\1+", r"\1\1", kata) # Proses ,. yang sisa jadi 2 kata = kata.replace("..", ".") kata = kata.replace(",,", ",") kata = kata.replace("!!", "!") kata = kata.replace("??", "?") # Tambahkan spasi habis titik rx = r"\.(?=\S)" kata = re.sub(rx, ". ", kata) # Slang correction kata = koreksi_slang.jalan(kata) # Spellcheck error # tampung_kata_1 = [] # tampung_1 = kata.split() # for word in tampung_1: # tampung_kata_1.append(spell_check.correctSpelling(word)) # kata = " ".join(tampung_kata_1) asdqwe = kata # Check apakah ada tanda baca di akhir if (re.match(".*[^.?!]$", kata) is not None) == True: kata = kata + " ." resultx = do_tag(kata) kata = " ".join(resultx) # print(words) # xxx = "".join([" " + i for i in words]).strip() # kata = xxx if kata != "": linesz = [] linesz.append(kata) words = [] for y in linesz: lines = y.split() for x in lines: word = x.split("/") chars_to_remove = set(( ",", "IN", "CC", "SC", "CDO", "CDC", "CDP", "CDI", "DT", "MD", "OP", "CP", "SYM", ".", )) if word[1] not in chars_to_remove: words.append(word[0] + "_" + word[1]) resultx = "".join([" " + i for i in words]).strip() # print(resultx) cobaa = [] cobaa.append(resultx) for x in pipeline.predict(cobaa): hasilx = x if hasilx == "anger": angerx = angerx + 1 elif hasilx == "joy": joyx = joyx + 1 elif hasilx == "sadness": sadnessx = sadnessx + 1 elif hasilx == "fear": fearx = fearx + 1 elif hasilx == "disgust": disgustx = disgustx + 1 elif hasilx == "surprise": surprisex = surprisex + 1 comments_data = { "id": each_video["id"], "komen": each_video["message"], "asdqwe": asdqwe, "komen_edit": resultx, "prediksi": hasilx, "like_count": each_video["like"]["summary"]["total_count"], "love_count": each_video["love"]["summary"]["total_count"], "wow_count": each_video["wow"]["summary"]["total_count"], "haha_count": each_video["haha"]["summary"]["total_count"], "sad_count": each_video["sad"]["summary"]["total_count"], "angry_count": each_video["angry"]["summary"]["total_count"], } abc.append(comments_data) # Attempt to make a request to the next page of data, if it exists. video = requests.get(video["paging"]["next"]).json() except KeyError: # When there are no more pages (['paging']['next']), break from the # loop and end the script. break ctrku = { "anger": angerx, "joy": joyx, "sadness": sadnessx, "fear": fearx, "surprise": surprisex, "disgust": disgustx, } # comments_data = { # 'id' : video['comment_count'], # 'video_like' : video['like']['summary']['total_count'], # 'video_love': video['love']['summary']['total_count'], # 'video_wow': video['wow']['summary']['total_count'], # 'video_haha': video['haha']['summary']['total_count'], # 'video_sad': video['sad']['summary']['total_count'], # 'video_angry': video['angry']['summary']['total_count'] # } # abc.append(comments_data) return jsonify({"tasks": abc}, {"ASD": ctrku})
def test(task_id): video = [] conn = create_connection("datafacebook/Kompas/" + str(task_id) + ".db") cursor = conn.execute( "SELECT comment_id, comment_content, like_count, love_count, wow_count, haha_count, sad_count, angry_count from Comments" ) for row in cursor: video.append({ "id": row[0], "message": row[1], "like": row[2], "love": row[3], "wow": row[4], "haha": row[5], "sad": row[6], "angry": row[7], }) conn.close() abc = [] joy_feel = read_dataset(get_full_path("dataset/cf/pp/filter/joy.txt"), "joy") disgust_feel = read_dataset( get_full_path("dataset/cf/pp/filter/disgust.txt"), "disgust") sadness_feel = read_dataset( get_full_path("dataset/cf/pp/filter/sadness.txt"), "sadness") anger_feel = read_dataset(get_full_path("dataset/cf/pp/filter/anger.txt"), "anger") fear_feel = read_dataset(get_full_path("dataset/cf/pp/filter/fear.txt"), "fear") surprise_feel = read_dataset( get_full_path("dataset/cf/pp/filter/surpriseExtra.txt"), "surprise") dataku = [] for (words, sentiment) in (joy_feel + disgust_feel + sadness_feel + anger_feel + fear_feel + surprise_feel): dataku.append((words.rstrip(), sentiment)) lines = [] labels = [] for words, sentiment in dataku: html_parser = HTMLParser() lines.append(html_parser.unescape(words)) labels.append(sentiment) headlines, labels = lines, labels pipeline = Pipeline([ ( "count_vectorizer", CountVectorizer( ngram_range=(2, 3), min_df=1, max_df=0.8, stop_words=frozenset([ "saya", "sedang", "lagi", "adalah", "di", "dari", "karena", "dan", "dengan", "ke", "yang", "untuk", "itu", "orang", ]), ), ), ("tfidf_transformer", TfidfTransformer()), ("classifier", MultinomialNB()), ]) pipeline.fit(headlines, labels) angerx = 0 joyx = 0 surprisex = 0 sadnessx = 0 fearx = 0 disgustx = 0 for each_video in video: if each_video["message"] != "": # connect to database init_tag() html_parser = HTMLParser() spell_check = jalanSpellCheck() koreksi_slang = slangWordCorrect() cucco = Cucco() kata = cucco.replace_emojis(each_video["message"]) # Escape HTML kata = html_parser.unescape(each_video["message"]) kata = " ".join(kata.split()) # Hapus emoji kata = cucco.replace_emojis(kata) normalizations = ["remove_extra_white_spaces"] # Hapus extra spasi kata = cucco.normalize(kata, normalizations) kata = kata.replace("/", " ") # Conver ke lowercase kata = kata.lower() # Hapus repeating character yang lebih dari 2 kata = re.sub(r"(.)\1+", r"\1\1", kata) # Proses ,. yang sisa jadi 2 kata = kata.replace("..", ".") kata = kata.replace(",,", ",") kata = kata.replace("!!", "!") kata = kata.replace("??", "?") # Tambahkan spasi habis titik rx = r"\.(?=\S)" kata = re.sub(rx, ". ", kata) # Slang correction kata = koreksi_slang.jalan(kata) # Spellcheck error # tampung_kata_1 = [] # tampung_1 = kata.split() # for word in tampung_1: # tampung_kata_1.append(spell_check.correctSpelling(word)) # kata = " ".join(tampung_kata_1) asdqwe = kata # Check apakah ada tanda baca di akhir if (re.match(".*[^.?!]$", kata) is not None) == True: kata = kata + " ." resultx = do_tag(kata) kata = " ".join(resultx) if kata != "": linesz = [] linesz.append(kata) words = [] for y in linesz: lines = y.split() for x in lines: word = x.split("/") chars_to_remove = set(( ",", "IN", "CC", "SC", "CDO", "CDC", "CDP", "CDI", "DT", "MD", "OP", "CP", "SYM", ".", )) if word[1] not in chars_to_remove: words.append(word[0] + "_" + word[1]) resultx = "".join([" " + i for i in words]).strip() cobaa = [] cobaa.append(resultx) for x in pipeline.predict(cobaa): hasilx = x if hasilx == "anger": angerx = angerx + 1 elif hasilx == "joy": joyx = joyx + 1 elif hasilx == "sadness": sadnessx = sadnessx + 1 elif hasilx == "fear": fearx = fearx + 1 elif hasilx == "disgust": disgustx = disgustx + 1 elif hasilx == "surprise": surprisex = surprisex + 1 comments_data = { "id": each_video["id"], "komen": each_video["message"], "asdqwe": asdqwe, "komen_edit": resultx, "prediksi": hasilx, "like_count": each_video["like"], "love_count": each_video["love"], "wow_count": each_video["wow"], "haha_count": each_video["haha"], "sad_count": each_video["sad"], "angry_count": each_video["angry"], } abc.append(comments_data) ctrku = { "anger": angerx, "joy": joyx, "sadness": sadnessx, "fear": fearx, "surprise": surprisex, "disgust": disgustx, } return jsonify({"tasks": abc}, {"ASD": ctrku})
class TestCucco(object): _cucco = None @staticmethod def _tests_generator(test): for test in TESTS_DATA['tests'][test[5:]]: yield (test['after'], test['before'], test['characters'] if 'characters' in test else '', test['kwargs'] if 'kwargs' in test else dict(), test['message']) def setup_method(self): self._cucco = Cucco() def test_normalize(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.normalize(before, **kwargs) == after, message def test_remove_accent_marks(self, request): for after, before, _, _, message in self._tests_generator(request.node.name): assert self._cucco.remove_accent_marks(before) == after, message def test_remove_stop_words(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force invalid language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'invalid' assert self._cucco.remove_stop_words(before, **kwargs) == before, message # Test lazy load self._cucco = Cucco(lazy_load=True) for after, before, _, kwargs, message in self._tests_generator(request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message def test_replace_characters(self, request): for after, before, characters, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_characters(text=before, characters=characters, **kwargs) == after, message def test_replace_emails(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_emails(text=before, **kwargs) == after, message def test_replace_emojis(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_emojis(text=before, **kwargs) == after, message def test_remove_extra_white_spaces(self, request): for after, before, _, _, message in self._tests_generator(request.node.name): assert self._cucco.remove_extra_white_spaces(before) == after, message def test_replace_hyphens(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_hyphens(text=before, **kwargs) == after, message def test_replace_punctuation(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_punctuation(text=before, **kwargs) == after, message def test_replace_symbols(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_symbols(text=before, **kwargs) == after, message def test_replace_urls(self, request): for after, before, _, kwargs, message in self._tests_generator(request.node.name): assert self._cucco.replace_urls(text=before, **kwargs) == after, message
sentence = "This is a phone number 672-123-456-9910" pattern = r".*(phone).*?([\d-]+)" match = re.match(pattern, sentence) match.groups() match.group() match.group(0) match.group(1) match.group(2) match.group(1,2) write a python program for searching and replacing flags. ## THis library checks for Emojis and replaces it with the regular expressions from cucco import Cucco cucco = Cucco() a=cucco.replace_emojis(':) :)) :( FSDFSDDFSDfv') print(a) write the syntax and a simple program for regular expression pattern in python. import re text = 'You can try to find an ant in this string' pattern = 'an?\w' for match in re.finditer(pattern, text): sStart = match.start() sEnd = match.end() sGroup = match.group() print('Match "{}" found at: [{},{}]'.format(sGroup, sStart,sEnd))
def remove_emoji(text): cucco = Cucco() return cucco.replace_emojis(text)
class TestCucco(object): _cucco = None @staticmethod def _tests_generator(test): for test in TESTS_DATA['tests'][test[5:]]: yield (test['after'], test['before'], test['characters'] if 'characters' in test else '', test['kwargs'] if 'kwargs' in test else dict(), test['message']) def setup_method(self): self._cucco = Cucco() def test_normalize(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.normalize(before, **kwargs) == after, message def test_remove_accent_marks(self, request): for after, before, _, _, message in self._tests_generator( request.node.name): assert self._cucco.remove_accent_marks(before) == after, message def test_remove_stop_words(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message # Force invalid language self._cucco = Cucco() for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'invalid' assert self._cucco.remove_stop_words(before, **kwargs) == before, message # Test lazy load self._cucco = Cucco(lazy_load=True) for after, before, _, kwargs, message in self._tests_generator( request.node.name): kwargs['language'] = 'en' assert self._cucco.remove_stop_words(before, **kwargs) == after, message def test_replace_characters(self, request): for after, before, characters, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_characters(text=before, characters=characters, **kwargs) == after, message def test_replace_emails(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_emails(text=before, **kwargs) == after, message def test_replace_emojis(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_emojis(text=before, **kwargs) == after, message def test_remove_extra_whitespaces(self, request): for after, before, _, _, message in self._tests_generator( request.node.name): assert self._cucco.remove_extra_whitespaces( before) == after, message def test_replace_hyphens(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_hyphens(text=before, **kwargs) == after, message def test_replace_punctuation(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_punctuation(text=before, **kwargs) == after, message def test_replace_symbols(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_symbols(text=before, **kwargs) == after, message def test_replace_urls(self, request): for after, before, _, kwargs, message in self._tests_generator( request.node.name): assert self._cucco.replace_urls(text=before, **kwargs) == after, message
def searchTweets(query): db = firestore.client() maxCount = 100 max_id = -1 count = 0 obj = { query : { "regioes": { "Norte": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Nordeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Centro-Oeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Sul": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 }, "Sudeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0 } } } } other_obj = { "regioes": { "Norte": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Nordeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Centro-Oeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Sul": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 }, "Sudeste": { "tristeza": 0, "alegria": 0, "amor": 0, "raiva": 0, "count": 0 } } } users_ref = db.collection(query) docs = users_ref.stream() jsonT = "" for doc in docs: jsonT = doc.to_dict()["porcentagem"] if jsonT == "": while count < maxCount: if max_id <= 0: searched_tweets = api.search(q=query+" -filter:retweets", lang="pt-br", tweet_mode='extended', count=maxCount*5) else: searched_tweets = api.search(q=query+" -filter:retweets", lang="pt-br", tweet_mode='extended', count=maxCount*5, max_id=str(max_id - 1)) if not searched_tweets: print("tem nada aq mona") break else: for tweet in searched_tweets: if (tweet.place is not None) and (count < maxCount): text = json.dumps(tweet._json['full_text'], sort_keys=True, indent=4, ensure_ascii=False).encode('utf8').decode() finalText = text.split(" ") text = "" for aux in finalText: if not '@' in aux and not 'https://' in aux: text += aux + " " count += 1 text = Cucco.replace_emojis(text) text = text.replace('"', '') municipio = (json.dumps(tweet._json['place']['full_name'], sort_keys=True, indent=4, ensure_ascii=False).encode('utf8')).decode().split(",")[0].replace('"',"") try: if municipio == 'Sao Paulo': municipio = 'São Paulo' regiao = regioes.getRegion(ufbr.get_cidade(municipio).codigo) em = classify(text) other_obj["regioes"][regiao][em] +=1 other_obj["regioes"][regiao]["count"] +=1 pass except Exception as identifier: count -= 1 pass max_id = searched_tweets[-1].id arr_reg = ["Norte", "Nordeste", "Centro-Oeste", "Sul", "Sudeste"] arr_emo = ["tristeza", "alegria", "amor", "raiva"] for i in arr_reg: for j in arr_emo: total = other_obj["regioes"][i]["count"] if total == 0: obj[query]["regioes"][i][j] = 0 else : obj[query]["regioes"][i][j] = round((other_obj["regioes"][i][j] / total) * 100, 2) db.collection(query).add({ "tweets_classificados": json.dumps(other_obj), "porcentagem" : json.dumps(obj) }) objs = [obj, other_obj] return objs else: users_ref = db.collection(query) docs = users_ref.stream() jsonP = "" for doc in docs: jsonP = doc.to_dict()["porcentagem"] jsonT = doc.to_dict()["tweets_classificados"] arr = [json.loads(jsonP), json.loads(jsonT)] return arr