def main(): out_text = open(tweets_file + ".text", 'w') out_labels = open(tweets_file + ".labels", 'w') out_ids = open(tweets_file + ".ids", 'w') tot = 0 ok = 0 with open(tweets_file) as f_in: for line in f_in: j = json.loads(line) tweet_id = j['id'] text = j['text'].replace("\n", "") emo_list = emojilib.emoji_list(text) emo_set = set([d['code'] for d in emo_list if 'code' in d]) if len(emo_set) == 1: emo = emo_set.pop().encode('utf_8') if emo in mapping: ct = clean_text(text) out_text.write(ct + "\n") out_labels.write(mapping[emo] + "\n") out_ids.write(str(tweet_id) + "\n") ok += 1 if tot % 10000 == 0: print(str(tot)) tot += 1 print(str(ok) + " good examples out of " + str(tot)) out_text.close() out_labels.close() out_ids.close()
def main(): out_text = open(tweets_file+".text",'w') out_labels = open(tweets_file+".labels",'w') out_ids = open(tweets_file+".ids",'w') tot=0 ok=0 with open(tweets_file) as f_in: for line in f_in: j = json.loads(line) tweet_id = j['id'] text = j['text'].replace("\n","") emo_list = emojilib.emoji_list(text) emo_set = set([d['code'] for d in emo_list if 'code' in d]) if len(emo_set) == 1: emo = emo_set.pop().encode('utf_8') if emo in mapping: ct= clean_text(text) out_text.write(ct+"\n") out_labels.write(mapping[emo]+"\n") out_ids.write(str(tweet_id)+"\n") ok+=1 if tot % 10000 == 0: print(str(tot)) tot+=1 print(str(ok) + " good examples out of " + str(tot)) out_text.close() out_labels.close() out_ids.close()
def get_emojis(): out_text = open("tweets.sadness.text", 'w') out_emojis_name = open("tweets.sadness.emoji.name.txt", 'w') out_emojis_code = open("tweets.sadness.emoji.code.txt", 'w') tot = 0 ok = 0 with open(DATA_PATH_SADNESS_TWEETS) as f_in: for line in f_in: text = line.replace("\n", "") emo_list = emojilib.emoji_list(text) emo_set_name = set([d['name'] for d in emo_list if 'name' in d]) emo_set_code = set([d['code'] for d in emo_list if 'code' in d]) if len(emo_set_name) == 1: emo_name = emo_set_name.pop() emo_code = emo_set_code.pop() ct = clean_text(text) out_text.write(ct + "\n") out_emojis_name.write(emo_name + "\n") out_emojis_code.write(emo_code + "\n") ok += 1 if tot % 10000 == 0: print(str(tot)) tot += 1 out_text.close() out_emojis_name.close() out_emojis_code.close()
def get_emojis(): tweets = pandas.read_csv(DATA_PATH_TWEETS, header=None, names=['tweet', 'label']) tweets['tweet'].to_csv('./annotated data/no_annotated_tweets.txt', encoding='utf-8', index=False, header=False) out_text = open("./annotated data/tweets.text.cleaned.txt", 'w') out_emojis_name = open("./annotated data/tweets.emoji.name.txt", 'w') out_emojis_code = open("./annotated data/tweets.emoji.code.txt", 'w') tot = 0 ok = 0 with open('./annotated data/no_annotated_tweets.txt') as f_in: for line in f_in: text = line.replace("\n", "") emo_list = emojilib.emoji_list(text) emo_set_name = set([d['name'] for d in emo_list if 'name' in d]) emo_set_code = set([d['code'] for d in emo_list if 'code' in d]) if len(emo_set_name) == 1: emo_name = emo_set_name.pop() emo_code = emo_set_code.pop() ct = clean_text(text) out_text.write(ct + "\n") out_emojis_name.write(emo_name + "\n") out_emojis_code.write(emo_code + "\n") ok += 1 else: ct = clean_text(text) out_text.write(ct + "\n") for emo_name in emo_set_name: out_emojis_name.write(emo_name + " ") out_emojis_name.write("\n") for emo_code in emo_set_code: out_emojis_code.write(emo_code + " ") out_emojis_code.write("\n") if tot % 10000 == 0: print(str(tot)) tot += 1 out_text.close() out_emojis_name.close() out_emojis_code.close()
TopN = 20 # max:20 full_data = [] line_count = 0 with open(raw_data_path, 'r') as fr: for line in fr: line_count += 1 js_dict = json.loads(line) Label = None data_line = None UserName = js_dict['user']['screen_name'] Text = js_dict['text'].strip() Id = js_dict['id'] emojis = emojilib.emoji_list(Text) Text = clean_text(Text) emoji_set = set([d['code'] for d in emojis if 'code' in d]) if len(emoji_set) == 1: emoji = emoji_set.pop() if emoji in mapping: Label = mapping[emoji] if Label and int(Label) < TopN: data_line = [Id, UserName, Text, Label] full_data.append(data_line) if line_count % 10000 == 0: # print(data_line) print("Finish extracting {} lines.".format(line_count)) full_data_df = pd.DataFrame(full_data, columns=["Id","UserName","Text","Label"]) full_data_df.to_csv("../../data/tweet/multi/top{}/tweet.csv".format(TopN), index=False)