def main():
    out_text = open(tweets_file + ".text", 'w')
    out_labels = open(tweets_file + ".labels", 'w')
    out_ids = open(tweets_file + ".ids", 'w')
    tot = 0
    ok = 0
    with open(tweets_file) as f_in:
        for line in f_in:
            j = json.loads(line)
            tweet_id = j['id']
            text = j['text'].replace("\n", "")
            emo_list = emojilib.emoji_list(text)
            emo_set = set([d['code'] for d in emo_list if 'code' in d])
            if len(emo_set) == 1:
                emo = emo_set.pop().encode('utf_8')
                if emo in mapping:
                    ct = clean_text(text)
                    out_text.write(ct + "\n")
                    out_labels.write(mapping[emo] + "\n")
                    out_ids.write(str(tweet_id) + "\n")
                    ok += 1
            if tot % 10000 == 0:
                print(str(tot))
            tot += 1

    print(str(ok) + " good examples out of " + str(tot))

    out_text.close()
    out_labels.close()
    out_ids.close()
def main():
	out_text = open(tweets_file+".text",'w')
	out_labels = open(tweets_file+".labels",'w')
	out_ids = open(tweets_file+".ids",'w')
	tot=0
	ok=0
	with open(tweets_file) as f_in:
		for line in f_in:
			j = json.loads(line)
			tweet_id = j['id']
			text = j['text'].replace("\n","")
			emo_list = emojilib.emoji_list(text)
			emo_set = set([d['code'] for d in emo_list if 'code' in d])
			if len(emo_set) == 1:
				emo = emo_set.pop().encode('utf_8')
				if emo in mapping:
					ct= clean_text(text)
					out_text.write(ct+"\n")
					out_labels.write(mapping[emo]+"\n")
					out_ids.write(str(tweet_id)+"\n")
					ok+=1
			if tot % 10000 == 0:
				print(str(tot))
			tot+=1

	print(str(ok) + " good examples out of " + str(tot))

	out_text.close()
	out_labels.close()
	out_ids.close()
Example #3
0
def get_emojis():
    out_text = open("tweets.sadness.text", 'w')
    out_emojis_name = open("tweets.sadness.emoji.name.txt", 'w')
    out_emojis_code = open("tweets.sadness.emoji.code.txt", 'w')
    tot = 0
    ok = 0

    with open(DATA_PATH_SADNESS_TWEETS) as f_in:
        for line in f_in:
            text = line.replace("\n", "")
            emo_list = emojilib.emoji_list(text)
            emo_set_name = set([d['name'] for d in emo_list if 'name' in d])
            emo_set_code = set([d['code'] for d in emo_list if 'code' in d])
            if len(emo_set_name) == 1:
                emo_name = emo_set_name.pop()
                emo_code = emo_set_code.pop()
                ct = clean_text(text)
                out_text.write(ct + "\n")
                out_emojis_name.write(emo_name + "\n")
                out_emojis_code.write(emo_code + "\n")
                ok += 1
            if tot % 10000 == 0:
                print(str(tot))
            tot += 1

    out_text.close()
    out_emojis_name.close()
    out_emojis_code.close()
def get_emojis():
    tweets = pandas.read_csv(DATA_PATH_TWEETS,
                             header=None,
                             names=['tweet', 'label'])
    tweets['tweet'].to_csv('./annotated data/no_annotated_tweets.txt',
                           encoding='utf-8',
                           index=False,
                           header=False)

    out_text = open("./annotated data/tweets.text.cleaned.txt", 'w')
    out_emojis_name = open("./annotated data/tweets.emoji.name.txt", 'w')
    out_emojis_code = open("./annotated data/tweets.emoji.code.txt", 'w')

    tot = 0
    ok = 0

    with open('./annotated data/no_annotated_tweets.txt') as f_in:
        for line in f_in:
            text = line.replace("\n", "")
            emo_list = emojilib.emoji_list(text)
            emo_set_name = set([d['name'] for d in emo_list if 'name' in d])
            emo_set_code = set([d['code'] for d in emo_list if 'code' in d])
            if len(emo_set_name) == 1:
                emo_name = emo_set_name.pop()
                emo_code = emo_set_code.pop()
                ct = clean_text(text)
                out_text.write(ct + "\n")
                out_emojis_name.write(emo_name + "\n")
                out_emojis_code.write(emo_code + "\n")
                ok += 1
            else:
                ct = clean_text(text)
                out_text.write(ct + "\n")

                for emo_name in emo_set_name:
                    out_emojis_name.write(emo_name + " ")
                out_emojis_name.write("\n")

                for emo_code in emo_set_code:
                    out_emojis_code.write(emo_code + " ")
                out_emojis_code.write("\n")

            if tot % 10000 == 0:
                print(str(tot))
            tot += 1

    out_text.close()
    out_emojis_name.close()
    out_emojis_code.close()
TopN = 20 # max:20

full_data = []
line_count = 0

with open(raw_data_path, 'r') as fr:
    for line in fr:
        line_count += 1
        js_dict = json.loads(line)
        
        Label = None
        data_line = None
        UserName = js_dict['user']['screen_name']
        Text = js_dict['text'].strip()
        Id = js_dict['id']
        emojis = emojilib.emoji_list(Text)
        Text = clean_text(Text)
        emoji_set = set([d['code'] for d in emojis if 'code' in d])
        if len(emoji_set) == 1:
            emoji = emoji_set.pop()
            if emoji in mapping:
                Label = mapping[emoji]
        if Label and int(Label) < TopN:
            data_line = [Id, UserName, Text, Label]
            full_data.append(data_line)
        if line_count % 10000 == 0:
            # print(data_line)
            print("Finish extracting {} lines.".format(line_count))
    full_data_df = pd.DataFrame(full_data, columns=["Id","UserName","Text","Label"])

full_data_df.to_csv("../../data/tweet/multi/top{}/tweet.csv".format(TopN), index=False)