f = open("processed_data/mId2Genre.txt", "w")
    genreIdx = Indexer()
    for idx, row in movies.iterrows():
        mId, raw_genres = row['mId'], row['genres']
        raw_genres = raw_genres.replace("\'", "\"")
        genres_l = json.loads(raw_genres)
        f.write("%d %d" % (mId, len(genres_l)))
        for g in genres_l:
            f.write(" %d" % (genreIdx.add_and_get_index(g['name']) + id_base))
        f.write("\n")
    f.close()

    f = open("processed_data/Genre2Id.txt", "w")
    num_genres = len(genreIdx)
    for i in range(num_genres):
        f.write("%d %s\n" % (i + id_base, genreIdx.get_object(i)))
    f.close()
    id_base += num_genres
    ''' create credits
	mId2CC.txt: 45476 lines
	each line includes (mId, num of crew/casts, cIds)
	'''
    credits = readCreditData(args, tmid2mid)
    print("credits.shape %s" % (str(credits.shape)))
    cIdx = Indexer()
    f = open("processed_data/mId2CC.txt", "w")
    for idx, row in credits.iterrows():
        mId, raw_cast, raw_crew = row['mId'], row['cast'], row['crew']
        cast_l = ast.literal_eval(raw_cast)
        crew_l = ast.literal_eval(raw_crew)
        attr = []
Example #2
0
    for idx, word in enumerate(words):
        if word_cnts[word] <= 10:
            words[idx] = ""
    line = " ".join(word for word in words)
    if line.isspace() or line == "":
        count_of_bad += 1
    else:
        data.text = line
        new_dataset.append(data)
    if i % 200000 == 0:
            print("iterated", i, "cleaned datapoints")

emoji_sample_count = {}

for dp in new_dataset:
    emoji_sample_count[indexer.get_object(dp.label)] = emoji_sample_count.get(indexer.get_object(dp.label), 0) + 1

print ("emoji_sample count ", emoji_sample_count)
shuffle(new_dataset)

sample_dataset = []
emoji_sample_counter = {}

if sys.argv[2] == "Sample":
    for dp in new_dataset:
        if dp.label in emoji_sample_counter:
            if emoji_sample_counter[dp.label] < 50000:
                sample_dataset.append(dp)
                emoji_sample_counter[dp.label] += 1
        else:
            emoji_sample_counter[dp.label] = 1