def compressTweets(data_dir): assert(os.path.exists(data_dir)) tick = Tick() for fname in os.listdir(data_dir): isFirst = True lines = [] new_contents = "" for line in open(os.path.join(data_dir, fname)): try: jobj = json.loads(line) if not isFirst: del jobj['user'] lines.append(json.dumps(jobj)) except (KeyError, ValueError): continue isFirst = False new_contents = '\n'.join(lines) fout = open(os.path.join(data_dir, fname), 'w') fout.write(new_contents) fout.close() tick.tick()
def tweets2Texts(input_dir, output_dir): ticker = Tick() for fname in os.listdir(input_dir): user_id = fname # collect texts texts = [] for line in open(os.path.join(input_dir, fname)): try: jobj = json.loads(line) t = jobj["text"] t = re.sub(r"\s+", " ", t) texts.append(t) except (KeyError, ValueError): continue text = "\n".join(texts) # write to a file fout = open(os.path.join(output_dir, fname), "w") fout.write(text.encode("ascii", "ignore")) fout.close() ticker.tick()