new_graph = {k:v for (k, v) in graph.iteritems() if k != 'children'} new_graph['children'] = [] for child in graph['children']: (total_children, total_children_score) = get_total_children_score(child) (immediate_children, immediate_children_score) = get_immediate_children_score(child) new_child = {k:v for k in child if k != 'children'} new_child['total_children'] = total_children new_child['total_children_score'] = total_children_score new_child['immediate_children'] = immediate_children new_child['immediate_children_score'] = immediate_children_score new_graph['children'].append(new_child) return new_graph if __name__ == '__main__': remake_folder('singles') remake_folder('convos') subs = get_post_groups('../comments_by_posts') for (subdir, sub) in subs: remake_folder('singles/' + sub) link_groups = os.listdir(subdir) for link_group in link_groups: with open(subdir + '/' + link_group, 'r') as f: comments = [json.loads(line) for line in f.readlines()] comments = sorted(comments, key=lambda comment: comment['link_id']) singles = [] convos = [] for link, g in itertools.groupby(comments, key=lambda comment: comment['link_id']): graph = create_graph(g) singles.append(make_singles(graph)) convo_graph = make_convos(graph)
self.df = pd.DataFrame(index=filenames, columns=["done"]) self.df.done = False if os.path.exists(self.fname): saved_df = pd.read_csv(self.fname) saved_df.columns = ["fnames", "done"] self.df.ix[saved_df[saved_df.done == True].fnames, "done"] = True def completed(self, filename): try: self.df.ix[filename, "done"] = True self.df.to_csv(self.fname) finally: self.df.ix[filename, "done"] = True self.df.to_csv(self.fname) if __name__ == "__main__": sub_filenames = sorted(get_sub_files("../sub_files")) df = StatusDF(sub_filenames) for sub_filename in sub_filenames: if df.df.ix[sub_filename, "done"]: print "%s ALREADY COMPLETED" % sub_filename else: print "%s" % sub_filename sub_name = sub_filename.split("/")[-1] remake_folder(sub_name) with open(sub_filename, "r") as sub_file: for i, lines in enumerate(get_chunks_of_file(sub_file, True)): save_lines(lines, sub_name + "/", i + 1) df.completed(sub_filename)