Exemple #1
0
import itertools
from utils import get_chunks_of_file

def get_comment_files_in_folder(folder):
  files = []
  for (dirpath, dirnames, filenames) in os.walk(folder):
      for f in filenames:
        if len(f) == 10 and re.match('RC_20\d\d-\d\d', f) is not None:
          files.append(os.path.join(dirpath, f))
  return files

def save_lines(lines, chunk=None):
	if chunk is not None:
		print 'SORTING CHUNK %d ...' % chunk,
	lines = sorted(lines, key=lambda line: json.loads(line)['subreddit'])
	if chunk is not None:
		print 'DONE\nSAVING CHUNK %d ...' % chunk,
	for sub, group in itertools.groupby(lines, key=lambda line: json.loads(line)['subreddit']):
		with open(sub, 'a') as sub_file:
			sub_file.write(''.join(group))
	print 'DONE'

if __name__ == '__main__':
	for month_fname in get_comment_files_in_folder('../reddit_data_comments/'):
		with open(month_fname, 'r') as month_file:
			print 'BEGINNING FILE: %s' % month_fname
			i = 1
			for lines in get_chunks_of_file(month_file, True):
				i += 1
				save_lines(lines, i)
Exemple #2
0
import json
from utils import get_chunks_of_file
import itertools

if __name__ == '__main__':
	with open('../RS_full_corpus', 'r') as f:
		chunk = 1
		print 'READING CHUNK %d ...' % chunk,
		for lines in get_chunks_of_file(f):
			print 'DONE\nSORTING CHUNK %d ...' % chunk,
			lines = sorted(lines, key=lambda l: json.loads(l).get('subreddit', 'NO_SUBREDDIT'))
			print 'DONE\nSAVING CHUNK %d ...' % chunk,
			for sub, g in itertools.groupby(lines, key=lambda l: json.loads(l).get('subreddit', 'NO_SUBREDDIT')):
				with open(sub, 'a') as subfile:
					subfile.write(''.join(g))
			chunk += 1
			print 'DONE\nREADING CHUNK %d ...' % chunk,
		print 'END'
        self.df = pd.DataFrame(index=filenames, columns=["done"])
        self.df.done = False
        if os.path.exists(self.fname):
            saved_df = pd.read_csv(self.fname)
            saved_df.columns = ["fnames", "done"]
            self.df.ix[saved_df[saved_df.done == True].fnames, "done"] = True

    def completed(self, filename):
        try:
            self.df.ix[filename, "done"] = True
            self.df.to_csv(self.fname)
        finally:
            self.df.ix[filename, "done"] = True
            self.df.to_csv(self.fname)


if __name__ == "__main__":
    sub_filenames = sorted(get_sub_files("../sub_files"))
    df = StatusDF(sub_filenames)
    for sub_filename in sub_filenames:
        if df.df.ix[sub_filename, "done"]:
            print "%s ALREADY COMPLETED" % sub_filename
        else:
            print "%s" % sub_filename
            sub_name = sub_filename.split("/")[-1]
            remake_folder(sub_name)
            with open(sub_filename, "r") as sub_file:
                for i, lines in enumerate(get_chunks_of_file(sub_file, True)):
                    save_lines(lines, sub_name + "/", i + 1)
            df.completed(sub_filename)