def download_and_process_posts(post_ids, st_time): # download and pre-process original posts subreddit_names = set() lines = defaultdict(list) # read lines from posts for i, (name, l) in enumerate(get_posts(post_ids)): if i % 1000000 == 0: print( "read %d lines, found %d" % (i, sum([len(ls) for ls in lines.values()])), time() - st_time, ) lines[name] += [l.strip()] subreddit_names.add(name) print("tokenizing and selecting specific posts %2f" % (time() - st_time)) processed_items = dict([(name, []) for name in subreddit_names]) key_list = ['id', 'score', 'url', 'title', 'selftext'] for name in subreddit_names: for line in lines[name]: reddit_dct = json.loads(line) if reddit_dct.get('num_comments', 1) > 0: reddit_res = {} for k in key_list: if k in ['title', 'selftext']: if reddit_dct[k].lower() in ['[removed]', '[deleted]']: reddit_dct[k] = '' txt, url_list = word_url_tokenize(reddit_dct[k]) reddit_res[k] = (' '.join(txt.split()), url_list) else: reddit_res[k] = reddit_dct[k] processed_items[name] += [reddit_res] print("Total found %d" % (len(processed_items)), time() - st_time) return subreddit_names, processed_items
def download_and_process(file_url, mode, subreddit_names, st_time, output_dir): # download and pre-process original posts reddit_tmp_dir = pjoin(output_dir, 'reddit_tmp') f_name = pjoin(reddit_tmp_dir, file_url.split('/')[-1]) tries_left = 4 # open monthly dumps and download lines in posts while tries_left: try: print("downloading %s %2f" % (f_name, time() - st_time)) subprocess.run(['wget', '-P', reddit_tmp_dir, file_url], stdout=subprocess.PIPE) print("decompressing and filtering %s %2f" % (f_name, time() - st_time)) if f_name.split('.')[-1] == 'xz': f = lzma.open(f_name, 'rt') elif f_name.split('.')[-1] == 'bz2': f = bz2.open(f_name, 'rt') elif f_name.split('.')[-1] == 'zst': fh = open(f_name, 'rb') dctx = zstd.ZstdDecompressor() stream_reader = dctx.stream_reader(fh) f = io.TextIOWrapper(stream_reader, encoding='utf-8') lines = dict([(name, []) for name in subreddit_names]) for i, l in enumerate(f): if i % 1000000 == 0: print( "read %d lines, found %d" % (i, sum([len(ls) for ls in lines.values()])), time() - st_time, ) for name in subreddit_names: subreddit_field = f'"subreddit":"{name}"' if subreddit_field in l: lines[name] += [l.strip()] if f_name.split('.')[-1] == 'zst': fh.close() else: f.close() os.remove(f_name) tries_left = 0 except EOFError: sleep(10) print("failed reading file %s file, another %d tries" % (f_name, tries_left)) os.remove(f_name) tries_left -= 1 print("tokenizing and selecting %s %2f" % (f_name, time() - st_time)) processed_items = dict([(name, []) for name in subreddit_names]) if mode == 'submissions': key_list = ['id', 'score', 'url', 'title', 'selftext'] else: key_list = ['id', 'link_id', 'parent_id', 'score', 'body'] for name in subreddit_names: for line in lines[name]: reddit_dct = json.loads(line) if (reddit_dct.get('num_comments', 1) > 0 and reddit_dct.get('score', 0) and reddit_dct.get('score', 0) >= 2 and (mode == 'submissions' or valid_comment(reddit_dct))): reddit_res = {} for k in key_list: if k in ['title', 'selftext', 'body']: if reddit_dct[k].lower() in ['[removed]', '[deleted]']: reddit_dct[k] = '' txt, url_list = word_url_tokenize(reddit_dct[k]) reddit_res[k] = (' '.join(txt.split()), url_list) else: reddit_res[k] = reddit_dct[k] processed_items[name] += [reddit_res] print("Total found %d" % (len(processed_items)), time() - st_time) return processed_items
def main(): opt = setup_args() output_dir = pjoin(opt['datapath'], opt['output_dir'], 'processed_data/collected_docs') wet_urls_path = pjoin(opt['datapath'], opt['output_dir'], opt['wet_urls']) f = open(wet_urls_path, buffering=4096) url_lst = [line.strip() for line in f if line.strip() != ''] f.close() if opt['urls']: with PathManager.open(opt['urls']) as f: specific_urls = json.load(f) using_specific_urls = True using_specific_ids = False elif opt['ccuids']: with PathManager.open(opt['ccuids']) as f: specific_ids = json.load(f) using_specific_urls = False using_specific_ids = True else: sr_names = json.loads(opt['subreddit_names']) using_specific_urls = False using_specific_ids = False print("loading URL selection") ccrawl_ids_maps = {} reddit_id_group = {} sr_names = json.loads(opt['subreddit_names']) # make a list of the CommonCrawl UIDs or URLs we want to process and keep if using_specific_urls: select_urls = select_specific_urls(specific_urls) elif using_specific_ids: select_ccid = select_specific_ids(specific_ids) else: for name in sr_names: print(name) ccrawl_ids_maps[name] = json.load( open('pre_computed/%s_ccrawl_ids.json' % (name, ))) for i, (k, _) in enumerate(ccrawl_ids_maps[name]): reddit_id_group[k] = (i * 10) // len(ccrawl_ids_maps[name]) select_ccid = make_ccid_filter(ccrawl_ids_maps, opt['n_urls']) print("loaded URL selection") # organize directories if not isdir(output_dir): subprocess.run(['mkdir', output_dir], stdout=subprocess.PIPE) if not isdir(pjoin(output_dir, 'tmp')): subprocess.run(['mkdir', pjoin(output_dir, 'tmp')], stdout=subprocess.PIPE) if using_specific_ids: make_docs_directory(output_dir, 'specific_ids') elif using_specific_urls: make_docs_directory(output_dir, 'specific_urls') else: for name in sr_names: make_docs_directory(output_dir, name) # check whether some ccrawl files have already been processed for this slice if using_specific_ids: articles = dict([('specific_ids', dict([(i, []) for i in range(10)]))]) mode = 'ids' elif using_specific_urls: articles = dict([('specific_urls', dict([(i, []) for i in range(10)]))]) mode = 'urls' else: articles = dict([(name, dict([(i, []) for i in range(10)])) for name in sr_names]) mode = 'subreddits' # check progress of slice or if slice is finished if isfile( pjoin(output_dir, 'tmp', 'counts_%s_%d.json' % (mode, opt['slnum']))): start_line = json.load( open( pjoin(output_dir, 'tmp', 'counts_%s_%d.json' % (mode, opt['slnum'])))) if start_line == 'finished': return True for name in sr_names: for i_st in range(10): d_name = pjoin(output_dir, name, str(i_st)) articles[name][i] = json.load( open(pjoin(d_name, "docs_slice_%05d.json" % (opt['slnum'])))) print( "loaded previously downloaded pages:", start_line - opt['slnum'] * opt['slsize'], ) else: start_line = opt['slnum'] * opt['slsize'] # Download and parse slice of args.slsize WET files st_time = time() for i in range(start_line, min((opt['slnum'] + 1) * opt['slsize'], len(url_lst))): # Download wet file from amazon AWS dl_time = time() fname = url_lst[i].split('/')[-1][:-3] # download and unzip if necessary fpath = pjoin(output_dir, 'tmp', fname) print("processing", fpath) if not isfile(fpath): ct_try = 0 while not isfile(fpath): subprocess.run(['rm', fpath + ".gz"], stdout=subprocess.PIPE) while not isfile(fpath + ".gz"): url = "https://commoncrawl.s3.amazonaws.com/" + url_lst[i] subprocess.run( ['wget', '-P', pjoin(output_dir, 'tmp'), url], stdout=subprocess.PIPE, ) print("download:", time() - dl_time) ct_try += 1 if ct_try > 5 and not isfile(fpath + ".gz"): print("giving up on file", fname) break downloaded = isfile(fpath + ".gz") if downloaded: subprocess.run(['gunzip', fpath + ".gz"], stdout=subprocess.PIPE) print("download and gunzip:", time() - dl_time) if ct_try > 5 and not isfile(fpath): print("giving up on file", fname) break else: downloaded = isfile(fpath) if not downloaded: print("FAILED DOWNLOADING ", fpath) continue # Extract, tokenize, and filter articles by language f = open(fpath, buffering=4096) article_url = '' article_id = '' article_txt = '' last_line = '' read_text = False ct = 0 start_time = time() ccid_path_tuple = False # check and save pages by IDs if getting posts by IDs, or by URLs # if using URLs for line in f: if line.startswith("WARC/1.0"): if ccid_path_tuple: ct += 1 article = { 'ccid': article_id, 'url': article_url, 'text': word_url_tokenize(article_txt), } if not using_specific_urls and not using_specific_ids: name, eli_k, num = ccid_path_tuple articles[name][reddit_id_group[eli_k]] += [(eli_k, num, article)] else: name, num = ccid_path_tuple articles[name][num % 10] += [(num, article)] article_txt = '' read_text = False if line.startswith("WARC-Target-URI"): try: article_url = line.strip().split()[-1] if using_specific_urls: ccid_path_tuple = check_url(select_urls, article_url) except Exception: article_url = '<UNK>' if using_specific_urls: ccid_path_tuple = False if line.startswith("WARC-Record-ID"): try: article_id = line.strip().split()[-1] if not using_specific_urls: ccid_path_tuple = select_ccid.get(article_id, False) except Exception: article_id = '<UNK>' if not using_specific_urls: ccid_path_tuple = False if read_text and (last_line.strip() + line.strip()) != '': article_txt += line + '\n' last_line = line if line.startswith("Content-Length: ") and ccid_path_tuple: read_text = True if ccid_path_tuple: ct += 1 article = { 'ccid': article_id, 'url': article_url, 'text': word_url_tokenize(article_txt), } if not using_specific_urls and not using_specific_ids: name, eli_k, num = ccid_path_tuple articles[name][reddit_id_group[eli_k]] += [(eli_k, num, article)] else: name, num = ccid_path_tuple articles[name][num % 10] += [(num, article)] f.close() subprocess.run(['rm', fpath], stdout=subprocess.PIPE) # periodically save slice print(">>>>>>>>>> ARTICLES FOUND %d in %.2f" % (ct, time() - start_time)) if i % opt['save_freq'] == opt['save_freq'] - 1: for name, elik_maps in articles.items(): print('saving', name, i, len(elik_maps)) for i_st, ls in elik_maps.items(): d_name = pjoin(output_dir, name, str(i_st)) if not isdir(d_name): subprocess.run(['mkdir', d_name], stdout=subprocess.PIPE) json.dump( ls, open( pjoin(d_name, "docs_slice_%05d.json" % (opt['slnum'])), 'w'), ) json.dump( i + 1, open( pjoin(output_dir, 'tmp', 'counts_%s_%d.json' % (mode, opt['slnum'])), 'w', ), ) print('saved json files %.2f' % (time() - start_time, )) subprocess.run(['rm', fpath], stdout=subprocess.PIPE) # save items to slices for name, elik_maps in articles.items(): print('saving', name, i, len(elik_maps)) for i_st, ls in elik_maps.items(): d_name = pjoin(output_dir, name, str(i_st)) if not isdir(d_name): subprocess.run(['mkdir', d_name], stdout=subprocess.PIPE) json.dump( ls, open(pjoin(d_name, "docs_slice_%05d.json" % (opt['slnum'])), 'w')) print('saved json files %.2f' % (time() - start_time, )) json.dump( 'finished', open( pjoin(output_dir, 'tmp', 'counts_%s_%d.json' % (mode, opt['slnum'])), 'w'), ) print("processing slice %d took %f seconds" % (i, time() - st_time))