def build_word_count(): if os.path.isfile('pickled/wcount.pickle'): return read_pickle('pickled/wcount.pickle') wcount = Counter() for fid in words.fileids(): for word in words.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in gutenberg.fileids(): for word in gutenberg.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in brown.fileids(): for word in brown.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in reuters.fileids(): for word in reuters.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 for fid in inaugural.fileids(): for word in inaugural.words(fid): word = word.lower() if only_words.match(word) is not None: wcount[word] += 1 dump_pickle(wcount, 'pickled/wcount.pickle') return wcount
def get_link_df(): if not os.path.exists(wsmall_latest_linkdf): names = ['wsmall_file', 'href'] ldf = pd.read_csv(wsmall_latest_links, sep=',', names=names) dump_pickle(ldf, wsmall_latest_linkdf) return ldf else: return read_pickle(wsmall_latest_linkdf)
def check_webarchive(ldf): if os.path.exists(wsmall_latest_webarchive): archive_sites = [] for archive_site in archiving_sites: archive_sites.append(ldf[ldf.href.str.startswith(archive_site)]) about_wba = pd.concat(archive_sites) dump_pickle(about_wba, wsmall_latest_webarchive) return about_wba else: return read_pickle(wsmall_latest_webarchive)
def save_result_file(seq_name, pred_dir, tracks): seq_filepaths = glob.glob(f'{paths.DATA_ROOT}/3dpw/sequenceFiles/*/*.pkl') seq_path = next(p for p in seq_filepaths if os.path.basename(p) == f'{seq_name}.pkl') rel_path = '/'.join(util.split_path(seq_path)[-2:]) out_path = f'{pred_dir}/{rel_path}' n_frames = len( glob.glob(f'{paths.DATA_ROOT}/3dpw/imageFiles/{seq_name}/image_*.jpg')) coords3d_raw = np.array( [complete_track(track, n_frames) for track in tracks]) / 1000 util.dump_pickle(dict(jointPositions=coords3d_raw), out_path)
def get_users2(ureviews, mset): if not os.path.exists(usr_pickle2): def user_trans(lines): return map(lambda line: UserWReviews(split=usr_split.split(line), ureviews=ureviews, mset=mset), lines) with SelectFromFile(user_file, transformer=user_trans, selector=lambda x: list(x)) as it: usrs = it dump_pickle(usrs, usr_pickle2) else: usrs = read_pickle(usr_pickle2) return usrs
def get_movies(): if not os.path.exists(movie_pickle): def move_clean_split(line): return Movie(msplit.split(msanity.sub('|', line.rstrip()))) movie_map = {} with codecs.open(movie_f, 'r', encoding='utf-8', errors='replace') as movs: for mov in map(move_clean_split, movs): movie_map[mov.mid] = mov dump_pickle(movie_map, movie_pickle) else: movie_map = read_pickle(movie_pickle) return movie_map
def domain_info(ldf=None): if not os.path.exists(wsmall_latest_no_wlink): no_wlinks = ldf[ldf.href.str.contains( r'(www)|(http)')][~ldf.href.str.contains('wiki')] no_wlinks['href'] = no_wlinks['href'].apply(front_slash_nuke) dump_pickle(no_wlinks, wsmall_latest_no_wlink) if not os.path.exists(wsmall_latest_no_wlinkd): no_wlinks['domain'] = no_wlinks.href.map(domain_getter) dump_pickle(no_wlinks, wsmall_latest_no_wlinkd) else: no_wlinks = read_pickle(wsmall_latest_no_wlinkd) return no_wlinks
def get_users(reviews=None): if not os.path.exists(usr_pickle): def user_trans(lines): return map(lambda line: User(usr_split.split(line), reviews), lines) usrs = {} with SelectFromFile(user_file, transformer=user_trans, selector=lambda x: list(x)) as it: for u in it: usrs[u.id] = u dump_pickle(usrs, usr_pickle) else: usrs = read_pickle(usr_pickle) return usrs
def check_ar_outlinks(ars): if not os.path.exists(wsmall_statues_ar): result = {} temp = [] processed = 0 c = 0 with FuturesSession( session=requests.Session(), executor=ProcessPoolExecutor(max_workers=10)) as session: for href in ars.href: temp.append(href) if len(temp) >= 100: pending = [] for url in temp: result[url] = -1 pending.append( session.head(url, headers={'User-Agent': useragents[c]}, timeout=5.0)) c += 1 if c == 3: c = 0 for future in pending: try: response = future.result() url = response.url scode = response.status_code result[url] = scode except Exception as e: one = 1 processed += 1 if processed % 100 == 0: print(processed) temp.clear() print('outise the with') had_status = {'archive': [], 'status': []} timed_out = {'archive': [], 'status': []} for k, v in result.items(): ar = archives_map(k) if v == -1: timed_out['archive'].append(ar) timed_out['status'].append(v) continue had_status['archive'].append(ar) had_status['status'].append(v) hs = pd.DataFrame(had_status) to = pd.DataFrame(timed_out) dump_pickle((hs, to), wsmall_statues_ar) return hs, to else: return read_pickle(wsmall_statues_ar)
def generate_poseviz_gt(i_subject, activity_name, camera_id): camera_names = ['54138969', '55011271', '58860488', '60457274'] camera_name = camera_names[camera_id] data, camera = get_examples(i_subject, activity_name, camera_id, frame_step=1, correct_S9=True) results = [] examples = [] for image_relpath, world_coords, bbox in data: results.append({ 'gt_poses': [world_coords.tolist()], 'camera_intrinsics': camera.intrinsic_matrix.tolist(), 'camera_extrinsics': camera.get_extrinsic_matrix().tolist(), 'image_path': image_relpath, 'bboxes': [bbox.tolist()] }) ex = ps3d.Pose3DExample(image_relpath, world_coords, bbox, camera, activity_name=activity_name) examples.append(ex) joint_names = ('rhip,rkne,rank,lhip,lkne,lank,tors,neck,head,htop,' 'lsho,lelb,lwri,rsho,relb,rwri,pelv'.split(',')) edges = ('htop-head-neck-lsho-lelb-lwri,neck-rsho-relb-rwri,' 'neck-tors-pelv-lhip-lkne-lank,pelv-rhip-rkne-rank') joint_info = ps3d.JointInfo(joint_names, edges) ds = ps3d.Pose3DDataset(joint_info, test_examples=examples) util.dump_pickle( ds, f'{paths.DATA_ROOT}/h36m/poseviz/S{i_subject}_{activity_name}_{camera_name}.pkl' ) output = {} output['joint_names'] = joint_info.names output['stick_figure_edges'] = joint_info.stick_figure_edges output['world_up'] = camera.world_up.tolist() output['frame_infos'] = results util.dump_json( output, f'{paths.DATA_ROOT}/h36m/poseviz/S{i_subject}_{activity_name}_{camera_name}.json' )
def get_movies2(): if not os.path.exists(movie_pickle2): def move_clean_split(line): return Movie(msplit.split(msanity.sub('|', line.rstrip()))) movies = [] movie_idx = {} with codecs.open(movie_f, 'r', encoding='utf-8', errors='replace') as movs: for idx, mov in enumerate(map(move_clean_split, movs)): movies.append(mov) movie_idx[mov.mid] = idx dump_pickle((movies, movie_idx), movie_pickle2) else: movies, movie_idx = read_pickle(movie_pickle2) return movies, movie_idx
def get_reviews2(): if not os.path.exists(usr_review_pickle2): def review_mapper(line): ur = UserReview(split=usr_ratting.split(line.rstrip())) return ur def trans(rvs): return seq(rvs).map(review_mapper).group_by(lambda ur: ur.uid).to_dict() with SelectFromFile(usr_review_file, transformer=trans, selector=lambda x: x) as r: reviews = r dump_pickle(reviews, usr_review_pickle2) else: reviews = read_pickle(usr_review_pickle2) return reviews
def check_wiki_live_web_links(): if not os.path.exists(wsmall_latest_nwl_status): link_df = get_link_df() di = domain_info(link_df) result = {} temp = [] num = re.compile( '^(?:(?:www\.)|(?:http://)|(?:https://))?(?:[0-9]{1,3}\.){3}.+$') c = 0 processed = 0 with FuturesSession( session=requests.Session(), executor=ProcessPoolExecutor(max_workers=10)) as session: for href in di.href.unique(): if not href.startswith('http') and not href.startswith( 'https'): href = '%s%s' % ('http://', href) if num.match(href) is not None: result[href] = -2 continue temp.append(href) if len(temp) >= 1000: pending = [] for url in temp: result[url] = -1 pending.append( session.head(url, headers={'User-Agent': useragents[c]}, timeout=5.0)) c += 1 if c == 3: c = 0 for future in pending: try: response = future.result() url = response.url scode = response.status_code result[url] = scode except Exception as e: one = 1 processed += 1 if processed % 1000 == 0: print(processed) temp.clear() print('outise the with') dump_pickle(result, wsmall_latest_nwl_status)
def get_reviews(movie_map): if not os.path.exists(usr_review_pickle): def review_mapper(line): ur = URating(usr_ratting.split(line.rstrip())) ur.mname = movie_map.get(ur.itemid, None) return ur def trans(rvs): return seq(rvs).map(review_mapper).group_by(lambda ur: ur.uid).to_dict() with SelectFromFile(usr_review_file, transformer=trans, selector=lambda x: x) as r: reviews = r dump_pickle(reviews, usr_review_pickle) else: reviews = read_pickle(usr_review_pickle) return reviews
def tidy_uris(s_list): if not os.path.exists(tidied_uirs): t = [] for w in s_list: original = os.path.basename(w) m = match_end.findall(original) if len(m) > 0: p_name = original.replace(m[0], '') else: p_name = original.replace('.html', '') t.append({ 'quoted': quote(p_name), 'unquoted': p_name, 'original': original }) dump_pickle(t, tidied_uirs) return t else: return read_pickle(tidied_uirs)
def compute_old_wsmall_links(): if not os.path.exists('pickled/wsmallo_outl_temp.pickle'): old_link_df = get_oldl_df() out = defaultdict(LinkDict) for owfile in old_link_df.wsmall_file.unique(): owdf = old_link_df[old_link_df.wsmall_file == owfile] links_to = old_link_df[old_link_df.wsmall_file.isin(owdf.href)] links_to_other = owdf[~owdf.href.isin(links_to.href)] out[owfile]['outlink_wsmall'] += links_to.wsmall_file.unique().size out[owfile]['outlink_other'] += links_to_other.wsmall_file.unique( ).size out[owfile]['total_outlinks'] += out[owfile][ 'outlink_wsmall'] + out[owfile]['outlink_other'] for other in links_to.wsmall_file.unique(): out[other]['inlink'] += 1 dump_pickle(out, 'pickled/wsmallo_outl_temp.pickle') return out else: return read_pickle('pickled/wsmallo_outl_temp.pickle')
def galago_postingd_csv(): cline = './rungalago.sh dump-index index' with open('output_files/idx3.csv', 'w') as retOut: runner = Popen(shlex.split(cline), stdout=retOut, stderr=PIPE) print(runner.stderr.read()) idx = Idx() with open('idx3.csv', 'r') as gal: for line in gal: lsplit = line.rstrip().lstrip().split(',') word = lsplit[0] doc = lsplit[1] at = lsplit[2:] idx[word].add_doc_where(doc, at) dump_pickle(idx, 'pickled/idx3.pickle') with open('output_files/idx3_terms.txt', 'w') as termOut: termOut.write(' '.join( sorted( filter(lambda x: isWord.match(x) is not None, list(idx.keys())))))
def dump_window_idx(): cline = './rungalago.sh dump-index windowIdx/od.n2.w5.h2' with open('output_files/window5/ordered5idx.txt', 'w') as retOut: runner = Popen(shlex.split(cline), stdout=retOut, stderr=PIPE) print(runner.stderr.read()) ab_count = Counter() ab_count_wins = Counter() wins = 0 with open('output_files/window5/ordered5idx.txt', 'r') as oin: for line in oin: splitted = line.rstrip().split(',') if only_words.match(splitted[0]) is not None: a, b = splitted[0].split('~') ab_count[a] += 1 ab_count[b] += 1 wins += 1 ab_count_wins[a, b] += 1 dump_pickle((ab_count, ab_count_wins, wins), 'pickled/window5Counts.pickle')
def dl_pages(tidied_uris): if not os.path.exists(wsmall_statues): useragent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:44.0) Gecko/20100101 Firefox/44.01' statuses = {} with requests.session() as session: session.headers.update({'User-Agent': useragent}) for tidy in tidied_uris: uri = base % tidy['unquoted'] r = session.get(uri) scode = r.status_code print(uri, scode) statuses[tidy['original']] = scode if scode == 200: with open('wiki_small_latest/%s' % tidy['original'], 'w') as out: out.write(r.text) dump_pickle(statuses, wsmall_statues) return statuses else: return read_pickle(wsmall_statues)
def compute_wnew_link_stats(ldf): if not os.path.exists('pickled/wsmall_nlinkstats.pickle'): only_wiki = pd.concat([ ldf[ldf.href.str.contains('/wiki')], ldf[ldf.href.str.contains('en.wikipedia.org/wiki')] ]) only_wiki.href = only_wiki.href.apply(lambda h: os.path.basename(h)) only_wiki.wsmall_file = only_wiki.wsmall_file.apply( clean_new_wiki_name) wn_lstats = defaultdict(LinkDict) for owfile in only_wiki.wsmall_file.unique(): owdf = only_wiki[only_wiki.wsmall_file == owfile] links_to = only_wiki[only_wiki.wsmall_file.isin(owdf.href)] links_to_other = owdf[~owdf.href.isin(links_to.href)] wn_lstats[owfile]['outlink_wsmall'] += links_to.wsmall_file.unique( ).size wn_lstats[owfile][ 'outlink_other'] += links_to_other.wsmall_file.unique().size wn_lstats[owfile]['total_outlinks'] += wn_lstats[owfile][ 'outlink_wsmall'] + wn_lstats[owfile]['outlink_other'] for other in links_to.wsmall_file.unique(): wn_lstats[other]['inlink'] += 1 it = { 'wsmall': [], 'outlink_wsmall': [], 'outlink_other': [], 'total_outlinks': [], 'inlinks': [] } for name in wn_lstats.keys(): it['wsmall'].append(name) it['outlink_wsmall'].append(wn_lstats[name]['outlink_wsmall']) it['outlink_other'].append(wn_lstats[name]['outlink_other']) it['total_outlinks'].append(wn_lstats[name]['total_outlinks']) it['inlinks'].append(wn_lstats[name]['inlink']) lstats_df = pd.DataFrame(it) dump_pickle(lstats_df, 'pickled/wsmall_nlinkstats.pickle') return lstats_df else: return read_pickle('pickled/wsmall_nlinkstats.pickle')
def predict_rated(method='pearson_seq', k=10): want = review_file_rated_pickle % (method, k) if not os.path.exists(want): user_predictions = defaultdict(UserReviewedPred) review_df = get_review_df() reviewer_sims = get_reviewer_sim(review_df) movies = get_movies_df() users = review_df.user_id.sort_values().unique() for user in users: user_reviews = review_df[review_df.user_id == user] similar_to_user = reviewer_sims[reviewer_sims.user == user] knn = pick_knn(similar_to_user, method=method, k=k) user_mean = user_reviews.rating.mean() sum_user = 1 / knn.sim.sum() neighbors = review_df[review_df.user_id.isin(knn.other_user.unique())] reviewed_same = neighbors[neighbors.item_id.isin(user_reviews.item_id)] neighbor_reviewed_ids = reviewed_same.item_id.unique() for neighbor_review_id in neighbor_reviewed_ids: movie_reviewed = reviewed_same[reviewed_same.item_id == neighbor_review_id] the_movie = movies[movies.movie_id.isin(movie_reviewed.item_id)] movie_reviewed_title = the_movie.movie_title.iat[0] neighbor_reviews = neighbors[neighbors.item_id == neighbor_review_id] acum = [] for _, row in neighbor_reviews.iterrows(): neighbor_who_did_id = row['user_id'] neighbor_who_did_rating = row['rating'] neighbor_who_did_mean_rating = review_df[review_df.user_id == neighbor_who_did_id].rating.mean() neighbor_who_did_sim_to_cur_u = knn[knn.other_user == neighbor_who_did_id].sim.iat[0] acum.append( neighbor_who_did_sim_to_cur_u * (neighbor_who_did_rating - neighbor_who_did_mean_rating)) sum_neighbors = sum(acum) predicted_score = user_mean + (sum_user * sum_neighbors) the_movie_id = the_movie.movie_id.iat[0] actual_r = user_reviews[user_reviews.item_id == the_movie_id].rating.iat[0] user_predictions[user].add_movie(the_movie_id, movie_reviewed_title, predicted_score, actual_r) dump_pickle(user_predictions, want) return user_predictions else: return read_pickle(want)
def save_overall_mask(people, i_out): def _get_mask(i): i_subject, i_seq, i_cam, i_frame = people[i] encoded_mask = get_mask(int(i_subject), int(i_seq), int(i_cam), int(i_frame)) return improc.decode_mask(encoded_mask) def _get_chair_mask(i): i_subject, i_seq, i_cam, i_frame = people[i] encoded_mask = get_chair_mask(int(i_subject), int(i_seq), int(i_cam), int(i_frame)) return improc.decode_mask(encoded_mask) overall_mask = _get_mask(0) np.maximum(overall_mask, _get_chair_mask(0), out=overall_mask) for i in range(1, 4): np.maximum(overall_mask, _get_mask(i), out=overall_mask) np.maximum(overall_mask, _get_chair_mask(i), out=overall_mask) s = f'{i_out + 1:06d}' out_path = f'{paths.DATA_ROOT}/muco/masks/{s[:2]}/{s[:4]}/{s}.pkl' util.dump_pickle(improc.encode_mask(overall_mask), out_path)
def old_wsmall_lstats_df(): if not os.path.exists('pickled/wsmallo_outl_stats.pickle'): old_wsmall_links = compute_old_wsmall_links() it = { 'wsmall': [], 'outlink_wsmall': [], 'outlink_other': [], 'total_outlinks': [], 'inlinks': [] } for name in old_wsmall_links.keys(): it['wsmall'].append(name) it['outlink_wsmall'].append( old_wsmall_links[name]['outlink_wsmall']) it['outlink_other'].append(old_wsmall_links[name]['outlink_other']) it['total_outlinks'].append( old_wsmall_links[name]['total_outlinks']) it['inlinks'].append(old_wsmall_links[name]['inlink']) lstats_df = pd.DataFrame(it) dump_pickle(lstats_df, 'pickled/wsmallo_outl_stats.pickle') else: lstats_df = read_pickle('pickled/wsmallo_outl_stats.pickle') return lstats_df
except: pass return changed max_iter = 5 for i in range(max_iter): changed = replace_roots_recursive() if not changed: print("Nothing changed after %d iterations." % i) break # In[ ]: # word_roots_dict = load_pickle("word_roots_dict.pkl") str_lines = [] for line in list(word_roots_dict.items()): str_lines.append(line[0] + "\t" + ", ".join(line[1])) write_lines("word_roots_dict.txt", str_lines) dump_pickle("word_roots_dict.pkl", word_roots_dict) # In[ ]: all_roots = list(itertools.chain(*list(word_roots_dict.values()))) freq_dist = nltk.FreqDist(all_roots).most_common() write_lines("root_freq_dist.txt", freq_dist) dump_pickle("root_freq_dist.pkl", freq_dist) # print(freq_dist) # In[ ]:
file = "words.pkl" fp = Path(file) if fp.is_file(): pages = load_pickle(file) else: pdf_file = open('/playpen/home/tongn/Word Frequency List of American English.pdf', 'rb') read_pdf = PyPDF2.PdfFileReader(pdf_file) start = 1906 end = 2053 pages = [] for i in range(start, end): page = read_pdf.getPage(i) page_content = page.extractText() pages.append(page_content) dump_pickle(file, pages) # In[ ]: entries = [] bad_pages = [] for (i, page) in enumerate(pages): if test_page >= 0: i = test_page page = pages[test_page] if i == 0: header_len = 50 else:
def main(): print('main function called') print(ARGS.output) if not os.path.exists(ARGS.output): print(ARGS.output + ' does not exist, create the folder...') os.makedirs(ARGS.output) if ARGS.weighted_training: svm_classifier = SVMBuilder.get_svm_pipeline(C=ARGS.C, class_weight='balanced') else: svm_classifier = SVMBuilder.get_svm_pipeline(C=ARGS.C) print('SVM classifier pipeline created:') # print(svm_classifier.get_params()) if ARGS.cross_validate: for cv_set in glob.glob(ARGS.data_dir + '/set*'): ix = int(cv_set[-1:]) print( "--------------------------set {}----------------------------". format(ix)) print("Read data from {}".format(cv_set)) train = TsvReader.from_tsv(tsv_dir=cv_set + '/train.tsv') print('Model training...') svm_classifier.fit(train.features, train.labels) util.dump_pickle( svm_classifier, ARGS.output + '/cv_model_set_{}'.format(ix), log='SVM model classifier from cross validation set {}'.format( ix), overwrite=True) print('\tTraining finished. Model saved to {}'.format( ARGS.output + '/cv_model_set_{}'.format(ix))) if ARGS.prediction: print('Predict on test set...') test = TsvReader.from_tsv(tsv_dir=cv_set + '/dev.tsv') if ARGS.predict_prob: prediction = svm_classifier.predict_proba(test.features) # pdb.set_trace() np.savetxt(ARGS.output + '/cv_test_pred_prob_set_{}.txt'.format(ix), prediction, delimiter='\t') print('\tPrediction finished. Result saved to {}'.format( ARGS.output + '/cv_test_pred_prob_set_{}.txt'.format(ix))) else: prediction = svm_classifier.predict(test.features) with open( ARGS.output + '/cv_test_pred_set_{}.txt'.format(ix), 'w') as handle: for pred in prediction: handle.write('{}\n'.format(int(pred))) print('\tPrediction finished. Result saved to {}'.format( ARGS.output + '/cv_test_pred_set_{}.txt'.format(ix))) else: print("Import training data...") if ARGS.training_data is not None: train = TsvReader.from_tsv(tsv_dir=ARGS.training_data) else: train = TsvReader.from_tsv(tsv_dir=ARGS.data_dir + '/train.tsv') # pdb.set_trace() print('Model training...') svm_classifier.fit(train.features, train.labels) util.dump_pickle(svm_classifier, ARGS.output + '/model_trained.pkl', log='SVM model classifier from training set.', overwrite=True) print('\tTraining finished. Model saved to {}'.format( ARGS.output + '/model_trained.pkl')) if ARGS.prediction: print('Predict on test set...') test = TsvReader.from_tsv(tsv_dir=ARGS.data_dir + '/test.tsv') # pdb.set_trace() if ARGS.predict_prob: prediction = svm_classifier.predict_proba(test.features) np.savetxt(ARGS.output + '/test_pred_prob.txt', prediction, delimiter='\t') print('\tPrediction finished. Result saved to {}'.format( ARGS.output + '/test_pred_prob.txt')) else: prediction = svm_classifier.predict(test.features) with open(ARGS.output + '/test_pred.txt', 'w') as handle: for pred in prediction: handle.write('{}\n'.format(int(pred))) print('\tPrediction finished. Result saved to {}'.format( ARGS.output + '/test_pred.txt'))
def make_efficient_example(ex): image_relpath = ex.image_path max_rotate = np.pi / 6 padding_factor = 1 / 0.85 scale_up_factor = 1 / 0.85 scale_down_factor = 1 / 0.85 shift_factor = 1.2 base_dst_side = 256 box_center = boxlib.center(ex.bbox) s, c = np.sin(max_rotate), np.cos(max_rotate) w, h = ex.bbox[2:] rot_bbox_side = max(c * w + s * h, c * h + s * w) rot_bbox = boxlib.box_around(box_center, rot_bbox_side) scale_factor = min(base_dst_side / np.max(ex.bbox[2:]) * scale_up_factor, 1) expansion_factor = padding_factor * shift_factor * scale_down_factor expanded_bbox = boxlib.expand(rot_bbox, expansion_factor) expanded_bbox = boxlib.intersect(expanded_bbox, np.array([0, 0, 2048, 2048])) new_camera = ex.camera.copy() new_camera.intrinsic_matrix[:2, 2] -= expanded_bbox[:2] new_camera.scale_output(scale_factor) new_camera.undistort() dst_shape = improc.rounded_int_tuple(scale_factor * expanded_bbox[[3, 2]]) new_im_relpath = ex.image_path.replace('3dhp', f'3dhp_downscaled') new_im_path = os.path.join(paths.DATA_ROOT, new_im_relpath) if not (util.is_file_newer(new_im_path, "2019-11-14T23:32:07") and improc.is_image_readable(new_im_path)): im = improc.imread_jpeg(f'{paths.DATA_ROOT}/{image_relpath}') new_im = cameralib.reproject_image(im, ex.camera, new_camera, dst_shape) util.ensure_path_exists(new_im_path) imageio.imwrite(new_im_path, new_im) new_bbox_topleft = cameralib.reproject_image_points( ex.bbox[:2], ex.camera, new_camera) new_bbox = np.concatenate([new_bbox_topleft, ex.bbox[2:] * scale_factor]) mask_rle_relpath = new_im_path.replace('Images', 'FGmaskImages').replace( '.jpg', '.pkl') mask_rle_path = os.path.join(paths.DATA_ROOT, mask_rle_relpath) if util.is_file_newer(mask_rle_path, "2020-03-11T20:46:46"): mask_runlength = util.load_pickle(mask_rle_path) else: mask_relpath = ex.image_path.replace('Images', 'FGmaskImages').replace( '.jpg', '.png') mask = imageio.imread(os.path.join(paths.DATA_ROOT, mask_relpath)) mask_reproj = cameralib.reproject_image(mask, ex.camera, new_camera, dst_shape) mask_runlength = get_mask_with_highest_iou(mask_reproj, new_bbox) util.dump_pickle(mask_runlength, mask_rle_path) return p3ds.Pose3DExample(new_im_relpath, ex.world_coords, new_bbox, new_camera, mask=mask_runlength, univ_coords=ex.univ_coords)
if os.path.exists(intermediate_file): new_vocab = load_pickle(intermediate_file) print(f"Loaded intermediate .pkl file with {len(new_vocab)} entries") else: new_vocab = OrderedDict() # In[6]: for i, (word, lst) in tqdm(enumerate(list(vocab.items()))): if word in new_vocab: continue try: num_related = get_num_related(word) except: num_related = -1 new_vocab[word] = [lst[0], str(num_related)] + lst[2:] if i % 25 == 0 and i > 0: # store intermediate result every ? iterations dump_pickle("intermediate_importance.pkl", new_vocab) updated_entries = ['\t'.join([k] + v) for k, v in new_vocab.items()] write_lines('importance.txt', updated_entries) updated_entries = ['\t'.join([k] + v) for k, v in new_vocab.items()] write_lines('importance.txt', updated_entries) driver.close() # In[ ]: # In[ ]:
# In[18]: if __name__ == "__main__": # Parse arguments args = parse_args() strategy = args.strategy path = args.path bpe = args.bpe # Get vocab and text source_path = join(path, "raw") target_path = join(path, "indexed") prefix = "bpe_" if bpe else "" vocab_file = join(path, f"{prefix}vocab.txt") vocab_lines = read_lines(vocab_file) vocab = [line.split()[0] for line in vocab_lines] indexer = Indexer(vocab) splits = ["training", "valid", "test"] for split in splits: source_filename = f"{prefix}raw_{split}_text.{strategy}.txt" dialogues = read_lines(join(source_path, source_filename)) indexed_dialogues = [ indexer.index(dialogue) for dialogue in tqdm(dialogues) ] target_filename = f"{prefix}indexed_{split}_text.{strategy}.pkl" dump_pickle(join(target_path, target_filename), indexed_dialogues)