def find_requested_hrefs(all_href_list, py_version, pkg_list): """ Filters out everything but the requested urls Returns the urls to download the requested installers """ print('Filtering to only requested HREFS') href_list1, missing1 = filter_href_list(all_href_list, pkg_list, OS_VERSION, py_version) #print('missing1 = %r' % (missing1,)) href_list2, missing2 = filter_href_list(all_href_list, missing1, OS_VERSION, py_version) #print('missing2 = %r' % (missing2,)) #print(href_list2) href_list3, missing3 = filter_href_list(all_href_list, missing2, 'x64', py_version.replace('p', 'P')) #print('missing3 = %r' % (missing3,)) href_list4, missing4 = filter_href_list(all_href_list, missing3, 'any', py_version.replace('cp', 'py')[0:3]) if len(missing4) > 0: print('Could not find a match for missing4=%r' % (missing4,)) #import Levenshtein for pkg in missing4: #dist_list = [Levenshtein.distance(href, pkg) for href in all_href_list] dist_list = [0 if (href.find(pkg) > -1) else 100 for href in all_href_list] closest_matche_xs = ut.list_argsort(dist_list)[::1] print('Perhaps pkg=%r could match one of these?' % (pkg,)) closest_hrefs = ut.take(all_href_list, closest_matche_xs[0:3]) print(ut.indentjoin(closest_hrefs, '\n ')) href_list = href_list1 + href_list2 + href_list3 + href_list4 return href_list
def get_summary(profile_block_list, maxlines=20): """ References: https://github.com/rkern/line_profiler """ time_list = [get_block_totaltime(block) for block in profile_block_list] time_list = [time if time is not None else -1 for time in time_list] blockid_list = [get_block_id(block) for block in profile_block_list] sortx = ut.list_argsort(time_list) sorted_time_list = ut.take(time_list, sortx) sorted_blockid_list = ut.take(blockid_list, sortx) aligned_blockid_list = ut.util_str.align_lines(sorted_blockid_list, ':') summary_lines = [('%6.2f seconds - ' % time) + line for time, line in zip(sorted_time_list, aligned_blockid_list)] #summary_header = ut.codeblock( # ''' # CLEANED PROFILE OUPUT # The Pystone timings are not from kernprof, so they may include kernprof # overhead, whereas kernprof timings do not (unless the line being # profiled is also decorated with kernrof) # The kernprof times are reported in Timer Units # ''') summary_lines_ = ut.listclip(summary_lines, maxlines, fromback=True) summary_text = '\n'.join(summary_lines_) return summary_text
def find_requested_hrefs(all_href_list, py_version, pkg_list): """ Filters out everything but the requested urls Returns the urls to download the requested installers """ print('Filtering to only requested HREFS') href_list1, missing1 = filter_href_list(all_href_list, pkg_list, OS_VERSION, py_version) #print('missing1 = %r' % (missing1,)) href_list2, missing2 = filter_href_list(all_href_list, missing1, OS_VERSION, py_version) #print('missing2 = %r' % (missing2,)) #print(href_list2) href_list3, missing3 = filter_href_list(all_href_list, missing2, 'x64', py_version.replace('p', 'P')) #print('missing3 = %r' % (missing3,)) href_list4, missing4 = filter_href_list( all_href_list, missing3, 'any', py_version.replace('cp', 'py')[0:3]) if len(missing4) > 0: print('Could not find a match for missing4=%r' % (missing4, )) #import Levenshtein for pkg in missing4: #dist_list = [Levenshtein.distance(href, pkg) for href in all_href_list] dist_list = [ 0 if (href.find(pkg) > -1) else 100 for href in all_href_list ] closest_matche_xs = ut.list_argsort(dist_list)[::1] print('Perhaps pkg=%r could match one of these?' % (pkg, )) closest_hrefs = ut.take(all_href_list, closest_matche_xs[0:3]) print(ut.indentjoin(closest_hrefs, '\n ')) href_list = href_list1 + href_list2 + href_list3 + href_list4 return href_list
def biggest_files(drive): print('Biggest Files in %r' % (drive,)) sortx = ut.list_argsort(drive.fpath_bytes_list)[::-1] sel = sortx[0:10] biggest_nbytes = ut.take(drive.fpath_bytes_list, sel) biggest_files = ut.take(drive.fpath_list, sel) biginfo_list = list(zip(map(ut.byte_str2, biggest_nbytes), biggest_files)) print(ut.list_str(biginfo_list, strvals=True))
def biggest_files(drive): print('Biggest Files in %r' % (drive, )) sortx = ut.list_argsort(drive.fpath_bytes_list)[::-1] sel = sortx[0:10] biggest_nbytes = ut.take(drive.fpath_bytes_list, sel) biggest_files = ut.take(drive.fpath_list, sel) biginfo_list = list( zip(map(ut.byte_str2, biggest_nbytes), biggest_files)) print(ut.repr2(biginfo_list, strvals=True))
def biggest_dirs(drive): print('Biggest Dirs in %r' % (drive,)) dpath_list = drive.dpath_list fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list) unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list, fidxs_list) dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list)) sortx = ut.list_argsort(dpath_nbytes_list)[::-1] sel = sortx[0:10] biggest_nbytes = ut.take(dpath_nbytes_list, sel) biggest_dpaths = ut.take(dpath_list, sel) biginfo_list = list(zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths)) print(ut.list_str(biginfo_list, strvals=True)) pass
def biggest_dirs(drive): print('Biggest Dirs in %r' % (drive, )) dpath_list = drive.dpath_list fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list) unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list, fidxs_list) dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list)) sortx = ut.list_argsort(dpath_nbytes_list)[::-1] sel = sortx[0:10] biggest_nbytes = ut.take(dpath_nbytes_list, sel) biggest_dpaths = ut.take(dpath_list, sel) biginfo_list = list( zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths)) print(ut.repr2(biginfo_list, strvals=True)) pass
def ensure_pz_mtest_mergesplit_test(): r""" Make a test database for MERGE and SPLIT cases CommandLine: python -m ibeis.init.sysres --test-ensure_pz_mtest_mergesplit_test Example: >>> # SCRIPT >>> from ibeis.init.sysres import * # NOQA >>> ensure_pz_mtest_mergesplit_test() """ import ibeis ibeis.ensure_pz_mtest() workdir = ibeis.sysres.get_workdir() mtest_dbpath = join(workdir, 'PZ_MTEST') source_dbdir = mtest_dbpath dest_dbdir = join(workdir, 'PZ_MERGESPLIT_MTEST') if ut.get_argflag('--reset'): ut.delete(dest_dbdir) if ut.checkpath(dest_dbdir): return copy_ibeisdb(source_dbdir, dest_dbdir) ibs = ibeis.opendb('PZ_MERGESPLIT_MTEST') assert len(ibs.get_valid_aids()) == 119 assert len(ibs.get_valid_nids()) == 41 aid_list = ibs.get_valid_aids() aids_list, nid_list = ibs.group_annots_by_name(aid_list) num_aids = list(map(len, aids_list)) # num cases wanted num_merge = 3 num_split = 1 num_combo = 1 # num inputs needed num_merge_names = num_merge num_split_names = num_split * 2 num_combo_names = num_combo * 3 total_names = num_merge_names + num_split_names + num_combo_names modify_aids = ut.take(aids_list, ut.list_argsort(num_aids, reverse=True)[0:total_names]) merge_nids1 = ibs.make_next_nids(num_merge, location_text='XMERGE') merge_nids2 = ibs.make_next_nids(num_merge, location_text='XMERGE') split_nid = ibs.make_next_nids(num_split, location_text='XSPLIT')[0] combo_nids = ibs.make_next_nids(num_combo * 2, location_text='XCOMBO') # the first 3 become merge cases #left = 0 #right = left + num_merge for aids, nid1, nid2 in zip(modify_aids[0:3], merge_nids1, merge_nids2): #ibs.get_annot_nids(aids) aids_ = aids[::2] ibs.set_annot_name_rowids(aids_, [nid1] * len(aids_)) ibs.set_annot_name_rowids(aids_, [nid2] * len(aids_)) # the next 2 become split cases #left = right #right = left + num_split_names for aids in modify_aids[3:5]: ibs.set_annot_name_rowids(aids, [split_nid] * len(aids)) #left = right #right = left + num_combo_names # The final 3 are a combination case for aids in modify_aids[5:8]: aids_even = aids[::2] aids_odd = aids[1::2] ibs.set_annot_name_rowids(aids_even, [combo_nids[0]] * len(aids_even)) ibs.set_annot_name_rowids(aids_odd, [combo_nids[1]] * len(aids_odd)) final_result = ibs.unflat_map(ibs.get_annot_nids, modify_aids) print('final_result = %s' % (ut.list_str(final_result),))
def ensure_pz_mtest_mergesplit_test(): r""" Make a test database for MERGE and SPLIT cases CommandLine: python -m ibeis.init.sysres --test-ensure_pz_mtest_mergesplit_test Example: >>> # SCRIPT >>> from ibeis.init.sysres import * # NOQA >>> ensure_pz_mtest_mergesplit_test() """ import ibeis ibeis.ensure_pz_mtest() workdir = ibeis.sysres.get_workdir() mtest_dbpath = join(workdir, 'PZ_MTEST') source_dbdir = mtest_dbpath dest_dbdir = join(workdir, 'PZ_MERGESPLIT_MTEST') if ut.get_argflag('--reset'): ut.delete(dest_dbdir) if ut.checkpath(dest_dbdir): return copy_ibeisdb(source_dbdir, dest_dbdir) ibs = ibeis.opendb('PZ_MERGESPLIT_MTEST') assert len(ibs.get_valid_aids()) == 119 assert len(ibs.get_valid_nids()) == 41 aid_list = ibs.get_valid_aids() aids_list, nid_list = ibs.group_annots_by_name(aid_list) num_aids = list(map(len, aids_list)) # num cases wanted num_merge = 3 num_split = 1 num_combo = 1 # num inputs needed num_merge_names = num_merge num_split_names = num_split * 2 num_combo_names = num_combo * 3 total_names = num_merge_names + num_split_names + num_combo_names modify_aids = ut.take( aids_list, ut.list_argsort(num_aids, reverse=True)[0:total_names]) merge_nids1 = ibs.make_next_nids(num_merge, location_text='XMERGE') merge_nids2 = ibs.make_next_nids(num_merge, location_text='XMERGE') split_nid = ibs.make_next_nids(num_split, location_text='XSPLIT')[0] combo_nids = ibs.make_next_nids(num_combo * 2, location_text='XCOMBO') # the first 3 become merge cases #left = 0 #right = left + num_merge for aids, nid1, nid2 in zip(modify_aids[0:3], merge_nids1, merge_nids2): #ibs.get_annot_nids(aids) aids_ = aids[::2] ibs.set_annot_name_rowids(aids_, [nid1] * len(aids_)) ibs.set_annot_name_rowids(aids_, [nid2] * len(aids_)) # the next 2 become split cases #left = right #right = left + num_split_names for aids in modify_aids[3:5]: ibs.set_annot_name_rowids(aids, [split_nid] * len(aids)) #left = right #right = left + num_combo_names # The final 3 are a combination case for aids in modify_aids[5:8]: aids_even = aids[::2] aids_odd = aids[1::2] ibs.set_annot_name_rowids(aids_even, [combo_nids[0]] * len(aids_even)) ibs.set_annot_name_rowids(aids_odd, [combo_nids[1]] * len(aids_odd)) final_result = ibs.unflat_map(ibs.get_annot_nids, modify_aids) print('final_result = %s' % (ut.list_str(final_result), ))
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive,)) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes),)) #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),)) fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
def fix_duplicates(drive): r""" for every duplicate file passing a (eg avi) filter, remove the file that is in the smallest directory. On a tie use the smallest dpath. This will filter all duplicate files in a folder into a single folder. but... need to look at non-duplicates in that folder and decide if they should be moved as well. So, should trigger on folders that have at least 50% duplicate. Might not want to move curated folders. Example: cd ~/local/scripts >>> from register_files import * # NOQA >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/']) >>> drives = [Drive(root_dpath) for root_dpath in dpaths] >>> E = drive = drives[0] >>> #D, E, F = drives """ print('Fixing Duplicates in %r' % (drive, )) list_ = drive.fpath_hashX_list multiindex_dict_ = build_multindex(list_) duplicate_hashes = [ key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1 ] duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes) unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) # Check if any dups have been removed still_exists = ut.unflat_map(exists, unflat_fpaths) unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists) duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1] # Look at duplicate files unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs) unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs) # Find highly coupled directories if True: coupled_dirs = [] for fpaths in unflat_fpaths: #basedir = ut.longest_existing_path(commonprefix(fpaths)) dirs = sorted(list(map(dirname, fpaths))) _list = list(range(len(dirs))) idxs = ut.upper_diag_self_prodx(_list) coupled_dirs.extend( list(map(tuple, ut.list_unflat_take(dirs, idxs)))) hist_ = ut.dict_hist(coupled_dirs) coupled_idxs = ut.list_argsort(hist_.values())[::-1] most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100]) print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True)) print('%d unique files are duplicated' % (len(unflat_sizes), )) #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True)) #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True)) print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True)) # Find duplicate directories dpath_list = list(drive.dpath_to_fidx.keys()) fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list) #exists_list = list(map(exists, drive.fpath_list)) #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list) fname_registry = [basename(fpath) for fpath in drive.fpath_list] unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list) def unsorted_list_hash(list_): return ut.hashstr27(str(sorted(list_))) unflat_fname_sets = list( map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000))) fname_based_duplicate_dpaths = [] multiindex_dict2_ = build_multindex(unflat_fname_sets) fname_based_duplicate_hashes = [ key for key, val in multiindex_dict2_.items() if len(val) > 1 ] print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes), )) fname_based_duplicate_didxs = ut.dict_take( multiindex_dict2_, fname_based_duplicate_hashes) fname_based_duplicate_dpaths = ut.list_unflat_take( dpath_list, fname_based_duplicate_didxs) print(ut.repr3(fname_based_duplicate_dpaths[0:10]))