Esempio n. 1
0
def find_requested_hrefs(all_href_list, py_version, pkg_list):
    """
    Filters out everything but the requested urls
    Returns the urls to download the requested installers
    """
    print('Filtering to only requested HREFS')
    href_list1, missing1  = filter_href_list(all_href_list, pkg_list, OS_VERSION, py_version)
    #print('missing1 = %r' % (missing1,))
    href_list2, missing2 = filter_href_list(all_href_list, missing1, OS_VERSION, py_version)
    #print('missing2 = %r' % (missing2,))
    #print(href_list2)
    href_list3, missing3 = filter_href_list(all_href_list, missing2, 'x64', py_version.replace('p', 'P'))
    #print('missing3 = %r' % (missing3,))
    href_list4, missing4 = filter_href_list(all_href_list, missing3, 'any', py_version.replace('cp', 'py')[0:3])

    if len(missing4) > 0:
        print('Could not find a match for missing4=%r' % (missing4,))
        #import Levenshtein
        for pkg in missing4:
            #dist_list = [Levenshtein.distance(href, pkg) for href in all_href_list]
            dist_list = [0 if (href.find(pkg) > -1) else 100 for href in all_href_list]
            closest_matche_xs = ut.list_argsort(dist_list)[::1]
            print('Perhaps pkg=%r could match one of these?' % (pkg,))
            closest_hrefs = ut.take(all_href_list, closest_matche_xs[0:3])
            print(ut.indentjoin(closest_hrefs, '\n   '))

    href_list = href_list1 + href_list2 + href_list3 + href_list4
    return href_list
Esempio n. 2
0
def get_summary(profile_block_list, maxlines=20):
    """
    References:
        https://github.com/rkern/line_profiler
    """
    time_list = [get_block_totaltime(block) for block in profile_block_list]
    time_list = [time if time is not None else -1 for time in time_list]
    blockid_list = [get_block_id(block) for block in profile_block_list]
    sortx = ut.list_argsort(time_list)
    sorted_time_list = ut.take(time_list, sortx)
    sorted_blockid_list = ut.take(blockid_list, sortx)

    aligned_blockid_list = ut.util_str.align_lines(sorted_blockid_list, ':')
    summary_lines = [('%6.2f seconds - ' % time) + line
                     for time, line in
                     zip(sorted_time_list, aligned_blockid_list)]
    #summary_header = ut.codeblock(
    #    '''
    #    CLEANED PROFILE OUPUT

    #    The Pystone timings are not from kernprof, so they may include kernprof
    #    overhead, whereas kernprof timings do not (unless the line being
    #    profiled is also decorated with kernrof)

    #    The kernprof times are reported in Timer Units

    #    ''')
    summary_lines_ = ut.listclip(summary_lines, maxlines, fromback=True)
    summary_text = '\n'.join(summary_lines_)
    return summary_text
Esempio n. 3
0
def get_summary(profile_block_list, maxlines=20):
    """
    References:
        https://github.com/rkern/line_profiler
    """
    time_list = [get_block_totaltime(block) for block in profile_block_list]
    time_list = [time if time is not None else -1 for time in time_list]
    blockid_list = [get_block_id(block) for block in profile_block_list]
    sortx = ut.list_argsort(time_list)
    sorted_time_list = ut.take(time_list, sortx)
    sorted_blockid_list = ut.take(blockid_list, sortx)

    aligned_blockid_list = ut.util_str.align_lines(sorted_blockid_list, ':')
    summary_lines = [('%6.2f seconds - ' % time) + line
                     for time, line in
                     zip(sorted_time_list, aligned_blockid_list)]
    #summary_header = ut.codeblock(
    #    '''
    #    CLEANED PROFILE OUPUT

    #    The Pystone timings are not from kernprof, so they may include kernprof
    #    overhead, whereas kernprof timings do not (unless the line being
    #    profiled is also decorated with kernrof)

    #    The kernprof times are reported in Timer Units

    #    ''')
    summary_lines_ = ut.listclip(summary_lines, maxlines, fromback=True)
    summary_text = '\n'.join(summary_lines_)
    return summary_text
Esempio n. 4
0
def find_requested_hrefs(all_href_list, py_version, pkg_list):
    """
    Filters out everything but the requested urls
    Returns the urls to download the requested installers
    """
    print('Filtering to only requested HREFS')
    href_list1, missing1 = filter_href_list(all_href_list, pkg_list,
                                            OS_VERSION, py_version)
    #print('missing1 = %r' % (missing1,))
    href_list2, missing2 = filter_href_list(all_href_list, missing1,
                                            OS_VERSION, py_version)
    #print('missing2 = %r' % (missing2,))
    #print(href_list2)
    href_list3, missing3 = filter_href_list(all_href_list, missing2, 'x64',
                                            py_version.replace('p', 'P'))
    #print('missing3 = %r' % (missing3,))
    href_list4, missing4 = filter_href_list(
        all_href_list, missing3, 'any',
        py_version.replace('cp', 'py')[0:3])

    if len(missing4) > 0:
        print('Could not find a match for missing4=%r' % (missing4, ))
        #import Levenshtein
        for pkg in missing4:
            #dist_list = [Levenshtein.distance(href, pkg) for href in all_href_list]
            dist_list = [
                0 if (href.find(pkg) > -1) else 100 for href in all_href_list
            ]
            closest_matche_xs = ut.list_argsort(dist_list)[::1]
            print('Perhaps pkg=%r could match one of these?' % (pkg, ))
            closest_hrefs = ut.take(all_href_list, closest_matche_xs[0:3])
            print(ut.indentjoin(closest_hrefs, '\n   '))

    href_list = href_list1 + href_list2 + href_list3 + href_list4
    return href_list
Esempio n. 5
0
 def biggest_files(drive):
     print('Biggest Files in %r' % (drive,))
     sortx = ut.list_argsort(drive.fpath_bytes_list)[::-1]
     sel = sortx[0:10]
     biggest_nbytes = ut.take(drive.fpath_bytes_list, sel)
     biggest_files = ut.take(drive.fpath_list, sel)
     biginfo_list = list(zip(map(ut.byte_str2, biggest_nbytes), biggest_files))
     print(ut.list_str(biginfo_list, strvals=True))
Esempio n. 6
0
 def biggest_files(drive):
     print('Biggest Files in %r' % (drive, ))
     sortx = ut.list_argsort(drive.fpath_bytes_list)[::-1]
     sel = sortx[0:10]
     biggest_nbytes = ut.take(drive.fpath_bytes_list, sel)
     biggest_files = ut.take(drive.fpath_list, sel)
     biginfo_list = list(
         zip(map(ut.byte_str2, biggest_nbytes), biggest_files))
     print(ut.repr2(biginfo_list, strvals=True))
Esempio n. 7
0
    def biggest_dirs(drive):
        print('Biggest Dirs in %r' % (drive,))
        dpath_list = drive.dpath_list
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list)
        unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list, fidxs_list)
        dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list))

        sortx = ut.list_argsort(dpath_nbytes_list)[::-1]
        sel = sortx[0:10]
        biggest_nbytes = ut.take(dpath_nbytes_list, sel)
        biggest_dpaths = ut.take(dpath_list, sel)
        biginfo_list = list(zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths))
        print(ut.list_str(biginfo_list, strvals=True))
        pass
Esempio n. 8
0
    def biggest_dirs(drive):
        print('Biggest Dirs in %r' % (drive, ))
        dpath_list = drive.dpath_list
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, dpath_list)
        unflat_dpath_bytes_list = ut.list_unflat_take(drive.fpath_bytes_list,
                                                      fidxs_list)
        dpath_nbytes_list = list(map(sum, unflat_dpath_bytes_list))

        sortx = ut.list_argsort(dpath_nbytes_list)[::-1]
        sel = sortx[0:10]
        biggest_nbytes = ut.take(dpath_nbytes_list, sel)
        biggest_dpaths = ut.take(dpath_list, sel)
        biginfo_list = list(
            zip(map(ut.byte_str2, biggest_nbytes), biggest_dpaths))
        print(ut.repr2(biginfo_list, strvals=True))
        pass
Esempio n. 9
0
def ensure_pz_mtest_mergesplit_test():
    r"""
    Make a test database for MERGE and SPLIT cases

    CommandLine:
        python -m ibeis.init.sysres --test-ensure_pz_mtest_mergesplit_test

    Example:
        >>> # SCRIPT
        >>> from ibeis.init.sysres import *  # NOQA
        >>> ensure_pz_mtest_mergesplit_test()
    """
    import ibeis
    ibeis.ensure_pz_mtest()
    workdir = ibeis.sysres.get_workdir()
    mtest_dbpath = join(workdir, 'PZ_MTEST')

    source_dbdir = mtest_dbpath
    dest_dbdir = join(workdir, 'PZ_MERGESPLIT_MTEST')

    if ut.get_argflag('--reset'):
        ut.delete(dest_dbdir)
    if ut.checkpath(dest_dbdir):
        return

    copy_ibeisdb(source_dbdir, dest_dbdir)

    ibs = ibeis.opendb('PZ_MERGESPLIT_MTEST')
    assert len(ibs.get_valid_aids()) == 119
    assert len(ibs.get_valid_nids()) == 41

    aid_list = ibs.get_valid_aids()
    aids_list, nid_list = ibs.group_annots_by_name(aid_list)
    num_aids = list(map(len, aids_list))

    # num cases wanted
    num_merge = 3
    num_split = 1
    num_combo = 1

    # num inputs needed
    num_merge_names = num_merge
    num_split_names = num_split * 2
    num_combo_names = num_combo * 3

    total_names = num_merge_names + num_split_names + num_combo_names

    modify_aids = ut.take(aids_list, ut.list_argsort(num_aids, reverse=True)[0:total_names])

    merge_nids1 = ibs.make_next_nids(num_merge, location_text='XMERGE')
    merge_nids2 = ibs.make_next_nids(num_merge, location_text='XMERGE')
    split_nid  = ibs.make_next_nids(num_split, location_text='XSPLIT')[0]
    combo_nids = ibs.make_next_nids(num_combo * 2, location_text='XCOMBO')

    # the first 3 become merge cases
    #left = 0
    #right = left + num_merge
    for aids, nid1, nid2 in zip(modify_aids[0:3], merge_nids1, merge_nids2):
        #ibs.get_annot_nids(aids)
        aids_ = aids[::2]
        ibs.set_annot_name_rowids(aids_, [nid1] * len(aids_))
        ibs.set_annot_name_rowids(aids_, [nid2] * len(aids_))

    # the next 2 become split cases
    #left = right
    #right = left + num_split_names
    for aids in modify_aids[3:5]:
        ibs.set_annot_name_rowids(aids, [split_nid] * len(aids))

    #left = right
    #right = left + num_combo_names
    # The final 3 are a combination case
    for aids in modify_aids[5:8]:
        aids_even = aids[::2]
        aids_odd = aids[1::2]
        ibs.set_annot_name_rowids(aids_even, [combo_nids[0]] * len(aids_even))
        ibs.set_annot_name_rowids(aids_odd, [combo_nids[1]] * len(aids_odd))

    final_result = ibs.unflat_map(ibs.get_annot_nids, modify_aids)
    print('final_result = %s' % (ut.list_str(final_result),))
Esempio n. 10
0
def ensure_pz_mtest_mergesplit_test():
    r"""
    Make a test database for MERGE and SPLIT cases

    CommandLine:
        python -m ibeis.init.sysres --test-ensure_pz_mtest_mergesplit_test

    Example:
        >>> # SCRIPT
        >>> from ibeis.init.sysres import *  # NOQA
        >>> ensure_pz_mtest_mergesplit_test()
    """
    import ibeis
    ibeis.ensure_pz_mtest()
    workdir = ibeis.sysres.get_workdir()
    mtest_dbpath = join(workdir, 'PZ_MTEST')

    source_dbdir = mtest_dbpath
    dest_dbdir = join(workdir, 'PZ_MERGESPLIT_MTEST')

    if ut.get_argflag('--reset'):
        ut.delete(dest_dbdir)
    if ut.checkpath(dest_dbdir):
        return

    copy_ibeisdb(source_dbdir, dest_dbdir)

    ibs = ibeis.opendb('PZ_MERGESPLIT_MTEST')
    assert len(ibs.get_valid_aids()) == 119
    assert len(ibs.get_valid_nids()) == 41

    aid_list = ibs.get_valid_aids()
    aids_list, nid_list = ibs.group_annots_by_name(aid_list)
    num_aids = list(map(len, aids_list))

    # num cases wanted
    num_merge = 3
    num_split = 1
    num_combo = 1

    # num inputs needed
    num_merge_names = num_merge
    num_split_names = num_split * 2
    num_combo_names = num_combo * 3

    total_names = num_merge_names + num_split_names + num_combo_names

    modify_aids = ut.take(
        aids_list,
        ut.list_argsort(num_aids, reverse=True)[0:total_names])

    merge_nids1 = ibs.make_next_nids(num_merge, location_text='XMERGE')
    merge_nids2 = ibs.make_next_nids(num_merge, location_text='XMERGE')
    split_nid = ibs.make_next_nids(num_split, location_text='XSPLIT')[0]
    combo_nids = ibs.make_next_nids(num_combo * 2, location_text='XCOMBO')

    # the first 3 become merge cases
    #left = 0
    #right = left + num_merge
    for aids, nid1, nid2 in zip(modify_aids[0:3], merge_nids1, merge_nids2):
        #ibs.get_annot_nids(aids)
        aids_ = aids[::2]
        ibs.set_annot_name_rowids(aids_, [nid1] * len(aids_))
        ibs.set_annot_name_rowids(aids_, [nid2] * len(aids_))

    # the next 2 become split cases
    #left = right
    #right = left + num_split_names
    for aids in modify_aids[3:5]:
        ibs.set_annot_name_rowids(aids, [split_nid] * len(aids))

    #left = right
    #right = left + num_combo_names
    # The final 3 are a combination case
    for aids in modify_aids[5:8]:
        aids_even = aids[::2]
        aids_odd = aids[1::2]
        ibs.set_annot_name_rowids(aids_even, [combo_nids[0]] * len(aids_even))
        ibs.set_annot_name_rowids(aids_odd, [combo_nids[1]] * len(aids_odd))

    final_result = ibs.unflat_map(ibs.get_annot_nids, modify_aids)
    print('final_result = %s' % (ut.list_str(final_result), ))
Esempio n. 11
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive,))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_)
            if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list, duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.list_str(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes),))
        #print('Duplicate sizes: ' + ut.list_str(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.list_str(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)
        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))
        unflat_fname_sets = list(map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [key for key, val in multiindex_dict2_.items() if len(val) > 1]
        print('#fname_based_duplicate_dpaths = %r' % (len(fname_based_duplicate_hashes),))
        fname_based_duplicate_didxs = ut.dict_take(multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))
Esempio n. 12
0
    def fix_duplicates(drive):
        r"""
        for every duplicate file passing a (eg avi) filter, remove the file
        that is in the smallest directory. On a tie use the smallest dpath.
        This will filter all duplicate files in a folder into a single folder.

        but... need to look at non-duplicates in that folder and decide if they
        should be moved as well.  So, should trigger on folders that have at
        least 50% duplicate.  Might not want to move curated folders.

        Example:
            cd ~/local/scripts
            >>> from register_files import *  # NOQA
            >>> dpaths = ut.get_argval('--drives', type_=list, default=['E:/'])#'D:/', 'E:/', 'F:/'])
            >>> drives = [Drive(root_dpath) for root_dpath in dpaths]
            >>> E = drive = drives[0]
            >>> #D, E, F = drives
        """
        print('Fixing Duplicates in %r' % (drive, ))
        list_ = drive.fpath_hashX_list
        multiindex_dict_ = build_multindex(list_)
        duplicate_hashes = [
            key for key, val in six.iteritems(multiindex_dict_) if len(val) > 1
        ]
        duplicate_idxs = ut.dict_take(multiindex_dict_, duplicate_hashes)
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        # Check if any dups have been removed
        still_exists = ut.unflat_map(exists, unflat_fpaths)
        unflat_idxs2 = ut.zipcompress(duplicate_idxs, still_exists)
        duplicate_idxs = [idxs for idxs in unflat_idxs2 if len(idxs) > 1]
        # Look at duplicate files
        unflat_fpaths = ut.list_unflat_take(drive.fpath_list, duplicate_idxs)
        unflat_sizes = ut.list_unflat_take(drive.fpath_bytes_list,
                                           duplicate_idxs)
        # Find highly coupled directories
        if True:
            coupled_dirs = []
            for fpaths in unflat_fpaths:
                #basedir = ut.longest_existing_path(commonprefix(fpaths))
                dirs = sorted(list(map(dirname, fpaths)))
                _list = list(range(len(dirs)))
                idxs = ut.upper_diag_self_prodx(_list)
                coupled_dirs.extend(
                    list(map(tuple, ut.list_unflat_take(dirs, idxs))))
            hist_ = ut.dict_hist(coupled_dirs)
            coupled_idxs = ut.list_argsort(hist_.values())[::-1]
            most_coupled = ut.take(list(hist_.keys()), coupled_idxs[0:100])
            print('Coupled fpaths: ' + ut.repr2(most_coupled, nl=True))
        print('%d unique files are duplicated' % (len(unflat_sizes), ))
        #print('Duplicate sizes: ' + ut.repr2(unflat_sizes[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0:10], nl=True))
        #print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths[0::5], nl=True))
        print('Duplicate fpaths: ' + ut.repr2(unflat_fpaths, nl=True))
        # Find duplicate directories
        dpath_list = list(drive.dpath_to_fidx.keys())
        fidxs_list = ut.dict_take(drive.dpath_to_fidx, drive.dpath_list)
        #exists_list = list(map(exists, drive.fpath_list))
        #unflat_exists = ut.list_unflat_take(exists_list, fidxs_list)
        fname_registry = [basename(fpath) for fpath in drive.fpath_list]
        unflat_fnames = ut.list_unflat_take(fname_registry, fidxs_list)

        def unsorted_list_hash(list_):
            return ut.hashstr27(str(sorted(list_)))

        unflat_fname_sets = list(
            map(unsorted_list_hash, ut.ProgIter(unflat_fnames, freq=10000)))
        fname_based_duplicate_dpaths = []
        multiindex_dict2_ = build_multindex(unflat_fname_sets)
        fname_based_duplicate_hashes = [
            key for key, val in multiindex_dict2_.items() if len(val) > 1
        ]
        print('#fname_based_duplicate_dpaths = %r' %
              (len(fname_based_duplicate_hashes), ))
        fname_based_duplicate_didxs = ut.dict_take(
            multiindex_dict2_, fname_based_duplicate_hashes)
        fname_based_duplicate_dpaths = ut.list_unflat_take(
            dpath_list, fname_based_duplicate_didxs)
        print(ut.repr3(fname_based_duplicate_dpaths[0:10]))