Ejemplo n.º 1
0
def rename_remove(monolith_path, images_json):
    """Take new "file" paths and rename, and also remove those which are not known"""
    from datalad.support.annexrepo import AnnexRepo
    repo = AnnexRepo(monolith_path)
    monolith_path = Path(monolith_path)
    with open(images_json) as f:
        data = json.load(f)

    # Tired yoh cannot get it why we ending up with string keys in json - not supported?
    for pk in list(data['collections']):
        data['collections'][int(pk)] = data['collections'].pop(pk)

    # Remove all collections which are not included
    known_collections = {
        r['full_name']: int(pk)
        for pk, r in data['collections'].items()
    }
    # Add those which might have been renamed but still present under original names
    # in the actual container images file tree
    for col, containers in data['images'].items():
        for r in containers:
            col_ = op.join(*Path(r['file_orig']).parts[:2])
            if col_ in known_collections:
                # in case of narrative/remoll  there is 214 with image
                # and 254 without image :-/ So we cannot assert
                # assert known_collections[col_] == int(r['collection'])
                # Let's just inform - and it seems we have just few
                if known_collections[col_] != int(r['collection']):
                    print(
                        f"WARNING: for {col_} known as {known_collections[col_]} we also have {r['collection']}"
                    )
            else:
                known_collections[col_] = r['collection']
                # and we need to adjust mapping since that is where it would be found now
                rcol = data['collections'][r['collection']]
                if 'full_name_orig' not in rcol and rcol['full_name'] != col_:
                    rcol['full_name_orig'] = rcol['full_name']
                rcol['full_name'] = col_

    dirs_under_monolith = set(
        str(p.relative_to(monolith_path))
        for p in monolith_path.glob('*/*/*/*'))
    dirs_under_monolith = set(x for x in dirs_under_monolith
                              if not (x.startswith('.') or x.startswith('_')))
    # group by collection
    cols_under_monolith = defaultdict(list)
    for d in dirs_under_monolith:
        cols_under_monolith[op.join(*Path(d).parts[:2])].append(d)

    dirs_images = set(
        itertools.chain(*([op.dirname(x['file_orig']) for x in recs]
                          for recs in data['images'].values())))
    # import pdb; pdb.set_trace()

    # Let's first remove all those collections which aren't known:
    # Do it first so when we get to rename container dirs, we fail if
    # entire collections is gone (which should not happen)
    to_remove = []
    for c in sorted(cols_under_monolith):
        if c not in known_collections:
            print(f"{c}: removing entirely")
            to_remove.append(c)
            cols_under_monolith.pop(c)
        # remove individual image directories
        keep = []
        for d in cols_under_monolith[c]:
            if d not in dirs_images:
                print(f"{c}: removing {d}")
                to_remove.append(d)
            else:
                keep.append(d)
        cols_under_monolith[c] = d
    if to_remove:
        print(f"REMOVING {len(to_remove)} directories")
        repo.call_git(['rm', '-rf'] + to_remove)
        repo.call_git(['clean', '-dfx'])

    # Now we need to rename all what is left
    for c, containers in sorted(data['images'].items()):
        for con in containers:
            src = op.dirname(con['file_orig'])
            assert len(
                src
            ) > 74  # sanitycheck - should have at least committish and checksu
            dest = op.dirname(con['file'])
            # we do it once!
            assert (monolith_path / src).exists()
            assert not (monolith_path / dest).exists()
            pdest = (monolith_path / dest).parent
            if not pdest.exists():
                pdest.mkdir(parents=True)
            print(f"Moving {src} to {dest}")
            repo.call_git(['mv', src, dest])

    repo.call_git(['clean', '-dfx'])

    # we might have adjusted collections
    with open(images_json, 'w') as f:
        json.dump(data, f, indent=2)