Ejemplo n.º 1
0
def file_info(repo):
    """Info from looking at set of metadata files
    
    @param repo: GitPython.Repository object
    @returns: dict
    """
    data = {}
    paths = util.find_meta_files(
        repo.working_dir,
        recursive=True,
        force_read=True
    )
    identifiers = [
        identifier.Identifier(path)
        for path in paths
    ]
    data['total objects'] = len(identifiers)
    # model totals
    for i in identifiers:
        key = '%s objects' % i.model
        if not data.get(key):
            data[key] = 0
        data[key] = data[key] + 1
    # role totals
    roles = identifier.VALID_COMPONENTS['role']
    for role in roles:
        key = '%s files' % role
        data[key] = 0
    for i in identifiers:
        if i.model == 'file':
            for role in roles:
                if role in i.path_abs():
                    key = '%s files' % role
                    data[key] = data[key] + 1
    return data
Ejemplo n.º 2
0
def test_pick_signatures():
    cpath = os.path.join(TESTING_BASE_DIR, COLLECTION_ID)
    paths = util.find_meta_files(cpath, recursive=True, force_read=True, testing=True)
    parents = signatures.choose(paths)
    updates = signatures.find_updates(parents)
    files_written = signatures.write_updates(updates)
    return files_written
Ejemplo n.º 3
0
def test_00_pick_signatures(tmpdir, collection):
    paths = util.find_meta_files(collection.path_abs,
                                 recursive=True,
                                 force_read=True)
    print('paths')
    for x in paths:
        print(x)
    parents = signatures.choose(paths)
    print('parents')
    for x in parents:
        print(x)
    updates = signatures.find_updates(parents)
    print('updates')
    for x in updates:
        print(x)
    files_written = signatures.write_updates(updates)
    print('files_written')
    for x in files_written:
        print(x)

    for oid, expected in SIGNATURES:
        oi = identifier.Identifier(id=oid,
                                   base_path=collection.identifier.basepath)
        o = oi.object()
        print('expected ', expected)
        print('o.signature_id ', o.signature_id)
        assert o.signature_id == expected
Ejemplo n.º 4
0
def file_info(repo):
    """Info from looking at set of metadata files
    
    @param repo: GitPython.Repository object
    @returns: dict
    """
    data = {}
    paths = util.find_meta_files(repo.working_dir,
                                 recursive=True,
                                 force_read=True)
    identifiers = [identifier.Identifier(path) for path in paths]
    data['total objects'] = len(identifiers)
    # model totals
    for i in identifiers:
        key = '%s objects' % i.model
        if not data.get(key):
            data[key] = 0
        data[key] = data[key] + 1
    # role totals
    roles = identifier.VALID_COMPONENTS['role']
    for role in roles:
        key = '%s files' % role
        data[key] = 0
    for i in identifiers:
        if i.model == 'file':
            for role in roles:
                if role in i.path_abs():
                    key = '%s files' % role
                    data[key] = data[key] + 1
    return data
Ejemplo n.º 5
0
def fieldcsv(collectionsdir, cidpattern, model, fieldname, csvfile):
    """Export value of specified field for all model objects in collections
    
    @param collectionsdir str: 
    @param cidpattern str: 
    @param model str: 
    @param fieldname str: 
    @param csvfile str: 
    """
    collection_paths = [
        path for path in util.natural_sort(
            util.find_meta_files(basedir=collectionsdir,
                                 model='collection',
                                 recursive=1,
                                 force_read=1)) if cidpattern in path
    ]
    for collection_path in collection_paths:
        print(collection_path)
        try:
            batch.Exporter.export_field_csv(
                collection=identifier.Identifier(collection_path).object(),
                model=model,
                fieldname=fieldname,
                csv_path=csvfile,
            )
        except Exception as err:
            print('ERROR: %s' % err)
Ejemplo n.º 6
0
def export_files( collection_path, csv_path ):
    """
    @param collection_path: Absolute path to collection repo.
    @param csv_path: Absolute path to CSV data file.
    """
    started = datetime.now(settings.TZ)
    print('%s starting import' % started)
    make_tmpdir(CSV_TMPDIR)
    fieldnames = [field['name'] for field in filemodule.FILE_FIELDS]
    print(fieldnames)
    paths = []
    for path in util.find_meta_files(basedir=collection_path, recursive=True):
        if ('master' in path) or ('mezzanine' in path):
            paths.append(path)
    
    with open(csv_path, 'wb') as csvfile:
        writer = make_csv_writer(csvfile)
        # headers
        writer.writerow(fieldnames)
        # everything else
        for n,path in enumerate(paths):
            rowstarted = datetime.now(settings.TZ)
            
            # load file object
            filename = os.path.basename(path)
            file_id = os.path.splitext(filename)[0]
            file_ = DDRLocalFile.from_json(path)
            if file_:
                # seealso DDR.modules.Module.function
                values = []
                for f in filemodule.FILE_FIELDS:
                    value = ''
                    if hasattr(file_, f['name']):
                        key = f['name']
                        # run csvexport_* functions on field data if present
                        val = modules.Module(filemodule).function(
                            'csvexport_%s' % key,
                            getattr(file_, f['name'])
                        )
                        if not (isinstance(val, str) or isinstance(val, unicode)):
                            val = unicode(val)
                        if val:
                            value = val.encode('utf-8')
                    values.append(value)
                writer.writerow(values)
            
                rowfinished = datetime.now(settings.TZ)
                rowelapsed = rowfinished - rowstarted
                print('%s %s/%s %s (%s)' % (dtfmt(rowfinished), n+1, len(paths), file_id, rowelapsed))
            else:
                print('NO FILE FOR %s' % path)
    
    finished = datetime.now(settings.TZ)
    elapsed = finished - started
    print('%s DONE (%s files)' % (dtfmt(finished), len(paths)))
    print('%s elapsed' % elapsed)
    if os.path.exists(csv_path):
        return csv_path
    return 'no file written'
Ejemplo n.º 7
0
def export_entities( collection_path, csv_path ):
    """
    @param collection_path: Absolute path to collection repo.
    @param csv_path: Absolute path to CSV data file.
    """
    started = datetime.now(settings.TZ)
    print('%s starting import' % started)
    make_tmpdir(CSV_TMPDIR)
    fieldnames = [field['name'] for field in entitymodule.ENTITY_FIELDS]
    # exclude 'files' bc not hard to convert to CSV and not different from files export.
    fieldnames.remove('files')
    print(fieldnames)
    paths = []
    for path in util.find_meta_files(basedir=collection_path, recursive=True):
        if os.path.basename(path) == 'entity.json':
            paths.append(path)
    
    with open(csv_path, 'wb') as csvfile:
        writer = make_csv_writer(csvfile)
        # headers
        writer.writerow(fieldnames)
        # everything else
        for n,path in enumerate(paths):
            rowstarted = datetime.now(settings.TZ)
            
            entity_dir = os.path.dirname(path)
            entity_id = os.path.basename(entity_dir)
            entity = DDRLocalEntity.from_json(entity_dir)
            # seealso DDR.modules.Module.function
            values = []
            for f in entitymodule.ENTITY_FIELDS:
                value = ''
                if hasattr(entity, f['name']) and f.get('form',None):
                    key = f['name']
                    label = f['form']['label']
                    # run csvexport_* functions on field data if present
                    val = modules.Module(entitymodule).function(
                        'csvexport_%s' % key,
                        getattr(entity, f['name'])
                    )
                    if not (isinstance(val, str) or isinstance(val, unicode)):
                        val = unicode(val)
                    if val:
                        value = val.encode('utf-8')
                values.append(value)
            writer.writerow(values)
            
            rowfinished = datetime.now(settings.TZ)
            rowelapsed = rowfinished - rowstarted
            print('%s %s/%s %s (%s)' % (dtfmt(rowfinished), n+1, len(paths), entity_id, rowelapsed))
    
    finished = datetime.now(settings.TZ)
    elapsed = finished - started
    print('%s DONE (%s entities)' % (dtfmt(finished), len(paths)))
    print('%s elapsed' % elapsed)
    if os.path.exists(csv_path):
        return csv_path
    return 'no file written'
Ejemplo n.º 8
0
def all_paths(collection_path, model):
    """Get all .json paths for specified model.
    
    @param collection_path: str Absolute path to collection repo
    @param model: str One of ['collection', 'entity', 'file']
    """
    return util.find_meta_files(
        basedir=collection_path, model=model, recursive=1, force_read=1
    )
Ejemplo n.º 9
0
def check_encoding(repo_url,
                   destdir,
                   verbose=False,
                   csv=False,
                   headers=False,
                   json=False):
    collection_id = extract_collection_id(repo_url)
    repo_path = os.path.join(destdir, collection_id)
    out(verbose, collection_id)
    out(verbose, repo_path)

    # if verbose, add marker to important lines
    if verbose:
        prefix = '%% '
    else:
        prefix = ''

    if csv and headers:
        print('{} collection id, files, defects, elapsed'.format(prefix))

    start = datetime.now()
    out(verbose, start)

    out(verbose, 'clone {} {}'.format(repo_url, repo_path))
    repo = clone(repo_url, repo_path)
    out(verbose, repo)

    out(verbose, 'analyzing')
    paths = util.find_meta_files(repo_path, recursive=True)
    defects = analyze_files(paths, verbose)

    out(verbose, 'cleaning up')
    clean(repo_path)

    end = datetime.now()
    elapsed = end - start
    out(verbose, end)

    if csv:
        print('{}{}'.format(
            prefix, ','.join([
                str(collection_id),
                str(len(paths)),
                str(len(defects)),
                str(elapsed)
            ])))
    elif json:
        data = {
            'collection id': collection_id,
            'files': len(paths),
            'defects': len(defects),
            'elapsed': str(elapsed),
        }
        print('{}{}'.format(prefix, json.dumps(data)))
    else:
        print('{}{}, {} bad, {} files, {} elapsed'.format(
            prefix, collection_id, len(defects), len(paths), elapsed))
Ejemplo n.º 10
0
def all_paths(collection_path, model):
    """Get all .json paths for specified model.
    
    @param collection_path: str Absolute path to collection repo
    @param model: str One of ['collection', 'entity', 'file']
    """
    return util.find_meta_files(basedir=collection_path,
                                model=model,
                                recursive=1,
                                force_read=1)
Ejemplo n.º 11
0
def _child_jsons( path, testing=False ):
    """List all the .json files under path directory; excludes specified dir.
    
    @param path: Absolute directory path.
    @return list of paths
    """
    return [
        p for p in util.find_meta_files(basedir=path, recursive=True, testing=testing)
        if os.path.dirname(p) != path
    ]
Ejemplo n.º 12
0
def test_find_meta_files():
    basedir = os.path.join(TESTING_BASE_DIR, 'find-meta-files')
    if os.path.exists(basedir):
        shutil.rmtree(basedir, ignore_errors=1)
    
    # build sample repo
    sampledir = os.path.join(basedir, 'ddr-test-123')
    for d in SAMPLE_DIRS:
        path = os.path.join(sampledir, d)
        os.makedirs(path)
    for fn in SAMPLE_FILES:
        path = os.path.join(sampledir, fn)
        with open(path, 'w') as f:
            f.write('testing')
    
    # cache
    cache_path = os.path.join(basedir, 'cache')
    if os.path.exists(cache_path):
        os.remove(cache_path)
    assert not os.path.exists(cache_path)

    def clean(paths):
        base = '%s/' % sampledir
        cleaned = [path.replace(base, '') for path in paths]
        cleaned.sort()
        return cleaned
    
    paths0 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True, testing=1))
    assert paths0 == META_ALL

    for model in ['collection', 'entity', 'file']:
        paths2 = clean(util.find_meta_files(sampledir, model=model, recursive=True, force_read=True, testing=1))
        assert paths2 == META_MODEL[model]
    
    paths3 = clean(util.find_meta_files(sampledir, recursive=False, force_read=True, testing=1))
    assert paths3 == META_MODEL['collection']
    
    paths4 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True, files_first=True, testing=1))
    assert paths4 == META_ALL
    
    paths5 = clean(util.find_meta_files(sampledir, recursive=True, force_read=False, testing=1))
    assert paths5 == META_ALL
Ejemplo n.º 13
0
def test_find_meta_files(tmpdir):
    basedir = str(tmpdir / 'find-meta-files')
    if os.path.exists(basedir):
        shutil.rmtree(basedir, ignore_errors=1)
    
    # build sample repo
    sampledir = os.path.join(basedir, 'ddr-test-123')
    for d in SAMPLE_DIRS:
        path = os.path.join(sampledir, d)
        os.makedirs(path)
    for fn in SAMPLE_FILES:
        path = os.path.join(sampledir, fn)
        with open(path, 'w') as f:
            f.write('testing')
    
    # cache
    cache_path = os.path.join(basedir, 'cache')
    if os.path.exists(cache_path):
        os.remove(cache_path)
    assert not os.path.exists(cache_path)

    def clean(paths):
        base = '%s/' % sampledir
        cleaned = [path.replace(base, '') for path in paths]
        cleaned.sort()
        return cleaned
    
    paths0 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True))
    assert paths0 == META_ALL

    for model in ['collection', 'entity', 'file']:
        paths2 = clean(util.find_meta_files(sampledir, model=model, recursive=True, force_read=True))
        assert paths2 == META_MODEL[model]
    
    paths3 = clean(util.find_meta_files(sampledir, recursive=False, force_read=True))
    assert paths3 == META_MODEL['collection']
    
    paths4 = clean(util.find_meta_files(sampledir, recursive=True, force_read=True, files_first=True))
    assert paths4 == META_ALL
    
    paths5 = clean(util.find_meta_files(sampledir, recursive=True, force_read=False))
    assert paths5 == META_ALL
Ejemplo n.º 14
0
def _child_jsons(path: str, testing: bool = False) -> List[str]:
    """List all the .json files under path directory; excludes specified dir.
    
    @param path: str Absolute directory path.
    @param testing: boolean
    @returns: list of paths
    """
    return [
        p for p in util.find_meta_files(basedir=path, recursive=True)
        if os.path.dirname(p) != path
    ]
Ejemplo n.º 15
0
def ddrcheckbinaries(repo, verbose=False):
    """ddrcheckbinaries - Find binaries that don't match metadata hashes.
    
    \b
    Example:
        $ ddrcheckbinaries /var/www/media/base/ddr-testing-141
    """
    filepaths = util.find_meta_files(
        repo, recursive=1, model='file', force_read=True
    )
    hits = check_files(filepaths, verbose)
Ejemplo n.º 16
0
def ddrcheck(help, collection_path):
    print('Gathering files in %s' % collection_path)
    paths = util.find_meta_files(
        collection_path, recursive=1,
        model=None, files_first=False, force_read=False, testing=0
    )
    print('Checking files...')
    for item in util.validate_paths(paths):
        n,path,err = item
        print('%s/%s ERROR %s - %s' % (n, len(paths), path, err))
    print('Checked %s files' % len(paths))
Ejemplo n.º 17
0
def ddrcheckbinaries(repo, verbose=False):
    """ddrcheckbinaries - Find binaries that don't match metadata hashes.
    
    \b
    Example:
        $ ddrcheckbinaries /var/www/media/base/ddr-testing-141
    """
    filepaths = util.find_meta_files(
        repo, recursive=1, model='file', force_read=True
    )
    hits = check_files(filepaths, verbose)
Ejemplo n.º 18
0
def _child_jsons(path, testing=False):
    """List all the .json files under path directory; excludes specified dir.
    
    @param path: Absolute directory path.
    @return list of paths
    """
    return [
        p for p in util.find_meta_files(
            basedir=path, recursive=True, testing=testing)
        if os.path.dirname(p) != path
    ]
Ejemplo n.º 19
0
def ddrcheck(collection_path):
    print('Gathering files in %s' % collection_path)
    paths = util.find_meta_files(
        collection_path, recursive=1,
        model=None, files_first=False, force_read=False, testing=0
    )
    print('Checking files...')
    for item in util.validate_paths(paths):
        n,path,err = item
        print('%s/%s ERROR %s - %s' % (n, len(paths), path, err))
    print('Checked %s files' % len(paths))
Ejemplo n.º 20
0
 def _children_paths(self):
     """Searches fs for (entity) childrens' .jsons, returns natsorted paths
     
     @returns: list
     """
     if os.path.exists(self.files_path):
         return natsorted([
             f
             for f in util.find_meta_files(self.files_path, recursive=True)
             # only direct children, no descendants
             if Identifier(f).parent_id() == self.id
         ])
     return []
Ejemplo n.º 21
0
def check_file_hashes(collection_path):
    """Check that hashes are present in file JSONs
    """
    paths = util.find_meta_files(collection_path,
                                 recursive=True,
                                 model='file',
                                 force_read=True)
    for path in paths:
        f = identifier.Identifier(path).object()
        if not (f.sha1 and f.sha256 and f.md5 and f.size):
            print('f.sha1   %s' % f.sha1)
            print('f.sha256 %s' % f.sha256)
            print('f.md5    %s' % f.md5)
            print('f.size   %s' % f.size)
            raise Exception('Hash data missing')
Ejemplo n.º 22
0
 def _file_paths(self, rel=False):
     """Searches filesystem for childrens' metadata files, returns relative paths.
     @param rel: bool Return relative paths
     @returns: list
     """
     if os.path.exists(self.files_path):
         prefix_path = 'THISWILLNEVERMATCHANYTHING'
         if rel:
             prefix_path = '{}/'.format(os.path.normpath(self.files_path))
         return sorted([
             f.replace(prefix_path, '')
             for f in util.find_meta_files(self.files_path, recursive=False)
         ],
                       key=lambda f: util.natural_order_string(f))
     return []
Ejemplo n.º 23
0
 def identifiers(self, model=None, force_read=False):
     """Lists Identifiers for all or subset of Collection's descendents.
     
     TODO how is this different from children?
     
     >>> c = Collection.from_json('/tmp/ddr-testing-123')
     >>> c.descendants()
     [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...]
     
     @param model: str Restrict list to model.
     @returns: list of Identifiers
     """
     return [
         Identifier(path) for path in util.find_meta_files(
             self.path, recursive=1, model=model, force_read=force_read)
     ]
Ejemplo n.º 24
0
 def _file_paths(self, rel=False):
     """Searches filesystem for childrens' metadata files, returns relative paths.
     @param rel: bool Return relative paths
     @returns: list
     """
     if os.path.exists(self.files_path):
         prefix_path = 'THISWILLNEVERMATCHANYTHING'
         if rel:
             prefix_path = '{}/'.format(os.path.normpath(self.files_path))
         return sorted(
             [
                 f.replace(prefix_path, '')
                 for f in util.find_meta_files(self.files_path, recursive=False)
             ],
             key=lambda f: util.natural_order_string(f)
         )
     return []
Ejemplo n.º 25
0
def csv_export_model( collection_path, model ):
    """Export collection {model} metadata to CSV file.
    
    @return collection_path: Absolute path to collection.
    @return model: 'entity' or 'file'.
    """
    collection = Collection.from_identifier(Identifier(path=collection_path))
    csv_path = settings.CSV_EXPORT_PATH[model] % collection.id
    
    logger.info('All paths in %s' % collection_path)
    paths = util.find_meta_files(
        basedir=collection_path, model=model, recursive=1, force_read=1
    )
    logger.info('Exporting %s paths' % len(paths))
    batch.Exporter.export(
        paths, model, csv_path, required_only=False
    )
    return csv_path
Ejemplo n.º 26
0
 def child_field_values(self, model, fieldname):
     """Get all values of fieldname from specified model in collection.
     
     @param model str
     @param fieldname str
     """
     rows = []
     paths = util.find_meta_files(self.path_abs,
                                  model=model,
                                  recursive=True)
     for path in paths:
         o = Identifier(path).object()
         if getattr(o, fieldname):
             rows.append([
                 o.id,
                 fieldname,
                 getattr(o, fieldname),
             ])
     return rows
Ejemplo n.º 27
0
def collect_hashes(collection_path):
    """Make dict of existing file hash data
    
    @param collection_path: str
    @returns: dict {file_id: {'sha1':..., 'sha256':..., 'md5':..., 'size':...}
    """
    paths = util.find_meta_files(collection_path,
                                 recursive=True,
                                 model='file',
                                 force_read=True)
    data = OrderedDict()
    for path in paths:
        o = identifier.Identifier(path).object()
        data[o.id] = OrderedDict()
        data[o.id]['sha1'] = o.sha1
        data[o.id]['sha256'] = o.sha256
        data[o.id]['md5'] = o.md5
        data[o.id]['size'] = o.size
    return data
Ejemplo n.º 28
0
 def _ids_in_local_repo(rowds, model, collection_path):
     """Lists which IDs in CSV are present in local repo.
     
     @param rowds: list of dicts
     @param model: str
     @param collection_path: str Absolute path to collection repo.
     @returns: list of IDs.
     """
     metadata_paths = util.find_meta_files(
         collection_path,
         model=model,
         recursive=True, force_read=True
     )
     existing_ids = [
         identifier.Identifier(path=path)
         for path in metadata_paths
     ]
     new_ids = [rowd['id'] for rowd in rowds]
     already = [i for i in new_ids if i in existing_ids]
     return already
Ejemplo n.º 29
0
def check( collection_path ):
    if not os.path.exists(settings.MEDIA_BASE):
        raise Exception('base_dir does not exist: %s' % settings.MEDIA_BASE)
    paths = util.find_meta_files(
        collection_path, recursive=1,
        model=None, files_first=False, force_read=False, testing=0
    )
    bad_files = util.validate_paths(paths)
    output = [
        'Checked %s files' % len(paths),
    ]
    if bad_files:
        for item in bad_files:
            n,path,err = item
            output.append(
                '%s/%s ERROR %s - %s' % (n, len(paths), path, err)
            )
    else:
        output.append('No bad files.')
    output.append('DONE')
    return '\n'.join(output)
Ejemplo n.º 30
0
def ddrsignatures(collection, nowrite, nocommit, user, mail):
    """ddrsignatures - Picks signature files for each collection object.
    """
    if not nocommit:
        if not (user and mail):
            logging.debug('You must specify a user and email address! >:-0')
            sys.exit(1)

    logging.debug('-----------------------------------------------')
    logging.debug('Loading collection')
    collection = identifier.Identifier(path=collection).object()
    logging.debug(collection)

    # Read data files, gather *published* Identifiers, map parents->nodes
    # assign signatures, write files
    updates = signatures.find_updates(
        signatures.choose(
            util.find_meta_files(collection.path, recursive=True, force_read=True)
        )
    )

    if nowrite:
        logging.debug('Not writing changes')
        files_written = []
    else:
        files_written = signatures.write_updates(updates)

    if nocommit:
        logging.debug('Not committing changes')
    elif files_written:
        if (not user) or (not mail):
            logging.debug('You must specify a user and email address! >:-0')
            sys.exit(1)
        status,msg = signatures.commit_updates(
            collection,
            files_written,
            user, mail, agent='ddr-signature',
            commit=True
        )
    logging.debug('DONE')
Ejemplo n.º 31
0
 def delete(self, document_id, recursive=False):
     """Delete a document and optionally its children.
     
     TODO refactor after upgrading Elasticsearch past 2.4.
     delete_by_query was removed sometime during elasticsearch-py 2.*
     I think it was added back in a later version so the code stays for now.
     
     For now, instead of deleting based on document_id, we start with
     document_id, find all paths beneath it in the filesystem,
     and curl DELETE url each individual document from Elasticsearch.
     
     @param document_id:
     @param recursive: True or False
     """
     logger.debug('delete(%s, %s)' % (document_id, recursive))
     oi = Identifier(document_id, config.MEDIA_BASE)
     if recursive:
         paths = util.find_meta_files(oi.path_abs(),
                                      recursive=recursive,
                                      files_first=1)
     else:
         paths = [oi.path_abs()]
     identifiers = [Identifier(path) for path in paths]
     num = len(identifiers)
     for n, oi in enumerate(identifiers):
         # TODO hard-coded models here!
         if oi.model == 'segment':
             model = 'entity'
         else:
             model = oi.model
         try:
             result = self.es.delete(index=self.index_name(model), id=oi.id)
             print(
                 f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> {result["result"]}'
             )
         except docstore.NotFoundError as err:
             print(
                 f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> 404 Not Found'
             )
Ejemplo n.º 32
0
def find_missing_annex_binaries(repo):
    """Find binaries that are not placed in .git/annex
    """
    files = [
        identifier.Identifier(path_json).object()
        for path_json in util.find_meta_files(
            repo.working_dir, recursive=True, model='file', force_read=True)
    ]
    binaries_missing = [
        f.path_rel for f in files
        if (not f.external) and (not dvcs.file_in_git_annex(repo, f.path_rel))
    ]
    accessfiles_missing = [
        f.access_rel for f in files if (os.path.exists(f.access_abs)) and (
            not dvcs.file_in_git_annex(repo, f.path_rel))
    ]
    missing = binaries_missing + accessfiles_missing
    if missing:
        print('Binaries missing from git-annex')
        for path in missing:
            print(path)
    return missing
Ejemplo n.º 33
0
def find_binaries_in_git_objects(repo):
    """Find binaries mistakenly placed into .git/objects
    """
    files = [
        identifier.Identifier(path_json).object()
        for path_json in util.find_meta_files(
            repo.working_dir, recursive=True, model='file', force_read=True)
    ]
    binaries_in_git_objects = [
        f.path_rel for f in files
        if (not f.external) and dvcs.file_in_git_objects(repo, f.path_rel)
    ]
    accessfiles_in_git_objects = [
        f.access_rel for f in files if os.path.exists(f.access_abs)
        and dvcs.file_in_git_objects(repo, f.path_rel)
    ]
    binaries = binaries_in_git_objects + accessfiles_in_git_objects
    if binaries:
        print('Found binaries in %s' %
              os.path.join(repo.working_dir, '.git/objects/'))
        for path in binaries:
            print(path)
    return binaries
Ejemplo n.º 34
0
    def post_multi(self, path, recursive=False, force=False):
        """Publish (index) specified document and (optionally) its children.
        
        After receiving a list of metadata files, index() iterates through the
        list several times.  The first pass weeds out paths to objects that can
        not be published (e.g. object or its parent is unpublished).
        
        In the final pass, a list of public/publishable fields is chosen based
        on the model.  Additional fields not in the model (e.g. parent ID, parent
        organization/collection/entity ID) are packaged.  Then everything is sent
        off to post().
        
        @param path: Absolute path to directory containing object metadata files.
        @param recursive: Whether or not to recurse into subdirectories.
        @param force: boolean Just publish the damn collection already.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug('index(%s, %s, %s, %s)' %
                     (self.indexname, path, recursive, force))

        publicfields = _public_fields()

        # process a single file if requested
        if os.path.isfile(path):
            paths = [path]
        else:
            # files listed first, then entities, then collections
            paths = util.find_meta_files(path, recursive, files_first=1)

        # Store value of public,status for each collection,entity.
        # Values will be used by entities and files to inherit these values
        # from their parent.
        parents = _parents_status(paths)

        # Determine if paths are publishable or not
        paths = _publishable(paths, parents, force=force)

        skipped = 0
        successful = 0
        bad_paths = []

        num = len(paths)
        for n, path in enumerate(paths):
            oi = path.get('identifier')
            # TODO write logs instead of print
            print('%s | %s/%s %s %s %s' % (datetime.now(
                config.TZ), n + 1, num, path['action'], oi.id, path['note']))

            if not oi:
                path['note'] = 'No identifier'
                bad_paths.append(path)
                continue
            try:
                document = oi.object()
            except Exception as err:
                path['note'] = 'Could not instantiate: %s' % err
                bad_paths.append(path)
                continue
            if not document:
                path['note'] = 'No document'
                bad_paths.append(path)
                continue

            # see if document exists
            existing_v = None
            d = self.get(oi.model, oi.id)
            if d:
                existing_v = d.meta.version

            # post document
            if path['action'] == 'POST':
                created = self.post(document, parents=parents, force=True)
                # force=True bypasses _publishable in post() function
            # delete previously published items now marked incomplete/private
            elif existing_v and (path['action'] == 'SKIP'):
                print('%s | %s/%s DELETE' %
                      (datetime.now(config.TZ), n + 1, num))
                self.delete(oi.id)

            if path['action'] == 'SKIP':
                skipped += 1
                continue

            # version is incremented with each updated
            posted_v = None
            # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment'
            es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name
            d = self.get(es_model, oi.id)
            if d:
                posted_v = d.meta.version

            # success: created, or version number incremented
            status = 'ERROR - unspecified'
            if posted_v and not existing_v:
                status = 'CREATED'
                successful += 1
            elif (existing_v and posted_v) and (existing_v < posted_v):
                status = 'UPDATED'
                successful += 1
            elif not posted_v:
                status = 'ERROR: not created'
                bad_paths.append(path)
                print(status)

        logger.debug('INDEXING COMPLETED')
        return {
            'total': len(paths),
            'skipped': skipped,
            'successful': successful,
            'bad': bad_paths
        }
Ejemplo n.º 35
0
def transform(collection,
              filter='',
              models='',
              topics=None,
              created=None,
              commit=None,
              user=None,
              mail=None):

    if commit and ((not user) or (not mail)):
        logging.error('You must specify a user and email address! >:-0')
        sys.exit(1)
    else:
        logging.info('Not committing changes')

    start = datetime.now()

    if filter:
        logging.info('FILTER: "%s"' % filter)
    ONLY_THESE = []
    if models:
        logging.info('MODELS: "%s"' % models)
        ONLY_THESE = models.split(',')

    logging.info('Loading collection')
    cidentifier = identifier.Identifier(os.path.normpath(collection))
    collection = cidentifier.object()
    logging.info(collection)

    logging.info('Finding metadata files')
    paths = util.find_meta_files(collection.identifier.path_abs(),
                                 recursive=True,
                                 force_read=True)
    logging.info('%s paths' % len(paths))

    TOPICS = vocab.get_vocabs(config.VOCABS_URL)['topics']
    # filter out paths
    these_paths = []
    for path in paths:
        oi = identifier.Identifier(path)
        if filter and (not fnmatch.fnmatch(oi.id, filter)):
            continue
        if models and (oi.model not in ONLY_THESE):
            continue
        these_paths.append(path)
    if len(these_paths) != len(paths):
        logging.info('%s after filters' % len(these_paths))

    logging.info('Writing')
    num = len(these_paths)
    for n, path in enumerate(these_paths):
        logging.info('%s/%s %s' % (n, num, path))
        o = identifier.Identifier(path).object()
        if filter and (not fnmatch.fnmatch(o.id, filter)):
            continue
        if models and (o.identifier.model not in ONLY_THESE):
            continue

        if o.identifier.model in ['entity', 'segment']:
            o.children(force_read=True)

        if topics and o.identifier.model in ['entity', 'segment']:
            before = o.topics
            after = vocab.repair_topicdata(o.topics, TOPICS)
            o.topics = after

        if created and hasattr(o, 'record_created'):
            record_created_before = o.record_created
            commit = dvcs.earliest_commit(path, parsed=True)
            o.record_created = commit['ts']

        o.write_json()

    if commit:
        logging.info('Committing changes')
        status, msg = commands.update(user,
                                      mail,
                                      collection,
                                      paths,
                                      agent='ddr-transform')
        logging.info('ok')
    else:
        logging.info('Changes not committed')

    end = datetime.now()
    elapsed = end - start
    per = elapsed / num
    logging.info('DONE (%s elapsed, %s per object)' % (elapsed, per))
Ejemplo n.º 36
0
def index( hosts, index, path, recursive=False, public=True ):
    """(Re)index with data from the specified directory.
    
    After receiving a list of metadata files, index() iterates through the list several times.  The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished).
    
    The second pass goes through the files and assigns a signature file to each entity or collection ID.
    There is some logic that tries to pick the first file of the first entity to be the collection signature, and so on.  Mezzanine files are preferred over master files.
    
    In the final pass, a list of public/publishable fields is chosen based on the model.  Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID, the signature file) are packaged.  Then everything is sent off to post().

    @param hosts: list of dicts containing host information.
    @param index: Name of the target index.
    @param path: Absolute path to directory containing object metadata files.
    @param recursive: Whether or not to recurse into subdirectories.
    @param public: For publication (fields not marked public will be ommitted).
    @param paths: Absolute paths to directory containing collections.
    @returns: number successful,list of paths that didn't work out
    """
    logger.debug('index(%s, %s, %s)' % (hosts, index, path))
    
    publicfields = public_fields()
    
    # process a single file if requested
    if os.path.isfile(path):
        paths = [path]
    else:
        # files listed first, then entities, then collections
        paths = util.find_meta_files(path, recursive, files_first=1)
    
    # Store value of public,status for each collection,entity.
    # Values will be used by entities and files to inherit these values from their parent.
    parents = _parents_status(paths)
    
    # Determine if paths are publishable or not
    successful_paths,bad_paths = _publishable_or_not(paths, parents)
    
    # iterate through paths, storing signature_url for each collection, entity
    # paths listed files first, then entities, then collections
    signature_files = _choose_signatures(successful_paths)
    print('Signature files')
    keys = signature_files.keys()
    keys.sort()
    for key in keys:
        print(key, signature_files[key])
    
    successful = 0
    for path in successful_paths:
        identifier = Identifier(path=path)
        parent_id = identifier.parent_id()
        
        document_pub_fields = []
        if public and identifier.model:
            document_pub_fields = publicfields[identifier.model]
        
        additional_fields = {'parent_id': parent_id}
        if identifier.model == 'collection': additional_fields['organization_id'] = parent_id
        if identifier.model == 'entity': additional_fields['collection_id'] = parent_id
        if identifier.model == 'file': additional_fields['entity_id'] = parent_id
        if identifier.model in ['collection', 'entity']:
            additional_fields['signature_file'] = signature_files.get(identifier.id, '')
        
        # HERE WE GO!
        document = load_document_json(path, identifier.model, identifier.id)
        try:
            existing = get(hosts, index, identifier.model, identifier.id, fields=[])
        except:
            existing = None
        result = post(hosts, index, document, document_pub_fields, additional_fields)
        # success: created, or version number incremented
        if result.get('_id', None):
            if existing:
                existing_version = existing.get('version', None)
                if not existing_version:
                    existing_version = existing.get('_version', None)
            else:
                existing_version = None
            result_version = result.get('version', None)
            if not result_version:
                result_version = result.get('_version', None)
            if result['created'] or (existing_version and (result_version > existing_version)):
                successful += 1
        else:
            bad_paths.append((path, result['status'], result['response']))
            #print(status_code)
    logger.debug('INDEXING COMPLETED')
    return {'total':len(paths), 'successful':successful, 'bad':bad_paths}
Ejemplo n.º 37
0
    def post_multi(self, path, recursive=False, force=False):
        """Publish (index) specified document and (optionally) its children.
        
        After receiving a list of metadata files, index() iterates through the
        list several times.  The first pass weeds out paths to objects that can
        not be published (e.g. object or its parent is unpublished).
        
        In the final pass, a list of public/publishable fields is chosen based
        on the model.  Additional fields not in the model (e.g. parent ID, parent
        organization/collection/entity ID) are packaged.  Then everything is sent
        off to post().
        
        @param path: Absolute path to directory containing object metadata files.
        @param recursive: Whether or not to recurse into subdirectories.
        @param force: boolean Just publish the damn collection already.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug('index(%s, %s, %s, %s)' % (self.indexname, path, recursive, force))
        
        publicfields = _public_fields()
        
        # process a single file if requested
        if os.path.isfile(path):
            paths = [path]
        else:
            # files listed first, then entities, then collections
            paths = util.find_meta_files(path, recursive, files_first=1)
        
        # Store value of public,status for each collection,entity.
        # Values will be used by entities and files to inherit these values
        # from their parent.
        parents = _parents_status(paths)
        
        # Determine if paths are publishable or not
        paths = _publishable(paths, parents, force=force)
        
        skipped = 0
        successful = 0
        bad_paths = []
        
        num = len(paths)
        for n,path in enumerate(paths):
            oi = path.get('identifier')
            # TODO write logs instead of print
            print('%s | %s/%s %s %s %s' % (
                datetime.now(config.TZ), n+1, num, path['action'], oi.id, path['note'])
            )
            
            if not oi:
                path['note'] = 'No identifier'
                bad_paths.append(path)
                continue
            try:
                document = oi.object()
            except Exception as err:
                path['note'] = 'Could not instantiate: %s' % err
                bad_paths.append(path)
                continue
            if not document:
                path['note'] = 'No document'
                bad_paths.append(path)
                continue
            
            # see if document exists
            existing_v = None
            d = self.get(oi.model, oi.id)
            if d:
                existing_v = d.meta.version
            
            # post document
            if path['action'] == 'POST':
                created = self.post(document, parents=parents, force=True)
                # force=True bypasses _publishable in post() function
            # delete previously published items now marked incomplete/private
            elif existing_v and (path['action'] == 'SKIP'):
                print('%s | %s/%s DELETE' % (datetime.now(config.TZ), n+1, num))
                self.delete(oi.id)
            
            if path['action'] == 'SKIP':
                skipped += 1
                continue
            
            # version is incremented with each updated
            posted_v = None
            # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment'
            es_model = ELASTICSEARCH_CLASSES_BY_MODEL[oi.model]._doc_type.name
            d = self.get(es_model, oi.id)
            if d:
                posted_v = d.meta.version

            # success: created, or version number incremented
            status = 'ERROR - unspecified'
            if posted_v and not existing_v:
                status = 'CREATED'
                successful += 1
            elif (existing_v and posted_v) and (existing_v < posted_v):
                status = 'UPDATED'
                successful += 1
            elif not posted_v:
                status = 'ERROR: not created'
                bad_paths.append(path)
                print(status)
            
        logger.debug('INDEXING COMPLETED')
        return {'total':len(paths), 'skipped':skipped, 'successful':successful, 'bad':bad_paths}
Ejemplo n.º 38
0
    def post_multi(self, path, recursive=False, force=False, backblaze=None):
        """Publish (index) specified document and (optionally) its children.
        
        After receiving a list of metadata files, index() iterates through the
        list several times.  The first pass weeds out paths to objects that can
        not be published (e.g. object or its parent is unpublished).
        
        In the final pass, a list of public/publishable fields is chosen based
        on the model.  Additional fields not in the model (e.g. parent ID, parent
        organization/collection/entity ID) are packaged.  Then everything is sent
        off to post().
        
        @param path: Absolute path to directory containing object metadata files.
        @param recursive: Whether or not to recurse into subdirectories.
        @param force: boolean Just publish the damn collection already.
        @param backblaze: storage.Backblaze object Look in b2sync tmpdir and mark
                   files uploaded to Backblaze.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug(f'post_multi({path}, {recursive}, {force}, {backblaze})')
        # Check that path
        try:
            ci = Identifier(path).collection()
        except:
            raise Exception(
                'Docstore.post_multi path must point to a collection or subdirectory.'
            )
        ci_path = Path(ci.id)

        publicfields = _public_fields()

        # process a single file if requested
        if os.path.isfile(path):
            paths = [path]
        else:
            # files listed first, then entities, then collections
            logger.debug(f'Finding files in {path}')
            paths = util.find_meta_files(path, recursive, files_first=1)

        # Determine if paths are publishable or not
        logger.debug(f'Checking for publishability')
        identifiers = [Identifier(path) for path in paths]
        parents = {
            oid: oi.object()
            for oid, oi in _all_parents(identifiers).items()
        }
        paths = publishable(identifiers, parents, force=force)

        # list files in b2 bucket
        # TODO do this in parallel with util.find_meta_files?
        b2_files = []
        if backblaze:
            logger.debug(
                f'Checking Backblaze for uploaded files ({backblaze.bucketname})'
            )
            b2_files = backblaze.list_files(folder=ci.id)
            logger.debug(f'{len(b2_files)} files')

        skipped = 0
        successful = 0
        bad_paths = []

        num = len(paths)
        for n, path in enumerate(paths):
            oi = path.get('identifier')
            if not oi:
                path['note'] = 'No identifier'
                bad_paths.append(path)
                continue
            try:
                document = oi.object()
            except Exception as err:
                path['note'] = f'Could not instantiate: {err}'
                bad_paths.append(path)
                continue
            if not document:
                path['note'] = 'No document'
                bad_paths.append(path)
                continue

            # see if file uploaded to Backblaze
            b2_synced = False
            b2str = ''
            if (oi.model == 'file') and b2_files:
                dir_filename = str(ci_path / Path(document.path).name)
                if dir_filename in b2_files:
                    b2_synced = True
                    b2str = '(b2)'
                    b2_files.remove(dir_filename)

            # TODO write logs instead of print
            now = datetime.now(config.TZ)
            action = path['action']
            path_note = path['note'].strip()
            print(f'{now} | {n+1}/{num} {action} {oi.id} {path_note}{b2str}')

            # see if document exists
            existing_v = None
            d = self.get(model=oi.model,
                         es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model],
                         document_id=oi.id)
            if d:
                existing_v = d.meta.version

            # post document
            if path['action'] == 'POST':
                try:
                    created = self.post(document,
                                        parents=parents,
                                        b2=b2_synced,
                                        force=True)
                except Exception as err:
                    traceback.print_exc()
                # force=True bypasses publishable in post() function
            # delete previously published items now marked incomplete/private
            elif existing_v and (path['action'] == 'SKIP'):
                print('%s | %s/%s DELETE' %
                      (datetime.now(config.TZ), n + 1, num))
                self.delete(oi.id)

            if path['action'] == 'SKIP':
                skipped += 1
                continue

            # version is incremented with each updated
            posted_v = None
            # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment'
            d = self.get(model=oi.model,
                         es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model],
                         document_id=oi.id)
            if d:
                posted_v = d.meta.version

            # success: created, or version number incremented
            status = 'ERROR - unspecified'
            if posted_v and not existing_v:
                status = 'CREATED'
                successful += 1
            elif (existing_v and posted_v) and (existing_v < posted_v):
                status = 'UPDATED'
                successful += 1
            elif not posted_v:
                status = 'ERROR: not created'
                bad_paths.append(path)
                print(status)

        logger.debug('INDEXING COMPLETED')
        return {
            'total': len(paths),
            'skipped': skipped,
            'successful': successful,
            'bad': bad_paths
        }