Ejemplo n.º 1
0
def create(request):
    if request.method == 'POST':
        dataset = Dataset()
        dataset.owner = request.user
        dataset.name = request.POST['name']
        dataset.number_of_labels = request.POST['number_of_labels']
        dataset.description = request.POST['description']

        if not dataset.privacy_validation(request.POST['privacy']):
            return render(request, '400.html', status=400)

        dataset_file = request.FILES['dataset']
        reader = csv.reader(dataset_file, encoding='utf-8')
        header_list = reader.next()

        label_name = request.POST.get('label_name', 'CLASS')
        append_label_column = request.POST.get('append_label_column', False)
        if not append_label_column:
            label_index = header_list.index(label_name)
            header_list.pop(label_index)

        header_list.append(label_name)
        dataset.header = csvlist_to_string(header_list).strip()
        dataset.save()

        samples_count = 0
        for row_list in reader:
            samples_count += 1
            if not append_label_column:
                label_string = row_list.pop(label_index)

            row = csvlist_to_string(row_list).strip()
            sample = Sample(dataset=dataset,
                            data=row,
                            original_index=samples_count)
            sample.save()

            if not append_label_column and label_string:
                label = Label(owner=request.user,
                              sample=sample,
                              label=label_string)
                label.save()
                sample.times_labeled = 1
                sample.save()

        dataset.number_of_samples = samples_count
        dataset.save()

        return HttpResponseRedirect(
            reverse('datasets_show', args=(dataset.id, )))
Ejemplo n.º 2
0
def create(request):
    if request.method == 'POST':
        dataset = Dataset()
        dataset.owner = request.user
        dataset.name = request.POST['name']
        dataset.number_of_labels = request.POST['number_of_labels']
        dataset.description = request.POST['description']

        if not dataset.privacy_validation(request.POST['privacy']):
            return render(request, '400.html', status=400)

        dataset_file = request.FILES['dataset']
        reader = csv.reader(dataset_file, encoding='utf-8')
        header_list = reader.next()

        label_name = request.POST.get('label_name', 'CLASS')
        append_label_column = request.POST.get('append_label_column', False)
        if not append_label_column:
            label_index = header_list.index(label_name)
            header_list.pop(label_index)

        header_list.append(label_name)
        dataset.header = csvlist_to_string(header_list).strip()
        dataset.save()

        samples_count = 0
        for row_list in reader:
            samples_count += 1
            if not append_label_column:
                label_string = row_list.pop(label_index)

            row = csvlist_to_string(row_list).strip()
            sample = Sample(dataset=dataset, data=row,
                            original_index=samples_count)
            sample.save()

            if not append_label_column and label_string:
                label = Label(owner=request.user, sample=sample,
                              label=label_string)
                label.save()
                sample.times_labeled = 1
                sample.save()

        dataset.number_of_samples = samples_count
        dataset.save()

        return HttpResponseRedirect(
            reverse('datasets_show', args=(dataset.id,)))
Ejemplo n.º 3
0
def _update_datasets(app):
    """
    Updates from conp-datasets
    """
    from app import db
    from app.models import ArkId
    from app.models import Dataset as DBDataset
    from app.models import DatasetAncestry as DBDatasetAncestry
    from sqlalchemy import exc
    from datalad import api
    from datalad.api import Dataset as DataladDataset
    import fnmatch
    import json
    from pathlib import Path
    import git

    datasetsdir = Path(app.config['DATA_PATH']) / 'conp-dataset'
    datasetsdir.mkdir(parents=True, exist_ok=True)

    # Initialize the git repository object
    try:
        repo = git.Repo(datasetsdir)
    except git.exc.InvalidGitRepositoryError:
        repo = git.Repo.clone_from('https://github.com/CONP-PCNO/conp-dataset',
                                   datasetsdir,
                                   branch='master')

    # Update to latest commit
    origin = repo.remotes.origin
    origin.pull('master')
    repo.submodule_update(recursive=False, keep_going=True)

    d = DataladDataset(path=datasetsdir)
    if not d.is_installed():
        api.clone(source='https://github.com/CONP-PCNO/conp-dataset',
                  path=datasetsdir)
        d = DataladDataset(path=datasetsdir)

    try:
        d.install(path='', recursive=True)
    except Exception as e:
        print("\033[91m")
        print("[ERROR  ] An exception occurred in datalad update.")
        print(e.args)
        print("\033[0m")
        return

    print('[INFO   ] conp-dataset update complete')
    print('[INFO   ] Updating subdatasets')

    for ds in d.subdatasets():
        print('[INFO   ] Updating ' + ds['gitmodule_url'])
        subdataset = DataladDataset(path=ds['path'])
        if not subdataset.is_installed():
            try:
                api.clone(source=ds['gitmodule_url'], path=ds['path'])
                subdataset = DataladDataset(path=ds['path'])
                subdataset.install(path='')
            except Exception as e:
                print("\033[91m")
                print(
                    "[ERROR  ] An exception occurred in datalad install for " +
                    str(ds) + ".")
                print(e.args)
                print("\033[0m")
                continue

        # The following relates to the DATS.json files
        # of the projects directory in the conp-dataset repo.
        # Skip directories that aren't projects.
        patterns = [app.config['DATA_PATH'] + '/conp-dataset/projects/*']
        if not any(
                fnmatch.fnmatch(ds['path'], pattern) for pattern in patterns):
            continue

        dirs = os.listdir(ds['path'])
        descriptor = ''
        for file in dirs:
            if fnmatch.fnmatch(file.lower(), 'dats.json'):
                descriptor = file

        if descriptor == '':
            print("\033[91m")
            print('[ERROR  ] DATS.json file can`t be found in ' + ds['path'] +
                  ".")
            print("\033[0m")
            continue

        try:
            with open(os.path.join(ds['path'], descriptor), 'r') as f:
                dats = json.load(f)
        except Exception as e:
            print("\033[91m")
            print("[ERROR  ] Descriptor file can't be read.")
            print(e.args)
            print("\033[0m")
            continue

        # use dats.json data to fill the datasets table
        # avoid duplication / REPLACE instead of insert
        dataset = DBDataset.query.filter_by(
            dataset_id=ds['gitmodule_name']).first()

        # pull the timestamp of the first commit in the git log for the dataset create date
        createDate = datetime.utcnow()
        try:
            createTimeStamp = os.popen(
                "git -C {} log --pretty=format:%ct --reverse | head -1".format(
                    ds['path'])).read()
            createDate = datetime.fromtimestamp(int(createTimeStamp))
        except Exception:
            print("[ERROR  ] Create Date couldnt be read.")

        firstMergeDate = datetime.utcnow()
        try:
            firstMergeTimeStamp = os.popen(
                "git -C {} log --pretty=format:%ct --reverse {} | head -1".
                format(app.config['DATA_PATH'] + "/conp-dataset",
                       ds['path'])).read()
            firstMergeDate = datetime.fromtimestamp(int(firstMergeTimeStamp))
        except Exception:
            print(
                "[ERROR  ] First merge date of the submodule dataset could not be read."
            )

        # last commit in the git log for the dataset update date
        updateDate = datetime.utcnow()
        try:
            createTimeStamp = os.popen(
                "git -C {} log --pretty=format:%ct | head -1".format(
                    ds['path'])).read()
            updateDate = datetime.fromtimestamp(int(createTimeStamp))
        except Exception:
            print("[ERROR  ] Update Date couldnt be read.")

        # get the remote URL
        remoteUrl = None
        try:
            remoteUrl = os.popen(
                "git -C {} config --get remote.origin.url".format(
                    ds['path'])).read()
        except Exception:
            print("[ERROR  ] Remote URL couldnt be read.")

        if dataset is None:
            dataset = DBDataset()
            dataset.dataset_id = ds['gitmodule_name']
            dataset.date_created = createDate
            dataset.date_added_to_portal = firstMergeDate

        if dataset.date_created != createDate:
            dataset.date_created = createDate

        # check for dataset ancestry
        extraprops = dats.get('extraProperties', [])
        for prop in extraprops:
            if prop.get('category') == 'parent_dataset_id':
                for x in prop.get('values', []):
                    if x.get('value', None) is None:
                        continue
                    datasetAncestry = DBDatasetAncestry()
                    datasetAncestry.id = str(uuid.uuid4())
                    datasetAncestry.parent_dataset_id = 'projects/' + \
                        x.get('value', None)
                    datasetAncestry.child_dataset_id = dataset.dataset_id
                    try:
                        db.session.merge(datasetAncestry)
                        db.session.commit()
                    except exc.IntegrityError:
                        # we already have a record of this ancestry
                        db.session.rollback()

        if not dataset.date_added_to_portal:
            dataset.date_added_to_portal = firstMergeDate

        dataset.date_updated = updateDate
        dataset.fspath = ds['path']
        dataset.remoteUrl = remoteUrl
        dataset.description = dats.get('description',
                                       'No description in DATS.json')
        dataset.name = dats.get('title', os.path.basename(dataset.dataset_id))

        db.session.merge(dataset)
        db.session.commit()

        # if the dataset does not have an ARK identifier yet, generate it
        dataset_with_ark_id_list = [
            row[0] for row in db.session.query(ArkId.dataset_id).all()
        ]
        if dataset.dataset_id not in dataset_with_ark_id_list:
            new_ark_id = ark_id_minter(app, 'dataset')
            save_ark_id_in_database(app, 'dataset', new_ark_id,
                                    dataset.dataset_id)
        print('[INFO   ] ' + ds['gitmodule_name'] + ' updated.')
Ejemplo n.º 4
0
def _update_datasets(app):
    """
    Updates from conp-datasets
    """
    from app import db, config
    from app.models import Dataset as DBDataset
    from datalad import api
    from datalad.api import Dataset as DataladDataset
    import fnmatch
    import json

    datasetspath = app.config['DATA_PATH']

    d = DataladDataset(path=datasetspath + '/conp-dataset')
    if not d.is_installed():
        api.clone(source='https://github.com/CONP-PCNO/conp-dataset',
                  path=datasetspath + '/conp-dataset')
        d = DataladDataset(path=datasetspath + '/conp-dataset')
        d.install(path='', recursive=True)

    try:
        d.update(path='', merge=True, recursive=True)
    except Exception as e:
        print("\033[91m")
        print("[ERROR  ] An exception occurred in datalad update.")
        print(e.args)
        print("\033[0m")
        return

    print('[INFO   ] conp-dataset update complete')
    print('[INFO   ] Updating subdatasets')

    for ds in d.subdatasets():
        print('[INFO   ] Updating ' + ds['gitmodule_url'])
        subdataset = DataladDataset(path=ds['path'])
        if not subdataset.is_installed():
            try:
                api.clone(source=ds['gitmodule_url'], path=ds['path'])
                subdataset = DataladDataset(path=ds['path'])
                subdataset.install(path='')
            except Exception as e:
                print("\033[91m")
                print(
                    "[ERROR  ] An exception occurred in datalad install for " +
                    str(ds) + ".")
                print(e.args)
                print("\033[0m")
                continue

        dirs = os.listdir(ds['path'])
        descriptor = ''
        for file in dirs:
            if fnmatch.fnmatch(file.lower(), 'dats.json'):
                descriptor = file

        if descriptor == '':
            print("\033[91m")
            print('[ERROR  ] DATS.json file can`t be found in ' + ds['path'] +
                  ".")
            print("\033[0m")
            continue

        try:
            with open(os.path.join(ds['path'], descriptor), 'r') as f:
                dats = json.load(f)
        except Exception as e:
            print("\033[91m")
            print("[ERROR  ] Descriptor file can't be read.")
            print(e.args)
            print("\033[0m")
            continue

        # use dats.json data to fill the datasets table
        # avoid duplication / REPLACE instead of insert
        dataset = DBDataset.query.filter_by(
            dataset_id=ds['gitmodule_name']).first()
        if dataset is None:
            dataset = DBDataset()
            dataset.dataset_id = ds['gitmodule_name']
            dataset.date_created = datetime.utcnow()

        dataset.date_updated = datetime.utcnow()
        dataset.fspath = ds['path']
        dataset.description = dats.get('description',
                                       'No description in DATS.json')
        dataset.name = dats.get('title', os.path.basename(dataset.dataset_id))

        db.session.merge(dataset)
        db.session.commit()
        print('[INFO   ] ' + ds['gitmodule_name'] + ' updated.')
Ejemplo n.º 5
0
def _update_datasets(app):
    """
    Updates from conp-datasets
    """
    from app import db, config
    from app.models import Dataset as DBDataset
    from datalad import api
    from datalad.api import Dataset as DataladDataset
    import fnmatch
    import json
    from pathlib import Path
    import git

    datasetsdir = Path(app.config['DATA_PATH']) / 'conp-dataset'
    datasetsdir.mkdir(parents=True, exist_ok=True)

    # Initialize the git repository object
    try:
        repo = git.Repo(datasetsdir)
    except git.exc.InvalidGitRepositoryError as e:
        repo = git.Repo.clone_from(
            'https://github.com/CONP-PCNO/conp-dataset',
            datasetsdir,
            branch='master'
        )

    # Update to latest commit
    origin = repo.remotes.origin
    origin.pull('master')
    repo.submodule_update(recursive=False, keep_going=True)

    d = DataladDataset(path=datasetsdir)
    if not d.is_installed():
        api.clone(
            source='https://github.com/CONP-PCNO/conp-dataset',
            path=datasetsdir
        )
        d = DataladDataset(path=datasetsdir)

    try:
        d.install(path='', recursive=True)
    except Exception as e:
        print("\033[91m")
        print("[ERROR  ] An exception occurred in datalad update.")
        print(e.args)
        print("\033[0m")
        return

    print('[INFO   ] conp-dataset update complete')
    print('[INFO   ] Updating subdatasets')

    for ds in d.subdatasets():
        print('[INFO   ] Updating ' + ds['gitmodule_url'])
        subdataset = DataladDataset(path=ds['path'])
        if not subdataset.is_installed():
            try:
                api.clone(
                    source=ds['gitmodule_url'],
                    path=ds['path']
                )
                subdataset = DataladDataset(path=ds['path'])
                subdataset.install(path='')
            except Exception as e:
                print("\033[91m")
                print("[ERROR  ] An exception occurred in datalad install for " + str(ds) + ".")
                print(e.args)
                print("\033[0m")
                continue

        dirs = os.listdir(ds['path'])
        descriptor = ''
        for file in dirs:
            if fnmatch.fnmatch(file.lower(), 'dats.json'):
                descriptor = file

        if descriptor == '':
            print("\033[91m")
            print('[ERROR  ] DATS.json file can`t be found in ' + ds['path'] + ".")
            print("\033[0m")
            continue

        try:
            with open(os.path.join(ds['path'], descriptor), 'r') as f:
                dats = json.load(f)
        except Exception as e:
            print("\033[91m")
            print("[ERROR  ] Descriptor file can't be read.")
            print(e.args)
            print("\033[0m")
            continue

        # use dats.json data to fill the datasets table
        # avoid duplication / REPLACE instead of insert
        dataset = DBDataset.query.filter_by(dataset_id=ds['gitmodule_name']).first()
        if dataset is None:
            dataset = DBDataset()
            dataset.dataset_id = ds['gitmodule_name']
            dataset.date_created = datetime.utcnow()

        dataset.date_updated = datetime.utcnow()
        dataset.fspath = ds['path']
        dataset.description = dats.get('description', 'No description in DATS.json')
        dataset.name = dats.get(
            'title',
            os.path.basename(dataset.dataset_id)
        )

        db.session.merge(dataset)
        db.session.commit()
        print('[INFO   ] ' + ds['gitmodule_name'] + ' updated.')