def test_get_subject_url():
    pc = PilotClient()
    args = ('dataframe.dat', 'my_folder', False)
    test_args = ('dataframe.dat', 'my_folder', True)
    assert pc.get_globus_url(*args) == \
           pc.get_subject_url(*args)
    assert pc.get_globus_url(*test_args) == \
           pc.get_subject_url(*test_args)
Exemple #2
0
def publish_gather_metadata(**data):
    import traceback
    from pilot.client import PilotClient
    from pilot.exc import PilotClientException, FileOrFolderDoesNotExist

    try:
        dataset, destination = data['dataset'], data.get('destination', '/')
        index, project, groups = data['index'], data['project'], data.get(
            'groups', [])

        # Bootstrap Pilot
        pc = PilotClient(config_file=None, index_uuid=index)
        pc.project.set_project(project)
        # short_path is how pilot internally refers to datasets, implicitly accounting for
        # the endpoint and base project path. After publication, you may refer to your
        # dataset via the short path -- ``pilot describe short_path``
        short_path = pc.build_short_path(dataset, destination)
        return {
            'search': {
                'id':
                data.get('id', 'metadata'),
                'content':
                pc.gather_metadata(dataset,
                                   destination,
                                   custom_metadata=data.get('metadata')),
                'subject':
                pc.get_subject_url(short_path),
                'visible_to': [
                    f'urn:globus:groups:id:{g}'
                    for g in groups + [pc.get_group()]
                ],
                'search_index':
                index
            },
            'transfer': {
                'source_endpoint_id':
                data['source_globus_endpoint'],
                'destination_endpoint_id':
                pc.get_endpoint(),
                'transfer_items': [
                    {
                        'source_path': src,
                        'destination_path': dest,
                        # 'recursive': False,  # each file is explicit in pilot, no directories
                    } for src, dest in pc.get_globus_transfer_paths(
                        dataset, destination)
                ]
            }
        }
    except (PilotClientException, FileOrFolderDoesNotExist):
        return traceback.format_exc()
def test_get_subject_url(mock_projects):
    pc = PilotClient()
    pc.project.current = 'foo-project'
    args = ('myfolder/dataframe.dat', )
    assert pc.get_globus_url(*args) == pc.get_subject_url(*args)
Exemple #4
0
def describe(path, test, output_json):
    pc = PilotClient()
    if not pc.is_logged_in():
        click.echo('You are not logged in.')
        return

    old_entry = False
    fname, dirname = os.path.basename(path), os.path.dirname(path)
    entry = pc.get_search_entry(fname, dirname, test)
    if not entry:
        old_entry = True
        entry = pc.get_search_entry(fname, dirname, old=True)

    if not entry:
        click.echo('Unable to find entry')
        return

    if output_json:
        click.echo(json.dumps(entry, indent=4))
        return

    general_fmt = '{:21.20}{}'
    general_columns = [
        ('Title', lambda r: r['dc']['titles'][0]['title']),
        ('Authors', lambda r: [c['creatorName'] for c in r['dc']['creators']]),
        ('Publisher', lambda r: r['dc']['publisher']),
        ('Subjects', lambda r: [s['subject'] for s in r['dc']['subjects']]),
        ('Dates', get_dates),
        ('Data', lambda r: r['ncipilot']['data_type']),
        ('Dataframe', lambda r: r['ncipilot']['dataframe_type']),
        ('Rows', lambda r: str(r['field_metadata']['numrows'])),
        ('Columns', lambda r: str(r['field_metadata']['numcols'])),
        ('Formats', lambda r: r['dc']['formats']),
        ('Version', lambda r: r['dc']['version']),
        ('Size', get_size),
        ('Filename', get_identifier),
        ('Description', lambda r: r['dc']['descriptions'][0]['description']),
    ]

    def format_list(name, content):
        return [general_fmt.format(name, content[0])] + \
               [general_fmt.format('', item) for item in content[1:]]

    def format_entry(name, content):
        return [general_fmt.format(name, content)]

    output = fetch_format(general_columns, entry, format_entry, format_list)

    fmt = ('{:21.20}'
           '{:8.7}{:7.6}{:5.4}{:12.11}{:7.6}'
           '{:7.6}{:7.6}{:7.6}{:7.6}'
           '{:8.7}{:8.7}{:8.7}'
           )
    field_metadata = [
        ('Column Name', 'name'),

        ('Type', 'type'),
        ('Count', 'count'),
        ('Freq', 'frequency'),
        ('Top', 'top'),
        ('Unique', 'unique'),

        ('Min', 'min'),
        ('Max', 'max'),
        ('Mean', 'mean'),
        ('Std', 'std'),

        ('25-PCTL', '25'),
        ('50-PCTL', '50'),
        ('75-PCTL', '75'),
    ]
    names = [n for n, f in field_metadata]
    keys = [f for n, f in field_metadata]
    fm_output = []
    try:
        for field in entry['field_metadata']['field_definitions']:
            f_metadata = [str(field.get(key, '')) for key in keys]
            fm_output.append(fmt.format(*f_metadata))

        field_metadata_names = fmt.format(*names)
        output = '{}\n\nField Metadata\n{}\n{}'.format(output,
                                                       field_metadata_names,
                                                       '\n'.join(fm_output))
    except KeyError:
        output = '{}\n\nField Metadata\nNo Field Metadata'.format(output)

    if not test:
        sub = pc.get_subject_url(fname, dirname, test, old=old_entry)
        qsub = urllib.parse.quote_plus(urllib.parse.quote_plus(sub))
        portal_url = '{}{}'.format(PORTAL_DETAIL_PAGE_PREFIX, qsub)
        other_data = [general_fmt.format('Subject', sub),
                      general_fmt.format(path, portal_url)]
        output = '{}\n\nOther Data\n{}'.format(output, '\n'.join(other_data))

    click.echo(output)