def search_with_dataset(dataset_path, query=None, **kwargs):
        """Search the datamart using a dataset"""
        if not isfile(dataset_path):
            user_msg = ('The dataset file could not be found.')
            return err_resp(user_msg)

        search_url = get_nyu_url() + '/search'

        # --------------------------------
        # Behavioral logging
        # --------------------------------
        if 'user_workspace' in kwargs:
            log_data = dict(feature_id=f'POST|by-dataset|{search_url}',
                            activity_l1=bl_static.L1_DATA_PREPARATION,
                            activity_l2=bl_static.L2_DATA_SEARCH,
                            path=search_url)

            LogEntryMaker.create_datamart_entry(kwargs['user_workspace'],
                                                log_data)
        # --------------------------------

        # --------------------------------
        # Query the datamart
        # --------------------------------
        try:
            with open(dataset_path, 'rb') as dataset_p:
                search_files = dict(data=dataset_p)
                if query:
                    search_files['query'] = query

                try:
                    response = requests.post(\
                        search_url,
                        files=search_files,
                        timeout=settings.DATAMART_LONG_TIMEOUT)

                except requests.exceptions.Timeout as err_obj:
                    return err_resp('Request timed out. responded with: %s' %
                                    err_obj)

        except IOError as err_obj:
            user_msg = (f'Failed to search with the dataset file.'
                        f'  Technical: {err_obj}')
            return err_resp(user_msg)

        if response.status_code != 200:
            print(str(response))
            print(response.text)
            return err_resp(('NYU Datamart internal server error.'
                             ' status_code: %s') % response.status_code)

        json_results = response.json()['results']

        if not json_results:
            return err_resp('No datasets found. (%s)' % \
                            (get_timestamp_string_readable(time_only=True),))

        print('num results: ', len(json_results))

        return ok_resp(json_results)
Beispiel #2
0
    def datamart_augment(user_workspace,
                         data_path,
                         search_result,
                         exact_match=False,
                         **kwargs):
        if not isinstance(user_workspace, UserWorkspace):
            return err_resp('user_workspace must be a UserWorkspace')

        if not isfile(data_path):
            user_msg = f'Original data file not found: {data_path}'
            return err_resp(user_msg)

        # ----------------------------
        # mock call
        # ----------------------------
        # 291780000
        """
        right_data = '291770000'
        left_columns= '[[2]]'
        right_columns = '[[6]]'
        exact_match = True
        data_path = '/Users/ramanprasad/Documents/github-rp/TwoRavens/ravens_volume/test_data/TR1_Greed_Versus_Grievance/TRAIN/dataset_TRAIN/tables/learningData.csv'
        """
        # ----------------------------
        LOGGER.info('(1) build path')
        datamart_id = search_result[dm_static.KEY_ISI_DATAMART_ID]

        dest_filepath_info = DatamartJobUtilISI.get_output_filepath(
            user_workspace,
            f'{datamart_id}-{get_timestamp_string()}',
            dir_type=dm_static.KEY_AUGMENT)

        if not dest_filepath_info.success:
            return err_resp(dest_filepath_info.err_msg)

        augment_filepath = dest_filepath_info.result_obj

        augment_url = get_isi_url() + '/augment'

        # ----------------------------
        # Behavioral logging
        # ----------------------------
        log_data = dict(feature_id=f'POST|{augment_url}',
                        activity_l1=bl_static.L1_DATA_PREPARATION,
                        activity_l2=bl_static.L2_DATA_AUGMENT,
                        path=augment_url)

        LogEntryMaker.create_datamart_entry(user_workspace, log_data)
        # ----------------------------

        try:
            response = requests.post(
                augment_url,
                data={
                    'task': json.dumps(search_result),
                    'format': 'd3m'
                },
                files={'data': open(data_path, 'r')},
                verify=False,
                timeout=settings.DATAMART_VERY_LONG_TIMEOUT)

        except requests.exceptions.Timeout as err_obj:
            return err_resp('Request timed out. responded with: %s' % err_obj)

        if not response.status_code == 200:
            user_msg = (f'ISI Augment response failed with status code: '
                        f'{response.status_code}.')
            return err_resp(user_msg)

        save_info = DatamartJobUtilISI.save_datamart_file(\
                        augment_filepath,
                        response)

        if not save_info.success:
            return err_resp(save_info.err_msg)
        save_info = save_info.result_obj

        # -----------------------------------------
        # Retrieve preview rows and return response
        # -----------------------------------------

        # preview rows
        #
        preview_info = read_file_rows(save_info[dm_static.KEY_DATA_PATH],
                                      dm_static.NUM_PREVIEW_ROWS)
        if not preview_info.success:
            user_msg = (f'Failed to retrieve preview rows.'
                        f' {preview_info.err_msg}')
            return err_resp(user_msg)

        # Format/return reponse
        #
        info_dict = DatamartJobUtilISI.format_materialize_response( \
            datamart_id,
            dm_static.DATAMART_ISI_NAME,
            save_info[dm_static.KEY_DATA_PATH],
            preview_info,
            **save_info)

        return ok_resp(info_dict)
Beispiel #3
0
    def datamart_materialize(user_workspace, search_result):
        """Materialize the dataset"""
        LOGGER.info('-- atttempt to materialize ISI dataset --')
        if not isinstance(user_workspace, UserWorkspace):
            return err_resp('user_workspace must be a UserWorkspace')

        if not isinstance(search_result, dict):
            return err_resp('search_result must be a python dictionary')

        if dm_static.KEY_ISI_DATAMART_ID not in search_result:
            user_msg = (f'"search_result" did not contain'
                        f' "{dm_static.KEY_ISI_DATAMART_ID}" key')
            return err_resp(user_msg)

        # -----------------------------------------
        # Format output file path
        # -----------------------------------------
        LOGGER.info('(1) build path')
        datamart_id = search_result[dm_static.KEY_ISI_DATAMART_ID]

        dest_filepath_info = DatamartJobUtilISI.get_output_filepath(\
                                        user_workspace,
                                        datamart_id,
                                        dir_type='materialize')

        if not dest_filepath_info.success:
            return err_resp(dest_filepath_info.err_msg)

        dest_filepath = dest_filepath_info.result_obj

        LOGGER.info('(2) Download file')

        # -----------------------------------------
        # Has the file already been downloaded?
        # -----------------------------------------
        print('dest_filepath', dest_filepath)
        if isfile(dest_filepath):
            LOGGER.info('(2a) file already downloaded')

            # Get preview rows
            #
            preview_info = read_file_rows(dest_filepath,
                                          dm_static.NUM_PREVIEW_ROWS)
            if not preview_info.success:
                user_msg = (f'Failed to retrieve preview rows.'
                            f' {preview_info.err_msg}')
                return err_resp(user_msg)

            info_dict = DatamartJobUtilISI.format_materialize_response(\
                            datamart_id, dm_static.DATAMART_ISI_NAME,
                            dest_filepath, preview_info)

            return ok_resp(info_dict)

        # -----------------------------------------
        # Download the file
        # -----------------------------------------
        # can this be streamed to a file?

        LOGGER.info('(2b) attempting download')

        # ----------------------------
        # Behavioral logging
        # ----------------------------
        isi_materialize_url = get_isi_url() + f'/download/{datamart_id}'

        log_data = dict(feature_id=f'GET|{isi_materialize_url}',
                        activity_l1=bl_static.L1_DATA_PREPARATION,
                        activity_l2=bl_static.L2_DATA_DOWNLOAD,
                        path=isi_materialize_url)

        LogEntryMaker.create_datamart_entry(user_workspace, log_data)

        try:
            print('isi_materialize_url', isi_materialize_url)
            response = requests.get(\
                        isi_materialize_url,
                        params={'id': datamart_id, 'format': 'd3m'},
                        verify=False,
                        timeout=settings.DATAMART_LONG_TIMEOUT)
        except requests.exceptions.Timeout as err_obj:
            return err_resp('Request timed out. responded with: %s' % err_obj)

        if response.status_code != 200:
            user_msg = (f'Materialize failed.  Status code:'
                        f' {response.status_code}.  response: {response.text}')
            return err_resp(user_msg)

        LOGGER.info('(3) Download complete.  Save file')

        # -----------------------------------------
        # Save the downloaded file
        # -----------------------------------------
        save_info = DatamartJobUtilISI.save_datamart_file(\
                        dest_filepath,
                        response)

        if not save_info.success:
            return err_resp(save_info.err_msg)
        save_info = save_info.result_obj

        # -----------------------------------------
        # Retrieve preview rows and return response
        # -----------------------------------------

        LOGGER.info('(4) File saved')

        # preview rows
        #
        preview_info = read_file_rows(save_info[dm_static.KEY_DATA_PATH],
                                      dm_static.NUM_PREVIEW_ROWS)
        if not preview_info.success:
            user_msg = (f'Failed to retrieve preview rows.'
                        f' {preview_info.err_msg}')
            return err_resp(user_msg)

        # Format/return reponse
        #
        info_dict = DatamartJobUtilISI.format_materialize_response(
            datamart_id, dm_static.DATAMART_ISI_NAME,
            save_info[dm_static.KEY_DATA_PATH], preview_info, **save_info)

        return ok_resp(info_dict)
Beispiel #4
0
    def datamart_search(query_dict=None, dataset_path=None, **kwargs):
        """Search the ISI datamart"""

        if query_dict is None and dataset_path is None:
            return err_resp('Either a query or dataset path must be supplied.')

        if query_dict is not None and not isinstance(query_dict, dict):
            user_msg = ('There is something wrong with the search parameters.'
                        ' Please try again. (expected a dictionary)')
            return err_resp(user_msg)

        search_url = get_isi_url() + '/search'

        # --------------------------------
        # Behavioral logging
        # --------------------------------
        if 'user' in kwargs:
            log_data = dict(feature_id=f'POST|{search_url}',
                            activity_l1=bl_static.L1_DATA_PREPARATION,
                            activity_l2=bl_static.L2_DATA_SEARCH,
                            path=search_url)

            LogEntryMaker.create_datamart_entry(kwargs['user'], log_data)
        # --------------------------------

        # --------------------------------
        # Query the datamart
        # --------------------------------

        query_json = None
        if query_dict:
            formatted_json_info = json_dumps(query_dict)
            if not formatted_json_info.success:
                return err_resp('Failed to convert query to JSON. %s' % \
                                formatted_json_info.err_msg)
            query_json = formatted_json_info.result_obj

        print(f'formatted query: {query_json}')

        if dataset_path:
            limit = kwargs.get('limit', 20)
            if not isinstance(limit, int):
                user_msg = ('The results limit must be an'
                            ' integer (datamart_search)')
                return err_resp(user_msg)

            if not USE_CACHED_SEARCH:
                try:
                    with open(dataset_path, 'rb') as dataset_p:
                        try:
                            response = requests.post(
                                search_url,
                                params={'max_return_docs': limit},
                                json={'query_json': query_json},
                                files={'data': dataset_p},
                                verify=False,
                                timeout=settings.DATAMART_LONG_TIMEOUT)

                        except requests.exceptions.Timeout as err_obj:
                            return err_resp(
                                'Request timed out. responded with: %s' %
                                err_obj)

                except IOError as err_obj:
                    user_msg = (f'Failed to search with the dataset file.'
                                f'  Technical: {err_obj}')
                    return err_resp(user_msg)

        else:
            raise NotImplementedError(
                'Augmentations on results without a dataset path are not implemented by ISI.'
            )

        if not USE_CACHED_SEARCH:
            if response.status_code != 200:
                return err_resp(response['reason'])

            response_json = response.json()

            if response_json['code'] != "0000":
                return err_resp(response_json['message'])

        else:
            import json
            print('loading file')
            response_json = json.load(
                open('/datamart_endpoints/cached_isi_search_response.json',
                     'r'))
        json_results = response_json['search_results']['results']

        #num_datasets = len(response['data'])
        #print('num_datasets', num_datasets)
        #print('iterating through....')

        sorted_data = sorted(
            json_results,  #response['data'],
            key=lambda k: k['score'],
            reverse=True)

        #print([ds['score'] for ds in sorted_data])

        return ok_resp(sorted_data[:limit])
Beispiel #5
0
    def search_with_dataset(dataset_path, query=None, **kwargs):
        """Search the datamart using a dataset"""
        if not isfile(dataset_path):
            user_msg = ('The dataset file could not be found.')
            return err_resp(user_msg)

        search_url = get_isi_url() + '/search'

        # --------------------------------
        # Behavioral logging
        # --------------------------------
        if 'user_workspace' in kwargs:
            log_data = dict(feature_id=f'POST|by-dataset|{search_url}',
                            activity_l1=bl_static.L1_DATA_PREPARATION,
                            activity_l2=bl_static.L2_DATA_SEARCH,
                            path=search_url)

            LogEntryMaker.create_datamart_entry(kwargs['user_workspace'],
                                                log_data)
        # --------------------------------

        # --------------------------------
        # Query the datamart
        # --------------------------------

        query_json = None
        if query:
            formatted_json_info = json_dumps(query)
            if not formatted_json_info.success:
                return err_resp('Failed to convert query to JSON. %s' % \
                                formatted_json_info.err_msg)
            query_json = formatted_json_info.result_obj

        print(f'formatted query: {query_json}')

        limit = kwargs.get('limit', 20)
        if not isinstance(limit, int):
            user_msg = ('The results limit must be an'
                        ' integer (datamart_search)')
            return err_resp(user_msg)

        if not USE_CACHED_SEARCH:
            try:
                with open(dataset_path, 'rb') as dataset_p:
                    try:
                        response = requests.post(
                            search_url,
                            params={'max_return_docs': limit},
                            json={'query_json': query_json},
                            files={'data': dataset_p},
                            verify=False,
                            timeout=settings.DATAMART_LONG_TIMEOUT)

                    except requests.exceptions.Timeout as err_obj:
                        return err_resp(
                            'Request timed out. responded with: %s' % err_obj)

            except IOError as err_obj:
                user_msg = (f'Failed to search with the dataset file.'
                            f'  Technical: {err_obj}')
                return err_resp(user_msg)

            if response.status_code != 200:
                print(str(response))
                print(response.text)
                return err_resp(('ISI Datamart internal server error.'
                                 ' status_code: %s') % response.status_code)

            response_json = response.json()
        else:
            import json
            print('loading file')
            response_json = json.load(
                open('/datamart_endpoints/cached_isi_search_response.json',
                     'r'))

        #print('response_json', response_json)

        if not 'results' in response_json:
            return err_resp('No datasets found. (%s)' % \
                            (get_timestamp_string_readable(time_only=True),))

        json_results = response_json['results']

        print('num results: ', len(json_results))

        return ok_resp(json_results)
    def datamart_augment(user_workspace, dataset_path, task_data, **kwargs):
        """Augment the file via the NYU API"""
        if not isinstance(user_workspace, UserWorkspace):
            return err_resp('user_workspace must be a UserWorkspace')

        # Make sure the soure file exists
        #
        if not isfile(dataset_path):
            user_msg = f'Original data file not found: {dataset_path}'
            return err_resp(user_msg)

        # Make sure the NYU datamart id is in the task_data
        #
        if not dm_static.KEY_NYU_DATAMART_ID in task_data:
            user_msg = (f'"task_data" did not contain'
                        f' "{dm_static.KEY_NYU_DATAMART_ID}" key')
            return err_resp(user_msg)

        # used for folder naming
        #
        datamart_id = task_data[dm_static.KEY_NYU_DATAMART_ID]

        # ---------------------------------
        # The augment url...
        # ---------------------------------
        augment_url = f"{ get_nyu_url() }/augment"

        # ----------------------------
        # Behavioral logging
        # ----------------------------
        log_data = dict(feature_id=f'POST|{augment_url}',
                        activity_l1=bl_static.L1_DATA_PREPARATION,
                        activity_l2=bl_static.L2_DATA_AUGMENT,
                        path=augment_url)

        LogEntryMaker.create_datamart_entry(user_workspace, log_data)
        # ----------------------------

        # ---------------------------------
        # Ready the query parameters
        # ---------------------------------
        data_params = dict(data=open(dataset_path, 'rb'),
                           task=json.dumps(task_data))

        # ---------------------------------
        # Make the augment request
        # ---------------------------------
        try:
            response = requests.post(augment_url,
                                     files=data_params,
                                     stream=True,
                                     allow_redirects=True,
                                     verify=False,
                                     timeout=settings.DATAMART_LONG_TIMEOUT)
        except requests.exceptions.Timeout as err_obj:
            return err_resp('Request timed out. responded with: %s' % err_obj)

        # Any errors?
        #
        if response.status_code != 200:
            user_msg = (f'NYU Datamart internal server error. Status code:'
                        f' "{response.status_code}".'
                        f' <hr />Technical: {response.content}')
            # print(response.content)

            return err_resp(user_msg)

        # Write the augmented file
        #
        dest_folderpath_info = DatamartJobUtilNYU.get_output_folderpath(\
                                        user_workspace,
                                        datamart_id,
                                        dir_type=dm_static.KEY_AUGMENT)

        if not dest_folderpath_info.success:
            return err_resp(dest_folderpath_info.err_msg)

        augment_folderpath = dest_folderpath_info.result_obj

        # Set the output file
        #
        dest_filepath = join(augment_folderpath, 'tables', 'learningData.csv')


        save_info = DatamartJobUtilNYU.save_datamart_file(\
                                    augment_folderpath,
                                    response,
                                    expected_filepath=dest_filepath)

        if not save_info.success:
            return err_resp(save_info.err_msg)
        save_info = save_info.result_obj

        # -----------------------------------------
        # Retrieve preview rows and return response
        # -----------------------------------------

        # preview rows
        #
        preview_info = read_file_rows(save_info[dm_static.KEY_DATA_PATH],
                                      dm_static.NUM_PREVIEW_ROWS)
        if not preview_info.success:
            user_msg = (f'Failed to retrieve preview rows.'
                        f' {preview_info.err_msg}')
            return err_resp(user_msg)

        # Format/return reponse
        #
        info_dict = DatamartJobUtilNYU.format_materialize_response(\
                        datamart_id,
                        dm_static.DATAMART_NYU_NAME,
                        save_info[dm_static.KEY_DATA_PATH],
                        preview_info,
                        **save_info)

        return ok_resp(info_dict)
    def datamart_materialize(user_workspace, search_result):
        """Materialize an NYU dataset!"""
        LOGGER.info('-- atttempt to materialize NYU dataset --')
        if not isinstance(user_workspace, UserWorkspace):
            return err_resp('user_workspace must be a UserWorkspace')

        if not isinstance(search_result, dict):
            return err_resp('search_result must be a python dictionary')

        print('\nsearch_result', search_result)
        print('\nsearch_result.keys()', search_result.keys())
        if not dm_static.KEY_NYU_DATAMART_ID in search_result:
            user_msg = (f'"search_result" did not contain'
                        f' "{dm_static.KEY_NYU_DATAMART_ID}" key')
            return err_resp(user_msg)

        # -----------------------------------------
        # Build the folder path where the .zip will
        #   be unbundled
        # -----------------------------------------
        LOGGER.info('(1) build path')
        datamart_id = search_result[dm_static.KEY_NYU_DATAMART_ID]

        dest_folderpath_info = DatamartJobUtilNYU.get_output_folderpath(\
                                        user_workspace,
                                        datamart_id,
                                        dir_type=dm_static.KEY_MATERIALIZE)

        # Failed to get/create the output folder
        #
        if not dest_folderpath_info.success:
            return err_resp(dest_folderpath_info.err_msg)

        # Set the output folder
        #
        dest_folderpath = dest_folderpath_info.result_obj

        # Set the output file path
        #
        dest_filepath = join(dest_folderpath, 'tables', 'learningData.csv')

        LOGGER.info('(2) Download file')

        # -----------------------------------------
        # Has the file already been downloaded?
        # -----------------------------------------
        print('dest_filepath', dest_filepath)

        LOGGER.info('(2a) Has the file already been downloaded?')
        if isfile(dest_filepath):
            LOGGER.info('Yes, already downloaded')

            # Get preview rows
            #
            preview_info = read_file_rows(dest_filepath,
                                          dm_static.NUM_PREVIEW_ROWS)
            if not preview_info.success:
                user_msg = (f'Failed to retrieve preview rows.'
                            f' {preview_info.err_msg}')
                return err_resp(user_msg)

            info_dict = DatamartJobUtilNYU.format_materialize_response(\
                            datamart_id, dm_static.DATAMART_NYU_NAME,
                            dest_filepath, preview_info)

            return ok_resp(info_dict)

        # -----------------------------------------
        # Download the file
        # -----------------------------------------
        LOGGER.info('(2b) File not yet downloaded. Attempting download')

        if not 'id' in search_result:
            user_msg = f'search_result did not contain the key "id"'
            return err_resp(user_msg)

        download_url = (f'{get_nyu_url()}/download/'
                        f'{search_result[dm_static.KEY_NYU_DATAMART_ID]}')

        # ----------------------------
        # Behavioral logging
        # ----------------------------
        log_data = dict(feature_id=f'GET|{download_url}',
                        activity_l1=bl_static.L1_DATA_PREPARATION,
                        activity_l2=bl_static.L2_DATA_DOWNLOAD,
                        path=download_url)

        LogEntryMaker.create_datamart_entry(user_workspace, log_data)

        # ----------------------------
        # Download the file!
        # ----------------------------
        try:
            response = requests.get(\
                        download_url,
                        params={'format': 'd3m'},
                        verify=False,
                        stream=True,
                        timeout=settings.DATAMART_LONG_TIMEOUT)
        except requests.exceptions.Timeout as err_obj:
            return err_resp('Request timed out. responded with: %s' % err_obj)

        if response.status_code != 200:
            user_msg = (f'Materialize failed.  Status code:'
                        f' {response.status_code}.  response: {response.text}')
            return err_resp(user_msg)

        save_info = DatamartJobUtilNYU.save_datamart_file(\
                                    dest_folderpath,
                                    response,
                                    expected_filepath=dest_filepath)

        if not save_info.success:
            return err_resp(save_info.err_msg)
        save_info = save_info.result_obj

        # ----------------------------
        # Get preview rows
        # ----------------------------
        preview_info = read_file_rows(save_info[dm_static.KEY_DATA_PATH],
                                      dm_static.NUM_PREVIEW_ROWS)
        if not preview_info.success:
            user_msg = (f'Failed to retrieve preview rows.'
                        f' {preview_info.err_msg}')
            return err_resp(user_msg)

        info_dict = DatamartJobUtilNYU.format_materialize_response( \
            datamart_id,
            dm_static.DATAMART_NYU_NAME,
            dest_filepath,
            preview_info,
            **save_info)

        return ok_resp(info_dict)
    def datamart_search(query_dict=None, dataset_path=None, **kwargs):
        """Search the NYU datamart"""

        if query_dict is None and dataset_path is None:
            return err_resp('Either a query or dataset path must be supplied.')

        if query_dict is not None and not isinstance(query_dict, dict):
            user_msg = ('There is something wrong with the search parameters.'
                        ' Please try again. (expected a dictionary)')
            return err_resp(user_msg)

        search_url = get_nyu_url() + '/search'

        # --------------------------------
        # Behavioral logging
        # --------------------------------
        if 'user' in kwargs:
            log_data = dict(feature_id=f'POST|{search_url}',
                            activity_l1=bl_static.L1_DATA_PREPARATION,
                            activity_l2=bl_static.L2_DATA_SEARCH,
                            path=search_url)

            LogEntryMaker.create_datamart_entry(kwargs['user'], log_data)
        # --------------------------------

        # --------------------------------
        # Query the datamart
        # --------------------------------

        if dataset_path:
            try:
                with open(dataset_path, 'rb') as dataset_p:
                    try:
                        response = requests.post(
                            search_url,
                            json=query_dict,
                            files=dict(data=dataset_p),
                            timeout=settings.DATAMART_LONG_TIMEOUT)

                    except requests.exceptions.Timeout as err_obj:
                        return err_resp(
                            'Request timed out. responded with: %s' % err_obj)

            except IOError as err_obj:
                user_msg = (f'Failed to search with the dataset file.'
                            f'  Technical: {err_obj}')
                return err_resp(user_msg)

        else:
            try:
                response = requests.post(
                    search_url,
                    json=query_dict,
                    stream=True,
                    timeout=settings.DATAMART_LONG_TIMEOUT)
            except requests.exceptions.Timeout as err_obj:
                return err_resp('Request timed out. responded with: %s' %
                                err_obj)
            if response.status_code != 200:
                print(str(response))
                print(response.text)
                return err_resp(('NYU Datamart internal server error.'
                                 ' status_code: %s') % response.status_code)

        json_results = response.json()['results']

        if not json_results:
            return err_resp('No datasets found. (%s)' % \
                            (get_timestamp_string_readable(time_only=True),))

        # print('num results: ', len(json_results))

        return ok_resp(json_results)