def make_mini(project_id): ''' Create sample version of selected file (call just after upload). GET: - project_id POST: - data_params: { module_name: 'INIT' (mandatory to be init) file_name: } - module_params: { sample_size: randomize: } ''' data_params, module_params = _parse_request() proj = ESNormalizer(project_id=project_id) proj.load_data(data_params['module_name'], data_params['file_name']) proj.make_mini(module_params) # Write transformations and log proj.write_data()
def _concat_with_init(project_id, data_params, *argv): ''' Concatenate transformed columns with original file ARGUMENTS (GET): project_id: ID for "normalize" project ARGUMENTS (POST): - data_params: file to concatenate to original { "module_name": module to fetch from "file_name": file to fetch } - module_params: none ''' proj = ESNormalizer(project_id=project_id) # TODO: not clean if data_params is None: (module_name, file_name) = proj.get_last_written() else: module_name = data_params['module_name'] file_name = data_params['file_name'] proj.load_data(module_name, file_name) # TODO: there was a pdb here. is everything alright ? _, run_info = proj.transform('concat_with_init', None) # Write transformations and logs proj.write_data() return run_info
def upload(project_id): ''' Uploads files to a normalization project. (NB: cannot upload directly to a link type project). Also creates the mini version of the project GET: - project_id: ID of the normalization project POST: file: (csv file) A csv to upload to the chosen normalization project NB: the "filename" property will be used to name the file json: - module_params: - make_mini: (default True) Set to False to NOT create a mini version of the file - sample_size - randomize ''' # Load project proj = ESNormalizer(project_id=project_id) _, module_params = _parse_request() if module_params is None: module_params = {} make_mini = module_params.get('make_mini', True) # TODO: can remove ? # Upload data def custom_stream_factory(total_content_length, filename, content_type, content_length=None): tmpfile = tempfile.NamedTemporaryFile('wb+', prefix='flaskapp') app.logger.info("start receiving file ... filename => " + str(tmpfile.name)) return tmpfile _, _, files = werkzeug.formparser.parse_form_data(flask.request.environ, stream_factory=custom_stream_factory) # Upload data file_name = files['file'].filename stream = files['file'].stream _, run_info = proj.upload_init_data(stream, file_name) # Make mini if make_mini: proj.load_data('INIT', run_info['file_name']) proj.make_mini(module_params) # Write transformations and log # TODO: not clean if proj.metadata['has_mini']: proj.write_data() else: proj._write_metadata() return jsonify(run_info=run_info, project_id=proj.project_id)
def _replace_mvs(project_id, data_params, module_params): ''' Runs the mvs replacement module ARGUMENTS (GET): project_id: ID for "normalize" project ARGUMENTS (POST): - data_params: { "module_name": module to fetch from "file_name": file to fetch } - module_params: same as result of infer_mvs ''' proj = ESNormalizer(project_id=project_id) proj.load_data(data_params['module_name'], data_params['file_name']) _, run_info = proj.transform('replace_mvs', module_params) # Write transformations and log proj.write_data() return run_info
def _run_all_transforms(project_id, data_params, *argv): ''' Run all transformations that were already (based on presence of run_info.json files) with parameters in run_info.json files. ARGUMENTS (GET): project_id: ID for "normalize" project ARGUMENTS (POST): - data_params: file to concatenate to original { "file_name": file to use for transform (module_name is 'INIT') } ''' proj = ESNormalizer(project_id=project_id) file_name = data_params['file_name'] proj.load_data('INIT', file_name) all_run_infos = proj.run_all_transforms() # Write transformations and logs proj.write_data() return all_run_infos
{'french', 'whitespace', 'integers', 'n_grams'}, 'departement': {'french', 'whitespace', 'integers', 'n_grams'}, 'code_postal_uai': {}, 'full_name': {'french', 'whitespace', 'integers', 'n_grams'} } ref.create_index(ref_path, columns_to_index, force=False) # Link index_name = proj.metadata['files']['ref']['project_id'] query_template = (('must', 'commune', 'localite_acheminement_uai', '.french', 1), ('must', 'lycees_sources', 'full_name', '.french', 1)) threshold = 3.5 must = {'full_name': ['lycee']} must_not = {'full_name': ['ass', 'association', 'sportive', 'foyer']} params = dict() params['index_name'] = index_name params['query_template'] = query_template params['thresh'] = threshold params['must'] = must params['must_not'] = must_not proj.linker('es_linker', None, params) proj.write_data() import pprint pprint.pprint(proj.metadata)