def make_mini(project_id):
    '''
    Create sample version of selected file (call just after upload).
    
    GET:
        - project_id
    POST:
        - data_params: 
                        {
                        module_name: 'INIT' (mandatory to be init)
                        file_name: 
                        }
        - module_params: {
                            sample_size: 
                            randomize:
                        }
    '''
    data_params, module_params = _parse_request()   
    proj = ESNormalizer(project_id=project_id)
    
    proj.load_data(data_params['module_name'], data_params['file_name'])
    proj.make_mini(module_params)
    
    # Write transformations and log
    proj.write_data()
Example #2
0
def _concat_with_init(project_id, data_params, *argv):
    '''
    Concatenate transformed columns with original file 

    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: file to concatenate to original
                {
                    "module_name": module to fetch from
                    "file_name": file to fetch
                }
                
        - module_params: none
    '''
    proj = ESNormalizer(project_id=project_id)

    # TODO: not clean
    if data_params is None:
        (module_name, file_name) = proj.get_last_written()
    else:
        module_name = data_params['module_name']
        file_name = data_params['file_name']

    proj.load_data(module_name, file_name)

    # TODO: there was a pdb here. is everything alright ?

    _, run_info = proj.transform('concat_with_init', None)

    # Write transformations and logs
    proj.write_data()
    return run_info
def upload(project_id):
    '''
    Uploads files to a normalization project. (NB: cannot upload directly to 
    a link type project). 
                                               
    Also creates the mini version of the project
    
    GET:
        - project_id: ID of the normalization project
        
    POST:
        
      file: (csv file) A csv to upload to the chosen normalization project
                  NB: the "filename" property will be used to name the file
      json:
        - module_params:
            - make_mini: (default True) Set to False to NOT create a mini version of the file
            - sample_size
            - randomize
    '''
    # Load project
    proj = ESNormalizer(project_id=project_id) 
    _, module_params = _parse_request()   
    if module_params is None:
        module_params = {}
    make_mini = module_params.get('make_mini', True) # TODO: can remove ?
    
    # Upload data        
    def custom_stream_factory(total_content_length, filename, content_type, content_length=None):
        tmpfile = tempfile.NamedTemporaryFile('wb+', prefix='flaskapp')
        app.logger.info("start receiving file ... filename => " + str(tmpfile.name))
        return tmpfile
    
    _, _, files = werkzeug.formparser.parse_form_data(flask.request.environ, stream_factory=custom_stream_factory)
    
    
    # Upload data
    file_name = files['file'].filename
    stream = files['file'].stream
    
    _, run_info = proj.upload_init_data(stream, file_name)
    
    # Make mini
    if make_mini:
        proj.load_data('INIT', run_info['file_name'])
        proj.make_mini(module_params)
        
        # Write transformations and log # TODO: not clean
        if proj.metadata['has_mini']:
            proj.write_data()
        else:
            proj._write_metadata()
    
    return jsonify(run_info=run_info, project_id=proj.project_id)
Example #4
0
def _replace_mvs(project_id, data_params, module_params):
    '''
    Runs the mvs replacement module
    
    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: {
                "module_name": module to fetch from
                "file_name": file to fetch
                }
        - module_params: same as result of infer_mvs
    '''
    proj = ESNormalizer(project_id=project_id)

    proj.load_data(data_params['module_name'], data_params['file_name'])

    _, run_info = proj.transform('replace_mvs', module_params)

    # Write transformations and log
    proj.write_data()
    return run_info
Example #5
0
def _run_all_transforms(project_id, data_params, *argv):
    '''
    Run all transformations that were already (based on presence of 
    run_info.json files) with parameters in run_info.json files.

    ARGUMENTS (GET):
        project_id: ID for "normalize" project

    ARGUMENTS (POST):
        - data_params: file to concatenate to original
                {
                    "file_name": file to use for transform (module_name is 'INIT')
                }
    '''
    proj = ESNormalizer(project_id=project_id)

    file_name = data_params['file_name']

    proj.load_data('INIT', file_name)
    all_run_infos = proj.run_all_transforms()

    # Write transformations and logs
    proj.write_data()
    return all_run_infos
Example #6
0
        {'french', 'whitespace', 'integers', 'n_grams'},
        'departement': {'french', 'whitespace', 'integers', 'n_grams'},
        'code_postal_uai': {},
        'full_name': {'french', 'whitespace', 'integers', 'n_grams'}
    }

    ref.create_index(ref_path, columns_to_index, force=False)

    # Link
    index_name = proj.metadata['files']['ref']['project_id']
    query_template = (('must', 'commune', 'localite_acheminement_uai',
                       '.french', 1), ('must', 'lycees_sources', 'full_name',
                                       '.french', 1))
    threshold = 3.5
    must = {'full_name': ['lycee']}
    must_not = {'full_name': ['ass', 'association', 'sportive', 'foyer']}

    params = dict()
    params['index_name'] = index_name
    params['query_template'] = query_template
    params['thresh'] = threshold
    params['must'] = must
    params['must_not'] = must_not

    proj.linker('es_linker', None, params)

    proj.write_data()

    import pprint
    pprint.pprint(proj.metadata)