def update_index(self, ref_gen): """Add the elements in ref_gen to an existing index. """ testing = True logging.warning('Updating index') es_insert.index(es, ref_gen, self.index_name, testing, action="update") logging.warning('Finished updating')
def create_index(self, ref_path, columns_to_index, force=False): ''' INPUT: - ref_path: path to the file to index - columns_to_index ''' # To solve http.client.HTTPException: got more than 100 headers import http http.client._MAXHEADERS = 1000 testing = True ref_gen = pd.read_csv(ref_path, usecols=columns_to_index.keys(), dtype=str, chunksize=self.es_insert_chunksize) if self.has_index() and (force or (not self.valid_index())): self.ic.delete(self.index_name) if not self.has_index(): logging.info('Creating new index') log = self._init_active_log('INIT', 'transform') index_settings = es_insert.gen_index_settings( DEFAULT_ANALYZER, columns_to_index, INDEX_SETTINGS_TEMPLATE) logging.warning('Creating index') logging.warning(index_settings) self.ic.create(self.index_name, body=json.dumps(index_settings)) logging.warning('Inserting in index') es_insert.index(es, ref_gen, self.index_name, testing) log = self._end_active_log(log, error=False) logging.warning('Finished indexing') else: logging.info('Index already exists') logging.info('Finished indexing') self.valid_index() self._write_log_buffer(written=False)
testing = True if force_re_index or (not ic.exists(ref_table_name)): if ic.exists(ref_table_name): ic.delete(ref_table_name) ref_gen = pd.read_csv(ref_file_path, usecols=columns_to_index.keys(), dtype=str, chunksize=40000) index_settings = es_insert.gen_index_settings(columns_to_index) ic.create(ref_table_name, body=json.dumps(index_settings)) es_insert.index(ref_gen, ref_table_name, testing) # ============================================================================= # Initiate the labellers # ============================================================================= if test_num == 2: columns_certain_match = {'source': ['SIRET'], 'ref': ['SIRET']} labellers = dict() for i in range(3): labellers[i] = ConsoleLabeller(es, source, ref_table_name, match_cols, columns_to_index) labellers[i].auto_label(columns_certain_match) # import cProfile
force_re_index = True # Usually set to false # Create the index es_insert.create_index(es, ref_table_name, columns_to_index, default_analyzer=default_analyzer, analyzer_definitions=ANALYZERS, force=force_re_index) # Insert documents in the index ref_gen = pd.read_csv(ref_file_path, usecols=columns_to_index.keys(), dtype=str, chunksize=40000) es_insert.index(es, ref_gen, ref_table_name, testing=True) # ============================================================================= # 3. Initiate the labeller # ============================================================================= # ----------------------------------------------------------------------------- # NB.1: # Enter `h` or `help` in # # NB.2: # Advanced users may want to skip the labelling process and go directly to # linking (step 6.) and enter custom parameters instead of learning them # # EX.1: # For the provide example it might be usefull to add the following filters (=f)
def create_index(self, ref_path, columns_to_index, force=False, no_delete=False): '''Index a csv file in Elasticsearch. Unless force is set to True, this method will check if an index already exists with a mapping that includes that requested by columns_to_index. If not it will delete the existing index and fully re-index # TODO: look into re-indexing a single column Parameters ---------- ref_path: str path to the csv file to index. columns_to_index: dict like {col1: list_of_analyzers1, float_col: 'float' ...} The analyzers (or type if not string) to use for each column. force: bool Force deleting any existing index in all cases. no_delete: bool Prevent deleting if set to True unless force is also set to True. ''' # To solve http.client.HTTPException: got more than 100 headers import http http.client._MAXHEADERS = 1000 testing = True dtype = { col: self._choose_dtype(col) for col in columns_to_index.keys() } ref_gen = pd.read_csv(ref_path, usecols=columns_to_index.keys(), dtype=dtype, chunksize=self.es_insert_chunksize) if self.has_index() and (force or (not self.valid_index())): print('[create_index] Deleting index') self.ic.delete(self.index_name) columns_to_index_str = {key: val for key, val in columns_to_index.items() \ if not isinstance(val, str)} if self.has_index() and (not no_delete): mapping = ic.get_mapping(self.project_id)[ self.project_id]['mappings']['structure']['properties'] for col, analyzers in columns_to_index_str.items(): if any( mapping.get(col, {'fields': None})['fields'].get(a) is None for a in analyzers): print('Mapping is: {0}\nCol: {1}\nAnalyzers:{2}'.format( mapping, col, analyzers)) print('[create_index] Deleting index because of analyzers') print('Missing:\n', [(col, a) for a in analyzers if mapping.get( col, {'fields': None})['fields'].get(a) is None]) logging.warning( 'create_index] Deleting index because of missing analyzers' ) self.ic.delete(self.index_name) break if not self.has_index(): logging.info('Creating new index') log = self._init_active_log('INIT', 'transform') # TODO: is this right ? logging.warning('Creating index') es_insert.create_index(self.es, self.index_name, columns_to_index, default_analyzer=DEFAULT_ANALYZER, analyzer_definitions=ANALYZERS, force=force) logging.warning('Inserting in index') es_insert.index(es, ref_gen, self.index_name, testing, action='index') log = self._end_active_log(log, error=False) logging.warning('Finished indexing') time.sleep(5) # TODO: why is this necessary? else: logging.info('Index already exists') logging.info('Finished indexing') self.valid_index() self._write_log_buffer(written=False)