def __init__(self, verbose=False, quiet=True): ETL.__init__(self, verbose=verbose) self.read_configfiles() self.config["plugins"] = []
def run_etl_groups(cls, logger, data_manager, neo_transactor): """This function runs each of the ETL in parallel""" etl_time_tracker_list = [] for etl_group in cls.etl_groups: etl_group_start_time = time.time() logger.info("Starting ETL group: %s" % etl_group) thread_pool = [] for etl_name in etl_group: logger.info("ETL Name: %s" % etl_name) config = data_manager.get_config(etl_name) if config is not None: etl = cls.etl_dispatch[etl_name](config) process = multiprocessing.Process(target=etl.run_etl) process.start() thread_pool.append(process) else: logger.info("No Config found for: %s" % etl_name) ETL.wait_for_threads(thread_pool) logger.info("Waiting for Queues to sync up") neo_transactor.check_for_thread_errors() neo_transactor.wait_for_queues() etl_elapsed_time = time.time() - etl_group_start_time etl_time_message = ( "Finished ETL group: %s, Elapsed time: %s" % (etl_group, time.strftime("%H:%M:%S", time.gmtime(etl_elapsed_time)))) logger.info(etl_time_message) etl_time_tracker_list.append(etl_time_message) return etl_time_tracker_list
def export_row_data_to_index(self, data, rownumber): parameters = self.config.copy() # todo: all content plugins configurated, not only this one parameters['plugins'] = [ 'enhance_path', 'enhance_entity_linking', 'enhance_multilingual', ] etl = ETL() try: etl.process( parameters=parameters, data=data) # if exception because user interrupted by keyboard, respect this and abbort except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception adding CSV row {} : {}".format(rownumber, e) ) if 'raise_pluginexception' in self.config: if self.config['raise_pluginexception']: raise e
def __init__(self, verbose=False): ETL.__init__(self, verbose=verbose) self.verbose = verbose self.read_configfiles() # Watched events # # We need IN_MOVE_SELF to track moved folder paths # pyinotify-internally. If omitted, the os instructions # mv /docs/src /docs/dest; touch /docs/dest/doc.pdf # will produce a IN_MOVED_TO pathname=/docs/dest/ followed by # IN_CLOSE_WRITE pathname=/docs/src/doc.pdf # where we would like a IN_CLOSE_WRITE pathname=/docs/dest/doc.pdf self.mask = (pyinotify.IN_DELETE | pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO | pyinotify.IN_MOVED_FROM | pyinotify.IN_MOVE_SELF) self.watchmanager = pyinotify.WatchManager() # Watch Manager self.handler = EventHandler() self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)
def set_configdefaults(self): # # Standard config # # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files # ETL.set_configdefaults(self) self.config['force'] = False # filename to URI mapping self.config['mappings'] = {"/": "file:///"} self.config['facet_path_strip_prefix'] = [ "file://", "http://www.", "https://www.", "http://", "https://" ] self.config['plugins'] = [ 'enhance_mapping_id', 'filter_blacklist', 'filter_file_not_modified', 'enhance_extract_text_tika_server', 'enhance_detect_language_tika_server', 'enhance_contenttype_group', 'enhance_pst', 'enhance_csv', 'enhance_file_mtime', 'enhance_path', 'enhance_extract_hashtags', 'enhance_warc', 'enhance_zip', 'clean_title', 'enhance_multilingual', ] self.config['blacklist'] = [ "/etc/opensemanticsearch/blacklist/blacklist-url" ] self.config['blacklist_prefix'] = [ "/etc/opensemanticsearch/blacklist/blacklist-url-prefix" ] self.config['blacklist_suffix'] = [ "/etc/opensemanticsearch/blacklist/blacklist-url-suffix" ] self.config['blacklist_regex'] = [ "/etc/opensemanticsearch/blacklist/blacklist-url-regex" ] self.config['whitelist'] = [ "/etc/opensemanticsearch/blacklist/whitelist-url" ] self.config['whitelist_prefix'] = [ "/etc/opensemanticsearch/blacklist/whitelist-url-prefix" ] self.config['whitelist_suffix'] = [ "/etc/opensemanticsearch/blacklist/whitelist-url-suffix" ] self.config['whitelist_regex'] = [ "/etc/opensemanticsearch/blacklist/whitelist-url-regex" ]
def export_row_data_to_index(self, data, rownumber): parameters = self.config.copy() # todo: all content plugins configurated, not only this one parameters['plugins'] = [ 'enhance_path', ] etl = ETL() try: etl.process(parameters=parameters, data=data) # if exception because user interrupted by keyboard, respect this and abbort except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write("Exception adding CSV row {} : {}".format( rownumber, e.message)) if 'raise_pluginexception' in self.config: if self.config['raise_pluginexception']: raise e
def getTargetDdl(self, tableformat, external=True, _ddl=""): ddl = _ddl databasename = self.metaresultlist[0].target_database tablename = self.metaresultlist[0].target_table tablelocation = self.metaresultlist[0].target_file_path if external is True and ETL.isNullOrEmpty(databasename) is None: ddl = f"CREATE EXTERNAL TABLE {tablename}(\n" elif external is True and ETL.isNullOrEmpty(databasename) is not None: ddl = f"CREATE EXTERNAL TABLE {databasename}.{tablename}(\n" elif external is False and ETL.isNullOrEmpty(databasename) is None: ddl = f"CREATE TABLE {tablename}(\n" elif external is False and ETL.isNullOrEmpty(databasename) is not None: ddl = f"CREATE TABLE {databasename}.{tablename}(\n" else: ddl = f"CREATE EXTERNAL TABLE {databasename}.{tablename}(\n" for metares in self.metaresultlist: if int(str(metares.src_table_order).strip()).__eq__(0): ddl = f"{ddl}`{metares.target_col}` {metares.target_col_datatype},\n" ddl = f"{ddl}--End" ddl = ddl.strip(',\n--End') ddl = f"{ddl}\n)" \ f"STORED AS {tableformat}" if ETL.isNullOrEmpty(tablelocation) is not None and external is True: ddl = f"{ddl}\nLOCATION {tablelocation}" return ddl
def count_word(self): """ Count stop words and output statistics. """ word_list = [] result = ETL().extract_word() count, total = 0, len(result) for row in result: count += 1 test_id = row[0] print(f"{test_id}, {count}/{total}") try: dump = ETL().extract_cdb(test_id) processed = Process(dump).internal_process() except (IndexError, UnicodeDecodeError): continue if "\n\n" in dump: exceptions = dump[dump.index("\n\n") + len("\n\n"):] try: header = "exception throw location:\n" stack = exceptions[exceptions.index(header) + len(header):] except ValueError: continue # extract root cause from exceptions if dump.count(header) > 1: stack = stack[:stack.index("\n\n")] roots = re.findall(r"^\d+:[ ](.+)[ ]at[ ].+", stack, re.M) words = self.obtain_word(roots, processed) word_list += words Log().chart_print(Counter(word_list).most_common(10))
class Task: def __init__(self, name, question, db_table, row_handler, answer_cql): self._name = name self._question = question self._db_table = db_table self._answer_cql = answer_cql self._etl = ETL( f"{name} ETL", [helper.join_path(TEMP_DIR, 'staging.csv')], { "target": 'Cassandra', "table": db_table["table_name"], "is_file": False }, [i[0] for i in db_table["cols"]], row_handler, ) logger.info(f"{self._name} - Question: {self._question}") def _create_table(self): table_name, cols, key = self._db_table.values() cql_col_types = ', '.join([f"{col[0]} {col[1]}" for col in cols]) db.execute( f"CREATE TABLE IF NOT EXISTS {table_name} ({cql_col_types}, PRIMARY KEY({key}))" ) logger.info(f"{self._name} - Create '{table_name}' table") def _get_result(self): result = db.fetch(self._answer_cql) result_col_name = helper.get_str_between(self._answer_cql, 'SELECT', 'FROM')[0].strip().split(', ') try: with open(helper.join_path(RESULT_DIR, f'{self._name}.csv'), 'a') as target_file: writer = csv.writer(target_file, dialect='Dialect') writer.writerow(result_col_name) for row in result: writer.writerow(row) except IOError as e: logger.error(e) except: logger.error(f"Unexpected error: {sys.exc_info()[0]}") logger.info( f"{self._name} - Complete generate '{self._name}.csv' file") def run(self): self._create_table() self._etl.run() logger.info(f"{self._name} - Complete ETL process") self._get_result()
def transform(self): # Get Unique source table names for Transformation srctables = set() for metares in self.model.metaresultlist: srctables.add(metares.src_table) # For each source table create SourceTable object and assign transform columns for srctable in srctables: tablemetaresult = self.model.filterMetaResultBySourceTable( srctbl=srctable) tblinfo: MetaResult = tablemetaresult[0] fklist = [] for item in self.model.datamodel.keys(): if self.model.datamodel[item][ 'fk'] is not None or self.model.datamodel[item][ 'fk'] is {}: if srctable in self.model.datamodel[item]['fk'].keys(): fklist.extend(self.model.datamodel[item]['fk'] [srctable]['fk_pk']) sourcetable: SourceTable = SourceTable( sourcesystem=tblinfo.src_system, tablename=tblinfo.src_table, pk=self.model.datamodel[tblinfo.src_table]['pk'], fk=fklist, database=tblinfo.src_database, filepath=tblinfo.src_file_path, filetype=tblinfo.src_filetype, modeltableorder=tblinfo.src_table_order) self.sourcetables.append(sourcetable) for tbl in tablemetaresult: sourcetable.addColumn( name=tbl.src_col, type=tbl.src_col_datatype, pk=(True, False)[tbl.src_key_constraints.__eq__('pk')], udf=tbl.udf, udfargs=tbl.udfarguments, casttype=tbl.target_col_datatype, aliasname=tbl.target_col, filterclause=tbl.src_col_filter, fk={}) # Read file as dataframe sourcetable.readFileFromSource(spark=self.spark) ETL.registerAllUDF(sc=self.spark) for sourcetable in self.sourcetables: sourcetable.applyTransform() self.applyJoin() self.applyFilters() self.applyGroupAggregation() self.targetdf.show()
def __init__(self, verbose=False, quiet=True): ETL.__init__(self, verbose=verbose) self.quiet = quiet self.set_configdefaults() self.read_configfiles()
def applyColTransform(self, query, src_table, src_col, target_col, target_col_datatype, udf, udfarguments): if ETL.isNullOrEmpty(udf) is not None and len(udfarguments) is not 0: query = f"{query} CAST({udf}({src_table}.`{src_col}`, {','.join(udfarguments)}) AS {target_col_datatype}) AS {target_col}," elif ETL.isNullOrEmpty(udf) is not None and len(udfarguments) is 0: query = f"{query} CAST({udf}({src_table}.`{src_col}`) AS {target_col_datatype}) AS {target_col}," else: query = f"{query} CAST({src_table}.`{src_col}` AS {target_col_datatype}) AS {target_col}," return query
def _load_and_process_data(self): thread_pool = [] for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type,)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool)
def set_configdefaults(self): # # Standard config # # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files # ETL.set_configdefaults(self) self.config['force'] = False
def setUp(self): self.etl = ETL() data = { 'customer_id': [1, 1], 'order_id': [1, 2], 'order_item_id': [5, 6], 'num_items': [2, 3], 'revenue': [90, 50], 'created_at_date': [datetime(2017, 10, 5), datetime(2017, 10, 12)] } self.test_df = pd.DataFrame.from_dict(data)
def _load_and_process_data(self): thread_pool = [] ensg_to_gene_primary_id_map = self._get_primary_gene_ids_to_ensembl_ids( ) for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process( target=self._process_sub_type, args=(sub_type, ensg_to_gene_primary_id_map)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool)
def ETI_TEST(): print("Staring ETL Test Job !!!!") var = Variable() var.INPUT_DATA = "/input/employee.csv" csvParser = csv_parser(var.INPUT_DATA) if csvParser.file_exist(var.INPUT_DATA): TABLE_NAME = csvParser.get_table_name( var.INPUT_DATA) # Takes Table name from Filename if var.CREATE_TABLE and not var.RELATION: ob = ETL(TABLE_NAME, var.INPUT_DATA) ob.etl_process(csvParser.check_header(), 20)
def main(): print("Staring ETL Job !!!!") var = Variable() csvParser = csv_parser(var.INPUT_DATA) if csvParser.file_exist(var.INPUT_DATA): TABLE_NAME = csvParser.get_table_name( var.INPUT_DATA) # Takes Table name from Filename if var.CREATE_TABLE and not var.RELATION: ob = ETL(TABLE_NAME, var.INPUT_DATA) ob.etl_process(csvParser.check_header())
def enrich(plugins, uri, wait=0): if wait: time.sleep(wait) etl = ETL() etl.read_configfile('/etc/opensemanticsearch/etl') etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf') etl.config['plugins'] = plugins.split(',') filename = uri # if exist delete protocoll prefix file:// if filename.startswith("file://"): filename = filename.replace("file://", '', 1) parameters = etl.config.copy() parameters['id'] = uri parameters['filename'] = filename parameters, data = etl.process(parameters=parameters, data={}) return data
def main(): """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`. Args: -c --config <config_file> the Stetl config file. -s --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]). -a --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc. """ args = parse_args() # Do the ETL etl = ETL(vars(args), args.config_args) etl.run()
def main(): parser = argparse.ArgumentParser( description='CLI for the pldb application.') parser.add_argument( '--update', action='store_true', help='Download season data and update the JSON file and the database.') parser.add_argument( '--table', action='store_true', help= 'Display the current standings table (calculated from data in the database).' ) parser.add_argument( '--club', type=str, default=None, help= 'Display info for all the matches for the given club in the season.') args = parser.parse_args() if args.update: print("Updating season data...") etl = ETL() etl.run() print("done.") elif args.table: query = Query() table_data = query.table() print("#\tClub\tPlayed\tWon\tDrawn\tLost\tGD\tPoints") for rank in range(len(table_data)): row = table_data[rank] print( f"{rank + 1}\t{row['club']}\t{row['matches_played']}\t" f"{row['wins']}\t{row['draws']}\t{row['losses']}\t{row['goal_diff']}\t" f"{row['points']}") elif args.club: query = Query() for match in query.club(args.club): kick_time = match['kickoff'] / 1000 kick_time = datetime.datetime.fromtimestamp(kick_time).strftime( "%a %d %b %H:%M") if match['status'] == 'C': score = f"{match['away_goals']} {match['home_goals']}" else: score = ' @ ' print( f"{kick_time} {match['away_club']['abbr']} " f"{score} {match['home_club']['abbr']} {match['ground']['name']}" )
def __init__(self, verbose=False): ETL.__init__(self, verbose=verbose) self.verbose = verbose self.read_configfiles() self.mask = pyinotify.IN_DELETE | pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO | pyinotify.IN_MOVED_FROM # watched events self.watchmanager = pyinotify.WatchManager() # Watch Manager self.handler = EventHandler() self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)
def __init__(self, verbose = False ): ETL.__init__(self, verbose=verbose) self.verbose=verbose self.read_configfiles() self.mask = pyinotify.IN_DELETE | pyinotify.IN_CLOSE_WRITE | pyinotify.IN_MOVED_TO | pyinotify.IN_MOVED_FROM # watched events self.watchmanager = pyinotify.WatchManager() # Watch Manager self.handler = EventHandler() self.notifier = pyinotify.Notifier(self.watchmanager, self.handler)
def __init__(self, plugins=[], verbose=False ): ETL.__init__(self, plugins=plugins, verbose=verbose) self.read_configfile ('/etc/etl/config') self.read_configfile ('/etc/opensemanticsearch/etl') self.read_configfile ('/etc/opensemanticsearch/enhancer-rdf') self.fields = self.getfieldnames_from_plugins() # init exporter (todo: exporter as extended PySolr) self.export_solr = export_solr.export_solr() # init PySolr solr_uri = self.config['solr'] if not solr_uri.endswith('/'): solr_uri += '/' solr_uri += self.config['index'] self.solr = pysolr.Solr( solr_uri ) self.threads_max = None # if not set explicit, autodetection of count of CPUs for amount of threads if not self.threads_max: import multiprocessing self.threads_max = multiprocessing.cpu_count() if self.verbose: print ( "Setting threads to count of CPUs: " + str(self.threads_max) ) self.rows_per_step = 100 if self.rows_per_step < self.threads_max * 2: self.rows_per_step = self.threads_max * 2 self.work_in_progress = [] self.delete_from_work_in_progress_lock = threading.Lock() self.delete_from_work_in_progress_after_commit = [] self.work_in_progress_lock = threading.Lock() self.e_job_done = threading.Event()
def run_etl(): #initialize auth_controller auth_controller = AuthController('http://restservice:8001/ETL/api/v1.0/', 'auths') #initialize data_controller data_controller = DataController('http://restservice:8001/ETL/api/v1.0/') #initialize sftp_controller sftp_controller = SftpController('sftpserver.pyc.test', 'etluser', 'sftpserver_keys/sftpserver_rsa', 'IntuitPYC') #initialize vertica_controller vertica_controller = VerticaController('config/vsql.config') etl = ETL(auth_controller, data_controller, sftp_controller, vertica_controller) etl.run()
def getWhereClauses(self): def matchEqualityOperator(expression): expr = str(expression).strip() if expr.__contains__('eq'): expr = expr.replace('eq(', '=').replace(')', '', 1) if expr.__contains__('gt'): expr = expr.replace('gt(', '>').replace(')', '', 1) elif expr.__contains__('lt'): expr = expr.replace('lt(', '<').replace(')', '', 1) elif expr.__contains__('lte'): expr = expr.replace('lte(', '<=').replace(')', '', 1) elif expr.__contains__('gte'): expr = expr.replace('gte(', '>=').replace(')', '', 1) elif expr.__contains__('notin'): expr = expr.replace('notin(', 'NOT IN').replace(')', '', 1) elif expr.__contains__('in'): expr = expr.replace('in(', 'IN').replace(')', '', 1) elif expr.__contains__('ne'): expr = expr.replace('ne(', '<>').replace(')', '', 1) else: expr = str(expression).strip() return expr query, joindict = self.joinSQL(self.datamodel, 'purchase', 'product', 'store') wherequery = "" for metares in self.metaresultlist: if str(metares.src_col_filter).strip() is not None and not str(metares.src_col_filter).strip().__eq__(""): wherequery = f"{metares.src_table}.`{metares.src_col}` {self.matchEqualityOperator(metares.src_col_filter)}" if ETL.isNullOrEmpty(wherequery) is not None: query = f"{query} WHERE {wherequery}" return query
def __init__(self, src_system, src_database, src_table, src_filetype, src_file_path, src_col, src_col_datatype, src_key_constraints, src_col_filter, src_col_aggregator, src_col_aggregator_filter, src_table_order, target_database, target_table, target_filetype, target_file_path, target_col, target_col_datatype, udf="", udfarguments=""): self.metacolumnslist = {} self.src_system = src_system self.src_database = src_database self.src_table = src_table self.src_filetype = src_filetype self.src_file_path = src_file_path self.src_col = src_col self.src_col_datatype = str(src_col_datatype).lower() self.src_key_constraints = str(src_key_constraints).lower() self.src_col_filter = src_col_filter self.src_col_aggregator = src_col_aggregator self.src_col_aggregator_filter = src_col_aggregator_filter self.src_table_order = int(str(src_table_order).strip()) self.target_database = target_database self.target_table = target_table self.target_filetype = target_filetype self.target_file_path = target_file_path self.target_col = target_col self.target_col_datatype = target_col_datatype self.target_col_aggregator = "" self.target_col_aggregator_filter = "" self.udf = udf if ETL.isNullOrEmpty(udfarguments) is not None: self.udfarguments = udfarguments.split('|') else: self.udfarguments = [] self.metacolumnslist.update({'src_filetype': self.src_filetype}) self.metacolumnslist.update({'src_system': self.src_system}) self.metacolumnslist.update({'src_database': self.src_database}) self.metacolumnslist.update({'src_table': self.src_table}) self.metacolumnslist.update({'src_file_path': self.src_table}) self.metacolumnslist.update({'src_col': self.src_table}) self.metacolumnslist.update({'src_col_datatype': self.src_col_datatype}) self.metacolumnslist.update({'src_key_constraints': self.src_key_constraints}) self.metacolumnslist.update({'src_col_filter': self.src_col_filter}) self.metacolumnslist.update({'src_col_aggregator': self.src_col_aggregator}) self.metacolumnslist.update({'src_col_aggregator_filter': self.src_col_aggregator_filter}) self.metacolumnslist.update({'src_table_order': self.src_table_order}) self.metacolumnslist.update({'target_database': self.target_database}) self.metacolumnslist.update({'target_table': self.target_table}) self.metacolumnslist.update({'target_filetype': self.target_filetype}) self.metacolumnslist.update({'target_file_path': self.target_file_path}) self.metacolumnslist.update({'target_col': self.target_col}) self.metacolumnslist.update({'target_col_datatype': self.target_col_datatype}) self.metacolumnslist.update({'target_col_aggregator': self.target_col_aggregator}) self.metacolumnslist.update({'target_col_aggregator_filter': self.target_col_aggregator_filter}) self.metacolumnslist.update({'udf': self.udf}) self.metacolumnslist.update({'udfarguments': self.udfarguments})
def __init__(self, verbose=False, quiet=True): ETL.__init__(self, verbose=verbose) self.quiet = quiet self.set_configdefaults() self.read_configfiles() # read on what DB or search server software our index is export = self.config['export'] # call delete function of the configured exporter module = importlib.import_module(export) objectreference = getattr(module, export) self.connector = objectreference()
def enrich(plugins, uri, wait=0): if wait: time.sleep(wait) etl = ETL() etl.read_configfile('/etc/opensemanticsearch/etl') etl.read_configfile('/etc/opensemanticsearch/enhancer-rdf') etl.config['plugins'] = plugins.split(',') filename = uri # if exist delete protocoll prefix file:// if filename.startswith("file://"): filename = filename.replace("file://", '', 1) parameters = etl.config.copy() parameters['id'] = uri parameters['filename'] = filename parameters, data = etl.process (parameters=parameters, data={}) return data
def __init__(self, name, question, db_table, row_handler, answer_cql): self._name = name self._question = question self._db_table = db_table self._answer_cql = answer_cql self._etl = ETL( f"{name} ETL", [helper.join_path(TEMP_DIR, 'staging.csv')], { "target": 'Cassandra', "table": db_table["table_name"], "is_file": False }, [i[0] for i in db_table["cols"]], row_handler, ) logger.info(f"{self._name} - Question: {self._question}")
def _load_and_process_data(self): thread_pool = [] query_tracking_list = multiprocessing.Manager().list() for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, query_tracking_list)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) queries = [] for item in query_tracking_list: queries.append(item) Neo4jTransactor.execute_query_batch(queries)
def update_mapping_new_product_id(self): logging.info("Create new product id after pruning dataset") _r_map = self.dataset[['PRODUCT_ID', 'NEW_PRODUCT_ID']] mm = [_r_map['PRODUCT_ID'].unique(), _r_map['NEW_PRODUCT_ID'].unique()] mapping = pd.DataFrame(data=np.array(mm).T, columns=["product_id", "new_product_id"]) logging.info("Save new product id into db") ETL(DATA_PATH, CONFIG_PATH, SCHEMA_PATH).insert_new_product_id_table(mapping)
def __init__(self, verbose=False, quiet=True): ETL.__init__(self, verbose=verbose) self.quiet = quiet self.set_configdefaults() self.read_configfiles() # if not set explicit, autodetection of count of CPUs for amount of threads if not self.threads_max: import multiprocessing self.threads_max = multiprocessing.cpu_count() if self.verbose: print("Setting threads to count of CPUs: " + str(self.threads_max)) self.e_job_done = threading.Event()
def _load_and_process_data(self): sub_types = [] for sub_type in self.data_type_config.get_sub_type_objects(): sub_types.append(sub_type.get_data_provider()) thread_pool = [] query_tracking_list = multiprocessing.Manager().list() for sub_type in self.data_type_config.get_sub_type_objects(): process = multiprocessing.Process(target=self._process_sub_type, args=(sub_type, sub_types, query_tracking_list)) process.start() thread_pool.append(process) ETL.wait_for_threads(thread_pool) queries = [] for item in query_tracking_list: queries.append(item) algo_queries = [] for item in queries: if "algorithm" in item[1]: algo_queries.append(item) main_list = self.get_randomized_list(sub_types) for file_set in main_list: for pair in file_set: for item in queries: if pair[0] + "_" + pair[1] in item[1]: self.logger.debug("Pair: %s Item: %s", pair, item[1]) Neo4jTransactor.execute_query_batch([item]) Neo4jTransactor().wait_for_queues() Neo4jTransactor.execute_query_batch(algo_queries) self.error_messages()
def set_configdefaults(self): # # Standard config # # Do not edit config here! Overwrite options in /etc/etl/ or /etc/opensemanticsearch/connector-files # ETL.set_configdefaults(self) self.config['force'] = False # filename to URI mapping self.config['mappings'] = { "/": "file:///" } self.config['facet_path_strip_prefix'] = [ "file://", "http://www.", "https://www.", "http://", "https://" ] self.config['plugins'] = [ 'enhance_mapping_id', 'filter_blacklist', 'filter_file_not_modified', 'enhance_extract_text_tika_server', 'enhance_detect_language_tika_server', 'enhance_contenttype_group', 'enhance_pst', 'enhance_csv', 'enhance_file_mtime', 'enhance_path', 'enhance_extract_hashtags', 'enhance_warc', 'enhance_zip', 'clean_title', 'enhance_multilingual', ] self.config['blacklist'] = ["/etc/opensemanticsearch/blacklist/blacklist-url"] self.config['blacklist_prefix'] = ["/etc/opensemanticsearch/blacklist/blacklist-url-prefix"] self.config['blacklist_suffix'] = ["/etc/opensemanticsearch/blacklist/blacklist-url-suffix"] self.config['blacklist_regex'] = ["/etc/opensemanticsearch/blacklist/blacklist-url-regex"] self.config['whitelist'] = ["/etc/opensemanticsearch/blacklist/whitelist-url"] self.config['whitelist_prefix'] = ["/etc/opensemanticsearch/blacklist/whitelist-url-prefix"] self.config['whitelist_suffix'] = ["/etc/opensemanticsearch/blacklist/whitelist-url-suffix"] self.config['whitelist_regex'] = ["/etc/opensemanticsearch/blacklist/whitelist-url-regex"]
def main(): """The `main` function, to be called from commandline, like `python src/main.py -c etl.cfg`. Args: -c --config <config_file> the Stetl config file. -s --section <section_name> the section in the Stetl config (ini) file to execute (default is [etl]). -a --args <arglist> substitutable args for symbolic, {arg}, values in Stetl config file, in format "arg1=foo arg2=bar" etc. -d --doc <class> Get component documentation like its configuration parameters, e.g. stetl --doc stetl.inputs.fileinput.FileInput -h --help get help info """ args = parse_args() if args.config_file: # Do the ETL etl = ETL(vars(args), args.config_args) etl.run() elif args.doc_args: print_doc(args.doc_args) else: print('Unknown option, try stetl -h for help')
def etl_graph(self, parameters): if self.verbose: print("Graph has {} triples.".format(len(self.graph)) ) count_triple = 0 count_subjects = 0 part_parameters = {} part_parameters['plugins'] = [] part_parameters['export'] = parameters['export'] property2facet = {} if 'property2facet' in parameters: property2facet = parameters['property2facet'] etl_processor = ETL() etl_processor.verbose = self.verbose class_properties = [] class_properties.append(rdflib.term.URIRef(u'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')) class_properties.append(rdflib.term.URIRef(u'http://www.wikidata.org/prop/direct/P31')) # since there can be multiple triples/values for same property in/from different graphs or graph describes existing other file/document, # do not overwrite document but add value to existent document & values of the facet/field/property part_parameters['add'] = True # use SPARQL query with distinct to get subjects only once res = self.graph.query( """SELECT DISTINCT ?subject WHERE { ?subject ?predicate ?object . }""") for row in res: count_subjects += 1 if self.verbose: print( "Importing entity / subject {}".format(count_subjects) ) # get subject of the concept from first column subj = row[0] if self.verbose: print ( "Processing RDF subject {}".format(subj) ) part_data = {} part_data['content_type_group_ss'] = 'Knowledge graph' # subject as URI/ID part_parameters['id'] = subj preferred_label = self.get_preferred_label(subject=subj) part_data['title_txt'] = preferred_label count_subject_triple = 0 # get all triples for this subject for pred, obj in self.graph.predicate_objects(subject=subj): count_triple += 1 count_subject_triple += 1 if self.verbose: print( "Importing subjects triple {}".format(count_subject_triple) ) print( "Predicate / property: {}".format(pred) ) print( "Object / value: {}".format(obj) ) try: # if class add preferredlabel of this entity to facet of its class (RDF rdf:type or Wikidata "instance of" (Property:P31)), # so its name (label) will be available in entities view and as filter for faceted search if pred in class_properties: class_facet = str(obj) # map class to facet, if mapping for class exist if class_facet in property2facet: class_facet = property2facet[class_facet] if class_facet in parameters['facets']: part_data['content_type_ss'] = 'Knowledge graph class {}'.format(parameters['facets'][class_facet]['label']) etl.append(data=part_data, facet=class_facet, values=preferred_label) # # Predicate/property to facet/field # # set Solr datatype strings so facets not available yet in Solr schema can be inserted automatically (dynamic fields) with right datatype facet = pred + '_ss' facet_uri = facet + '_uri_ss' facet_preferred_label_and_uri = facet + '_preflabel_and_uri_ss' if self.verbose: print ( "Facet: {}".format(facet) ) # # get values or labels of this object # values = self.get_values(obj=obj) if self.verbose: print ( "Values: {}".format(values) ) # insert or append value (object of triple) to data etl.append(data=part_data, facet=facet, values=values) # if object is reference/URI append URI if type(obj) == rdflib.URIRef: uri = obj etl.append( data=part_data, facet=facet_uri, values=uri ) # append mixed field with preferred label and URI of the object for disambiguation of different Entities/IDs/URIs with same names/labels in faceted search preferredlabel_and_uri = "{} <{}>".format ( self.get_preferred_label(subject=obj), obj) else: preferredlabel_and_uri = self.get_preferred_label(subject=obj) etl.append(data=part_data, facet=facet_preferred_label_and_uri, values=preferredlabel_and_uri) except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while triple {} of subject {}: {}\n".format(count_subject_triple, subj, e) ) # index triple etl_processor.process( part_parameters, part_data)
def run(self): print("running") etl = ETL() etl.run()
def process (self, parameters={}, data={} ): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'id' in data: docid = data['id'] else: docid = parameters['id'] # default classifier classifier = 'en_core_web_sm' if 'spacy_ner_classifier_default' in parameters: classifier = parameters['spacy_ner_classifier_default'] # set language specific classifier, if configured and document language detected if 'spacy_ner_classifiers' in parameters and 'language_s' in data: # is a language speciic cassifier there for the detected language? if data['language_s'] in parameters['spacy_ner_classifiers']: classifier = parameters['spacy_ner_classifiers'][data['language_s']] analyse_fields = ['content_txt','ocr_t','ocr_descew_t'] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) # extract sentences from text url = "http://localhost:8080/sents" headers = {'content-type': 'application/json'} d = {'text': text, 'model': classifier} response = requests.post(url, data=json.dumps(d), headers=headers) sentences = response.json() etl = ETL() sentencenumber = 0 for sentence in sentences: sentencenumber +=1 partdocid = docid + '#sentence' + str(sentencenumber) partparameters = parameters.copy() partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server', 'enhance_entity_linking', 'enhance_multilingual'] if 'enhance_ner_spacy' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_spacy') if 'enhance_ner_stanford' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_stanford') sentencedata = {} sentencedata['id'] = partdocid sentencedata['container_s'] = docid if 'author_ss' in data: sentencedata['author_ss'] = data['author_ss'] sentencedata['content_type_group_ss'] = "Sentence" sentencedata['content_type_ss'] = "Sentence" sentencedata['content_txt'] = sentence # index sentence try: partparameters, sentencedata = etl.process( partparameters, sentencedata ) except BaseException as e: sys.stderr.write( "Exception adding sentence {} : {}".format(sentencenumber, e) ) data['sentences_i'] = sentencenumber return parameters, data
def segment_pdf_to_pages ( self, parameters={}, data={} ): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True if 'id' in data: docid = data['id'] else: docid = parameters['id'] filename = parameters['filename'] #defaults, if pdfinfo will not detect them pages = 1 title = 'No title' author = None # get pagecount with pdfinfo command line tool pdfinfo = subprocess.check_output(['pdfinfo', '-enc', 'UTF-8', filename]) # decode pdfinfo = pdfinfo.decode(encoding='UTF-8') # get the count of pages from pdfinfo result # its a text with a line per parameter for line in pdfinfo.splitlines(): line=line.strip() # we want only the line with the pagecount if line.startswith('Pages:'): pages = int( line.split()[1] ) if line.startswith('Title:'): title = line.replace("Title:", '', 1) title = title.strip() if line.startswith('Author:'): author = line.replace("Author:", '', 1) author = author.strip() etl = ETL() # export and index each page for pagenumber in range(1, pages + 1): if verbose: print ("Extracting PDF page {} of {}".format(pagenumber, pages)) # generate temporary filename md5hash = hashlib.md5(filename.encode('utf-8')).hexdigest() temp_filename = tempfile.gettempdir() + os.path.sep + "opensemanticetl_pdftotext_" + md5hash + "_" + str(pagenumber) # call pdftotext to write the text of page into tempfile try: result = subprocess.check_call(['pdftotext', '-enc', 'UTF-8','-f', str(pagenumber), '-l', str(pagenumber), filename, temp_filename]) except BaseException as e: sys.stderr.write( "Exception extracting text from PDF page {}: {}\n".format(pagenumber, e) ) # read text from tempfile f = open(temp_filename, "r", encoding="utf-8") text = f.read() os.remove(temp_filename) partdocid = docid + '#page=' + str(pagenumber) partparameters = parameters.copy() partparameters['plugins'] = ['enhance_path', 'enhance_detect_language_tika_server', 'enhance_entity_linking', 'enhance_multilingual'] if 'enhance_ner_spacy' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_spacy') if 'enhance_ner_stanford' in parameters['plugins']: partparameters['plugins'].append('enhance_ner_stanford') pagedata = {} pagedata['id'] = partdocid pagedata['page_i'] = pagenumber pagedata['pages_i'] = pages pagedata['container_s'] = docid pagedata['title_txt'] = title if author: pagedata['author_ss'] = author pagedata['content_type_group_ss'] = "Page" pagedata['content_type_ss'] = "PDF page" pagedata['content_txt'] = text if verbose: print ( "Indexing extracted page {}".format(pagenumber) ) # index page try: partparameters, pagedata = etl.process( partparameters, pagedata) except BaseException as e: sys.stderr.write( "Exception adding PDF page {} : {}".format(pagenumber, e) ) data['pages_i'] = pages return parameters, data