def import_class_proxy(module_name, class_name): module_proxy = import_module_proxy(module_name) try: class_proxy = getattr(module_proxy, class_name) except Exception: error('Cannot import class "%s" from module "%s"'%(class_name, module_name.__name__)) raise return class_proxy
def import_module_proxy(module_name): try: module_proxy = importlib.import_module(module_name) except Exception: error('Cannot import module "%s". Make sure there are no typos' 'and configure your environment properly.'%module_name) raise return module_proxy
def _collect_tasks(self, tokenized_sentences): # bookkeeping datasets = { 'filter_unknown_words': tokenized_sentences[self.operant_column_name] } # helping stuff is_known = lambda tok_snt: [ w for w in tok_snt if str != type(w) == int ] is_unknown = lambda tok_snt: [ w for w in tok_snt if not str != type(w) == int ] # standard task operators = { 'filter_unknown_words': lambda df: df.apply(is_known).values } arguments = { 'filter_unknown_words': [datasets['filter_unknown_words']] } # append optional tasks for tsk_nam, exec_tsk, tsk_func, fltr_fnc, col_nam in zip( ['persist_sentences', 'persist_unknown_words'], [self.persist_sentences, self.persist_unknown_words], [persist_sentences, persist_unknown_words], [is_known, is_unknown], [self.operant_column_name, self.language_column_name]): if exec_tsk: try: data_part = tokenized_sentences[[ self.tweet_ids_column_name, col_nam ]] data_slice = data_part[self.operant_column_name].apply( fltr_fnc) data_part[self.operant_column_name] = data_slice datasets[tsk_nam] = data_part except KeyError: error('Cannot locate "%s" required by task "%s"' % (col_nam, tsk_nam)) raise except Exception as err: error('Caught unknown exception while preparing datasets') print(err) raise operators[tsk_nam] = tsk_func arguments[tsk_nam] = [ self.db, datasets[tsk_nam], self.table_names[tsk_nam] ] return operators, arguments
def transform(self, sents): if not type(sents) == DataFrame: try: sents = DataFrame(sents) except Exception as err: error('Cannot parse data into frame') print(err) raise return sents
def has_valid_db_backend(class_instance): try: assert hasattr(class_instance, 'db') except KeyError as err: exmpl = "conf['map_word_to_embeding_indices_conf']['kwargs']['wrapper_db']=<db-backend-isntance>" error('Specify a db backend isntance in your main file, e.g.: %s'%exmpl) raise try: assert hasattr(class_instance.db, 'cursor') except AssertionError as err: error('Make sure db instance "%s" has a "query" method'%self.db) raise
def _connect(self, pwd): try: conn = psycopg.connect(user = self.user, password = pwd, database = self.database, host = self.host) except Exception as err: error('Error while connecting to "%s@%s" as %s'%(self.database,self.host,self.user)) error(err) raise return conn
def _check_derived_class_argument(self, arguments, default_values): for arg, val in zip(arguments, default_values): if not hasattr(self, arg): class_name = self.__class__.__name__ try: warn('%s: argument "%s" has no value using defaults:' % (class_name, arg)) debug(val) setattr(self, arg, val) except Exception as err: error('Cannot set default valeus for argument %s' % arg) raise
def configure(self, **kwargs): conf = kwargs.pop('conf', self.conf) # parse configuration for arg in ['pipeline_version', 'pipeline_name']: try: setattr(self, arg, self.conf.get(arg)) except Exception: error( '"%s" is mandatory, not found in the provided configuration:' % arg) raise memory = self.conf.get('memory', False) steps_cnf = self.conf.get('steps', None) # create pipline steps assert len(steps_cnf) >= 1, 'Pipeline without any components.' self.pipeline_steps = self._create_steps(steps_cnf, self.conf) # pipeline backend super().__init__(steps=self.pipeline_steps, memory=memory) try: # pipline backed assert len(steps_cnf) == len(self.steps) except AssertionError: error('Pipeline components where not appended properly.') error('The requested pipeline configuration:') pprint(steps_cnf) error('Was parsed into the pipeline backed as follows:') pprint(self.steps) raise
def persist(backend, insert_qry): committed = False try: committed = backend.execute_insert(insert_qry) debug('Excecuted query: %s'%insert_qry) except Exception as err: if err.pgcode == '23505': warn('Caught primary key vioaltion, when %s'%insert_qry) else: error('Throwing unknown runtime exception, when: %s'%insert_qry) print(err,err.pgcode) raise return committed
def __init__(self, *args, nthreads=1): # parse args try: self._pipeline = args[0] self._streamer = args[1] except Exception as err: error('naaaaaa') print(err) self._num_threads = nthreads self.input_count = np.int64(0) self.output_count = np.int64(0) self.time_batch_excecution = False self._time_measurements = []
def persist_urls(*args): # TODO: This needs to be updated to be compatible with pandas try: # parse args db = args[0] # db_backend urlsl = args[1] # nested list of urls ids = args[2] # sentence_ids name = args[3] # table_name except KeyError as err: error('Not enough arguments to persist urls') raise RuntimeError(err) urls_nested = [ [(id,url) for url in urls] for id, urls in zip(ids, urlsl) ] urls_flatned = [ (id,url) for nurl in urls_nested for id, url in nurl] insert_data = [row for row in [', '. join(["('%s','%s')"%tpl]) for tpl in urls_flatned]] return [ persist(db, insert_qry(name, row)) for row in insert_data]
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # check attributes self._check_derived_class_argument( ["persist_urls", "tweet_ids_column_name", "table_name"], [False, 'id', "urls"]) # check that urls can be persisted if self.persist_urls: try: # data availability assert self.sentence_ids except AssertionError as err: error( '"sentence_ids" argument is required when "persist_urls" is True.' ) raise has_valid_db_backend(self) has_table(self.db, self.table_name)
def instansiate_engine(*arguments): # check required args assert len(arguments) >= 2, error('Parsed arguments "%s" cannot be used to instantiate class') module_name = arguments[0] class_name = arguments[1] assert type(module_name) == type(class_name) == str, \ error('Module and class names must be of "str" type. Got "%s" and "%s" instead.'%(type(module_name),type(class_name))) # check optional args args = arguments[2] if len(arguments) >= 3 else [] kwargs = arguments[3] if len(arguments) == 4 else {} if args: assert type(args) == list, error('Cannot parse "%s" args correctly'%class_name) if kwargs: assert type(kwargs) == dict, error('Cannot parse "%s" kwargs correctly'%class_name) # instansiate class_proxy = import_class_proxy(module_name, class_name) try: class_instance = class_proxy(*args, **kwargs) except Exception as err: error('Cannot instansiate class "%s"'%(class_proxy.__name__)) print(err) raise debug('Instansiated class "%s"'%class_proxy.__name__) if args: debug(' args %s'%args) if kwargs: debug(' kwargs %s'%kwargs) return class_instance
def persist_unknown_words(*args): # TODO: This needs to be updated to be compatible with pandas, like the above one assert False, 'Unknown words persistance is is not ready yet' try: # parse args db = args[0] # db_backend data = args[1] # raw isnert data name = args[2] # table_name except KeyError as err: error('Not enough arguments to persist unknown words') raise RuntimeError(err) import pdb; pdb.set_trace() # helping stuff uwrds = lambda snt: [w for w in snt if str==type(w)!=int] unknown_words_nested = [ [(uw,ln) for uw in uwrds(snt)] for snt, ln in zip(snts,lang) if uwrds(snt)] unknown_words_flatned = [(uw,l) for unwnst in unknown_words_nested for uw, l in unwnst ] insert_data = [row for row in [', '. join(["('%s','%s')"%tpl]) for tpl in unknown_words_flatned]] return [ persist(db, insert_qry(name, row)) for row in insert_data]
def persist_sentences(*args): try: # parse args db = args[0] # db_backend data = args[1] # raw isnert_data name = args[2] # table_name except KeyError as err: error('Not enough arguments to persist sentences') raise err # helping stuff row_to_string = lambda row: "(%s, '{%s}')"%(row.values[0],row.values[1]) insert_frmter = lambda row: row_to_string(row).replace('[','').replace(']','') # prepare insert if data.shape[0] == 0: responce = [] warn('Nothing to persist.') else: insert_data = data.apply(insert_frmter, axis=1) responce = [persist(db, insert_qry(name, row)) for row in insert_data] return responce
def __init__(self, *args, **kwargs): #TODO: reduce the size of checks ????..... super().__init__(*args, **kwargs) # check attributes self._check_derived_class_argument([ "persist_sentences", "persist_unknown_words", "tweet_ids_column_name", "language_column_name", 'table_names' ], [False, False, "id", "lang", {}]) # insertion metrics self.metrics = { key: None for key in ['persist_sentences', 'persist_unknown_words'] if getattr(self, key) } # guarantee db engine has_valid_db_backend(self) # guarantedd language model (word embedings) try: self.language_model = self.table_names['language_model'] except KeyError as err: error( 'Specify "wrapper_table_names.language_model" in the pipeline conf file' ) raise has_table(self.db, self.language_model) try: assert get_embeding_qry except AssertionError as err: error( 'Cannot locate "get_embeding_qry" from module utilities.postgres_queries' ) raise # guarante persistance of sentences and unknown words for flag_name in ['persist_sentences', 'persist_unknown_words']: if getattr(self, flag_name): try: # list of tables in the db assert flag_name in self.table_names.keys() except KeyError as err: msg = 'Specify wrapper_table_names."%s" in the pipeline conf file' % flag_name error(msg) raise has_table(self.db, self.table_names[flag_name])
def has_table(backend, table_name): try: assert table_name in list(map(lambda e: e[2], backend.execute(list_of_tables_qry))) except AssertionError as err: error('Cannot locate table "%s" in the database'%table_name) raise