def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) sents = super().transform(sents) # basic sentence filtering sents[self.operant_column_name] = sents[ self.operant_column_name].apply(self.sentence_to_embeding_tokens) # colect tasks operators, arguments = self._collect_tasks(sents) results = {nam: opr(*arguments[nam]) for nam, opr in operators.items()} # measure persistance fraction for key in ['persist_sentences', 'persist_unknown_words']: if getattr(self, key): try: self.metrics[key] = { 'completed_inserts': float(sum(results[key])) / float(len(results[key])) } except ZeroDivisionError as err: warn( 'Caught zero division error. Try increasing the batch size.' ) return results['filter_unknown_words']
def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) sents = super().transform(sents) def number_to_string(num): try: string = self.underlying_engine.number_to_words(num) except NumOutOfRangeError: warn('NumOutOfRangeError caught from inflect engine for %s' % num) string = '' except Exception: warn('Caught unknown exception from inflect engine') string = '' return string replace_func = lambda w: number_to_string(w) if w.isnumeric() else w filter_numbers = lambda snt: [replace_func(w) for w in snt] sents[self.operant_column_name] = sents[ self.operant_column_name].apply(filter_numbers) return sents
def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) is_retweet = lambda row: not re.match(self._regular_expresion, row[ self.operant_column_name]) sents = super().transform(sents) return sents[sents.apply(is_retweet, axis=1)]
def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) sents = super().transform(sents) sents[self.operant_column_name] = sents[ self.operant_column_name].apply(self._tokenizer) return sents
def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) sents = super().transform(sents) subst = lambda snt: re.sub(self._regular_expresion, '', snt) sents[self.operant_column_name] = sents[ self.operant_column_name].apply(subst) return sents
def __init__(self): # check tmp dir path exists assert os.path.exists('/tmp'), \ 'Path "%s" does not exist. Try specifieng path correctly or set the '\ 'global property "tmp_directory_path" accordingly' self._class_prefix = self.__class__.__name__.split('PostgresDatabaseService')[0] # chcek password and connection self._check_connection() debug('Instantiated db client to: "%s" database @%s.'%(self.database,self.host))
def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) sents = super().transform(sents) subst = lambda snt: self.underlying_engine(snt, delimiters=self.delimeters) sents[self.operant_column_name] = sents[ self.operant_column_name].apply(subst) return sents
def _check_derived_class_argument(self, arguments, default_values): for arg, val in zip(arguments, default_values): if not hasattr(self, arg): class_name = self.__class__.__name__ try: warn('%s: argument "%s" has no value using defaults:' % (class_name, arg)) debug(val) setattr(self, arg, val) except Exception as err: error('Cannot set default valeus for argument %s' % arg) raise
def transform(self, sents): debug('Progressing %s/%s steps (%s)' % (self.order, self.num_pipeline_steps, self.__class__.__name__)) sents = super().transform(sents) drop_punktuation = lambda snt: [ w for w in snt if w not in self._stop_words ] sents[self.operant_column_name] = sents[ self.operant_column_name].apply(drop_punktuation) return sents
def word_to_embeding_token(self, wrd): try: response = self.db.execute( get_embeding_qry(wrd, self.language_model)) assert response result = response[0][0] except AssertionError: debug('Found unknown word "%s"' % wrd) result = wrd except Exception as err: prerror('Caught unknown exception') print(err) raise return result
def persist(backend, insert_qry): committed = False try: committed = backend.execute_insert(insert_qry) debug('Excecuted query: %s'%insert_qry) except Exception as err: if err.pgcode == '23505': warn('Caught primary key vioaltion, when %s'%insert_qry) else: error('Throwing unknown runtime exception, when: %s'%insert_qry) print(err,err.pgcode) raise return committed
def instansiate_engine(*arguments): # check required args assert len(arguments) >= 2, error('Parsed arguments "%s" cannot be used to instantiate class') module_name = arguments[0] class_name = arguments[1] assert type(module_name) == type(class_name) == str, \ error('Module and class names must be of "str" type. Got "%s" and "%s" instead.'%(type(module_name),type(class_name))) # check optional args args = arguments[2] if len(arguments) >= 3 else [] kwargs = arguments[3] if len(arguments) == 4 else {} if args: assert type(args) == list, error('Cannot parse "%s" args correctly'%class_name) if kwargs: assert type(kwargs) == dict, error('Cannot parse "%s" kwargs correctly'%class_name) # instansiate class_proxy = import_class_proxy(module_name, class_name) try: class_instance = class_proxy(*args, **kwargs) except Exception as err: error('Cannot instansiate class "%s"'%(class_proxy.__name__)) print(err) raise debug('Instansiated class "%s"'%class_proxy.__name__) if args: debug(' args %s'%args) if kwargs: debug(' kwargs %s'%kwargs) return class_instance