コード例 #1
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        sents = super().transform(sents)

        # basic sentence filtering
        sents[self.operant_column_name] = sents[
            self.operant_column_name].apply(self.sentence_to_embeding_tokens)

        # colect tasks
        operators, arguments = self._collect_tasks(sents)

        results = {nam: opr(*arguments[nam]) for nam, opr in operators.items()}

        # measure persistance fraction
        for key in ['persist_sentences', 'persist_unknown_words']:
            if getattr(self, key):
                try:
                    self.metrics[key] = {
                        'completed_inserts':
                        float(sum(results[key])) / float(len(results[key]))
                    }
                except ZeroDivisionError as err:
                    warn(
                        'Caught zero division error. Try increasing the batch size.'
                    )

        return results['filter_unknown_words']
コード例 #2
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        sents = super().transform(sents)

        def number_to_string(num):
            try:
                string = self.underlying_engine.number_to_words(num)
            except NumOutOfRangeError:
                warn('NumOutOfRangeError caught from inflect engine for %s' %
                     num)
                string = ''
            except Exception:
                warn('Caught unknown exception from inflect engine')
                string = ''
            return string

        replace_func = lambda w: number_to_string(w) if w.isnumeric() else w

        filter_numbers = lambda snt: [replace_func(w) for w in snt]

        sents[self.operant_column_name] = sents[
            self.operant_column_name].apply(filter_numbers)

        return sents
コード例 #3
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        is_retweet = lambda row: not re.match(self._regular_expresion, row[
            self.operant_column_name])

        sents = super().transform(sents)

        return sents[sents.apply(is_retweet, axis=1)]
コード例 #4
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        sents = super().transform(sents)

        sents[self.operant_column_name] = sents[
            self.operant_column_name].apply(self._tokenizer)

        return sents
コード例 #5
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        sents = super().transform(sents)

        subst = lambda snt: re.sub(self._regular_expresion, '', snt)

        sents[self.operant_column_name] = sents[
            self.operant_column_name].apply(subst)

        return sents
コード例 #6
0
ファイル: postgres.py プロジェクト: vsyropou/joker
    def __init__(self):

        # check tmp dir path exists
        assert os.path.exists('/tmp'), \
            'Path "%s" does not exist. Try specifieng path correctly or set the '\
            'global property "tmp_directory_path" accordingly'

        self._class_prefix = self.__class__.__name__.split('PostgresDatabaseService')[0]

        # chcek password and connection
        self._check_connection()

        debug('Instantiated db client to: "%s" database @%s.'%(self.database,self.host))
コード例 #7
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        sents = super().transform(sents)

        subst = lambda snt: self.underlying_engine(snt,
                                                   delimiters=self.delimeters)

        sents[self.operant_column_name] = sents[
            self.operant_column_name].apply(subst)

        return sents
コード例 #8
0
    def _check_derived_class_argument(self, arguments, default_values):

        for arg, val in zip(arguments, default_values):
            if not hasattr(self, arg):
                class_name = self.__class__.__name__
                try:
                    warn('%s: argument "%s" has no value using defaults:' %
                         (class_name, arg))
                    debug(val)
                    setattr(self, arg, val)
                except Exception as err:
                    error('Cannot set default valeus for argument %s' % arg)
                    raise
コード例 #9
0
    def transform(self, sents):
        debug('Progressing %s/%s steps (%s)' %
              (self.order, self.num_pipeline_steps, self.__class__.__name__))

        sents = super().transform(sents)

        drop_punktuation = lambda snt: [
            w for w in snt if w not in self._stop_words
        ]

        sents[self.operant_column_name] = sents[
            self.operant_column_name].apply(drop_punktuation)

        return sents
コード例 #10
0
    def word_to_embeding_token(self, wrd):

        try:
            response = self.db.execute(
                get_embeding_qry(wrd, self.language_model))
            assert response
            result = response[0][0]
        except AssertionError:
            debug('Found unknown word "%s"' % wrd)
            result = wrd
        except Exception as err:
            prerror('Caught unknown exception')
            print(err)
            raise

        return result
コード例 #11
0
def persist(backend, insert_qry):

    committed = False
    try:
        committed = backend.execute_insert(insert_qry)
        debug('Excecuted query: %s'%insert_qry)

    except Exception as err:

        if err.pgcode == '23505':
            warn('Caught primary key vioaltion, when %s'%insert_qry)
        else:
            error('Throwing unknown runtime exception, when: %s'%insert_qry)
            print(err,err.pgcode)
            raise

    return committed
コード例 #12
0
def instansiate_engine(*arguments):

    # check required args
    assert len(arguments) >= 2, error('Parsed arguments "%s" cannot be used to instantiate class')
    module_name = arguments[0]
    class_name  = arguments[1]
    assert type(module_name) == type(class_name) == str, \
        error('Module and class names must be of "str" type. Got "%s" and "%s" instead.'%(type(module_name),type(class_name)))

    # check optional args
    args   = arguments[2] if len(arguments) >= 3 else []
    kwargs = arguments[3] if len(arguments) == 4 else {}
    if args:   assert type(args) == list,   error('Cannot parse "%s" args correctly'%class_name)
    if kwargs: assert type(kwargs) == dict, error('Cannot parse "%s" kwargs correctly'%class_name)

    # instansiate
    class_proxy = import_class_proxy(module_name, class_name)
    try:
        class_instance = class_proxy(*args, **kwargs)
    except Exception as err:
        error('Cannot instansiate class "%s"'%(class_proxy.__name__))
        print(err)
        raise

    debug('Instansiated class "%s"'%class_proxy.__name__)
    if args:   debug(' args %s'%args)
    if kwargs: debug(' kwargs %s'%kwargs)

    return class_instance