コード例 #1
0
def import_class_proxy(module_name, class_name):

    module_proxy = import_module_proxy(module_name)
    try:
        class_proxy  = getattr(module_proxy, class_name)
    except Exception:
        error('Cannot import class "%s" from module "%s"'%(class_name, module_name.__name__))
        raise

    return class_proxy
コード例 #2
0
def import_module_proxy(module_name):

    try:
        module_proxy = importlib.import_module(module_name)
    except Exception:
        error('Cannot import module "%s". Make sure there are no typos'
              'and configure your environment properly.'%module_name)
        raise

    return module_proxy
コード例 #3
0
    def _collect_tasks(self, tokenized_sentences):

        # bookkeeping
        datasets = {
            'filter_unknown_words':
            tokenized_sentences[self.operant_column_name]
        }

        # helping stuff
        is_known = lambda tok_snt: [
            w for w in tok_snt if str != type(w) == int
        ]
        is_unknown = lambda tok_snt: [
            w for w in tok_snt if not str != type(w) == int
        ]

        # standard task
        operators = {
            'filter_unknown_words': lambda df: df.apply(is_known).values
        }
        arguments = {
            'filter_unknown_words': [datasets['filter_unknown_words']]
        }

        # append optional tasks
        for tsk_nam, exec_tsk, tsk_func, fltr_fnc, col_nam in zip(
            ['persist_sentences', 'persist_unknown_words'],
            [self.persist_sentences, self.persist_unknown_words],
            [persist_sentences, persist_unknown_words], [is_known, is_unknown],
            [self.operant_column_name, self.language_column_name]):
            if exec_tsk:
                try:
                    data_part = tokenized_sentences[[
                        self.tweet_ids_column_name, col_nam
                    ]]
                    data_slice = data_part[self.operant_column_name].apply(
                        fltr_fnc)

                    data_part[self.operant_column_name] = data_slice
                    datasets[tsk_nam] = data_part
                except KeyError:
                    error('Cannot locate "%s" required by task "%s"' %
                          (col_nam, tsk_nam))
                    raise
                except Exception as err:
                    error('Caught unknown exception while preparing datasets')
                    print(err)
                    raise

                operators[tsk_nam] = tsk_func
                arguments[tsk_nam] = [
                    self.db, datasets[tsk_nam], self.table_names[tsk_nam]
                ]

        return operators, arguments
コード例 #4
0
    def transform(self, sents):

        if not type(sents) == DataFrame:
            try:
                sents = DataFrame(sents)
            except Exception as err:
                error('Cannot parse data into frame')
                print(err)
                raise

        return sents
コード例 #5
0
def has_valid_db_backend(class_instance):
    try:
        assert hasattr(class_instance, 'db') 
    except KeyError as err:
        exmpl = "conf['map_word_to_embeding_indices_conf']['kwargs']['wrapper_db']=<db-backend-isntance>"
        error('Specify a db backend isntance in your main file, e.g.: %s'%exmpl)
        raise
    try:
        assert hasattr(class_instance.db, 'cursor')
    except AssertionError as err:
        error('Make sure db instance "%s" has a "query" method'%self.db)
        raise
コード例 #6
0
ファイル: postgres.py プロジェクト: vsyropou/joker
    def _connect(self, pwd):
        try:
            conn = psycopg.connect(user = self.user,
                                   password = pwd,
                                   database = self.database,
                                   host = self.host)
        except Exception as err:
            error('Error while connecting to "%s@%s" as %s'%(self.database,self.host,self.user))
            error(err)
            raise        

        return conn
コード例 #7
0
    def _check_derived_class_argument(self, arguments, default_values):

        for arg, val in zip(arguments, default_values):
            if not hasattr(self, arg):
                class_name = self.__class__.__name__
                try:
                    warn('%s: argument "%s" has no value using defaults:' %
                         (class_name, arg))
                    debug(val)
                    setattr(self, arg, val)
                except Exception as err:
                    error('Cannot set default valeus for argument %s' % arg)
                    raise
コード例 #8
0
    def configure(self, **kwargs):

        conf = kwargs.pop('conf', self.conf)

        # parse configuration
        for arg in ['pipeline_version', 'pipeline_name']:
            try:
                setattr(self, arg, self.conf.get(arg))
            except Exception:
                error(
                    '"%s" is mandatory, not found in the provided configuration:'
                    % arg)
                raise

        memory = self.conf.get('memory', False)
        steps_cnf = self.conf.get('steps', None)

        # create pipline steps
        assert len(steps_cnf) >= 1, 'Pipeline without any components.'
        self.pipeline_steps = self._create_steps(steps_cnf, self.conf)

        # pipeline backend
        super().__init__(steps=self.pipeline_steps, memory=memory)

        try:  # pipline backed
            assert len(steps_cnf) == len(self.steps)
        except AssertionError:
            error('Pipeline components where not appended properly.')
            error('The requested pipeline configuration:')
            pprint(steps_cnf)
            error('Was parsed into the pipeline backed as follows:')
            pprint(self.steps)

            raise
コード例 #9
0
def persist(backend, insert_qry):

    committed = False
    try:
        committed = backend.execute_insert(insert_qry)
        debug('Excecuted query: %s'%insert_qry)

    except Exception as err:

        if err.pgcode == '23505':
            warn('Caught primary key vioaltion, when %s'%insert_qry)
        else:
            error('Throwing unknown runtime exception, when: %s'%insert_qry)
            print(err,err.pgcode)
            raise

    return committed
コード例 #10
0
ファイル: streaming.py プロジェクト: vsyropou/joker
    def __init__(self, *args, nthreads=1):

        # parse args
        try:
            self._pipeline = args[0]
            self._streamer = args[1]
        except Exception as err:
            error('naaaaaa')
            print(err)

        self._num_threads = nthreads

        self.input_count = np.int64(0)
        self.output_count = np.int64(0)

        self.time_batch_excecution = False
        self._time_measurements = []
コード例 #11
0
def persist_urls(*args):
    # TODO: This needs to be updated to be compatible with pandas
    try: # parse args
        db    = args[0] # db_backend
        urlsl = args[1] # nested list of urls
        ids   = args[2] # sentence_ids
        name  = args[3] # table_name
    except KeyError as err:
        error('Not enough arguments to persist urls')
        raise RuntimeError(err)
         
    urls_nested  = [ [(id,url) for url in urls] for id, urls in zip(ids, urlsl) ]

    urls_flatned = [ (id,url) for nurl in urls_nested for id, url in nurl]

    insert_data =  [row for row in [', '. join(["('%s','%s')"%tpl]) for tpl in urls_flatned]]
    
    return [ persist(db, insert_qry(name, row)) for row in insert_data]
コード例 #12
0
    def __init__(self, *args, **kwargs):

        super().__init__(*args, **kwargs)

        # check attributes
        self._check_derived_class_argument(
            ["persist_urls", "tweet_ids_column_name", "table_name"],
            [False, 'id', "urls"])

        # check that urls can be persisted
        if self.persist_urls:
            try:  # data availability
                assert self.sentence_ids
            except AssertionError as err:
                error(
                    '"sentence_ids" argument is required when "persist_urls" is True.'
                )
                raise

            has_valid_db_backend(self)
            has_table(self.db, self.table_name)
コード例 #13
0
def instansiate_engine(*arguments):

    # check required args
    assert len(arguments) >= 2, error('Parsed arguments "%s" cannot be used to instantiate class')
    module_name = arguments[0]
    class_name  = arguments[1]
    assert type(module_name) == type(class_name) == str, \
        error('Module and class names must be of "str" type. Got "%s" and "%s" instead.'%(type(module_name),type(class_name)))

    # check optional args
    args   = arguments[2] if len(arguments) >= 3 else []
    kwargs = arguments[3] if len(arguments) == 4 else {}
    if args:   assert type(args) == list,   error('Cannot parse "%s" args correctly'%class_name)
    if kwargs: assert type(kwargs) == dict, error('Cannot parse "%s" kwargs correctly'%class_name)

    # instansiate
    class_proxy = import_class_proxy(module_name, class_name)
    try:
        class_instance = class_proxy(*args, **kwargs)
    except Exception as err:
        error('Cannot instansiate class "%s"'%(class_proxy.__name__))
        print(err)
        raise

    debug('Instansiated class "%s"'%class_proxy.__name__)
    if args:   debug(' args %s'%args)
    if kwargs: debug(' kwargs %s'%kwargs)

    return class_instance
コード例 #14
0
def persist_unknown_words(*args):
    # TODO: This needs to be updated to be compatible with pandas, like the above one
    assert False, 'Unknown words persistance is is not ready yet'
    try: # parse args
        db   = args[0] # db_backend
        data = args[1] # raw isnert data
        name = args[2] # table_name
    except KeyError as err:
        error('Not enough arguments to persist unknown words')
        raise RuntimeError(err)

    import pdb; pdb.set_trace()
    # helping stuff
    uwrds = lambda snt: [w for w in snt if str==type(w)!=int]

    unknown_words_nested = [ [(uw,ln) for uw in uwrds(snt)] for snt, ln in zip(snts,lang) if uwrds(snt)]

    unknown_words_flatned = [(uw,l) for unwnst in unknown_words_nested for uw, l in unwnst ]

    insert_data = [row for row in [', '. join(["('%s','%s')"%tpl]) for tpl in unknown_words_flatned]]

    return [ persist(db, insert_qry(name, row)) for row in insert_data]
コード例 #15
0
def persist_sentences(*args):

    try: # parse args
        db   = args[0] # db_backend
        data = args[1] # raw isnert_data
        name = args[2] # table_name
    except KeyError as err:
        error('Not enough arguments to persist sentences')
        raise err

    # helping stuff
    row_to_string = lambda row: "(%s, '{%s}')"%(row.values[0],row.values[1])
    insert_frmter = lambda row: row_to_string(row).replace('[','').replace(']','')

    # prepare insert
    if data.shape[0] == 0:
        responce = []
        warn('Nothing to persist.')
    else:
        insert_data = data.apply(insert_frmter, axis=1)
        responce = [persist(db, insert_qry(name, row)) for row in insert_data]

    return responce
コード例 #16
0
    def __init__(self, *args, **kwargs):
        #TODO: reduce the size of checks ????.....
        super().__init__(*args, **kwargs)

        # check attributes
        self._check_derived_class_argument([
            "persist_sentences", "persist_unknown_words",
            "tweet_ids_column_name", "language_column_name", 'table_names'
        ], [False, False, "id", "lang", {}])

        # insertion metrics
        self.metrics = {
            key: None
            for key in ['persist_sentences', 'persist_unknown_words']
            if getattr(self, key)
        }

        # guarantee db engine
        has_valid_db_backend(self)

        # guarantedd language model (word embedings)
        try:
            self.language_model = self.table_names['language_model']
        except KeyError as err:
            error(
                'Specify "wrapper_table_names.language_model" in the pipeline conf file'
            )
            raise

        has_table(self.db, self.language_model)

        try:
            assert get_embeding_qry
        except AssertionError as err:
            error(
                'Cannot locate "get_embeding_qry" from module utilities.postgres_queries'
            )
            raise

        # guarante persistance of sentences and unknown words
        for flag_name in ['persist_sentences', 'persist_unknown_words']:

            if getattr(self, flag_name):
                try:  # list of tables in the db
                    assert flag_name in self.table_names.keys()
                except KeyError as err:
                    msg = 'Specify wrapper_table_names."%s" in the pipeline conf file' % flag_name
                    error(msg)
                    raise

                has_table(self.db, self.table_names[flag_name])
コード例 #17
0
def has_table(backend, table_name):
    try:
        assert table_name in list(map(lambda e: e[2], backend.execute(list_of_tables_qry)))
    except AssertionError as err:
        error('Cannot locate table "%s" in the database'%table_name)
        raise