Exemple #1
0
    def open(self, host, user, pwd, dbn, port=3306):
        '''
            open database or create it if not exist
        :return:
        '''
        #init storage path
        try:
            self.host, self.port, self.user, self.pwd = host, port, user, pwd
            self.dbn = dbn
            self.dbc = MySQLdb.connect(host=host,
                                       user=user,
                                       passwd=pwd,
                                       port=port)

            if not self._exists():
                #create database
                self._create()
                self._use()
            else:
                # load database
                self._use()
                self._load()

            self._rebuild_tindex()

            return self
            logger.info(
                "open storage mysql://%s:%s@%s:%d/%s...success. %d tables.",
                user, pwd, host, port, self.dbn, len(self.tables))
        except Exception, e:
            logger.error(
                "open storage mysql://%s:%s@%s:%d/%s...failed. error: %s",
                user, pwd, host, port, self.dbn, str(e))
            raise e
Exemple #2
0
    def create_table(self, table):
        '''
            create table in current database
        :param table:
        :return:
        '''
        with clock(self.lock):
            # test if the table has loaded
            for t in self.tables:
                if t.table == table:
                    logger.info("create table %s...exists.", table.name)
                    return

            #create new table
            table = FSTable().create(self.path, table)

            for i in range(0, len(self.tables)):
                t = self.tables[i]
                if t.table.name == table.name:
                    self.tables.pop(i)
                    break

            self.tables.append(table)

            self._rebuild_tindex()
Exemple #3
0
    def select(self):
        '''
            select all data from table
        :return:
        '''
        try:
            with clock(self.lock):
                with open(self.data_file, "r") as fdata:
                    models = []

                    #read field names
                    nfields = strips(fdata.readline().strip().split(","))
                    #read data records
                    data = fdata.readline()
                    while data:
                        data = data.strip()
                        vfields = strips(data.split(","))
                        model = {}
                        for idx in range(0, len(nfields)):
                            model[nfields[idx]] = str2obj(vfields[idx], ',')
                        models.append(model)
                        data = fdata.readline()

                    return models
        except Exception, e:
            logger.info("select data from table %s...failed. error: %s", self.name, str(e))
            raise e
Exemple #4
0
    def load(self, dbpath, name):
        '''
            load table
        :return:  self
        '''
        try:
            #initialize table parameters
            self.name = name

            self.path = join_paths(dbpath, name)
            self.table_file = join_paths(self.path, "table")
            self.data_file = join_paths(self.path, "data")

            self.table = self.desc()

            #load data file
            if not is_file(self.data_file):
                #create data file if not exists
                self._create_data_file()
            else:
                #replace old data file if needed
                with open(self.data_file) as fdata:
                    nfields = strips(fdata.readline().split(","))
                    if self.table.nfields() != nfields:
                        if is_subset(nfields, self.table.nfields()):
                            self._upgrade_data_file()
                        else:
                            self._replace_data_file()

            logger.info("loading table %s...success.", self.name)
            return self
        except Exception, e:
            logger.info("loading table %s...failed. error: %s", self.name, str(e))
            raise e
Exemple #5
0
    def open(self, path):
        '''
            open storage or create it if not exist
        :return:
        '''
        try:
            with clock(self.lock):
                #init storage path
                self.path = path

                if not path_exists(self.path):
                    #create database
                    self._create()
                else:
                    # load database
                    self._load()

                self._rebuild_tindex()

                return self
            logger.info("open storage %s...success. %d tables.", self.path,
                        len(self.tables))
        except Exception, e:
            logger.error("open storage %s...failed. error: %s", self.path,
                         str(e))
            raise e
Exemple #6
0
 def launch(self):
     '''
         launch extractor
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._launch)
         logger.info("extractor: launch extractor - %s, time used: %fs",
                     self.name(), time_used)
     except IOError, e:
         pass
Exemple #7
0
    def update(self, uri, extras):
        '''
            udpate uri context with crawl response extras data
        :param uri: object, Uri object
        :param extras: dict, extras data for crawled response
        :return:
        '''
        time_used, ret = Helper.timerun(self._update, uri, extras)

        logger.info("linker: update link %s, updated. time used:%fs",
                    uri.url(), time_used)
Exemple #8
0
 def persist(self):
     '''
         persist extractor data
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._persist)
         logger.info("extractor: persist extractor - %s, time used: %fs",
                     self.name(), time_used)
     except Exception, e:
         logger.info("extractor: persist extractor - %s, error: %s",
                     self.name(), e.message)
Exemple #9
0
 def desc(self):
     '''
            descrite table from storage
        :return:  Table
     '''
     try:
         with open(self.table_file) as ftable:
             table = Table().fromstr(ftable.read())
             return table
     except Exception, e:
         logger.info("describe table %s...failed. error: %s", self.name, str(e))
         raise e
Exemple #10
0
 def shutdown(self):
     '''
         shutdown extractor
     :return:
     '''
     try:
         time_used, ret = Helper.timerun(self._shutdown)
         logger.info("extractor: shutdown extractor - %s, time used: %fs",
                     self.name(), time_used)
     except Exception, e:
         logger.info("extractor: shutdown extractor - %s, error: %s",
                     self.name(), e.message)
Exemple #11
0
    def create(self, dbpath, table):
        '''
            create table
        :return self
        '''
        try:
            #initialize table parameters
            self.table = table
            self.name = table.name

            self.path = join_paths(dbpath, table.name)
            self.table_file = join_paths(self.path, "table")
            self.data_file = join_paths(self.path, "data")

            #create table directory if it is not exists
            make_dirs(self.path)

            #create or replace table file
            if is_file(self.table_file):

                #replace old table file if needed
                old_table = self.desc()
                if self.table != old_table:
                    #replace table file
                    self._replace_table_file()
                else:
                    #new table is same as exists table
                    pass
            else:
                #create new table file
                self._create_table_file()

            #create or upgrade or replace data file
            if is_file(self.data_file):
                #replace old data file if needed
                with open(self.data_file) as fdata:
                    nfields = strips(fdata.readline().split(","))
                    if self.table.nfields() != nfields:
                        if is_subset(nfields, self.table.nfields()):
                            self._upgrade_data_file()
                        else:
                            self._replace_data_file()
            else:
                #create new data file
                self._create_data_file()

            logger.info("create table %s...success.", self.name)
            return self
        except Exception, e:
            logger.error("create table %s...failed. error: %s", self.name, str(e))
            raise e
Exemple #12
0
 def close(self):
     '''
         close datbase
     :return:
     '''
     #close database connection
     try:
         if self.dbc is not None:
             self.dbc.close()
         logger.info("close storage mysql://%s:%s@%s:%d/%s...success.",
                     self.user, self.pwd, self.host, self.port, self.dbn)
     except Exception, e:
         logger.info(
             "close storage mysql://%s:%s@%s:%d/%s...failed. error: %s",
             self.user, self.pwd, self.host, self.port, self.dbn, str(e))
         raise e
Exemple #13
0
class Linker(Launcher):
    '''
        linker who manage crawl links from spider
    '''
    def __init__(self, workdir, name="linker"):
        '''
            initialize linker instance
        :param name: string, linker name, an unique identifier
        :param configs: list, PatternConfig objects in list
        '''
        Launcher.__init__(self, workdir, name)

    def launch(self):
        '''
            launch linker
        :return:
        '''
        try:
            time_used, ret = Helper.timerun(self._launch)
            logger.info("linker: launch linker - %s, time used: %fs",
                        self.name(), time_used)
        except IOError, e:
            pass
        except Exception, e:
            logger.info("linker: launch linker - %s, error: %s", self.name(),
                        e.message)
Exemple #14
0
class Extractor(Launcher):
    '''
        base class for all extractor
    '''
    def __init__(self, workdir, name="extractor"):
        '''
            initialize extractor instance with @filter
        :param name: string, extractor name, unique identifier for the extractor instance
        '''
        Launcher.__init__(self, workdir, name)

    def launch(self):
        '''
            launch extractor
        :return:
        '''
        try:
            time_used, ret = Helper.timerun(self._launch)
            logger.info("extractor: launch extractor - %s, time used: %fs",
                        self.name(), time_used)
        except IOError, e:
            pass
        except Exception, e:
            logger.info("extractor: launch extractor - %s, error: %s",
                        self.name(), e.message)
Exemple #15
0
    def pull(self):
        '''
            pull next link from linker
        :return: object, Link object or None
        '''
        time_used, link = Helper.timerun(self._pull)

        if link is not None:
            logger.info("linker: pull link %s, pulled. time used: %fs",
                        link.uri().url(), time_used)
            return link.uri()
        else:
            logger.info(
                "linker: pull link none, no more links. time used: %fs",
                time_used)
            return None
Exemple #16
0
    def parse(self, uri, content):
        '''
            parse wrapper for actual @_parse method
        :param uri: object, uri for the @content
        :param content: string, content for the @url
        :return: list, list with @Uri objects
        '''
        if not self.accept(uri):
            return None

        time_used, links = Helper.timerun(self._parse, uri, content)
        logger.info(
            "parser: parse links: %s, parsed. links: %d, time used: %fs",
            uri.url(), len(links), time_used)

        return links
Exemple #17
0
    def extract(self, uri, content):
        '''
            extract data from content
        :param uri: object, @Uri object of content
        :param content: string, content of @uri
        :return: object, extract result object or None
        '''
        if not self.accept(uri):
            return None

        time_used, result = Helper.timerun(self._extract, uri, content)

        logger.info(
            "extractor: extract data from: %s, extracted. time used: %fs",
            uri.url(), time_used)

        return result
Exemple #18
0
 def insert(self, models):
     '''
         insert data to table
     :param models:
     :return:
     '''
     try:
         with clock(self.lock):
             with open(self.data_file, "a") as fdata:
                 lines = []
                 for model in models:
                     vfields = []
                     for nfield in self.table.nfields():
                         vfields.append(objtostr(model.get(nfield), ','))
                     lines.append("%s\n" % ",".join(vfields))
                 fdata.writelines(lines)
     except Exception, e:
         logger.info("insert data to table %s...failed. error: %s", self.name, str(e))
         raise e
Exemple #19
0
    def register(self, linker):
        '''
            register @linker into linker manager, replace current linker
        :param linker: object, linker to be loaded
        :return: object, old linker or None
        '''
        old = self.__linker

        self.__linker = linker
        if self.__linker is not None:
            logger.info("linker manager: register new linker %s.",
                        self.__linker.name())
        else:
            if old is None:
                logger.warning(
                    "linker manager: linker is none, no linker registered.")
            else:
                logger.warning(
                    "linker manager: linker is none, old linker %s is unregistered.",
                    old.name())

        return old
Exemple #20
0
    def create_table(self, table):
        '''
            create table in current database
        :param table:
        :return:
        '''
        #check if the table has exist
        for t in self.tables:
            if t.table == table:
                logger.info("create table %s...exists.", table.name)
                return

        #create new tabel if not exists or changed
        dbtable = DBTable().create(self.dbc, table)

        for i in range(0, len(self.tables)):
            t = self.tables[i]
            if t.table.name == table.name:
                self.tables.pop(i)
                break

        self.tables.append(dbtable)

        self._rebuild_tindex()
Exemple #21
0
class Filter(Launcher):
    '''
        filter base class, use white list rules
    '''
    def __init__(self, workdir, name="filter"):
        Launcher.__init__(self, workdir, name)

    def launch(self):
        '''
            launch filter
        :return:
        '''
        try:
            time_used, ret = Helper.timerun(self._launch)
            logger.info("filter: launch filter - %s, time used: %fs",
                        self.name(), time_used)
        except IOError, e:
            pass
        except Exception, e:
            logger.info("filter: launch filter - %s, error: %s", self.name(),
                        e.message)
Exemple #22
0
    def push(self, uri):
        '''
            push a uri to linker
        :param uri: object, Uri object
        :return: object, key of stored link
        '''
        if self.exists(uri):
            logger.info("linker: push link %s, exists.", uri.url())
            return

        if not self.accept(uri):
            logger.info("linker: push link %s, filtered.", uri.url())
            return

        time_used, ret = Helper.timerun(self._push, uri)

        logger.info("linker: push link %s, pushed. time used:%fs", uri.url(),
                    time_used)