def open(self, host, user, pwd, dbn, port=3306): ''' open database or create it if not exist :return: ''' #init storage path try: self.host, self.port, self.user, self.pwd = host, port, user, pwd self.dbn = dbn self.dbc = MySQLdb.connect(host=host, user=user, passwd=pwd, port=port) if not self._exists(): #create database self._create() self._use() else: # load database self._use() self._load() self._rebuild_tindex() return self logger.info( "open storage mysql://%s:%s@%s:%d/%s...success. %d tables.", user, pwd, host, port, self.dbn, len(self.tables)) except Exception, e: logger.error( "open storage mysql://%s:%s@%s:%d/%s...failed. error: %s", user, pwd, host, port, self.dbn, str(e)) raise e
def create_table(self, table): ''' create table in current database :param table: :return: ''' with clock(self.lock): # test if the table has loaded for t in self.tables: if t.table == table: logger.info("create table %s...exists.", table.name) return #create new table table = FSTable().create(self.path, table) for i in range(0, len(self.tables)): t = self.tables[i] if t.table.name == table.name: self.tables.pop(i) break self.tables.append(table) self._rebuild_tindex()
def select(self): ''' select all data from table :return: ''' try: with clock(self.lock): with open(self.data_file, "r") as fdata: models = [] #read field names nfields = strips(fdata.readline().strip().split(",")) #read data records data = fdata.readline() while data: data = data.strip() vfields = strips(data.split(",")) model = {} for idx in range(0, len(nfields)): model[nfields[idx]] = str2obj(vfields[idx], ',') models.append(model) data = fdata.readline() return models except Exception, e: logger.info("select data from table %s...failed. error: %s", self.name, str(e)) raise e
def load(self, dbpath, name): ''' load table :return: self ''' try: #initialize table parameters self.name = name self.path = join_paths(dbpath, name) self.table_file = join_paths(self.path, "table") self.data_file = join_paths(self.path, "data") self.table = self.desc() #load data file if not is_file(self.data_file): #create data file if not exists self._create_data_file() else: #replace old data file if needed with open(self.data_file) as fdata: nfields = strips(fdata.readline().split(",")) if self.table.nfields() != nfields: if is_subset(nfields, self.table.nfields()): self._upgrade_data_file() else: self._replace_data_file() logger.info("loading table %s...success.", self.name) return self except Exception, e: logger.info("loading table %s...failed. error: %s", self.name, str(e)) raise e
def open(self, path): ''' open storage or create it if not exist :return: ''' try: with clock(self.lock): #init storage path self.path = path if not path_exists(self.path): #create database self._create() else: # load database self._load() self._rebuild_tindex() return self logger.info("open storage %s...success. %d tables.", self.path, len(self.tables)) except Exception, e: logger.error("open storage %s...failed. error: %s", self.path, str(e)) raise e
def launch(self): ''' launch extractor :return: ''' try: time_used, ret = Helper.timerun(self._launch) logger.info("extractor: launch extractor - %s, time used: %fs", self.name(), time_used) except IOError, e: pass
def update(self, uri, extras): ''' udpate uri context with crawl response extras data :param uri: object, Uri object :param extras: dict, extras data for crawled response :return: ''' time_used, ret = Helper.timerun(self._update, uri, extras) logger.info("linker: update link %s, updated. time used:%fs", uri.url(), time_used)
def persist(self): ''' persist extractor data :return: ''' try: time_used, ret = Helper.timerun(self._persist) logger.info("extractor: persist extractor - %s, time used: %fs", self.name(), time_used) except Exception, e: logger.info("extractor: persist extractor - %s, error: %s", self.name(), e.message)
def desc(self): ''' descrite table from storage :return: Table ''' try: with open(self.table_file) as ftable: table = Table().fromstr(ftable.read()) return table except Exception, e: logger.info("describe table %s...failed. error: %s", self.name, str(e)) raise e
def shutdown(self): ''' shutdown extractor :return: ''' try: time_used, ret = Helper.timerun(self._shutdown) logger.info("extractor: shutdown extractor - %s, time used: %fs", self.name(), time_used) except Exception, e: logger.info("extractor: shutdown extractor - %s, error: %s", self.name(), e.message)
def create(self, dbpath, table): ''' create table :return self ''' try: #initialize table parameters self.table = table self.name = table.name self.path = join_paths(dbpath, table.name) self.table_file = join_paths(self.path, "table") self.data_file = join_paths(self.path, "data") #create table directory if it is not exists make_dirs(self.path) #create or replace table file if is_file(self.table_file): #replace old table file if needed old_table = self.desc() if self.table != old_table: #replace table file self._replace_table_file() else: #new table is same as exists table pass else: #create new table file self._create_table_file() #create or upgrade or replace data file if is_file(self.data_file): #replace old data file if needed with open(self.data_file) as fdata: nfields = strips(fdata.readline().split(",")) if self.table.nfields() != nfields: if is_subset(nfields, self.table.nfields()): self._upgrade_data_file() else: self._replace_data_file() else: #create new data file self._create_data_file() logger.info("create table %s...success.", self.name) return self except Exception, e: logger.error("create table %s...failed. error: %s", self.name, str(e)) raise e
def close(self): ''' close datbase :return: ''' #close database connection try: if self.dbc is not None: self.dbc.close() logger.info("close storage mysql://%s:%s@%s:%d/%s...success.", self.user, self.pwd, self.host, self.port, self.dbn) except Exception, e: logger.info( "close storage mysql://%s:%s@%s:%d/%s...failed. error: %s", self.user, self.pwd, self.host, self.port, self.dbn, str(e)) raise e
class Linker(Launcher): ''' linker who manage crawl links from spider ''' def __init__(self, workdir, name="linker"): ''' initialize linker instance :param name: string, linker name, an unique identifier :param configs: list, PatternConfig objects in list ''' Launcher.__init__(self, workdir, name) def launch(self): ''' launch linker :return: ''' try: time_used, ret = Helper.timerun(self._launch) logger.info("linker: launch linker - %s, time used: %fs", self.name(), time_used) except IOError, e: pass except Exception, e: logger.info("linker: launch linker - %s, error: %s", self.name(), e.message)
class Extractor(Launcher): ''' base class for all extractor ''' def __init__(self, workdir, name="extractor"): ''' initialize extractor instance with @filter :param name: string, extractor name, unique identifier for the extractor instance ''' Launcher.__init__(self, workdir, name) def launch(self): ''' launch extractor :return: ''' try: time_used, ret = Helper.timerun(self._launch) logger.info("extractor: launch extractor - %s, time used: %fs", self.name(), time_used) except IOError, e: pass except Exception, e: logger.info("extractor: launch extractor - %s, error: %s", self.name(), e.message)
def pull(self): ''' pull next link from linker :return: object, Link object or None ''' time_used, link = Helper.timerun(self._pull) if link is not None: logger.info("linker: pull link %s, pulled. time used: %fs", link.uri().url(), time_used) return link.uri() else: logger.info( "linker: pull link none, no more links. time used: %fs", time_used) return None
def parse(self, uri, content): ''' parse wrapper for actual @_parse method :param uri: object, uri for the @content :param content: string, content for the @url :return: list, list with @Uri objects ''' if not self.accept(uri): return None time_used, links = Helper.timerun(self._parse, uri, content) logger.info( "parser: parse links: %s, parsed. links: %d, time used: %fs", uri.url(), len(links), time_used) return links
def extract(self, uri, content): ''' extract data from content :param uri: object, @Uri object of content :param content: string, content of @uri :return: object, extract result object or None ''' if not self.accept(uri): return None time_used, result = Helper.timerun(self._extract, uri, content) logger.info( "extractor: extract data from: %s, extracted. time used: %fs", uri.url(), time_used) return result
def insert(self, models): ''' insert data to table :param models: :return: ''' try: with clock(self.lock): with open(self.data_file, "a") as fdata: lines = [] for model in models: vfields = [] for nfield in self.table.nfields(): vfields.append(objtostr(model.get(nfield), ',')) lines.append("%s\n" % ",".join(vfields)) fdata.writelines(lines) except Exception, e: logger.info("insert data to table %s...failed. error: %s", self.name, str(e)) raise e
def register(self, linker): ''' register @linker into linker manager, replace current linker :param linker: object, linker to be loaded :return: object, old linker or None ''' old = self.__linker self.__linker = linker if self.__linker is not None: logger.info("linker manager: register new linker %s.", self.__linker.name()) else: if old is None: logger.warning( "linker manager: linker is none, no linker registered.") else: logger.warning( "linker manager: linker is none, old linker %s is unregistered.", old.name()) return old
def create_table(self, table): ''' create table in current database :param table: :return: ''' #check if the table has exist for t in self.tables: if t.table == table: logger.info("create table %s...exists.", table.name) return #create new tabel if not exists or changed dbtable = DBTable().create(self.dbc, table) for i in range(0, len(self.tables)): t = self.tables[i] if t.table.name == table.name: self.tables.pop(i) break self.tables.append(dbtable) self._rebuild_tindex()
class Filter(Launcher): ''' filter base class, use white list rules ''' def __init__(self, workdir, name="filter"): Launcher.__init__(self, workdir, name) def launch(self): ''' launch filter :return: ''' try: time_used, ret = Helper.timerun(self._launch) logger.info("filter: launch filter - %s, time used: %fs", self.name(), time_used) except IOError, e: pass except Exception, e: logger.info("filter: launch filter - %s, error: %s", self.name(), e.message)
def push(self, uri): ''' push a uri to linker :param uri: object, Uri object :return: object, key of stored link ''' if self.exists(uri): logger.info("linker: push link %s, exists.", uri.url()) return if not self.accept(uri): logger.info("linker: push link %s, filtered.", uri.url()) return time_used, ret = Helper.timerun(self._push, uri) logger.info("linker: push link %s, pushed. time used:%fs", uri.url(), time_used)