def clean_data(cls, content): charset_name = '' try: dict = json.loads(content) except Exception, e: mlog.log().error(e) mlog.log().error(content)
def is_connected(self): try: self.voidcmd("NOOP") return True except Exception, e: mlog.log().error('ftp error:%s' % e) return False
def test(): config = { 'ftp': { 'type': 1, 'host': '61.147.114.73', 'port': 21, 'user': '******', 'passwd': '123456x', 'timeout': 5, 'local': './' } } analysis_engine = AnalysisEngine(config) file_list = analysis_engine.input_data('~/text_storage/60006bak') i = 0 count = len(file_list) while i < count: tlist = file_list[i:i + 4] i += 5 start_time = time.time() for t in tlist: analysis_engine.process_file_data(60006, '~/text_storage/60006bak', t, 0) end_time = time.time() mlog.log().info("analysis file count %d expend %d", i, end_time - start_time)
def main(): sqlmgr = SQLiteExt("./text.db", 0) try: create_table_sql = '''CREATE TABLE `search` ( `id` BIGINT NOT NULL, `uid` BIGINT NULL, `title` VARCHAR(128) NULL, `text` VARCHAR(40960) NULL, `created_at` INT NULL, `retweet_count` INT NULL, `reply_count` INT NULL, `fav_count` INT NULL, `retweet_id` INT NULL, `type` INT NULL, `source_link` VARCHAR(256), `edited_at` INT NULL, `pic` VARCHAR(256) NULL, `target` VARCHAR(256) NULL, `source` VARCHAR(256) NULL, PRIMARY KEY (`id`));''' sqlmgr.create_table(create_table_sql) except Exception, e: mlog.log().error("Create table failed")
def create_table(self, sql): if sql is not None and sql != '': cur = self.__get_cursor() cur.execute(sql) self.conn.commit() self.__close_all(cur) else: mlog.log().error('the [{}] is empty or equal None!'.format(sql))
def run(self): try: if len(self.wait_queue): item = self.wait_queue.pop(0) self.engine.save(item['db'], item['data']) except Exception, e: mlog.log().error('save error:%s' % e) pass
def nickname_format(cls, str,content): reply_nickname = None try: tree = ET.fromstring(str[2:len(str) - 1]) lst_node = tree.getiterator('a') for node in lst_node: reply_nickname = node.text[1:len(node.text)] break except Exception, e: mlog.log().error(str + "===>" + content)
def drop_table(self, table): """如果表存在,则删除表,如果表中存在数据的时候,使用该 方法的时候要慎用!""" if table is not None and table != '': sql = 'DROP TABLE IF EXISTS ' + table cur = self.__get_cursor() cur.execute(sql) self.conn.commit() self.__close_all(cur) else: mlog.log().error('the [{}] is empty or equal None!'.format(table))
def save(self, sql, data): '''插入数据''' if sql is not None and sql != '': if data is not None: cur = self.__get_cursor() for d in data: cur.execute(sql, d) self.conn.commit() self.__close_all(cur) else: mlog.log().error('the [{}] is empty or equal None!'.format(sql))
def __get_uid(self, content): dict = {} data = content.get('dict') for key in data: mlog.log().info("tabel name %s content %d", key, len(data[key])) for t in data[key]: uid = t[1] dict[uid] = uid return { 'pid': local_task_opercode.XUEQIU_GET_DISCUSSION_UID, 'result': dict }
def face_format(cls, content): re_t = re.compile('(<img src=.//assets\\.imedao\\.com).*?(images).*?(face).*?(title).*?(alt).*?(>)', re.DOTALL) list = [] for m in re_t.finditer(content): try: tree = ET.fromstring(m.group()) if tree.attrib.has_key('title'): value = tree.attrib['title'] r = {'start': m.start(), 'end': m.end(), 'str': m.group(), 'value': value} list.append(r) except Exception, e: mlog.log().error(m.group())
def __conection_sql(self): try: self.conn = sqlite3.connect(self.name, self.timeout) if os.path.exists(self.name) and os.path.isfile(self.name): self.type = 0 else: self.conn = sqlite3.connect(':memory:') self.type = 1 except Exception, e: mlog.log().error('sqlite3 error:%s' % e) return
def __get_conn(self, path): """获取到数据库的连接对象,参数为数据库文件的绝对路径 如果传递的参数是存在,并且是文件,那么就返回硬盘上面改 路径下的数据库文件的连接对象;否则,返回内存中的数据接 连接对象""" conn = sqlite3.connect(path) if os.path.exists(path) and os.path.isfile(path): mlog.log().info('硬盘上面:[{}]'.format(path)) return conn else: conn = None mlog.log().info('内存上面:[:memory:]') return sqlite3.connect(':memory:')
def fetch(self, sql): queue = [] if sql is not None and sql != '': cur = self.__get_cursor() cur.execute(sql) r = cur.fetchall() if len(r) > 0: for e in range(len(r)): queue.append(r[e]) return queue else: mlog.log().error('the [{}] is empty or equal None!'.format(sql)) return None
def __handle_all_file(self, pid, path): file_list = self.scheduler_engine.input_data(path) i = 0 count = len(file_list) while i < count: unit_list = file_list[i:i + 5] i += 5 start_time = time.time() for t in unit_list: self.__handle_single_file(pid, path, t) end_time = time.time() mlog.log().info("analysis file count %d expend %d", i, end_time - start_time)
def fetchall_data(self, pid): dict = {} result = self.sql_mgr.get_table() for t in result: s_t = "".join(t) mlog.log().info("tablename %s ", s_t) if pid == local_task_opercode.XUEQIU_GET_MEMBER_MAX: sql = xqdb.get_member_max(s_t) elif pid == pid == local_task_opercode.XUEQIU_GET_DISCUSSION_UID: sql = xqdb.get_user_discuss_max(s_t) else: sql = xqdb.get_id(s_t) dict[s_t] = self.sql_mgr.get_data(sql) return dict
def run(self): """ 连接取数据 """ while True: consumer = KafkaConsumer(bootstrap_servers=self.host) consumer.subscribe([self.coname]) for message in consumer: try: json_info = json.loads(message[6]) print json_info #self.callback(json_info) except Exception, e: mlog.log().error(e)
def run(console): analysis_engine = AnalysisEngine() file_list = analysis_engine.input_data(console.path) i = 0 count = len(file_list) while i < count: tlist = file_list[i:i + 4] i += 5 start_time = time.time() for t in tlist: analysis_engine.process_file_data(console.plt_id, console.path, t, 0) end_time = time.time() mlog.log().info("analysis file count %d expend %d", i, end_time - start_time)
def parser_ftp_method(config, path, pid): ae = AnalysisEngine(config) ae.start() file_list = ae.input_data(path) i = 0 count = len(file_list) while i < count: unit_list = file_list[i:i + 5] i += 5 start_time = time.time() for t in unit_list: ae.process_file_data(pid, path, t, 0) end_time = time.time() mlog.log().info("analysis file count %d expend %d", i, end_time - start_time)
def create_table(self,crate_table_sql, type = 1): """ Args: crate_table_sql: 创建表的SQL语句 type: 0 删除原有的 1.保留原有的 Returns: """ if type == 0: drop_table_sql = 'DROP TABLE IF EXISTS ' + self.table self.engine.drop_table(drop_table_sql) try: self.engine.create_table(crate_table_sql) except Exception, e: mlog.log().error('create_table error:%s' % e)
def day_heat(self, content): symbol = "" tlist = [] try: tree = ET.fromstring(content) lst_node = tree.getiterator('Title') for node in lst_node: if node.attrib.has_key("id") > 0: symbol = node.attrib['id'] lst_node = tree.getiterator('Individual') for node in lst_node: for c in node: if c.attrib.has_key("d") > 0: d = c.attrib["d"] v = c.attrib["v"] tlist.append((d, int(v))) except Exception, e: mlog.log().error("error content")
def __clean_search_event(self, content): dt = {} d = content['dict'] for key, value in d.items(): lt = [] for t in value: #replpy = xq_common.quote_format(t[3]) try: dic = Discussion() reply = dic.parser_int(t[3]) l = list(t) s = json.dumps(reply) l.append(s.decode('unicode-escape')) lt.append(l) except Exception, e: mlog.log().error("https://xueqiu.com/" + str(t[1]) + "/" + str(t[0])) dt[key] = lt
def get(self, basic_path, filename, callback=None): if not self.ping(): return False try: #self.ftp.cwd('~/text_storage') path_list = basic_path.split('/') for _path in path_list: self.ftp.cwd(_path) #self.ftp.cwd(basic_path) file_size = self.ftp.size(filename) if callback is None: self.ftp.retrbinary('RETR ' + filename, self.callback, file_size) else: self.ftp.retrbinary('RETR ' + filename, callback, file_size) return True except Exception, e: mlog.log().error("ftp error:%s url:%s", e, filename) return False
def quarter_heat(self, content): symbol = "" tlist = [] try: tree = ET.fromstring(content) lst_node = tree.getiterator('Title') for node in lst_node: if node.attrib.has_key("id") > 0: symbol = node.attrib['id'] lst_node = tree.getiterator('Individual') for node in lst_node: for c in node: if c.attrib.has_key("d") > 0: d = c.attrib["d"] dlist = d.split(' ') date = dlist[0] hour = dlist[1] v = c.attrib["v"] changerate = c.attrib["changerate"] tlist.append((d, int(v), changerate)) except Exception, e: mlog.log().error("error content")
def __u_connect(self): self.ftp.set_pasv(True, self.host) try: if not self.ftp.connect(self.host, self.port, self.timeout): mlog.log().error("connect ftp server failed") return False if not self.ftp.login(self.name, self.pwd): mlog.log().error("login ftp server failed") return False self.is_connected = True mlog.log().info("host : " + self.host + " ftp login success") return True except Exception, e: mlog.log().error("ftp error[%s]", e) return False
class CleaningCrawler(): @classmethod def clean_data(cls, content): charset_name = '' try: dict = json.loads(content) except Exception, e: mlog.log().error(e) mlog.log().error(content) data = '' # 解base64 try: data = base64.b32decode(dict['content']) charset_name = dict['charset'] except Exception, e: mlog.log().error(e) return None
def run(self): while True: try: self.pool.poll() except KeyboardInterrupt: mlog.log().error("**** Interrupted!") break except NoResultsPending: mlog.log().error("**** No pending results.") break if self.pool.dismissedWorkers: mlog.log().info("Joining all dismissed worker threads...") self.pool.joinAllDismissedWorkers()
def main(): mlog.log().info('Python %s on %s' % (sys.version, sys.platform)) sys_str = platform.system() mlog.log().info(sys_str) if platform.system() == "Darwin" or platform.system() == "Linux": reload(sys) sys.setdefaultencoding('utf-8') # @UndefinedVariable os.chdir(os.getcwd()) ### 控制台输出 console = Console() console.input_info() pool = Pool(processes=3) result = pool.apply_async(run, (console, )) pool.close() pool.join() if result.successful(): mlog.log().info("successful")
except Exception, e: mlog.log().error(e) mlog.log().error(content) data = '' # 解base64 try: data = base64.b32decode(dict['content']) charset_name = dict['charset'] except Exception, e: mlog.log().error(e) return None # 解压缩 try: data = zlib.decompress(data) except Exception, e: mlog.log().error(e) return None # 解字符串码 try: data = data.decode(charset_name) except Exception, e: mlog.log().error(e) return None url = dict.get('url') pid = dict.get('pid') if url is not None: url = base64.b32decode(url) if pid is not None and url is not None:
def log(self): mlog.log().info(self.host) mlog.log().info(self.port) mlog.log().info(self.name) mlog.log().info(self.pwd)