def load(self, mode="ro"): try: self.create_output_path() sh.chmod("700", self.output_path) except sh.ErrorReturnCode as e: ## Already mounted readonly. pass try: log.debug("Loading {0}".format(self)) self.loaded = self.mount_compat("rw") if self.loaded: try: sh.rm( "-rf", os.path.join(self.output_path, '._.Trashes'), os.path.join(self.output_path, '.Spotlight-V100'), os.path.join(self.output_path, 'lost+found'), os.path.join(self.output_path, '$RECYCLE.BIN'), os.path.join(self.output_path, 'System Volume Information')) except: pass try: sh.umount(self.output_path) except: self.loaded = False self.loaded = self.mount_compat(mode) return self.loaded else: return False except sh.ErrorReturnCode as e: self.unload() log.exception(e) return False
def load(self): self.create_output_path() try: #patoolib.extract_archive(self.path, outdir=self.output_path, interactive=False) #p = Process(target=patoolib.extract_archive,kwargs={"archive":self.path, "outdir":self.output_path, "interactive":False}) command = "patool --non-interactive extract --outdir {0} {1}".format( self.output_path, self.path) child = pexpect.spawn(command) while True: #stdout, stderr = cmd_proc.communicate(input=bytes("\n", 'utf8'), timeout=10) child.sendline("") if child.isalive(): time.sleep(1) else: break if child.exitstatus == 0: return True else: self.unload() return False except Exception as e: err_str = "Unable to unpack {0} @ ".format(self.path, self.output_path) log.error(err_str) log.exception(e) return False
def login(self): """登录QQ空间""" log.run().debug("执行Like.login()") # 打印日志 author_info = False try: log.info().info("正在读取用户信息") # 打印日志 with open("config/user.json", "r", encoding="utf-8") as usr: infos = json.load(usr) account = infos['account'] password = infos['password'] author_info = True log.info().info("用户信息读取成功") # 打印日志 except Exception as e: log.exception().exception(e) # 打印日志 log.error().error("用户信息读取失败") # 打印日志 if author_info: # 登录部分 log.info().info("开始登录QQ空间") # 打印日志 driver = webdriver.PhantomJS() driver.maximize_window() url = "https://qzone.qq.com/" driver.get(url) driver.implicitly_wait(3) try: driver.switch_to.frame("login_frame") try: driver.find_element_by_id('switcher_plogin').click() except: log.run().info("默认显示账号密码登录,不需要切换") driver.find_element_by_id('u').clear() driver.find_element_by_id('u').send_keys(account) driver.find_element_by_id('p').click() driver.find_element_by_id('p').send_keys(password) driver.find_element_by_id('login_button').click() time.sleep(3) driver.implicitly_wait(20) log.debug().debug("即将开始验证QQ登录") return self.login_on(driver) # 判断是否登录 except Exception as login_01: log.exception().exception(login_01) # 打印日志 log.error().info("QQ空间登录模块获取失败") # 打印日志 return 'error' else: return 'error'
def _put_file(self, file, local_path, dropbox_path): size = os.stat(file.fileno()).st_size if size < 1000: # kb self.client.put_file(dropbox_path, file, overwrite=True) self.send_progress(local_path, 1.0) else: chunk_size = 1024 * 1024 offset = 0 upload_id = None last_block = None while offset < size: next_chunk_size = min(chunk_size, size - offset) if last_block is None: last_block = file.read(next_chunk_size) try: (offset, upload_id) = self.client.upload_chunk( last_block, next_chunk_size, offset, upload_id) self.last_block = None self.send_progress(local_path, min(offset, size) / size) except dropbox.rest.ErrorResponse as e: log.exception(e) self.client.commit_chunked_upload( 'auto' + dropbox_path, upload_id, overwrite=True, parent_rev=None )
def load(self): log.debug("loading mapable drive {0}".format(self.path)) try: if not re.search(r"block special", str(sh.file(self.path).stdout, 'utf8'), flags=re.IGNORECASE): self.lodev = sh.losetup("-f").split()[0] sh.losetup(self.lodev, self.path) sh.blkid(self.lodev) try: sh.partprobe(self.lodev) except: pass else: sh.blkid(self.path) try: sh.partprobe(self.path) except: pass sh.sync("/dev/") self.process_devicemap() except Exception as e: log.exception(e) return False return True
def _upload(self, event, dropbox_path): if event.isdir: if event.type != 'CREATE': return try: self.client.file_create_folder(dropbox_path) except dropbox.rest.ErrorResponse as e: log.exception(e) finally: return with open(event.source_absolute, 'rb') as file: self._put_file(file, event.source_absolute, dropbox_path)
def initialize_database(path=MASTER_DOC): global models CSVModel.clear() csv_docs = xls_parse_from_url(path) log.info('Downloaded %s' % path) model_instances = {} for k, doc in csv_docs.iteritems(): if k in ['IDMap', 'AllScenarios']: continue try: csv_model = CSVModel(doc).create_model(k) models[csv_model.__name__] = csv_model model_instances[k] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue # We want a late load so that the order is preserved and deterministic from model.refs.parameter_ref import ParameterRef log.info('Dropping view') drop_dp_view(engine) drop_view(engine) CSVModel.drop_all(engine) CSVModel.create_all(engine) log.info('Creating view') initialize_view(engine) initialize_dp_view(engine) for k, v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) log.info("Initialized %s" % k) log.info("Initializing Parameter References and Associations") pdicts = [(pdict.scenario, pdict.id, pdict.parameter_ids) for pdict in model_instances['ParameterDictionary']] log.info("Loaded ParameterDictionary into memory") params = {p.id: p.scenario for p in model_instances['ParameterDefs']} log.info("Loaded Parameters into Memory") if engine.name == 'postgresql': speedy_parameter_load(pdicts, params) else: linear_parameter_load(pdicts, params, session)
def initialize_database(path=MASTER_DOC): global models CSVModel.clear() csv_docs = xls_parse_from_url(path) log.info('Downloaded %s' % path) model_instances = {} for k,doc in csv_docs.iteritems(): if k in ['IDMap', 'AllScenarios']: continue try: csv_model = CSVModel(doc).create_model(k) models[csv_model.__name__] = csv_model model_instances[k] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue # We want a late load so that the order is preserved and deterministic from model.refs.parameter_ref import ParameterRef log.info('Dropping view') drop_dp_view(engine) drop_view(engine) CSVModel.drop_all(engine) CSVModel.create_all(engine) log.info('Creating view') initialize_view(engine) initialize_dp_view(engine) for k,v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) log.info("Initialized %s" % k) log.info("Initializing Parameter References and Associations") pdicts = [(pdict.scenario, pdict.id, pdict.parameter_ids) for pdict in model_instances['ParameterDictionary']] log.info("Loaded ParameterDictionary into memory") params = {p.id : p.scenario for p in model_instances['ParameterDefs']} log.info("Loaded Parameters into Memory") if engine.name == 'postgresql': speedy_parameter_load(pdicts, params) else: linear_parameter_load(pdicts, params, session)
def initialize_saf(database='data/objects_20131126_112742.xls'): global models CSVModel.clear() csv_docs = xls_parse_from_url(database) log.info('Loaded %s' % database) model_instances = {} for k,doc in csv_docs.iteritems(): try: csv_model = CSVModel(doc).create_model('saf_%s' % k) models[csv_model.__name__] = csv_model model_instances[csv_model.__name__] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue from model.refs.saf_instrument_ref import SAFInstrumentRef log.info("Dropping SAF Views") drop_saf_instrument_view(engine) drop_qc_view(engine) log.info("Dropping SAF Models") CSVModel.drop_all(engine) log.info("Creating SAF Models") CSVModel.create_all(engine) log.info("Creating SAF Views") initialize_saf_instrument_view(engine) initialize_qc_view(engine) for k,v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) raise log.info('Initialized %s' % k) log.info('Initialized SAF Data instances') instruments = model_instances['saf_instrument'] instruments = [(i.id, i.data_product_list) for i in instruments] log.info("Loaded instruments into memory") if engine.name == 'postgresql': speedy_saf_ref(instruments) else: linear_saf_ref(instruments, session)
def initialize_saf(database='data/objects_20131126_112742.xls'): global models CSVModel.clear() csv_docs = xls_parse_from_url(database) log.info('Loaded %s' % database) model_instances = {} for k, doc in csv_docs.iteritems(): try: csv_model = CSVModel(doc).create_model('saf_%s' % k) models[csv_model.__name__] = csv_model model_instances[csv_model.__name__] = csv_model.from_csv(doc) log.info("Parsed sheet %s" % k) except ArgumentError: log.exception("Couldn't load %s" % k) continue except TypeError: log.exception("Couldn't load %s" % k) continue from model.refs.saf_instrument_ref import SAFInstrumentRef log.info("Dropping SAF Views") drop_saf_instrument_view(engine) drop_qc_view(engine) log.info("Dropping SAF Models") CSVModel.drop_all(engine) log.info("Creating SAF Models") CSVModel.create_all(engine) log.info("Creating SAF Views") initialize_saf_instrument_view(engine) initialize_qc_view(engine) for k, v in model_instances.iteritems(): for inst in v: session.add(inst) try: session.commit() except Exception as e: session.rollback() from traceback import print_exc print_exc(e) raise log.info('Initialized %s' % k) log.info('Initialized SAF Data instances') instruments = model_instances['saf_instrument'] instruments = [(i.id, i.data_product_list) for i in instruments] log.info("Loaded instruments into memory") if engine.name == 'postgresql': speedy_saf_ref(instruments) else: linear_saf_ref(instruments, session)
def linear_saf_ref(instances,session): from model.refs.saf_instrument_ref import SAFInstrumentRef for i_id, dp_ids in instances: dp_ids.replace(' ', '') dp_ids = dp_ids.split(',') for dp_id in dp_ids: inst_ref = SAFInstrumentRef(instrument_id=i_id, data_product_id=dp_id) session.add(inst_ref) try: session.commit() except: log.exception("Couldn't load reference") session.rollback()
def linear_parameter_load(pdicts, params, session): from model.refs.parameter_ref import ParameterRef for pdict_scenario, pdict_id, pdict_parameter_ids in pdicts: param_ids = pdict_parameter_ids.replace(' ', '') # strip white space, param_ids = param_ids.split(',') for param_id in param_ids: param_scenario = params[param_id] pref = ParameterRef(pdict_id=pdict_id, pdict_scenario=pdict_scenario, param_id=param_id, param_scenario=param_scenario) session.add(pref) try: session.commit() except: log.exception("Couldn't load reference") session.rollback()
def login_on(self, driver): """验证QQ空间是否登录""" log.run().debug("执行Like.login_on()") # 打印日志 try: driver.find_element_by_id('QZ_Toolbar_Container') log.run().debug("QQ已登录") # 打印日志 return driver except Exception as login_02: log.exception().exception(login_02) # 打印日志 log.info().info("QQ未登录") # 打印日志 self.screenshot(driver) return 'error'
def linear_saf_ref(instances, session): from model.refs.saf_instrument_ref import SAFInstrumentRef for i_id, dp_ids in instances: dp_ids.replace(' ', '') dp_ids = dp_ids.split(',') for dp_id in dp_ids: inst_ref = SAFInstrumentRef(instrument_id=i_id, data_product_id=dp_id) session.add(inst_ref) try: session.commit() except: log.exception("Couldn't load reference") session.rollback()
def get_metadata(full_name): metadata = None try: parser = createParser(full_name) metadata = extractMetadata(parser) if parser: parser.stream._input.close() del parser except hachoir.stream.input.InputStreamError: ## is directory metadata = None except Exception as err: log.exception(err) log.error("Cannot extract metadata") metadata = None finally: return metadata
def mount_compat(self, mode="ro"): status = True try: sh.mount("-o", "{0},{1}".format(mode, self.MOUNT_OPTIONS), self.path, self.output_path) except sh.ErrorReturnCode as e: log.debug("Legacy re-mount opts for {0}".format(self)) try: sh.mount("-o", "{0}".format(mode), self.path, self.output_path) except: try: sh.mount(self.path, self.output_path) except Exception as e: log.error("Cannot mount : {0}".format(self)) log.exception(e) status = False return status
def praise(self, driver): """点赞部分""" log.run().debug("执行Like.praise()") # 打印日志 log.info().info("查找未点赞目标") driver.refresh() # 刷新 driver.implicitly_wait(10) if self.login_on(driver) != 'error': # 检查是否在线 # 判断自己是否点赞 try: praise_person = driver.find_element_by_xpath( '//*[@id="feed_friend_list"]/li[1]/div[@class="f-single-foot"]/div[@class="f-like-list f-like _likeInfo"]/div[@class="user-list"]/a' ).get_attribute("class") if praise_person == 'item _ownerlike q_namecard': result = False else: result = True except: result = True log.info().info("找到目标,准备点赞") # 打印日志 # 判断是否点赞 try: if result: driver.find_element_by_css_selector( "[class='fui-icon icon-op-praise']").click() log.info().info("点赞成功") # 打印日志 except Exception as e106: log.exception().exception(e106) # 打印日志 log.error().error("点赞发生错误") # 打印日志 time.sleep(3) return driver else: time.sleep(60) log.info().info("正尝试重新登录") # 打印日志 driver_now = self.login() # 重新登录, if driver_now != 'error': # 若成功则继续点赞 return driver else: return 'error'
def create_file(self, path, parent=None): file_obj = None #if stat.S_ISFIFO(os.stat(path).st_mode) or stat.S_ISCHR(os.stat(path).st_mod): if stat.S_ISFIFO(os.stat(path).st_mode) or stat.S_ISCHR( os.stat(path).st_mode): return None magic_str, mime_str = self.get_file_magic(path) metadata = get_metadata(path) for regex, file_class in self.CONTAINER_TYPES_MAP.items(): if file_class and re.search(regex, magic_str, flags=re.IGNORECASE): try: file_obj = file_class(path, magic_str=magic_str, mime_type=mime_str, metadata=metadata, parent=parent) break except IncompatibleFS: log.error( "Attempted to create filesystem from block device without success" ) pass if not file_obj: for regex, file_class in self.MIME_TYPES_MAP.items(): if file_class and re.search( regex, mime_str, flags=re.IGNORECASE): try: file_obj = file_class(path, magic_str=magic_str, mime_type=mime_str, metadata=metadata, parent=parent) break except Exception as e: log.exception(e) pass if not file_obj: file_obj = Data(path, magic_str) return file_obj
def organize(self, mfile, root_call=True): loaded_mfiles = set() if root_call: log.info("Organizing {0}".format(mfile)) try: if mfile.load(): log.info("Organizing childs of {0}".format(mfile)) if mfile.is_source_container(): log.debug("{0} is source".format(mfile.path)) dump_dir_path = os.path.join(WORK_DIR, SW_PROJECTS_OUTPUT) sh.mkdir("-p", dump_dir_path) dump_dir_path = sh.mktemp( "-d", "-p", dump_dir_path, "--suffix", os.path.basename(mfile.path)).stdout.strip() try: sh.rsync("-rat", mfile.path, dump_dir_path) except sh.ErrorReturnCode_23: ## Rsync errs related with attrs or others pass else: loaded_mfiles.add(mfile) #self.dive(mfile) p = Process(target=Organizer.dive, args=[self, mfile]) p.start() p.join() else: destination_path = self.index.put_file(mfile.path) metapath_file = open( "{}.{}".format(destination_path, METAFPATHFILE), 'ab') metapath_file.write(bytes(mfile.path + "\n", 'utf8')) metapath_file.close() try: ordered_path = mfile.get_ordered_path() sh.mkdir("-p", os.path.join(ordered_path, 'NoMeta')) fname = os.path.basename(mfile.path) destination_fname = os.path.basename(destination_path) for link in mfile.gen_ordered_paths(): log.debug("{} to {}".format(mfile.path, link)) sh.mkdir("-p", link) try: has_ext = re.search(r"(\..*)", fname) extension = has_ext.group(1) link = os.path.join( link, u"{0}{1}".format(destination_fname, extension)) except AttributeError: link = os.path.join( link, u"{0}".format(destination_fname)) log.info(u"File {0} @ {1}".format( str(mfile), ordered_path)) sh.ln("-s", destination_path, link) except sh.ErrorReturnCode_1: pass except sh.ErrorReturnCode as e: log.exception(e) except Exception as e: log.error("Organizer error {0}".format(mfile.path)) log.exception(e) finally: for loaded_mfile in loaded_mfiles: try: loaded_mfile.unload() except Exception as e: log.error("Error unloading {0}".format(mfile.path)) log.exception(e) return True
def deal_article_list(self, req_url, text): """ @summary: 获取文章列表 分为两种 1、第一次查看历史消息 返回的是html格式 包含公众号信息 2、下拉显示更多时 返回json格式 但是文章列表都是json格式 且合适相同 抓取思路: 1、如果是第一种格式,直接解析文章内容,拼接下一页json格式的地址 2、如果是第二种格式, --------- @param data: --------- @result: """ try: # 判断是否为被封的账号, 被封账号没有文章列表 __biz = tools.get_param(req_url, "__biz") if "list" in text: # 取html格式里的文章列表 if "action=home" in req_url: # 解析公众号信息 self.__parse_account_info(text, req_url) # 解析文章列表 regex = "msgList = '(.*?})';" article_list = tools.get_info(text, regex, fetch_one=True) article_list = article_list.replace(""", '"') publish_time = self.__parse_article_list( article_list, __biz, is_first_page=True) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 regex = "can_msg_continue = '(\d)'" can_msg_continue = tools.get_info(text, regex, fetch_one=True) if can_msg_continue == "0": # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) if not new_last_publish_time: # 标记成僵尸号 log.info("公众号 {} 为僵尸账号 不再监控".format(__biz)) self._task_manager.sign_account_is_zombie(__biz) else: self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取appmsg_token 在html中 regex = 'appmsg_token = "(.*?)";' appmsg_token = tools.get_info(text, regex, fetch_one=True) # 取其他参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=10, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( 10, publish_time), ) else: # json格式 text = tools.get_json(text) article_list = text.get("general_msg_list", {}) publish_time = self.__parse_article_list( article_list, __biz) # 判断是否还有更多文章 没有跳转到下个公众号,有则下拉显示更多 can_msg_continue = text.get("can_msg_continue") if not can_msg_continue: # 无更多文章 log.info("抓取到列表底部 无更多文章,公众号 {} 抓取完毕".format(__biz)) new_last_publish_time = self._task_manager.get_new_last_article_publish_time( __biz) self._task_manager.update_account_last_publish_time( __biz, new_last_publish_time) pass elif publish_time: # 以下是拼接下拉显示更多的历史文章 跳转 # 取参数 在url中 __biz = tools.get_param(req_url, "__biz") pass_ticket = tools.get_param(req_url, "pass_ticket") appmsg_token = tools.get_param(req_url, "appmsg_token") # 取offset 在json中 offset = text.get("next_offset", 0) next_page_url = "https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz={__biz}&f=json&offset={offset}&count=10&is_ok=1&scene=124&uin=777&key=777&pass_ticket={pass_ticket}&wxtoken=&appmsg_token={appmsg_token}&x5=0&f=json".format( __biz=__biz, offset=offset, pass_ticket=pass_ticket, appmsg_token=appmsg_token, ) return self._task_manager.get_task( next_page_url, tip="正在抓取列表 next_offset {} 抓取到 {}".format( offset, publish_time), ) else: # 该__biz 账号已被封 self._task_manager.sign_account_is_zombie(__biz) pass except Exception as e: log.exception(e) return self._task_manager.get_task()
config.quiet = True def work(path): ffactory = FileFactory() organizer = Organizer() mfile = ffactory.create_file(path) organizer.organize(mfile) work_queue = Queue() def worker_loop(): path = work_queue.get() log.info("Ordering {}".format(path)) work(path) log.info("Finished {}".format(path)) if __name__ == "__main__": for param in sys.argv[1:]: try: p = Process(target=work, args=(param, )) p.start() p.join() except Exception as e: log.exception(e) log.info("Finished all tasks")