def _get_task_update_sql(self, src: dict, new: IscoutTask): """拼接更新task的sql,并返回sqlparameters列表""" params: list = [] sql = "UPDATE iscouttask SET taskstatus=?, " params.append(new.taskstatus.value) sql += "objecttype=?, " params.append(new._objecttype.value) sql += "object=?, " params.append(new._object) # 增加一个存入时间 # sql += 'taskstarttime=?, ' # params.append(new.taskstarttime) if not helper_str.is_none_or_empty(new.cmd_id): sql += "cmdid=?, " params.append(new.cmd_id) if not helper_str.is_none_or_empty(new.source): sql += "source=?, " params.append(new.source) if not helper_str.is_none_or_empty(new.periodnum): sql += "periodnum=?, " params.append(new.periodnum) sql = sql.rstrip().rstrip(",") sql += " WHERE batchid=? AND taskid=?" params.append(new.batchid) params.append(new.taskid) return sql, params
def _get_task_update_sql(self, src: dict, new: IscanTask): """ 相同的任务更新字段 taskstatus cmdid source periodnum 拼接更新task的sql,并返回sqlparameters列表 iscantask接收的是server发放的周期任务和暂停任务 所以每次都需要将状态给更新 """ params: list = [] sql = "UPDATE iscantask SET taskstatus=?, " params.append(new.taskstatus.value) # sql += 'scantype=?, ' # params.append(new.scantype.value) if not helper_str.is_none_or_empty(new.cmd_id): sql += "cmdid=?, " params.append(new.cmd_id) if not helper_str.is_none_or_empty(new.source): sql += "source=?, " params.append(new.source) if not helper_str.is_none_or_empty(new.periodnum): sql += "periodnum=?, " params.append(new.periodnum) sql = sql.rstrip().rstrip(",") sql += " WHERE taskid=?" params.append(new.taskid) return sql, params
def get_display_name(self): res = '' if not helper_str.is_none_or_empty(self.filename): res += " {}".format(self.filename) if not helper_str.is_none_or_empty(self.sign): res += " {}".format(self.sign) return res
def _getstring( self, url: str, req_data: str = None, headers: str = None, encoding: str = "utf-8", params=None, json=None, files=None, auth=None, proxies=None, stream=True, verify=None, cert=None, timeout=None, allow_redirects: bool = True, ) -> (str, str): """return (html, redirection url)""" res = None redirected_url: str = None try: resp: requests.Response = self.get_response( url=url, req_data=req_data, headers=headers, params=params, json=json, files=files, auth=auth, proxies=proxies, stream=stream, verify=verify, cert=cert, timeout=timeout, allow_redirects=allow_redirects) if resp.is_redirect or resp.is_permanent_redirect or 300 <= resp.status_code < 400: redirected_url = self.get_redirect_target(resp) host = parse.urlparse(resp.url) redirected_url = parse.urljoin( "{}://{}".format(host.scheme, host.netloc), redirected_url) elif not helper_str.is_none_or_empty(resp.url) and resp.url != url: redirected_url = resp.url resp.encoding = 'utf-8' if not helper_str.is_none_or_empty(encoding): resp.encoding = encoding _respheaders = self._parse_resp_headers(resp) res = resp.text # resp_data = self._decompress_data(_respheaders, resp_data) # res = self._decode_data(_respheaders, resp_data, encoding) except Exception as ex: raise ex return (res, redirected_url)
def get_parent_clientid_of_task(self, task: Task) -> str: """获取 指定task的被分配到的采集端 Client对象""" res: str = None conn: SqliteConn = None cursor = None task: Task = task try: if helper_str.is_none_or_empty(task.parenttaskid): raise Exception( "Invalid task parent_taskid for task, taskid={} batchid={}" .format(task.taskid, task.batchid)) if helper_str.is_none_or_empty(task.parentbatchid): raise Exception( "Invalid task parent_batchid for task, taskid={} batchid={}" .format(task.taskid, task.batchid)) # 搜索每个库,看有没有 taskid和clientid一样,且时间更新 # 的,一样就更新其他所有字段 cmd = f'''SELECT ClientId FROM {self._tbname} WHERE Platform=? and TaskId=? and BatchId=?''' for conn in self.connect_all(5): conn: SqliteConn = conn try: cursor = conn.cursor cursor.execute(cmd, ( task._platform, task.parenttaskid, task.parentbatchid, )) result = cursor.fetchall() if result is None or len(result) < 1 or len(result[0]) < 1: continue else: res = result[0][0] break except Exception: self._logger.error( "get_parent_client_of_task error: {}".format( traceback.format_exc())) finally: if not conn is None: conn.close() if not res is None: break except Exception: self._logger.error("get_parent_client_of_task error: %s" % traceback.format_exc()) return res
def __init__(self, ipdir: dict): if not isinstance(ipdir, dict) or len(ipdir) < 1: raise Exception("Invalid param 'ipdir' for TaskDeliverConfig") self._ipdir: dict = {} for i in ipdir.items(): ip: str = i[0] di: str = i[1] if helper_str.is_none_or_empty(ip) or helper_str.is_none_or_empty( di): raise Exception( "Invalid ip->dir key value pair: {}->{}".format(ip, di)) self._ipdir[ip] = di
def __read_msg(self, retmsg: str) -> (bool, str): """读取指定文本内容,返回是否成功读取到,以及读取到的内容(bool,str)""" succ: bool = False readmsg: str = None try: if helper_str.is_none_or_empty(retmsg): self.__log("Read msg param 'retmsg' is empty") return (succ, readmsg) retmsglen = len(retmsg) retdata: bytes = self.__read_bytes(retmsglen) if retdata is None or len(retdata) != retmsglen: return (succ, 'read bytes from server failed') if not retdata is None: readmsg = retdata.decode('utf-8') if readmsg.lower() == retmsg: succ = True except Exception: succ = False readmsg = 'Read msg from server error: {} {}'.format( self._curr_phone, traceback.format_exc()) self.__log(readmsg) return (succ, readmsg)
def _get_contact_docid(self) -> bool: """fb联系人需要的docid""" if self.docid_contact is not None: return True res: bool = False try: # ProfileCometTopAppSectionQuery for js in self._get_contact_docid_js(): try: if helper_str.is_none_or_empty( js) or 'ProfileCometTopAppSectionQuery' not in js: continue if js.__contains__('ProfileCometTopAppSectionQuery'): m = MessengerContact._re_docid_ProfileCometTopAppSectionQuery.search( js) if not m is None: self.docid_contact = re.search( r'id:\s*?"(\d+)"', m.group(1)).group(1) res = True except Exception: self._logger.debug( "Parse init message docid error: {}".format( traceback.format_exc())) except Exception: self._logger.error( "Get docid for init message error: {} {}".format( self.uname_str, traceback.format_exc())) return res
def _get_docid_profile(self): """获取个人信息的那个docid""" res: bool = False try: for urljs, js in self._get_js_resources(): try: if helper_str.is_none_or_empty( js ) or not '"FBStoriesBucketsQueryWebGraphQLQuery",["WebGraphQLQueryBase"]' in js: continue js = js[js.index( '__d("FBStoriesBucketsQueryWebGraphQLQuery"'):] # c.__getDocID=function(){"use strict";return"2392075840832371"}; res, self.docid_profile = helper_str.substringif( js, '.__getDocID=function(){"use strict";return"', '"') if res and not self.docid_profile is None and self.docid_profile != "": break res, self.docid_profile = helper_str.substringif( js, '__getDocID=function(){return"', '"') if res and not self.docid_profile is None and self.docid_profile != "": break except Exception: self._logger.error("Access urldocid error: {}".format( traceback.format_exc())) if not res: self._logger.error("Get docid for profile failed: {}".format( self.uname_str)) except Exception: self._logger.error("Get docid for profile error:%s" % traceback.format_exc()) return res
def get_write_lines(self) -> iter: """""" lines: str = '' if any(self.__innerdatas): with self.__innerdata_locker: segcount: int = 0 for innerdata in self.__innerdatas: innerdata: InnerDataBase = innerdata lines += self._get_common_fields_lines() lines += innerdata.get_write_lines() if not lines.endswith('\r\n\r\n'): lines = lines.strip() + '\r\n\r\n' segcount += 1 if segcount >= muti_seg_count: yield lines.encode('utf-8') lines = '' segcount = 0 if not helper_str.is_none_or_empty(lines): yield lines.encode('utf-8') elif isinstance(self.io_stream, io.IOBase): lines += self._get_common_fields_lines() lines += self._get_write_lines() if not lines.endswith('\r\n\r\n'): lines = lines.strip() + '\r\n\r\n' yield lines.encode('utf-8') else: lines += self._get_common_fields_lines() lines += self._get_write_lines() if not lines.endswith('\r\n\r\n'): lines = lines.strip() + '\r\n\r\n' yield lines.encode('utf-8')
def append_resource(self, rsc: Resource): """将一个RESOURCE资源数据关联到当前数据的resources列表""" if not issubclass(type(rsc), Resource) or \ helper_str.is_none_or_empty(rsc._url) or \ not isinstance(rsc._resourcetype, EResourceType): raise Exception( "Invalid param 'rsc' Resource for append_resource: {}".format( rsc)) tmp = { "url": rsc._url, "type": rsc._resourcetype.value, } if isinstance(rsc.sign, ESign) and not rsc.sign == ESign.Null: tmp['sign'] = ResourceData._sign_map[rsc.sign] if not helper_str.is_none_or_empty(rsc.filename): tmp['name'] = rsc.filename self._resources.append(tmp)
def _get_uname_task(self): """从task中搜索可用的用户名,包括手机/账号等,并返回,没有可用的 用户名则返回None。适用于Gmail""" res: str = None try: if self.task is None: return res if not helper_str.is_none_or_empty(self.task.account): res = self.task.account elif not helper_str.is_none_or_empty(self.task.phone): if helper_str.is_none_or_empty(self.task.globaltelcode): return res res = "+{}{}".format( self.task.globaltelcode.strip().strip('+'), self.task.phone.strip()) except Exception: res = None return res
def _check_registration(self) -> iter: """查询手机号是否注册了gmail # 中国的手机号需要加上+86 :param account: :return: 返回PROFILE和其附带的RESOURCE资源信息""" # 此函数现应为返回PROFILE和其附带的RESOURCE资源信息 res: PROFILE = None try: targetacc: str = self._get_uname_task() if helper_str.is_none_or_empty(targetacc): self._logger.error( "Target acount is empty while checking registration") return res # 需要先访问一下主页,拿个cookie url = "https://mail.google.com" html = self._ha.getstring(url, headers=""" accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""" ) url = "https://accounts.google.com/_/signin/sl/lookup?hl=zh-CN&_reqid=238058&rt=j" postdata = """continue=https%3A%2F%2Fmail.google.com%2Fmail%2F&service=mail&rm=false<mpl=default&ss=1&osid=1&f.req=%5B%22{account}%22%2C%22AEThLlwQTFRaZdGqR1ySvZpLCNWaG8r9DmMMnVXufvJis2SJJsqoidT0i9UrtAWrSFQpRCLNeiqj0L_XDWnfDK4gXbewbaCoxux2T2wqYutk7KfIW-BMDBov_woG3AtFesU-mmUAAkaw8sLDLabW9CRQUJEPeKzl-Jdj3hNiK9VvNgkFTeg-MJk%22%2C%5B%5D%2Cnull%2C%22KR%22%2Cnull%2Cnull%2C2%2Cfalse%2Ctrue%2C%5Bnull%2Cnull%2C%5B2%2C1%2Cnull%2C1%2C%22https%3A%2F%2Faccounts.google.com%2FServiceLogin%3Fcontinue%3Dhttps%253A%252F%252Fmail.google.com%252Fmail%252F%26osid%3D1%26service%3Dmail%26ss%3D1%26ltmpl%3Ddefault%26rm%3Dfalse%22%2Cnull%2C%5B%5D%2C4%2C%5B%5D%2C%22GlifWebSignIn%22%5D%2C10%2C%5Bnull%2Cnull%2C%5B%5D%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C%5B5%2C%2277185425430.apps.googleusercontent.com%22%2C%5B%22https%3A%2F%2Fwww.google.com%2Faccounts%2FOAuthLogin%22%5D%2Cnull%2Cnull%2C%2255f38eed-e452-4296-b42c-6b386d00a5a2%22%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C5%2Cnull%2Cnull%2C%5B%5D%2Cnull%2Cnull%2Cnull%2C%5B%5D%5D%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C%5B%5D%2Cnull%2Cnull%2Cnull%2C%5B%5D%5D%2Cnull%2Cnull%2Cnull%2Ctrue%5D%2C%22{account}%22%5D&bgRequest=%5B%22identifier%22%2C%22!IiGlIQBCh9g7QSu0Yu9EB9GEMHhDBNACAAAAlVIAAAAYmQGcbYCxJolG33I0aLTL3UDDeXekvvZ2D_ORjLR-jl96QQHU3r4uu1DaX9oqM3NRssfbOOAU5upc9lrn76TBv8Z4exkrfw-We-T3TDbi-4xf2n6uFlzf8CmwU7-FuYCgL_sscdc8HwiP03ADn_5wtaoowXB6wawSjW8_U4p1gkiaGtn9uM61X2-BUR8r3kIKgziDpNHj731K279e3u9YL0Wrye3BEKuydAv5FIsxD0mKpNXz5Ury2e8yX9Wdn1pf72vc8LvzZon7kf-yQePKquVP3MsiMzWyeBvglTxw2NU51U7Y0ZfS7uKn6BR6DvUZ9ZdomFVns0dak5xpmIpRKfEkBURZtD9WnD8hg2kVXgTzMXbQJYPDzAPLtjQDB5Uv5NmbTVSJVXPQeBwDmjeznvdrceOfae7zHUE8L7LWcNDe8iI8Jdpb-tQm6EafnFLTIzhIeYJcWrNupBDPE1q6sb2mzxZqJzFIdlsuAfKyUTRhRxxKt6zFHjGPZ64D384w5mi7ouRTXrNwM_XuhbPnPwCUNoWMwj14_BVwa6RbnQ%22%5D&azt=AFoagUVegOGq9-AhBRoAcTa3AQVNLlzmFw%3A1542793455384&cookiesDisabled=false&deviceinfo=%5Bnull%2Cnull%2Cnull%2C%5B%5D%2Cnull%2C%22KR%22%2Cnull%2Cnull%2C%5B%5D%2C%22GlifWebSignIn%22%2Cnull%2C%5Bnull%2Cnull%2C%5B%5D%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C%5B5%2C%2277185425430.apps.googleusercontent.com%22%2C%5B%22https%3A%2F%2Fwww.google.com%2Faccounts%2FOAuthLogin%22%5D%2Cnull%2Cnull%2C%2255f38eed-e452-4296-b42c-6b386d00a5a2%22%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C5%2Cnull%2Cnull%2C%5B%5D%2Cnull%2Cnull%2Cnull%2C%5B%5D%5D%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2Cnull%2C%5B%5D%2Cnull%2Cnull%2Cnull%2C%5B%5D%5D%5D&gmscoreversion=undefined&checkConnection=youtube%3A248%3A1&checkedDomains=youtube&pstMsg=1&""".format( account=parse.quote_plus(targetacc)) html = self._ha.getstring(url, req_data=postdata, headers=""" accept: */* accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache content-type: application/x-www-form-urlencoded;charset=UTF-8 google-accounts-xsrf: 1 origin: https://accounts.google.com referer: https://accounts.google.com/signin/v2/identifier?flowName=GlifWebSignIn&flowEntry=ServiceLogin pragma: no-cache user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""" ) if isinstance(html, str) and '"5001":[]' in html: res = PROFILE(self._clientid, self.task, self.task.apptype, self._userid) except Exception: self._logger.error("Check gmail registration error: {}".format( traceback.format_exc())) return res
def __check_real_token(self) -> ETokenType: """检查真实tokentype并返回,找不到则返回ETokenType.Unknown""" if (self.tokentype != ETokenType.Unknown and self.tokentype != ETokenType.NotNeed and self.forcetokentype): return self.tokentype res: ETokenType = ETokenType.Unknown # 从上到下按优先级排列任务资源 # Cookie if not helper_str.is_none_or_empty(self.cookie): res = ETokenType.Cookie # 账密登陆 elif not helper_str.is_none_or_empty( self.account) and not helper_str.is_none_or_empty( self.password): if helper_str.is_none_or_empty(self.globaltelcode): res = ETokenType.Pwd else: res = ETokenType.SmsPwd # 短信 elif not helper_str.is_none_or_empty( self.globaltelcode) and not helper_str.is_none_or_empty( self.phone): res = ETokenType.Sms return res
def _parse_otherfields(self, ojs): if helper_str.is_none_or_empty(ojs): return try: js = json.loads(ojs) if js is None: return for key in js: # self._other_fields[key] = js[key] self._allfields[key] = js[key] except Exception as ex: print(ex)
def _get_folders(self) -> iter: """Get folders""" try: # 直接从主页解析 hdoc = etree.HTML(self._homepage, etree.HTMLParser()) if hdoc is None: self._logger.error( "Parse html document for mail folders from homepage failed" ) return xtd: list = hdoc.xpath('.//td[@width="120"]') if xtd is None or len(xtd) < 1: self._logger.error("Get folder list failed") return xtd = xtd[0] xfolders = xtd.xpath('.//tr') if xfolders is None or len(xfolders) < 1: self._logger.error("Get folder list fialed2") return folderkeys: dict = {} for xfolder in xfolders: try: if xfolder is None: continue xas: list = xfolder.xpath('.//a') if xas is None or len(xas) < 1: continue xa: etree._Element = xas[0] xhrefs = xa.xpath('.//@href') if xhrefs is None or len(xhrefs) < 1: continue href: str = str(xhrefs[0]) if helper_str.is_none_or_empty(href): continue xaccsskeys = xa.xpath('.//@accesskey') if not xaccsskeys is None and len(xaccsskeys) > 1: accesskey: str = str(xaccsskeys[0]) elif href.__contains__("s="): idx = href.index('s=') if idx is None or idx < 0: continue accesskey = href[idx + 2:] if helper_str.is_none_or_empty(accesskey): continue foldername = xa.xpath('./text()') if foldername is None or len(foldername) < 1: continue foldername = str(foldername[0]) if helper_str.is_none_or_empty(foldername): continue if accesskey == 'c' or foldername == '写邮件' or \ accesskey == '?&s=a' or foldername == '所有邮件' or \ accesskey == '?&v=cl' or foldername == '通讯录' or \ accesskey == '?&v=prl' or foldername == '修改标签': continue if not folderkeys.__contains__(accesskey): folderkeys[accesskey] = href folder = Folder() folder.name = foldername folder.folderid = accesskey folder.folderurl = "{}/{}".format(self._hpurlbase, href.lstrip('/')) yield folder except Exception: self._logger.error("Get one folder error: {}".format( traceback.format_exc())) except Exception: self._logger.error("Get folders error: {}".format( traceback.format_exc()))
def get_display_name(self): if not helper_str.is_none_or_empty(self.subject): return self.subject else: return self._mailid
def _get_profile_(self) -> iter: """拿个人信息和个人头像,返回两个数据""" try: if helper_str.is_none_or_empty(self.docid_profile): if not self._get_docid_profile(): return url = "https://www.facebook.com/api/graphqlbatch/" postdata = ('__user='******'&__a=1&__req=' + self._req.get_next() + '&__be=1&__pc=PHASED%3ADEFAULT&__rev=' + parse.quote_plus(self._rev) + '&fb_dtsg=' + parse.quote_plus(self.fb_dtsg) + '&jazoest=' + parse.quote_plus(self.jazoest) + '&__spin_r=' + parse.quote_plus(self._rev) + '&__spin_b=' + parse.quote_plus(self._spin_b) + '&__spin_t=' + parse.quote_plus(self._spin_t) + r'&queries=%7B%22o0%22%3A%7B%22doc_id%22%3A%22' + parse.quote_plus(self.docid_profile) + r'%22%7D%7D') html = self._ha.getstring(url, req_data=postdata, headers=""" accept: */* accept-encoding: gzip, deflate accept-language: zh-CN,zh;q=0.9 cache-control: no-cache content-type: application/x-www-form-urlencoded origin: https://www.facebook.com pragma: no-cache referer: https://www.facebook.com/ user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36""" ) if html is None or html == '': self._logger.error("Get profile failed") return html = html.strip().replace('\\', '').replace('\\r', '').replace( '\\n', '').replace(' ', '') # .replace('\r', '').replace('\n', '') sj = self._parse_js(html) if sj is None: html = html.strip().replace(' ', '') sj = self._parse_js(html) if sj is None: raise Exception("Parse detailed profile json failed.") # 个人信息 if not sj.__contains__('o0') or not sj['o0'].__contains__( 'data') or not sj['o0']['data'].__contains__('me'): raise Exception("Parse from detailed profile json failed.") sjme = sj['o0']['data']['me'] profile = PROFILE(self._clientid, self.task, self.task.apptype, self._userid) profile.phone = self.phone profile.account = self.uname_str if self._username is None or self._username == "": if sjme.__contains__('name'): self._username = sjme['name'] profile.nickname = self._username if sjme.__contains__('gender'): gender = sjme['gender'] if gender == 'FEMALE': profile.gender = EGender.Female if gender == 'MALE': profile.gender = EGender.Male # 头像 if sjme.__contains__('large_profile_picture') and sjme[ 'large_profile_picture'].__contains__('uri'): profilepicurl = sjme['large_profile_picture']['uri'] resp: ResponseIO = self._ha.get_response_stream(profilepicurl, headers=""" accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache upgrade-insecure-requests: 1""") profilepic = RESOURCES(self._clientid, self.task, profilepicurl, EResourceType.Picture, self.task.apptype) profilepic.io_stream = resp profilepic.sign = ESign.PicUrl profile.append_resource(profilepic) yield profilepic yield profile except Exception: self._logger.error("Get profile errorex:%s" % traceback.format_exc())
def _dispatch_by_apptype(self, task: Task, clients: dict) -> Tuple[bool, Client, str]: """按apptype分配给一个最优采集端""" succ: bool = False res: Client = None msg: str = None try: need_crosswall = True # 先看下是不是受支持的apptype if helper_str.is_none_or_empty( task.apptype) or not ALL_APPS.__contains__(task.apptype): # 不受支持的apptype必然是邮件imap/pop下载,否则一律不支持 # 不受支持的apptype,无法走webmail if task.cmd is None or \ task.cmd.stratagymail is None or \ task.cmd.stratagymail.eml_priority_protocol is None or \ task.cmd.stratagymail.eml_priority_protocol == 'webmail': self._logger.error( "Unknown apptype from task:\ntaskid:%s\napptype:%s" % (task.taskid, task.apptype)) msg = '不支持的应用类型和协议' return (succ, res, msg) # protocol非空,判断是否下发了邮服配置 if not task.cmd.stratagymail.eml_priority_protocol in ["imap","pop3"] or \ task.cmd.stratagymail.mail_service is None: self._logger.error( "Require mailservice for unknown apptype:\ntaskid:%s\napptype:%s" % (task.taskid, task.apptype)) msg = '未知的应用类型必须为IMAP/POP,且需配置邮服' return (succ, res, msg) elif task.cmd.stratagymail.eml_priority_protocol == "imap" and \ (task.cmd.stratagymail.mail_service.imap_host is None or \ task.cmd.stratagymail.mail_service.imap_host ==""): self._logger.error( "Require imap server config for imap download:\ntaskid:%s\napptype:%s" % (task.taskid, task.apptype)) msg = '缺少IMAP邮服配置' return (succ, res, msg) elif task.cmd.stratagymail.eml_priority_protocol == "pop3" and \ (task.cmd.stratagymail.mail_service.pop3_host is None or \ task.cmd.stratagymail.mail_service.pop3_host ==""): self._logger.error( "Require pop3 server config for pop3 download:\ntaskid:%s\napptype:%s" % (task.taskid, task.apptype)) msg = '缺少POP3邮服配置' return (succ, res, msg) # 无错误情况,则拿是否需要翻墙 need_crosswall = task.cmd.stratagymail.mail_service.crosswall else: # 受支持的apptype appcfg: AppConfig = ALL_APPS[task.apptype] need_crosswall = appcfg._crosswall # 过滤client,选择是否翻墙,不用翻墙的走哪都行 if need_crosswall: crosswall_clients = {} for c, cross in clients.items(): c: Client = c if cross == need_crosswall: continue crosswall_clients[c] = cross if len(crosswall_clients) < 1: raise Exception( "No Crosswall client found, iscout task need crosswall." ) clients = crosswall_clients # 再看数据库中是否有完全相同的令牌资源,有的话,就分配到同一个采集端 # 返回的数据不会更新资源字段到数据库,所以前端下发的资源到这里一定是 # 原始的一样的,直接用。 res = DbManager.get_client_by_search_token( platform=task._platform, apptype=task.apptype, tokentype=task.tokentype, input_=task.input, preglobaltelcode=task.preglobaltelcode, preaccount=task.preaccount, globaltelcode=task.globaltelcode, phone=task.phone, account=task.account, password=task.password, url=task.url, host=task.host, cookie=task.cookie, ) # 如果存在相同令牌资源的Task,说明已经分配过了相同的令牌资源的任务, # 否则说明没有被分配过,要进行新的分配 if isinstance(res, Client): succ = True return (succ, res, msg) # 然后遍历每个策略进行分数计算 clients = self._get_scores(task, clients, self.all_stgs) if not isinstance(clients, dict) or len(clients) < 1: msg = '内部计算出错' return (succ, res, msg) # highest = self._get_highest_score(clients) # 修改分发策略为轮询 highest = self._get_polling_next(clients) if not isinstance(highest, Client): msg = '内部选择采集端出错' return (succ, res, msg) else: res = highest succ = True except Exception as ex: succ = False self._logger.info( "No client suites task:\nplatform={}\ntaskid={}\ntasktype={}\napptype={}\nerr:{}" .format(task._platform, task.taskid, task.tasktype, task.apptype, traceback.format_exc())) msg = '内部分发任务出错:{}'.format(ex.args) return (succ, res, msg)
def _check_registration(self): """ 查询手机号是否注册了mailoutlook # 中国的手机号需要加上+86 :param account: :return: """ res: PROFILE = None try: targetacc: str = self._get_uname_task() if helper_str.is_none_or_empty(targetacc): self._logger.error( "Target acount is empty while checking registration") return res t = time.strftime('%Y-%m-%d %H:%M:%S') url = 'https://login.live.com/login.srf' headers = f""" Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Cache-Control: no-cache Connection: keep-alive Host: login.live.com Pragma: no-cache Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36""" response = self._ha.getstring(url, headers=headers) succ, uaid = helper_str.substringif(response, 'uaid=', '"') if not succ or not isinstance(uaid, str) or uaid == "": self._logger.error( "Get field 'uaid' failed for checking registration: {}". format(targetacc)) return res succ, flowToken = helper_str.substringif(response, 'id="i0327" value="', '"') if not succ or not isinstance(flowToken, str) or flowToken == "": self._logger.error( "Get field 'flowToken' failed for checking registration: {}" .format(targetacc)) return res url = f'https://login.live.com/GetCredentialType.srf?vv=1600&mkt=ZH-CN&lc=2052&uaid={uaid}' headers = f""" Accept: application/json client-request-id: {uaid} Content-type: application/json; charset=UTF-8 hpgact: 0 hpgid: 33 Origin: https://login.live.com Referer: https://login.live.com/login.srf User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36""" postdata = '{"username":"******","uaid":"' + uaid + '","isOtherIdpSupported":false,"checkPhones":true,"isRemoteNGCSupported":true,"isCookieBannerShown":false,"isFidoSupported":false,"forceotclogin":false,"otclogindisallowed":true,"flowToken":"' + flowToken + '"}' html = self._ha.getstring(url, headers=headers, req_data=postdata) if '"IfExistsResult":0,' in html: self._write_task_back(ECommandStatus.Succeed, 'Registered', t, EBackResult.Registerd) else: self._write_task_back(ECommandStatus.Succeed, 'Not Registered', t, EBackResult.UnRegisterd) except Exception: self._logger.error('Check registration fail: {}'.format( traceback.format_exc())) self._write_task_back(ECommandStatus.Failed, 'Check registration fail.', t, EBackResult.CheckRegisterdFail) return
def __log(self, msg: str, lvl: MsLogLevel = MsLogLevels.INFO): """""" if helper_str.is_none_or_empty(msg): return if isinstance(self._logger, MsLogger): self._logger.log(msg, lvl)
def _cookie_login(self) -> bool: """gmail cookie login""" res: bool = False try: if helper_str.is_none_or_empty(self.task.cookie): self._logger.warn("Cookie from task is empty") else: # 直接丢到google.com根域名下 self._ha._managedCookie.add_cookies(".google.com", self.task.cookie) # 直接从主页自动跳转 url = "https://mail.google.com/mail/" html = self._ha.getstring(url, headers=""" accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache referer: https://accounts.google.com/CheckCookie upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""" ) succ, globals_ = helper_str.substringif(html, "GLOBALS=[", ",[") if not succ or helper_str.is_none_or_empty(globals_): self._logger.error("Get profile information failed") return res globalparams: list = globals_.split(',') if globalparams is None or len(globalparams) < 10: self._logger.error("Get field 'ik' failed") return res ik = globalparams[9].strip('"') if helper_str.is_none_or_empty(ik): self._logger.error("Feld 'ik' is empty") return res if len(globalparams) < 11: self._logger.error("Get username failed") return res self._username = globalparams[10].strip('"') if helper_str.is_none_or_empty(self._username): self._logger.info('Get username failed2') return res # self._logger.info("Got username: {}".format(self._username)) # 尝试跳转到基础版页面 gmail_at = self._ha._managedCookie.get_cookie_value_in_domain( 'mail.google.com', 'GMAIL_AT') if helper_str.is_none_or_empty(gmail_at): succ, gmail_at = helper_str.substringif( html, 'input type="hidden" name="at" value="', '"') if not succ or helper_str.is_none_or_empty(gmail_at): gmail_at = self._ha._managedCookie.get_cookie_value( 'GMAIL_AT') if helper_str.is_none_or_empty(gmail_at): self._logger.error("Get field 'GMAIL_AT' failed") return res self._ha._managedCookie.add_cookies('google.com', 'GMAIL_AT={}'.format(gmail_at)) url = 'https://mail.google.com/mail/u/0/?ui=html&zy=e' postdata = 'at={}'.format(gmail_at) html, redir = self._ha.getstring_with_reurl(url, req_data=postdata, headers=""" accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache content-type: application/x-www-form-urlencoded origin: https://mail.google.com pragma: no-cache referer: https://mail.google.com/mail/u/0/ upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""" ) if helper_str.is_none_or_empty(redir): self._logger.error("Jump to basic page failed") return res redirs: list = redir.split('?') if redirs is None or len(redirs) < 1: self._logger.error("Get url root of basic page failed") return res self._hpurlbase = redirs[0] if helper_str.is_none_or_empty(self._hpurlbase): self._logger.error("Get url root of basic page failed2") return res self._hpurlbase = self._hpurlbase.strip().strip('/') self._homepage = html res = True except Exception: self._logger.error(traceback.format_exc()) return res
def _get_task_update_sql(self, src: dict, new: Task): """拼接更新task的sql,并返回sqlparameters列表""" params: list = [] sql = "UPDATE task SET " # 特定条件更新任务下载状态 if self.__modify_task_state(src["taskstatus"], new): sql += "taskstatus=?, " params.append(new.taskstatus.value) # 如果cmdid不一致则需要更新下cmd if src["cmdid"] != new.cmd_id: sql += "cmdid" params.append(new.cmd_id) # sql += 'otherfileds=?, ' # params.append(json.dumps(new._other_fields)) sql += "tasktype=?, " params.append(new.tasktype.value) if not helper_str.is_none_or_empty(new.cookie): sql += "cookie=?, " params.append(new.cookie) if helper_str.is_none_or_empty( src["url"]) and not helper_str.is_none_or_empty(new.url): sql += "url=?, " params.append(new.url) if helper_str.is_none_or_empty( src["host"]) and not helper_str.is_none_or_empty(new.host): sql += "host=?, " params.append(new.host) if helper_str.is_none_or_empty( src["account"]) and not helper_str.is_none_or_empty( new.account): sql += "account=?, " params.append(new.account) if helper_str.is_none_or_empty( src["password"]) and not helper_str.is_none_or_empty( new.password): sql += "password=?, " params.append(new.password) if helper_str.is_none_or_empty( src["phone"]) and not helper_str.is_none_or_empty( new.password): sql += "phone=?, " params.append(new.password) if helper_str.is_none_or_empty( src["globaltelcode"]) and not helper_str.is_none_or_empty( new.globaltelcode): sql += "globaltelcode=?, " params.append(new.globaltelcode) # if not helper_str.is_none_or_empty(new._sequence): # sql += 'sequence=?, ' # params.append(int(new._sequence)) if not helper_str.is_none_or_empty(new.forcedownload): sql += "forcedownload=?, " params.append(int(new.forcedownload)) sql = sql.rstrip().rstrip(",") sql += " WHERE batchid=? AND taskid=?" params.append(new.batchid) params.append(new.taskid) return (sql, params)
def get_display_name(self): res = '' if not helper_str.is_none_or_empty(self.url): res += " {}".format(self.url) return res
def _get_mails(self, folder: Folder) -> iter: """Get mails in given folder""" try: if folder is None: self._logger.error("Given folder is None") return urlnext: str = folder.folderurl page: int = 0 next_: bool = True maillastidx: int = 0 mailidx: int = 0 while next_: try: html = self._ha.getstring(urlnext, headers=""" accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""" ) hdoc = etree.HTML(html, etree.HTMLParser()) if hdoc is None: # self._logger.error( # "Parse mail html document of folder '{}' failed.". # format(folder.name)) self._logger.info("No mail find in folder '{}'".format( folder.name)) return # 有无下一页 next_ = False next_, urlnext, mailidx, maillastidx = self._get_next_page_url( hdoc, folder, mailidx, maillastidx) self._logger.info("Enter folder '{}' {}-{}".format( folder.name, maillastidx, str(mailidx) if mailidx > maillastidx else "")) # 解析邮件列表 mailnodes = hdoc.xpath('.//tr[@bgcolor]') if mailnodes is None or len(mailnodes) < 1: self._logger.info( "No mail found in folder '{}'".format(folder.name)) continue for mailnode in mailnodes: try: if mailnode is None: continue # 已读未读状态 isread: bool = False abgcolor = mailnode.xpath('./@bgcolor') if not abgcolor is None and len(abgcolor) > 0: strbgcolor = str(abgcolor[0]).strip() if not helper_str.is_none_or_empty( strbgcolor ) and strbgcolor == '#E8EEF7': isread = True # 获取邮件详情页 # href="?&th=1671b647a79e227b&v=c" xas = mailnode.xpath('.//td/a[@href]') if xas is None or len(xas) < 1: self._logger.error( "Get mail content url failed, skip this mail" ) continue xa = xas[0] xhref = xa.xpath('./@href') if xhref is None or len(xhref) < 1: self._logger.error( "Get mail content url failed, skip this mail1" ) continue strhref = str(xhref[0]).strip() if helper_str.is_none_or_empty(strhref): self._logger.error( "Get mail content url failed, skip this mail2" ) continue mailurl = "{}/{}".format(self._hpurlbase, strhref) succ, mailid = helper_str.substringif( mailurl, 'th=', '&') if not succ or helper_str.is_none_or_empty(mailid): self._logger.error( "Get mail id failed, skip this mail") continue xsubjs: str = xa.xpath('.//text()') if xsubjs is None or len(xsubjs) < 1: self._logger.error( "Get mail subject failed: {}".format( mailid)) continue subj = ''.join(str(xs) for xs in xsubjs) if '-' in subj: idx = subj.find('-') subj = subj[0:idx - 1].strip() # 点击 ‘显示原始邮件’ urlsrc = "{}/?&th={}&v=om".format( self._hpurlbase, mailid) html = self._ha.getstring(urlsrc, headers=""" accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache referer: {} upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""" .format(mailurl)) if helper_str.is_none_or_empty(html): self._logger.error( "Get source mail page failed: {}".format( urlsrc)) continue hsrc = etree.HTML(html, etree.HTMLParser()) if hsrc is None: self._logger.error( "Parse mail source document failed: {}". format(urlsrc)) continue # 发送时间 sendtime = datetime.datetime(1970, 1, 1, 0, 0, 0) strsendtime = None m = self._resendtime.search(html) if m is None: self._logger.warn( "Get mail sendtime failed: {} {}".format( mailid, subj)) else: strsendtime = m.group('date').strip() if not helper_str.is_none_or_empty(strsendtime): try: sendtime: datetime.datetime = dateparser.parse( strsendtime) except Exception: try: sendtime = datetime.datetime.strptime( strsendtime, '%a, %d %b %Y %H:%M:%S %z' ).strftime('%a, %d %b %Y %H:%M:%S %z') except Exception: self._logger.warn( "Get mail sendtime failed: {} {}". format(mailid, subj)) sendtime = datetime.datetime( 1970, 1, 1, 0, 0, 0) # 找 下载邮件 按钮 xbtns = hsrc.xpath( './/a[@class="download-buttons"]') if xbtns is None or len(xbtns) < 1: self._logger.error( "Get mail download url failed: {} {}". format(mailid, subj)) continue xbtn = xbtns[0] xdurls = xbtn.xpath('.//@href') if xdurls is None or len(xdurls) < 1: self._logger.error( "Get mail download url failed: {} {}". format(mailid, subj)) continue downurl = "{}/{}".format( self._hpurlbase, str(xdurls[0]).strip().lstrip('/')) # 用uname_str会有问题的,应该固定使用一个值 mail = EML(self._clientid, self.task, self.uname_str, mailid, folder, self.task.apptype) mail.owner = self.uname_str mail.sendtime = sendtime mail.downloadurl = downurl headers = """ accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 accept-encoding: gzip, deflate, br accept-language: zh-CN,zh;q=0.9 cache-control: no-cache pragma: no-cache referer: {} upgrade-insecure-requests: 1 user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36""".format( urlsrc) resp = self._ha.get_response(mail.downloadurl, headers=headers) mail.stream_length = resp.headers.get( 'Content-Length', 0) mail.io_stream = ResponseIO(resp) yield mail except Exception: self._logger.error( "Parse one mail error: {}".format( traceback.format_exc())) except Exception: self._logger.error( "Get mails of folder '{}' error: {}".format( folder.name, traceback.format_exc())) except Exception: self._logger.error("Get mails of folder '{}' error: {}".format( folder.name, traceback.format_exc()))