def get_uniqueid(self): return helper_crypto.get_md5_from_str(self._get_write_lines())
def get_all_country(self): """ 下载所有城市的zip,解压并解析 :return: """ # 下载 # widgets = [ # 'Downloading allCountries.zip: ', # progressbar.Bar(), # ' ', # progressbar.Counter(format='%(value)d Mb/%(max_value)d Mb'), # ] filename = self.tmpfile / 'allinfo.zip' url = 'http://download.geonames.org/export/dump/allCountries.zip' headers = { 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 'Accept-Encoding': "gzip, deflate", 'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8", 'Cache-Control': "no-cache", 'Connection': "keep-alive", 'Host': "download.geonames.org", 'Pragma': "no-cache", 'Referer': "http://download.geonames.org/export/dump/", 'Upgrade-Insecure-Requests': "1", 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36" } count = 0 with requests.get(url, headers=headers, stream=True) as r: r.raise_for_status() total_length = math.ceil( int(r.headers.get('content-length')) / (1024 * 1024)) with filename.open('wb') as f: # with progressbar.ProgressBar(max_value=total_length, widgets=widgets) as bar: for chunk in r.iter_content(chunk_size=1024 * 1024): count += 1 # bar.update(count) self._logger.info( f'Downloading allCountries.zip: {count} Mb/ {total_length} Mb' ) # self._logger.info(f'{count} times Downloaded 1Mb, and waiting...') if chunk: # filter out keep-alive new chunks f.write(chunk) self._logger.info('Download all city info zip success') # 解压 self._logger.debug('Start unzip') zip_file = zipfile.ZipFile(filename, 'r') zip_file.extractall(self.tmpfile) zipinfo = zip_file.namelist() self._logger.debug(f'Get unzip file, name:{zipinfo[0]}') zip_file.close() # 并解析 info_path = self.tmpfile / zipinfo[0] for g_data in self.__get_geoname(info_path): geo_dict = g_data.__dict__ # 增量下载 geostr = json.dumps(geo_dict, ensure_ascii=False) geo_md5 = helper_crypto.get_md5_from_str(geostr) # 判断数据是否重复,如果重复那么就不不输出数据,这样是用客户端分担了server的运算量 if self.is_geodata_unique(geo_md5): continue # 数据不重复表示新数据,输出数据,保存数据的唯一标识 self.write_text(geostr, self.suffix) self.store_geodata_unique(geo_md5) # 下载完成数据后删除文件, # 因为这个文件好像每个月都在更新, # 所以拿到数据后就去从新下载比较好 # 最后清理下载的数据 time.sleep(5) info_path.unlink() filename.unlink() self._logger.debug('Delete source zip file')
def get_uniqueid(self): alllines = "" for lines in self._get_write_lines(): alllines += lines return helper_crypto.get_md5_from_str(alllines)
def _parse_one_ipwhois(self, ip: str, jcontent: dict, reason) -> IPWhoisData: """parse one ipwhois, same as ipwhois history.content""" res: IPWhoisData = None try: if not isinstance(jcontent, dict): return res handle = jcontent.get("handle") ip_ver = jcontent.get("ipVersion") allocate_type = jcontent.get("type") netname = jcontent.get("name") country_code = jcontent.get("country") if country_code is None: # 整理因为修改了mmdb的数据库,所以会返回组织和运营商 geo, org, isp = self._dbip.get_ip_mmdbinfo(1, ip) country_code = geo._country_code raw: str = json.dumps(jcontent) md5 = helper_crypto.get_md5_from_str(raw) # construct obj res = IPWhoisData(reason, md5, raw, handle, allocate_type, netname, country_code, ip_ver) # last_modified jevents = jcontent.get("events") if not jevents is None and len(jevents) > 0: for je in jevents: if je.__contains__("eventAction") and \ je.__contains__("eventDate"): jea = je["eventAction"] jval = je["eventDate"] if jea == "last changed": res.last_modified = jval elif jea == "registration": res.applicable_from = jval else: self._logger.warn( "Unknown eventAction for ipwhois: ip={}, action={}, val={}" .format(ip, jea, jval)) # remarks jremarks = jcontent.get("remarks") if not jremarks is None and len(jremarks) > 0: remarks = '' for jr in jremarks: jdes = jr.get("description") if jdes is None or len(jdes) < 1: continue for jd in jdes: remarks += (jd + "\r\n") if not remarks is None and remarks != "": res.remarks = remarks # cidrs jcidrs = jcontent.get("cidr0_cidrs") if not jcidrs is None and len(jcidrs) > 0: for jc in jcidrs: k = None if jc.__contains__("v4prefix"): k = jc['v4prefix'] elif jc.__contains__("v6prefix"): k = jc['v6prefix'] v = jc.get("length") if v is None: continue res.set_cidrs("{}/{}".format(k, v)) # entities jentity = jcontent.get("entities") if not jentity is None and len(jentity) > 0: for jen in jentity: en = self._parse_entity(ip, jen) if en is None: continue res.set_entity(en) except Exception: self._logger.debug( "Parse one ipwhois error: ip:{}, error: {}".format( ip, traceback.format_exc())) return res
def _check_registration(self): """ 查询手机号是否注册了百度贴吧 :param account: :return: """ t = time.strftime('%Y-%m-%d %H:%M:%S') ti = int( datetime.now(pytz.timezone('Asia/Shanghai')).timestamp() * 1000) try: html = self._ha.getstring( 'https://passport.baidu.com/v2/?reg&tpl=tb&u=//tieba.baidu.com', headers=""" Host: passport.baidu.com Connection: keep-alive Pragma: no-cache Cache-Control: no-cache Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 """, encoding='utf-8') html.encoding = 'utf-8' # print(html.text) gid = 'DD9D1FD-752B-4AC4-9BE0-CB699316505D' gid = str(uuid.uuid1()).upper()[1:] gid = gid[:13] + '4' + gid[14:] js = """ getUniqueId = function(e) { return e + Math.floor(2147483648 * Math.random()).toString(36) }""" ctx = execjs.compile(js) callback = ctx.call('getUniqueId', 'bd__cbs__') html = self._ha.getstring( f'https://passport.baidu.com/v2/api/?getapi&tpl=tb&apiver=v3&tt={ti}&class=regPhone&gid={gid}&app=&traceid=&callback={callback}', headers=""" Host: passport.baidu.com Connection: keep-alive Pragma: no-cache Cache-Control: no-cache User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36 Accept: */* Referer: https://passport.baidu.com/v2/?reg&tpl=tb&u=//tieba.baidu.com Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 """) token = substring(html, '"token" : "', '"') # # js = """ # function hex_md5(s) { # return binl2hex(core_md5(str2binl(s), s.length * chrsz)) # } # function get_moonshad(phone) { # n = hex_md5(phone + "Moonshadow"); # n = n.replace(/o/, "ow").replace(/d/, "do").replace(/a/, "ad"), # n = n.replace(/h/, "ha").replace(/s/, "sh").replace(/n/, "ns").replace(/m/, "mo"), # return n # } # """ # moon = execjs.compile(js) # moonshad = moon.call('get_moonshad', self.task.phone) moonshad = helper_crypto.get_md5_from_str(self.task.phone + "Moonshadow") moonshad = re.sub(r'o', 'o~', moonshad, 1) moonshad = re.sub(r'd', 'd!', moonshad, 1) moonshad = re.sub(r'a', 'a@', moonshad, 1) moonshad = re.sub(r'h', 'h#', moonshad, 1) moonshad = re.sub(r's', 's$', moonshad, 1) moonshad = re.sub(r'n', 'n%', moonshad, 1) moonshad = re.sub(r'm', 'm^', moonshad, 1) moonshad = moonshad.replace('~', 'w').replace('!', 'o').replace( '@', 'd').replace('#', 'a').replace('$', 'h').replace('%', 's').replace('^', 'n') callback = ctx.call('getUniqueId', 'bd__cbs__') url = f"https://passport.baidu.com/v2/?regphonecheck&token={token}&tpl=tb&apiver=v3&tt={ti}&phone={self.task.phone}&moonshad={moonshad}&countrycode=&gid={gid}&exchange=0&isexchangeable=1&action=reg&traceid=&callback={callback}" headers = """ Host: passport.baidu.com Connection: keep-alive Pragma: no-cache Cache-Control: no-cache User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36 Accept: */* Referer: https://passport.baidu.com/v2/?reg&tpl=tb&u=//tieba.baidu.com Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 """ response = self._ha.get_response(url, headers=headers) response.encoding = 'utf-8' # print(response.text) if '"400001"' in response.text: self._write_task_back(ECommandStatus.Succeed, 'Registered', t, EBackResult.Registerd) else: self._write_task_back(ECommandStatus.Succeed, 'Not Registered', t, EBackResult.UnRegisterd) except Exception: self._logger.error('Uber check registration fail: {}'.format( traceback.format_exc())) self._write_task_back(ECommandStatus.Failed, 'Check registration fail', t, EBackResult.CheckRegisterdFail) return
def get_uniqueid(self): return helper_crypto.get_md5_from_str("{}{}{}".format( self._apptype, self._userid, self._orderid))
def get_uniqueid(self): return helper_crypto.get_md5_from_str("{}{}{}".format( self.resourceid, self._task.platform, self._url))
def get_uniqueid(self): return helper_crypto.get_md5_from_str("{}{}{}".format( self._parentobj, self._task.platform, self.url))
def get_uniqueid(self): return helper_crypto.get_md5_from_str("{}{}".format( self._userid, self._messageid))
def _parse_chatlog(self, msg: map, threadtype: str, ownerid: str) -> iter: """接收 json解出来的map消息msg对象,返回ICHATLOG_ONE和RESOURCE对象迭代器\n msg:json解出来的map消息msg对象\n threadtype:一个json中解出来的字段,应该是表示会话类型""" try: if msg is None: self._logger.error( "Invalid msg map object for parseing chat log: {}".format( msg)) return chattype: int = 0 # 0私聊,1群聊 msgtype: str = None # 图片视频等 sendtime: str = None # 发送时间 if not msg.__contains__('message_sender') or not msg[ 'message_sender'].__contains__( 'id') or not msg.__contains__('message_id'): return if not threadtype is None: if threadtype != "ONE_TO_ONE": chattype = 1 else: chattype = 0 # 消息类型 if not msg.__contains__('__typename') or msg[ '__typename'] is None or msg['__typename'] == '': return msgtype = self._judge_message_type(msg['__typename']) # 发送时间戳 timestamp_precise = None if msg.__contains__('timestamp_precise'): try: tmp = msg['timestamp_precise'] tmp = int(tmp) timestamp_precise = tmp sendtime = helper_time.timespan_to_datestr(tmp) except Exception: sendtime = helper_time.timespan_to_datestr( helper_time.ts_since_1970()) # 构建消息对象 ctg = ICHATLOG_ONE(self.task, self._appcfg._apptype, self._userid, msgtype, ownerid, chattype, msg['message_id'], msg['message_sender']['id'], sendtime) ctg.remarks = timestamp_precise # 已读未读 if msg.__contains__('unread'): if msg['unread'].strip().lower() == 'true': ctg.isread = 0 else: ctg.isread = 1 # 表情资源 if msg.__contains__( 'sticker' ) and not msg['sticker'] is None and msg['sticker'].__contains__( 'url') and msg['sticker'].__contains__('label'): sjstk = msg['sticker'] if sjstk.__contains__('url'): url = sjstk['url'].replace('\\', '').rstrip() rscid = helper_crypto.get_md5_from_str(url) if sjstk.__contains__('id'): rscid = sjstk['id'] for rsc in self._fetch_resources(url, EResourceType.Picture, rscid): ctg.append_resource(rsc) yield rsc # 片段,系统消息说明 if msg.__contains__('snippet'): ctg.content += msg['snippet'] # answered对方是否响应 if msg.__contains__('answered'): if msg['answered'] == 'false': ctg.answered = 0 else: ctg.answered = 1 # blob_attachments if msg.__contains__('blob_attachments' ) and not msg['blob_attachments'] is None: for blob in msg['blob_attachments']: if not blob.__contains__('__typename'): continue # 拿附件url,附件类型/type url, rsctype = self._get_attachments_type_and_url(blob) if not isinstance(url, str) or url == "": self._logger.warn( "Get attachment url failed: {}".format(blob)) continue rscid: str = None if blob.__contains__('legacy_attachment_id'): rscid = blob['legacy_attachment_id'] elif blob.__contains__('message_file_fbid'): rscid = blob['message_file_fbid'] if not isinstance(rscid, str) or rscid == "": rscid = helper_crypto.get_md5_from_str(url) # 附件名 finame = None if blob.__contains__('filename'): finame = blob['filename'] # 下载 for rsc in self._fetch_resources(url, rsctype, rscid, finame): ctg.append_resource(rsc) yield rsc if msg.__contains__( 'extensible_attachment' ) and not msg['extensible_attachment'] is None and msg[ 'extensible_attachment'].__contains__( 'legacy_attachment_id'): resourceid = msg['extensible_attachment'][ 'legacy_attachment_id'] if msg['extensible_attachment'].__contains__("story_attachment") \ and msg['extensible_attachment']['story_attachment'].__contains__('media'): jmedia = msg['extensible_attachment']['story_attachment'][ 'media'] if jmedia.__contains__('is_playable') and jmedia[ 'is_playable'] == 'true' and jmedia.__contains__( 'playable_url'): url = jmedia['playable_url'].rstrip().replace( '\\', '').rstrip() for rsc in self._fetch_resources( url, EResourceType.Video, resourceid): ctg.append_resource(rsc) yield rsc if jmedia.__contains__('image') \ and jmedia['image'].__contains__('uri'): url = jmedia['image']['uri'].rstrip().replace( '\\', '').rstrip() for rsc in self._fetch_resources( url, EResourceType.Picture, resourceid): ctg.append_resource(rsc) yield rsc # message if msg.__contains__('message') and not msg['message'] is None: if msg['message'].__contains__('text'): if not msg['message']['text'] is None and not msg[ 'message']['text'] == '': ctg.content += msg['message']['text'] yield ctg except Exception: self._logger.error( "Parse one chatlog msg error:\nmsg:{}\nerror:{}".format( msg, traceback.format_exc()))
def get_uniqueid(self): """子类实现时,返回当前数据的唯一标识id,用于去重数据,和增量下载""" return helper_crypto.get_md5_from_str(self.get_write_lines())
def get_uniqueid(self) -> str: return helper_crypto.get_md5_from_str("{}{}{}".format( self._userid, self._contactid, self._apptype))