def httpExists(url): import httplib, urlparse host, path = urlparse.urlsplit(url)[1:3] if ":" in host: host, port = host.split(":", 1) try: port = int(port) except ValueError: return False, url else: port = None try: connection = httplib.HTTPConnection(host, port=port) connection.request("HEAD", path) resp = connection.getresponse() if resp.status == 200: return True, url elif resp.status == 302: new_url = urlparse.urljoin(url, resp.getheader("location", "")) return httpExists(new_url) else: return False, url except: return False, url
def feature_extract(url_input): Feature = {} tokens_words = re.split('\W+', url_input) host = urlparse.urljoin(url_input, '/') path = urlparse.urlparse(url_input).path Feature['URL'] = url_input Feature['rank_host'], Feature['rank_country'] = sitepopularity(host) Feature['host'] = host Feature['Length_of_url'] = len(url_input) Feature['Length_of_host'] = len(host) Feature['No_of_dots'] = url_input.count('.') Feature['sec_sen_word_cnt'] = Security_sensitive(tokens_words) Feature['IPaddress_presence'] = Check_IPaddress(tokens_words) Feature['avg_token_length'], Feature['token_count'], Feature[ 'largest_token'] = Tokenise(url_input) Feature['avg_domain_token_length'], Feature['domain_token_count'], Feature[ 'largest_domain'] = Tokenise(host) Feature['avg_path_token'], Feature['path_token_count'], Feature[ 'largest_path'] = Tokenise(path) Feature['ASNno'] = getASN(host) Feature['safebrowsing'] = safebrowsing(url_input) Feature['numTld'] = numTld(url_input) Feature['numPunctuation'] = numPunctuation(url_input) return Feature
def _extracturls(self): #print "Extract URLs" urls = [] htmlsrc, charset, parenturl = self.htmlSrcTuple if htmlsrc != None: resulturls = [] urlExtractor = ExtractLinks(resulturls) try: if charset == None: urlExtractor.feed(htmlsrc) else: urlExtractor.feed(htmlsrc.decode(charset)) except HTMLParser.HTMLParseError: pass try: urlExtractor.reset() # I think close needs special treatment .close() except HTMLParser.HTMLParseError: urlExtractor.reset() #this piece of code forms the URIs to full URLs by joining the #parenturl with the network location free URLs extracted for i in xrange(len(resulturls)): #replacing range() for performance reasons urlres = urlparse.urlparse(resulturls[i], "http") if urlres.netloc == "": resulturls[i] = urlparse.urljoin(parenturl, resulturls[i]) urls.extend(resulturls) return urls
def handle_starttag(self, tag, attrs): # print(tag) if tag == 'a': for (attribute, value) in attrs: if attribute == 'href': url = urlparse.urljoin(self.base_url, value) # whenever you come across a relative url this home page ^ needs to be stuck on it self.links.add(url)
def convt2compurl(self, base_url, urls): import re import urlparse if type(urls) is tuple or type(urls) is list: newurllist = [] for url in urls: url = re.sub(r'^\s+', '', url) url = re.sub(r'\s+$', '', url) newurl = urlparse.urljoin(base_url, url) newurl = self.normalize_url(newurl) newurllist.append(newurl) return newurllist else: url = re.sub(r'^\s', '', url) url = re.sub(r'\s$', '', url) newurl = urlparse.urljoin(base_url, urls) return self.normalize_url(newurl)
def standardize_url(self, base_url, raw_url): if raw_url == None or raw_url == "": return raw_url import urlparse url = urlparse.urljoin(base_url, raw_url) #if url.startswith("http://www") or url.startswith("wwww") or url.startswith("https://www"): #pass #else: #url = url.replace("//", "//www.", 1) return url
def getPicInfo(file): res = [] # 读取照片信息 path = os.path.join(extract_path, file) fd = open(path, 'rb') # 获取照片的属性组 tags = exifread.process_file(fd) fd.close() # 获取照片属性中的创建时间 if 'Image DateTime' in tags: date = str(tags['Image DateTime']) LatRef = tags["GPS GPSLatitudeRef"].printable Lat = tags["GPS GPSLatitude"].printable[1:-1].replace(" ", "").replace( "/", ",").split(",") Lat = float( Lat[0]) + float(Lat[1]) / 60 + float(Lat[2]) / float(Lat[3]) / 3600 if LatRef != "N": Lat = Lat * (-1) # 经度 LonRef = tags["GPS GPSLongitudeRef"].printable Lon = tags["GPS GPSLongitude"].printable[1:-1].replace( " ", "").replace("/", ",").split(",") Lon = float( Lon[0]) + float(Lon[1]) / 60 + float(Lon[2]) / float(Lon[3]) / 3600 if LonRef != "E": Lon = Lon * (-1) date1 = date.split(' ') date1[0] = date1[0].replace(':', '-') date = ' '.join(date1) # 文件重命名 new_name = date.replace(':', '').replace( ' ', '_') + os.path.splitext(file)[1] tot = 1 new_upload_path = os.path.join(extract_path, new_name) while os.path.exists(new_upload_path): new_name = date.replace(':', '').replace( ' ', '_') + '_' + str(tot) + os.path.splitext(file)[1] tot += 1 new_upload_path = os.path.join(extract_path, new_name) # 重命名文件需要传入的是路径加文件名 ../../1.png, ../../2018_0912.png os.rename(path, new_upload_path) # 拼接图片存储的URL pathName = urllib.pathname2url(new_upload_path) path = urlparse.urljoin(request.url[:pos], pathName) # 文件名不加后缀 index = file.rfind('.') # 解决获取中文文件名的问题 describertion = file[:index].decode('GB2312') res.append({ 'time': date, 'describertion': describertion, 'path': path, 'pos': [Lat, Lon] })
def absoluteUrl(baseUrl, url): if (not url or url[:7].lower() == "http://" or url[:8].lower() == "https://"): return url i = url.find('#') if (i >= 0): url = url[:i] if (not url): return baseUrl if (urljoin): return urljoin(baseUrl, url) return urlparse.urljoin(baseUrl, url)
def get_intra_links(webdriver, url): ps1 = du.get_ps_plus_1(url) links = list() for elem in webdriver.find_elements_by_tag_name("a"): try: href = elem.get_attribute('href') except StaleElementReferenceException: continue if href is None: continue full_href = urlparse.urljoin(url, href) if not full_href.startswith('http'): continue if du.get_ps_plus_1(full_href) == ps1: links.append(elem) return links
def handle_starttag(self, tag, attrs): # We are looking for the begining of a link. Links normally look # like <a href="www.someurl.com"></a> if tag == 'a': for (key, value) in attrs: if key == 'href': # We are grabbing the new URL. We are also adding the # base URL to it. For example: # www.netinstructions.com is the base and # somepage.html is the new URL (a relative URL) # # We combine a relative URL with the base URL to create # an absolute URL like: # www.netinstructions.com/somepage.html newUrl = urlparse.urljoin(self.baseUrl, value) # And add it to our colection of links: self.links = self.links + [newUrl]
def _extract_urls(self, node_list): final_nodes = [] final_urls = [] in_web_page = is_web_url(self.url) if in_web_page: root_scheme = urlparse.urlparse(self.url).scheme else: root_scheme = None for node in node_list: url = node.get('src') if in_web_page: url = urlparse.urljoin(self.url, url) else: folder = os.path.dirname(self.url) url = os.path.abspath(os.path.join(folder, url)) final_nodes.append(node) final_urls.append(url) return zip(final_nodes, final_urls)
def extract_next_links(rawDataObj): outputLinks = [] ''' rawDataObj is an object of type UrlResponse declared at L20-30 datamodel/search/server_datamodel.py the return of this function should be a list of urls in their absolute form Validation of link via is_valid function is done later (see line 42). It is not required to remove duplicates that have already been downloaded. The frontier takes care of that. Suggested library: lxml ''' url = rawDataObj.url r = requests.get(url) data = r.text soup = BeautifulSoup(data, "lxml") for link in soup.find_all('a'): obtained_link = link.get('href') if (obtained_link is not None) and is_ascii( obtained_link): # Check for invalid characters and None type if ('mailto' not in str(obtained_link)) and ( "calendar" not in obtained_link ): # Check for presence of mailto links and calendar (for crawler trap) http_url = get_absolute(url, obtained_link) obtained_link = urlparse.urljoin(http_url, obtained_link) if '?' in obtained_link: # Identifying links with query parameters position = obtained_link.rfind('?') obtained_link = obtained_link[:position] outputLinks.append(obtained_link) count_subdomain(outputLinks) links_count = len(outputLinks) check_max_outlinks(links_count, url) return outputLinks
def urijoin(base_uri, path): if is_url(base_uri): return urlparse.urljoin(base_uri, path) else: return os.path.normpath(os.path.join(base_uri, path.strip('/')))
def handle_starttag(self, tag, attrs): if tag == 'a': for (attribute, value) in attrs: if attribute == 'href': url = urlparse.urljoin(self.base_URL, value) self.links.add(url)
def o(self, data, puredata=0, force=0): if self.abbr_data is not None: self.abbr_data += data if not self.quiet: if options.google_doc: # prevent white space immediately after 'begin emphasis' marks ('**' and '_') lstripped_data = data.lstrip() if self.drop_white_space and not (self.pre or self.code): data = lstripped_data if lstripped_data != '': self.drop_white_space = 0 if puredata and not self.pre: # data = re.sub('\s+', ' ', data) if data and data[0] == ' ': self.space = 1 data = data[1:] if not data and not force: return if self.startpre: #self.out(" :") #TODO: not output when already one there self.startpre = 0 bq = (">" * self.blockquote) if not (force and data and data[0] == ">") and self.blockquote: bq += " " if self.pre: bq += " " data = data.replace("\n", "\n"+bq) if self.start: self.space = 0 self.p_p = 0 self.start = 0 if force == 'end': # It's the end. self.p_p = 0 self.out("\n") self.space = 0 if self.p_p: self.out((self.br_toggle+'\n'+bq)*self.p_p) self.space = 0 self.br_toggle = '' if self.space: if not self.lastWasNL: self.out(' ') self.space = 0 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): if force == "end": self.out("\n") newa = [] for link in self.a: if self.outcount > link['outcount']: self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href'])) if has_key(link, 'title'): self.out(" ("+link['title']+")") self.out("\n") else: newa.append(link) if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done. self.a = newa if self.abbr_list and force == "end": for abbr, definition in self.abbr_list.items(): self.out(" *[" + abbr + "]: " + definition + "\n") self.p_p = 0 self.out(data) self.outcount += 1
def __real_update_pkg_filter(self, pkg_id, fm): import sqlite3 import urllib2 import urlparse import bz2 if pkg_id not in fm.pkg_filters_conf.keys(): return try: fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_DOWNLOADING fm.pkg_filters_conf[pkg_id]["progress"] = 0 url = fm.pkg_filters_conf[pkg_id]["update_url"] pkg_info = json.load(urllib2.urlopen(url)) orig_t = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"][ "orig-timestamp"] release_n = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"][ "release-number"] on_server_orig_t = pkg_info["metadata"]["orig-timestamp"] on_server_release_n = pkg_info["metadata"]["release-number"] if orig_t != on_server_orig_t: reactor.callInThread(self.__download_new_pkg, pkg_id, url, self) return else: force_download = False for x in range( int(release_n) + 1, int(on_server_release_n) + 1): if "diff-%s-%s.bz2" % (orig_t, x) not in pkg_info["diffs"]: force_download = True break if force_download == True: reactor.callInThread(self.__download_new_pkg, pkg_id, url, self) return else: patches = [] for x in range( int(release_n) + 1, int(on_server_release_n) + 1): patches.append([ "diff-%s-%s.bz2" % (orig_t, x), urlparse.urljoin(url, "diff-%s-%s.bz2" % (orig_t, x)) ]) dest_patch = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.update-patch" % (pkg_id)) if os.path.exists(dest_patch): os.unlink(dest_patch) dest_patch_fd = open(dest_patch, "w") lines_counted = 0 total_diffs = len(patches) downl_diffs = 0 for diff_filename, diff_url in patches: dest_file = os.path.join( NANNY_DAEMON_BLACKLISTS_DIR, "%s-%s" % (pkg_id, diff_filename)) if os.path.exists(dest_file): os.unlink(dest_file) df = open(dest_file, "wb") url_x = urllib2.urlopen(diff_url) while True: x = url_x.read(1024) if x != '': df.write(x) else: break df.close() df_uc = bz2.BZ2File(dest_file, "r") for line in df_uc.readlines(): if not line.startswith("#"): dest_patch_fd.write(line) lines_counted += 1 df_uc.close() os.unlink(dest_file) downl_diffs += 1 fm.pkg_filters_conf[pkg_id]["progress"] = ( downl_diffs * 100) / total_diffs dest_patch_fd.close() dest_patch_fd = open(dest_patch, "r") if pkg_id in fm.db_pools.keys(): db = fm.db_pools.pop(pkg_id) db.close() dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.db" % (pkg_id)) db_conn = sqlite3.connect(dest_db) fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_UPDATING fm.pkg_filters_conf[pkg_id]["progress"] = 0 lines_inserted = 0 sql = '' update_ok = True for line in dest_patch_fd.readlines(): lines_inserted += 1 sql = sql + line if sqlite3.complete_statement(sql): c = db_conn.cursor() try: c.execute(sql) except: db_conn.rollback() update_ok = False break sql = '' fm.pkg_filters_conf[pkg_id]["progress"] = ( lines_inserted * 100) / lines_counted if update_ok == True: c = db_conn.cursor() c.execute( "UPDATE metadata SET value='%s' WHERE key='release-number'" % on_server_release_n) db_conn.commit() print "UPDATED pkg:%s to version:%s" % ( pkg_id, on_server_release_n) db_conn.close() dest_patch_fd.close() os.unlink(dest_patch) if update_ok == True: fm.pkg_filters_conf[pkg_id][ "status"] = PKG_STATUS_READY fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info fm.pkg_filters_conf[pkg_id]["progress"] = 0 else: fm.pkg_filters_conf[pkg_id][ "status"] = PKG_STATUS_READY_UPDATE_AVAILABLE fm.pkg_filters_conf[pkg_id]["progress"] = 0 fm.db_pools[pkg_id] = adbapi.ConnectionPool( 'sqlite3', dest_db, check_same_thread=False, cp_openfun=on_db_connect) print "Added to db pool -> %s" % pkg_id threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf) except: print "Something wrong updating pkg : %s" % pkg_id fm.pkg_filters_conf[pkg_id][ "status"] = PKG_STATUS_READY_UPDATE_AVAILABLE fm.pkg_filters_conf[pkg_id]["progress"] = 0 threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
def __download_new_pkg(self, pkg_id, url, fm): import sqlite3 import urllib2 import urlparse import bz2 try: if pkg_id in fm.db_pools.keys(): db = fm.db_pools.pop(pkg_id) db.close() try: pkg_info = json.load(urllib2.urlopen(url)) except: fm.pkg_filters_conf.pop(pkg_id) threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf) return fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info base_filename = pkg_info["base"] base_url = urlparse.urljoin(url, base_filename) dest_file = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s-%s" % (pkg_id, base_filename)) dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.db" % (pkg_id)) if os.path.exists(dest_file): os.unlink(dest_file) if os.path.exists(dest_db): os.unlink(dest_db) df = open(dest_file, "wb") url_x = urllib2.urlopen(base_url) fm.pkg_filters_conf[pkg_id]["progress"] = 0 total_len = int(url_x.info().getheaders("Content-Length")[0]) downl_len = 0 while True: x = url_x.read(1024) if x != '': df.write(x) downl_len += len(x) fm.pkg_filters_conf[pkg_id]["progress"] = (downl_len * 100) / total_len else: break df.close() df_uc_c = bz2.BZ2File(dest_file, "r") lines_counted = 0 for line in df_uc_c.readlines(): lines_counted += 1 df_uc_c.close() df_uc = bz2.BZ2File(dest_file, "r") db_conn = sqlite3.connect(dest_db) sql = '' fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_INSTALLING fm.pkg_filters_conf[pkg_id]["progress"] = 0 lines_inserted = 0 for line in df_uc.readlines(): lines_inserted += 1 sql = sql + line if sqlite3.complete_statement(sql): c = db_conn.cursor() try: c.execute(sql) except: pass sql = '' fm.pkg_filters_conf[pkg_id]["progress"] = (lines_inserted * 100) / lines_counted db_conn.commit() db_conn.close() df_uc.close() os.unlink(dest_file) fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY fm.pkg_filters_conf[pkg_id]["progress"] = 0 fm.db_pools[pkg_id] = adbapi.ConnectionPool( 'sqlite3', dest_db, check_same_thread=False, cp_openfun=on_db_connect) print "Added to db pool -> %s" % pkg_id threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf) except: if os.path.exists(dest_file): os.unlink(dest_file) if os.path.exists(dest_db): os.unlink(dest_db) fm.pkg_filters_conf[pkg_id]["pkg_info"] = {} fm.pkg_filters_conf[pkg_id][ "status"] = PKG_STATUS_ERROR_INSTALLING_NEW_BL fm.pkg_filters_conf[pkg_id]["progress"] = 0 threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
# -*- coding: UTF-8 -*- import sys import requests from bs4 import BeautifulSoup import re from urlparse import urlparse from urlparse import urljoin reload(sys) sys.setdefaultencoding('utf8') url = 'http://www.bjkufang.cn/kusou.aspx' r = requests.get(url) soup = BeautifulSoup(r.text) g_data = soup.find_all('div', {'class': 'btnr'}) for links in g_data: for link in links.findAll('a', {'target': '_blank'}): href = urlparse.urljoin(url, link.get('href')) ''' try: print item.contents[1].find_all('li','class':'primary'})[0].text except: pass '''
def onMessage(self, type, args): if type == 'public': args = args.split(' ') for frag in args: o = urlparse(frag) if o.scheme == 'http' or o.scheme == 'https': uri = frag if self.DEBUG: print('parsing url "' + uri + '"\n') localhost = [ 'http://localhost/', 'http://localhost:80/', 'http://localhost:8080/', 'http://127.0.0.1/', 'http://127.0.0.1:80/', 'http://127.0.0.1:8080/', 'https://localhost/', 'https://localhost:80/', 'https://localhost:8080/', 'https://127.0.0.1/', 'https://127.0.0.1:80/', 'https://127.0.0.1:8080/', ] for s in localhost: if uri.startswith(s): if self.DEBUG: print(uri + ": acces for localhost denied") return try: redirects = 0 while True: if self.DEBUG: print(uri + ": fetching ...") headers = { 'Accept': 'text/html', 'User-Agent': 'Mozilla/5.0 (fsiBot)' } req = urllib2.Request(uri, headers = headers) u = urllib2.urlopen(req) info = u.info() u.close() if not isinstance(info, list): status = '200' else: status = str(info[1]) info = info[0] if status.startswith('3'): uri = urlparse.urljoin(uri, info['Location']) else: break redirects += 1 if redirects >= 10: if self.DEBUG: print(url + ": To many redirects") return try: mtype = info['content-type'] except: if self.DEBUG: print(url + ": Couldnt get the Content-Type") return if not (('/html' in mtype) or ('/xhtml' in mtype)): if self.DEBUG: print(url + ": Document isnt HTML") return if self.DEBUG: print(uri + ": opening ...") u = urllib2.urlopen(req) bytes = u.read(262144) if self.DEBUG: print("read: " + bytes) u.close except IOError: if self.DEBUG: print(url + ": Can't connet to") return r_title = re.compile(r'(?ims)<title[^>]*>(.*?)</title\s*>') m = r_title.search(bytes) if m: title = m.group(1) if self.DEBUG: print("parsed html, title is: " + title) if (len(title) > 200): title = title[:200] + "[...]" def e(m): entity = m.group(0) if entity.startswith('&#x'): cp = int(entity[3:-1], 16) return unichr(cp).encode('utf-8') elif entity.startswith('&#'): cp = int(entity[2:-1]) return unichr(cp).encode('utf-8') else: char = name2codepoint[entity[1:-1]] return unichr(char).encode('utf-8') r_entity = re.compile(r'&[A-Za-z0-9#]+;') title = r_entity.sub(e, title) if title: try: title.decode('utf-8') except: try: title = title.decode('iso-8859-1').encode('utf-8') except: title = title.decode('cp1252').encode('utf-8') else: pass else: title = '[Title is empty.]' answer = re.sub(r'\s+', ' ', '[' + o.hostname + '] ' + title) self.sendPublicMessage(answer) else: if self.DEBUG: print(url + ": Title is empty") else: if self.DEBUG: print(url + ": No title found")
def __download_new_pkg(self, pkg_id, url, fm): import sqlite3 import urllib2 import urlparse import bz2 try: if pkg_id in fm.db_pools.keys(): db = fm.db_pools.pop(pkg_id) db.close() try: pkg_info = json.load(urllib2.urlopen(url)) except: fm.pkg_filters_conf.pop(pkg_id) threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf) return fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info base_filename = pkg_info["base"] base_url = urlparse.urljoin(url, base_filename) dest_file = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s-%s" % (pkg_id, base_filename)) dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.db" % (pkg_id)) if os.path.exists(dest_file): os.unlink(dest_file) if os.path.exists(dest_db): os.unlink(dest_db) df = open(dest_file, "wb") url_x = urllib2.urlopen(base_url) fm.pkg_filters_conf[pkg_id]["progress"] = 0 total_len = int(url_x.info().getheaders("Content-Length")[0]) downl_len = 0 while True: x = url_x.read(1024) if x != "": df.write(x) downl_len += len(x) fm.pkg_filters_conf[pkg_id]["progress"] = (downl_len * 100) / total_len else: break df.close() df_uc_c = bz2.BZ2File(dest_file, "r") lines_counted = 0 for line in df_uc_c.readlines(): lines_counted += 1 df_uc_c.close() df_uc = bz2.BZ2File(dest_file, "r") db_conn = sqlite3.connect(dest_db) sql = "" fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_INSTALLING fm.pkg_filters_conf[pkg_id]["progress"] = 0 lines_inserted = 0 for line in df_uc.readlines(): lines_inserted += 1 sql = sql + line if sqlite3.complete_statement(sql): c = db_conn.cursor() try: c.execute(sql) except: pass sql = "" fm.pkg_filters_conf[pkg_id]["progress"] = (lines_inserted * 100) / lines_counted db_conn.commit() db_conn.close() df_uc.close() os.unlink(dest_file) fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY fm.pkg_filters_conf[pkg_id]["progress"] = 0 fm.db_pools[pkg_id] = adbapi.ConnectionPool( "sqlite3", dest_db, check_same_thread=False, cp_openfun=on_db_connect ) print "Added to db pool -> %s" % pkg_id threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf) except: if os.path.exists(dest_file): os.unlink(dest_file) if os.path.exists(dest_db): os.unlink(dest_db) fm.pkg_filters_conf[pkg_id]["pkg_info"] = {} fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_ERROR_INSTALLING_NEW_BL fm.pkg_filters_conf[pkg_id]["progress"] = 0 threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
def _create_published_file(tk, context, path, name, version_number, task, comment, published_file_type, created_by_user, created_at, version_entity, sg_fields=None, dry_run=False): """ Creates a publish entity in shotgun given some standard fields. :param tk: :class:`~sgtk.Sgtk` instance :param context: A :class:`~sgtk.Context` to associate with the publish. This will populate the ``task`` and ``entity`` link in Shotgun. :param path: The path to the file or sequence we want to publish. If the path is a sequence path it will be abstracted so that any sequence keys are replaced with their default values. :param name: A name, without version number, which helps distinguish this publish from other publishes. This is typically used for grouping inside of Shotgun so that all the versions of the same "file" can be grouped into a cluster. For example, for a Maya publish, where we track only the scene name, the name would simply be that: the scene name. For something like a render, it could be the scene name, the name of the AOV and the name of the render layer. :param version_number: The version number of the item we are publishing. :param task: Shotgun Task dictionary to associate with publish or ``None`` :param comment: Comments string to associate with publish :param published_file_type: Shotgun publish type dictionary to associate with publish :param created_by_user: User entity to associate with publish or ``None`` if current user (via :meth:`sgtk.util.get_current_user`) should be used. :param created_at: Timestamp to associate with publish or None for default. :param version_entity: Version dictionary to associate with publish or ``None``. :param sg_fields: Dictionary of additional data to add to publish. :param dry_run: Don't actually create the published file entry. Simply return the data dictionary that would be supplied. :returns: The result of the shotgun API create method. """ data = { "description": comment, "name": name, "task": task, "version_number": version_number, } # we set the optional additional fields first so we don't allow overwriting the standard parameters if sg_fields is None: sg_fields = {} data.update(sg_fields) if created_by_user: data["created_by"] = created_by_user else: # use current user sg_user = login.get_current_user(tk) if sg_user: data["created_by"] = sg_user if created_at: data["created_at"] = created_at published_file_entity_type = get_published_file_entity_type(tk) if published_file_type: if published_file_entity_type == "PublishedFile": data["published_file_type"] = published_file_type else: # using legacy type TankPublishedFile data["tank_type"] = published_file_type if version_entity: data["version"] = version_entity # Determine the value of the link field based on the given context if context.project is None: # when running toolkit as a standalone plugin, the context may be # empty and not contain a project. Publishes are project entities # in Shotgun, so we cannot proceed without a project. raise DskError("Your context needs to at least have a project set in order to publish.") elif context.entity is None: # If the context does not have an entity, link it up to the project. # This happens for project specific workflows such as editorial # workflows, ingest and when running zero config toolkit plugins in # a generic project mode. data["entity"] = context.project else: data["entity"] = context.entity # set the associated project data["project"] = context.project # Check if path is a url or a straight file path. Path # is assumed to be a url if it has a scheme: # # scheme://netloc/path # path_is_url = False res = urlparse(path) if res.scheme: # handle Windows drive letters - note this adds a limitation # but one that is not likely to be a problem as single-character # schemes are unlikely! if len(res.scheme) > 1 or not res.scheme.isalpha(): path_is_url = True # naming and path logic is different depending on url if path_is_url: # extract name from url: # # scheme://hostname.com/path/to/file.ext -> file.ext # scheme://hostname.com -> hostname.com if res.path: # scheme://hostname.com/path/to/file.ext -> file.ext data["code"] = res.path.split("/")[-1] else: # scheme://hostname.com -> hostname.com data["code"] = res.netloc # make sure that the url is escaped property, otherwise # shotgun might not accept it. # # for quoting logic, see bugfix here: # http://svn.python.org/view/python/trunk/Lib/urllib.py?r1=71780&r2=71779&pathrev=71780 # # note: by applying a safe pattern like this, we guarantee that already quoted paths # are not touched, e.g. quote('foo bar') == quote('foo%20bar') data["path"] = { "url": quote(path, safe="%/:=&?~#+!$,;'@()*[]"), "name": data["code"] # same as publish name } else: # normalize the path to native slashes norm_path = ShotgunPath.normalize(path) if norm_path != path: log.debug("Normalized input path '%s' -> '%s'" % (path, norm_path)) path = norm_path # convert the abstract fields to their defaults path = _translate_abstract_fields(tk, path) # name of publish is the filename data["code"] = os.path.basename(path) # Make path platform agnostic and determine if it belongs # to a storage that is associated with this toolkit config. root_name, path_cache = _calc_path_cache(tk, path) if path_cache: # there is a toolkit storage mapping defined for this storage log.debug( "The path '%s' is associated with config root '%s'." % (path, root_name) ) # check if the shotgun server supports the storage and relative_path parameters which # allows us to specify exactly which storage to bind a publish to rather than relying on # Shotgun to compute this. supports_specific_storage_syntax = ( hasattr(tk.shotgun, "server_caps") and tk.shotgun.server_caps.version and tk.shotgun.server_caps.version >= (7, 0, 1) ) if supports_specific_storage_syntax: # get corresponding SG local storage for the matching root name storage = tk.pipeline_configuration.get_local_storage_for_root(root_name) if storage is None: # there is no storage in Shotgun that matches the one toolkit expects. # this *may* be ok because there may be another storage in Shotgun that # magically picks up the publishes and associates with them. In this case, # issue a warning and fall back on the server-side functionality log.warning( "Could not find the expected storage for required root " "'%s' in Shotgun to associate publish '%s' with. " "Falling back to Shotgun's built-in storage resolution " "logic. It is recommended that you explicitly map a " "local storage to required root '%s'." % (root_name, path, root_name)) data["path"] = {"local_path": path} else: data["path"] = {"relative_path": path_cache, "local_storage": storage} else: # use previous syntax where we pass the whole path to Shotgun # and shotgun will do the storage/relative path split server side. # This operation may do unexpected things if you have multiple # storages that are identical or overlapping data["path"] = {"local_path": path} # fill in the path cache field which is used for filtering in Shotgun # (because SG does not support data["path_cache"] = path_cache else: # path does not map to any configured root - fall back gracefully: # 1. look for storages in Shotgun and see if we can create a local path # 2. failing that, just register the entry as a file:// resource. log.debug("Path '%s' does not have an associated config root." % path) log.debug("Will check shotgun local storages to see if there is a match.") matching_local_storage = False for storage in get_cached_local_storages(tk): local_storage_path = ShotgunPath.from_shotgun_dict(storage).current_os # assume case preserving file systems rather than case sensitive if local_storage_path and path.lower().startswith(local_storage_path.lower()): log.debug("Path matches Shotgun local storage '%s'" % storage["code"]) matching_local_storage = True break if matching_local_storage: # there is a local storage matching this path # so use that when publishing data["path"] = {"local_path": path} else: # no local storage defined so publish as a file:// url log.debug( "No local storage matching path '%s' - path will be " "registered as a file:// url." % (path, ) ) # (see http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url) file_url = urlparse.urljoin("file:", pathname2url(path)) log.debug("Converting '%s' -> '%s'" % (path, file_url)) data["path"] = { "url": file_url, "name": data["code"] # same as publish name } # now call out to hook just before publishing data = tk.execute_core_hook(dsk_constants.PUBLISH_HOOK_NAME, shotgun_data=data, context=context) if dry_run: # add the publish type to be as consistent as possible data["type"] = published_file_entity_type log.debug("Dry run. Simply returning the data that would be sent to SG: %s" % pprint.pformat(data)) return data else: log.debug("Registering publish in Shotgun: %s" % pprint.pformat(data)) return tk.shotgun.create(published_file_entity_type, data)
def o(self, data, puredata=0, force=0): if self.abbr_data is not None: self.abbr_data += data if not self.quiet: if options.google_doc: # prevent white space immediately after 'begin emphasis' marks ('**' and '_') lstripped_data = data.lstrip() if self.drop_white_space and not (self.pre or self.code): data = lstripped_data if lstripped_data != '': self.drop_white_space = 0 if puredata and not self.pre: # data = re.sub('\s+', ' ', data) if data and data[0] == ' ': self.space = 1 data = data[1:] if not data and not force: return if self.startpre: #self.out(" :") #TODO: not output when already one there self.startpre = 0 bq = (">" * self.blockquote) if not (force and data and data[0] == ">") and self.blockquote: bq += " " if self.pre: bq += " " data = data.replace("\n", "\n" + bq) if self.start: self.space = 0 self.p_p = 0 self.start = 0 if force == 'end': # It's the end. self.p_p = 0 self.out("\n") self.space = 0 if self.p_p: self.out((self.br_toggle + '\n' + bq) * self.p_p) self.space = 0 self.br_toggle = '' if self.space: if not self.lastWasNL: self.out(' ') self.space = 0 if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"): if force == "end": self.out("\n") newa = [] for link in self.a: if self.outcount > link['outcount']: self.out(" [" + str(link['count']) + "]: " + urlparse.urljoin(self.baseurl, link['href'])) if has_key(link, 'title'): self.out(" (" + link['title'] + ")") self.out("\n") else: newa.append(link) if self.a != newa: self.out( "\n" ) # Don't need an extra line when nothing was done. self.a = newa if self.abbr_list and force == "end": for abbr, definition in self.abbr_list.items(): self.out(" *[" + abbr + "]: " + definition + "\n") self.p_p = 0 self.out(data) self.outcount += 1
pp.pprint(r) #Remove the anchor (#), u = 'http://docs.python.org/library/urlparse.html#item22' udfrag = urldefrag(u) # the retrun type is a tuple #('http://docs.python.org/library/urlparse.html', 'urlparse.urldefrag') print("URL defrag :") length = len(udfrag) # Get the number of items in a udfrag Tuple print("Tuple Length :", length) print(udfrag) # ('http://docs.python.org/library/urlparse.html','item22'') print(udfrag[0]) # http://docs.python.org/library/urlparse.html print(udfrag[1]) # 'item22' print("slice", udfrag[0:length]) print "this is a tuple: %s" % (udfrag, ) # Another way to print a tuple # Build a URL by calling its geturl() method. # When combined with the urlencode() function, which knows how to build # query strings, this can be used to construct new URLs: import urllib, urlparse query = urllib.urlencode({'company': 'Nord/LB', 'report': 'sales'}) p = urlparse.ParseResult('https', 'example.com', 'data', None, query, None) url = p.geturl() print(url) #Relative URLs path = 'grants' url = urlparse.urljoin('http://www.python.org/psf/', path) print(url) print "URL + Relative path : %s path= %s " % (url, path)
def indexDocs(self, root, writer): # t1 = FieldType() # t1.setIndexed(True) # t1.setStored(True) # t1.setTokenized(False) # t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # t2 = FieldType() # t2.setIndexed(True) # t2.setStored(False) # t2.setTokenized(True) # t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) if not root.endswith('.txt'): print "Please give the index file end with .txt !" return index_file = open(root) for line in index_file.readlines(): url_and_name = line.split() url = url_and_name[0] filename = url_and_name[1] print "adding", filename try: path = os.path.join("html", filename) file = open(path) contents = file.read() soup = BeautifulSoup(contents, features="html.parser") imgs = soup.find_all('img') title = soup.head.title.string file.close() for img in imgs: imgurl = img.get('src') imgurl = urlparse.urljoin(url, imgurl) contents = getinfo(img) if imgurl not in crawled_imgurls: crawled_imgurls.append(imgurl) if contents: contents_list = jieba.cut(contents) contents = ' '.join(contents_list) doc = Document() doc.add( Field("imgurl", imgurl, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("title", title, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED)) if contents: if len(contents) > 0: doc.add( Field("contents", contents, Field.Store.NO, Field.Index.ANALYZED)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def __real_update_pkg_filter(self, pkg_id, fm): import sqlite3 import urllib2 import urlparse import bz2 if pkg_id not in fm.pkg_filters_conf.keys(): return try: fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_DOWNLOADING fm.pkg_filters_conf[pkg_id]["progress"] = 0 url = fm.pkg_filters_conf[pkg_id]["update_url"] pkg_info = json.load(urllib2.urlopen(url)) orig_t = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"]["orig-timestamp"] release_n = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"]["release-number"] on_server_orig_t = pkg_info["metadata"]["orig-timestamp"] on_server_release_n = pkg_info["metadata"]["release-number"] if orig_t != on_server_orig_t: reactor.callInThread(self.__download_new_pkg, pkg_id, url, self) return else: force_download = False for x in range(int(release_n) + 1, int(on_server_release_n) + 1): if "diff-%s-%s.bz2" % (orig_t, x) not in pkg_info["diffs"]: force_download = True break if force_download == True: reactor.callInThread(self.__download_new_pkg, pkg_id, url, self) return else: patches = [] for x in range(int(release_n) + 1, int(on_server_release_n) + 1): patches.append( ["diff-%s-%s.bz2" % (orig_t, x), urlparse.urljoin(url, "diff-%s-%s.bz2" % (orig_t, x))] ) dest_patch = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.update-patch" % (pkg_id)) if os.path.exists(dest_patch): os.unlink(dest_patch) dest_patch_fd = open(dest_patch, "w") lines_counted = 0 total_diffs = len(patches) downl_diffs = 0 for diff_filename, diff_url in patches: dest_file = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s-%s" % (pkg_id, diff_filename)) if os.path.exists(dest_file): os.unlink(dest_file) df = open(dest_file, "wb") url_x = urllib2.urlopen(diff_url) while True: x = url_x.read(1024) if x != "": df.write(x) else: break df.close() df_uc = bz2.BZ2File(dest_file, "r") for line in df_uc.readlines(): if not line.startswith("#"): dest_patch_fd.write(line) lines_counted += 1 df_uc.close() os.unlink(dest_file) downl_diffs += 1 fm.pkg_filters_conf[pkg_id]["progress"] = (downl_diffs * 100) / total_diffs dest_patch_fd.close() dest_patch_fd = open(dest_patch, "r") if pkg_id in fm.db_pools.keys(): db = fm.db_pools.pop(pkg_id) db.close() dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.db" % (pkg_id)) db_conn = sqlite3.connect(dest_db) fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_UPDATING fm.pkg_filters_conf[pkg_id]["progress"] = 0 lines_inserted = 0 sql = "" update_ok = True for line in dest_patch_fd.readlines(): lines_inserted += 1 sql = sql + line if sqlite3.complete_statement(sql): c = db_conn.cursor() try: c.execute(sql) except: db_conn.rollback() update_ok = False break sql = "" fm.pkg_filters_conf[pkg_id]["progress"] = (lines_inserted * 100) / lines_counted if update_ok == True: c = db_conn.cursor() c.execute("UPDATE metadata SET value='%s' WHERE key='release-number'" % on_server_release_n) db_conn.commit() print "UPDATED pkg:%s to version:%s" % (pkg_id, on_server_release_n) db_conn.close() dest_patch_fd.close() os.unlink(dest_patch) if update_ok == True: fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info fm.pkg_filters_conf[pkg_id]["progress"] = 0 else: fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY_UPDATE_AVAILABLE fm.pkg_filters_conf[pkg_id]["progress"] = 0 fm.db_pools[pkg_id] = adbapi.ConnectionPool( "sqlite3", dest_db, check_same_thread=False, cp_openfun=on_db_connect ) print "Added to db pool -> %s" % pkg_id threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf) except: print "Something wrong updating pkg : %s" % pkg_id fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY_UPDATE_AVAILABLE fm.pkg_filters_conf[pkg_id]["progress"] = 0 threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)