Esempio n. 1
0
def httpExists(url):
    import httplib, urlparse

    host, path = urlparse.urlsplit(url)[1:3]
    if ":" in host:
        host, port = host.split(":", 1)
        try:
            port = int(port)
        except ValueError:
            return False, url
    else:
        port = None
    try:
        connection = httplib.HTTPConnection(host, port=port)
        connection.request("HEAD", path)
        resp = connection.getresponse()
        if resp.status == 200:
            return True, url
        elif resp.status == 302:
            new_url = urlparse.urljoin(url, resp.getheader("location", ""))
            return httpExists(new_url)
        else:
            return False, url
    except:
        return False, url
Esempio n. 2
0
def feature_extract(url_input):

    Feature = {}
    tokens_words = re.split('\W+', url_input)

    host = urlparse.urljoin(url_input, '/')
    path = urlparse.urlparse(url_input).path

    Feature['URL'] = url_input

    Feature['rank_host'], Feature['rank_country'] = sitepopularity(host)

    Feature['host'] = host

    Feature['Length_of_url'] = len(url_input)
    Feature['Length_of_host'] = len(host)
    Feature['No_of_dots'] = url_input.count('.')

    Feature['sec_sen_word_cnt'] = Security_sensitive(tokens_words)
    Feature['IPaddress_presence'] = Check_IPaddress(tokens_words)
    Feature['avg_token_length'], Feature['token_count'], Feature[
        'largest_token'] = Tokenise(url_input)
    Feature['avg_domain_token_length'], Feature['domain_token_count'], Feature[
        'largest_domain'] = Tokenise(host)
    Feature['avg_path_token'], Feature['path_token_count'], Feature[
        'largest_path'] = Tokenise(path)

    Feature['ASNno'] = getASN(host)
    Feature['safebrowsing'] = safebrowsing(url_input)
    Feature['numTld'] = numTld(url_input)
    Feature['numPunctuation'] = numPunctuation(url_input)
    return Feature
Esempio n. 3
0
 def _extracturls(self):
     #print "Extract URLs"
     urls = []
     htmlsrc, charset, parenturl = self.htmlSrcTuple
     if htmlsrc != None:
         resulturls = []
         urlExtractor = ExtractLinks(resulturls)
         try:
             if charset == None:
                 urlExtractor.feed(htmlsrc)
             else:
                 urlExtractor.feed(htmlsrc.decode(charset))
         except HTMLParser.HTMLParseError:
             pass
         try:
             urlExtractor.reset() # I think close needs special treatment .close()
         except HTMLParser.HTMLParseError:
             urlExtractor.reset()
         #this piece of code forms the URIs to full URLs by joining the
         #parenturl with the network location free URLs extracted
         for i in xrange(len(resulturls)): #replacing range() for performance reasons
             urlres = urlparse.urlparse(resulturls[i], "http")
             if urlres.netloc == "":
                 resulturls[i] = urlparse.urljoin(parenturl, resulturls[i])
             urls.extend(resulturls)
     return urls
 def handle_starttag(self, tag, attrs):
     # print(tag)
     if tag == 'a':
         for (attribute, value) in attrs:
             if attribute == 'href':
                 url = urlparse.urljoin(self.base_url, value)
 # whenever you come across a relative url this home page ^ needs to be stuck on it
                 self.links.add(url)
Esempio n. 5
0
 def convt2compurl(self, base_url, urls):
     import re
     import urlparse
     if type(urls) is tuple or type(urls) is list:
         newurllist = []
         for url in urls:
             url = re.sub(r'^\s+', '', url)
             url = re.sub(r'\s+$', '', url)
             newurl = urlparse.urljoin(base_url, url)
             newurl = self.normalize_url(newurl)
             newurllist.append(newurl)
         return newurllist
     else:
         url = re.sub(r'^\s', '', url)
         url = re.sub(r'\s$', '', url)
         newurl = urlparse.urljoin(base_url, urls)
         return self.normalize_url(newurl)
Esempio n. 6
0
 def standardize_url(self, base_url, raw_url):
     if raw_url == None or raw_url == "":
         return raw_url
     import urlparse
     url = urlparse.urljoin(base_url, raw_url)
     #if url.startswith("http://www") or url.startswith("wwww") or url.startswith("https://www"):
     #pass
     #else:
     #url = url.replace("//", "//www.", 1)
     return url
Esempio n. 7
0
def getPicInfo(file):
    res = []
    # 读取照片信息
    path = os.path.join(extract_path, file)
    fd = open(path, 'rb')
    # 获取照片的属性组
    tags = exifread.process_file(fd)
    fd.close()
    # 获取照片属性中的创建时间
    if 'Image DateTime' in tags:
        date = str(tags['Image DateTime'])
        LatRef = tags["GPS GPSLatitudeRef"].printable
        Lat = tags["GPS GPSLatitude"].printable[1:-1].replace(" ", "").replace(
            "/", ",").split(",")
        Lat = float(
            Lat[0]) + float(Lat[1]) / 60 + float(Lat[2]) / float(Lat[3]) / 3600
        if LatRef != "N":
            Lat = Lat * (-1)
        # 经度
        LonRef = tags["GPS GPSLongitudeRef"].printable
        Lon = tags["GPS GPSLongitude"].printable[1:-1].replace(
            " ", "").replace("/", ",").split(",")
        Lon = float(
            Lon[0]) + float(Lon[1]) / 60 + float(Lon[2]) / float(Lon[3]) / 3600
        if LonRef != "E":
            Lon = Lon * (-1)
        date1 = date.split(' ')
        date1[0] = date1[0].replace(':', '-')
        date = ' '.join(date1)
        # 文件重命名
        new_name = date.replace(':', '').replace(
            ' ', '_') + os.path.splitext(file)[1]
        tot = 1
        new_upload_path = os.path.join(extract_path, new_name)
        while os.path.exists(new_upload_path):
            new_name = date.replace(':', '').replace(
                ' ', '_') + '_' + str(tot) + os.path.splitext(file)[1]
            tot += 1
            new_upload_path = os.path.join(extract_path, new_name)
        # 重命名文件需要传入的是路径加文件名 ../../1.png, ../../2018_0912.png
        os.rename(path, new_upload_path)
        # 拼接图片存储的URL
        pathName = urllib.pathname2url(new_upload_path)
        path = urlparse.urljoin(request.url[:pos], pathName)
        # 文件名不加后缀
        index = file.rfind('.')
        # 解决获取中文文件名的问题
        describertion = file[:index].decode('GB2312')
        res.append({
            'time': date,
            'describertion': describertion,
            'path': path,
            'pos': [Lat, Lon]
        })
Esempio n. 8
0
def absoluteUrl(baseUrl, url):
    if (not url or url[:7].lower() == "http://"
            or url[:8].lower() == "https://"):
        return url
    i = url.find('#')
    if (i >= 0):
        url = url[:i]
    if (not url):
        return baseUrl
    if (urljoin):
        return urljoin(baseUrl, url)
    return urlparse.urljoin(baseUrl, url)
def get_intra_links(webdriver, url):
    ps1 = du.get_ps_plus_1(url)
    links = list()
    for elem in webdriver.find_elements_by_tag_name("a"):
        try:
            href = elem.get_attribute('href')
        except StaleElementReferenceException:
            continue
        if href is None:
            continue
        full_href = urlparse.urljoin(url, href)
        if not full_href.startswith('http'):
            continue
        if du.get_ps_plus_1(full_href) == ps1:
            links.append(elem)
    return links
Esempio n. 10
0
 def handle_starttag(self, tag, attrs):
     # We are looking for the begining of a link. Links normally look
     # like <a href="www.someurl.com"></a>
     if tag == 'a':
         for (key, value) in attrs:
             if key == 'href':
                 # We are grabbing the new URL. We are also adding the
                 # base URL to it. For example:
                 # www.netinstructions.com is the base and
                 # somepage.html is the new URL (a relative URL)
                 #
                 # We combine a relative URL with the base URL to create
                 # an absolute URL like:
                 # www.netinstructions.com/somepage.html
                 newUrl = urlparse.urljoin(self.baseUrl, value)
                 # And add it to our colection of links:
                 self.links = self.links + [newUrl]
Esempio n. 11
0
 def _extract_urls(self, node_list):
     final_nodes = []
     final_urls = []
     in_web_page = is_web_url(self.url)
     if in_web_page:
         root_scheme = urlparse.urlparse(self.url).scheme
     else:
         root_scheme = None
     for node in node_list:
         url = node.get('src')
         if in_web_page:
             url = urlparse.urljoin(self.url, url)
         else:
             folder = os.path.dirname(self.url)
             url = os.path.abspath(os.path.join(folder, url))
         final_nodes.append(node)
         final_urls.append(url)
     return zip(final_nodes, final_urls)
def extract_next_links(rawDataObj):
    outputLinks = []
    '''
    rawDataObj is an object of type UrlResponse declared at L20-30
    datamodel/search/server_datamodel.py
    the return of this function should be a list of urls in their absolute form
    Validation of link via is_valid function is done later (see line 42).
    It is not required to remove duplicates that have already been downloaded. 
    The frontier takes care of that.
    
    Suggested library: lxml
    '''

    url = rawDataObj.url

    r = requests.get(url)

    data = r.text

    soup = BeautifulSoup(data, "lxml")

    for link in soup.find_all('a'):
        obtained_link = link.get('href')

        if (obtained_link is not None) and is_ascii(
                obtained_link):  # Check for invalid characters and None type
            if ('mailto' not in str(obtained_link)) and (
                    "calendar" not in obtained_link
            ):  # Check for presence of mailto links and calendar (for crawler trap)
                http_url = get_absolute(url, obtained_link)
                obtained_link = urlparse.urljoin(http_url, obtained_link)
                if '?' in obtained_link:  # Identifying links with query parameters
                    position = obtained_link.rfind('?')
                    obtained_link = obtained_link[:position]
                outputLinks.append(obtained_link)
            count_subdomain(outputLinks)

    links_count = len(outputLinks)
    check_max_outlinks(links_count, url)
    return outputLinks
Esempio n. 13
0
def urijoin(base_uri, path):
    if is_url(base_uri):
        return urlparse.urljoin(base_uri, path)
    else:
        return os.path.normpath(os.path.join(base_uri, path.strip('/')))
Esempio n. 14
0
 def handle_starttag(self, tag, attrs):
     if tag == 'a':
         for (attribute, value) in attrs:
             if attribute == 'href':
                 url = urlparse.urljoin(self.base_URL, value)
                 self.links.add(url)
Esempio n. 15
0
    def o(self, data, puredata=0, force=0):
        if self.abbr_data is not None: self.abbr_data += data

        if not self.quiet:
            if options.google_doc:
                # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
                lstripped_data = data.lstrip()
                if self.drop_white_space and not (self.pre or self.code):
                    data = lstripped_data
                if lstripped_data != '':
                    self.drop_white_space = 0

            if puredata and not self.pre:
#                data = re.sub('\s+', ' ', data)
                if data and data[0] == ' ':
                    self.space = 1
                    data = data[1:]
            if not data and not force: return

            if self.startpre:
                #self.out(" :") #TODO: not output when already one there
                self.startpre = 0

            bq = (">" * self.blockquote)
            if not (force and data and data[0] == ">") and self.blockquote: bq += " "

            if self.pre:
                bq += "    "
                data = data.replace("\n", "\n"+bq)

            if self.start:
                self.space = 0
                self.p_p = 0
                self.start = 0

            if force == 'end':
                # It's the end.
                self.p_p = 0
                self.out("\n")
                self.space = 0

            if self.p_p:
                self.out((self.br_toggle+'\n'+bq)*self.p_p)
                self.space = 0
                self.br_toggle = ''

            if self.space:
                if not self.lastWasNL: self.out(' ')
                self.space = 0

            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
                if force == "end": self.out("\n")

                newa = []
                for link in self.a:
                    if self.outcount > link['outcount']:
                        self.out("   ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
                        if has_key(link, 'title'): self.out(" ("+link['title']+")")
                        self.out("\n")
                    else:
                        newa.append(link)

                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.

                self.a = newa

            if self.abbr_list and force == "end":
                for abbr, definition in self.abbr_list.items():
                    self.out("  *[" + abbr + "]: " + definition + "\n")

            self.p_p = 0
            self.out(data)
            self.outcount += 1
Esempio n. 16
0
    def __real_update_pkg_filter(self, pkg_id, fm):
        import sqlite3
        import urllib2
        import urlparse
        import bz2

        if pkg_id not in fm.pkg_filters_conf.keys():
            return

        try:
            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_DOWNLOADING
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            url = fm.pkg_filters_conf[pkg_id]["update_url"]
            pkg_info = json.load(urllib2.urlopen(url))

            orig_t = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"][
                "orig-timestamp"]
            release_n = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"][
                "release-number"]

            on_server_orig_t = pkg_info["metadata"]["orig-timestamp"]
            on_server_release_n = pkg_info["metadata"]["release-number"]

            if orig_t != on_server_orig_t:
                reactor.callInThread(self.__download_new_pkg, pkg_id, url,
                                     self)
                return
            else:
                force_download = False

                for x in range(
                        int(release_n) + 1,
                        int(on_server_release_n) + 1):
                    if "diff-%s-%s.bz2" % (orig_t, x) not in pkg_info["diffs"]:
                        force_download = True
                        break

                if force_download == True:
                    reactor.callInThread(self.__download_new_pkg, pkg_id, url,
                                         self)
                    return
                else:
                    patches = []
                    for x in range(
                            int(release_n) + 1,
                            int(on_server_release_n) + 1):
                        patches.append([
                            "diff-%s-%s.bz2" % (orig_t, x),
                            urlparse.urljoin(url,
                                             "diff-%s-%s.bz2" % (orig_t, x))
                        ])

                    dest_patch = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR,
                                              "%s.update-patch" % (pkg_id))

                    if os.path.exists(dest_patch):
                        os.unlink(dest_patch)

                    dest_patch_fd = open(dest_patch, "w")
                    lines_counted = 0

                    total_diffs = len(patches)
                    downl_diffs = 0

                    for diff_filename, diff_url in patches:
                        dest_file = os.path.join(
                            NANNY_DAEMON_BLACKLISTS_DIR,
                            "%s-%s" % (pkg_id, diff_filename))

                        if os.path.exists(dest_file):
                            os.unlink(dest_file)

                        df = open(dest_file, "wb")
                        url_x = urllib2.urlopen(diff_url)

                        while True:
                            x = url_x.read(1024)
                            if x != '':
                                df.write(x)
                            else:
                                break

                        df.close()

                        df_uc = bz2.BZ2File(dest_file, "r")
                        for line in df_uc.readlines():
                            if not line.startswith("#"):
                                dest_patch_fd.write(line)
                                lines_counted += 1

                        df_uc.close()
                        os.unlink(dest_file)

                        downl_diffs += 1
                        fm.pkg_filters_conf[pkg_id]["progress"] = (
                            downl_diffs * 100) / total_diffs

                    dest_patch_fd.close()

                    dest_patch_fd = open(dest_patch, "r")

                    if pkg_id in fm.db_pools.keys():
                        db = fm.db_pools.pop(pkg_id)
                        db.close()

                    dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR,
                                           "%s.db" % (pkg_id))
                    db_conn = sqlite3.connect(dest_db)

                    fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_UPDATING
                    fm.pkg_filters_conf[pkg_id]["progress"] = 0

                    lines_inserted = 0

                    sql = ''
                    update_ok = True
                    for line in dest_patch_fd.readlines():
                        lines_inserted += 1
                        sql = sql + line
                        if sqlite3.complete_statement(sql):
                            c = db_conn.cursor()
                            try:
                                c.execute(sql)
                            except:
                                db_conn.rollback()
                                update_ok = False
                                break

                            sql = ''
                        fm.pkg_filters_conf[pkg_id]["progress"] = (
                            lines_inserted * 100) / lines_counted

                    if update_ok == True:
                        c = db_conn.cursor()
                        c.execute(
                            "UPDATE metadata SET value='%s' WHERE key='release-number'"
                            % on_server_release_n)
                        db_conn.commit()
                        print "UPDATED pkg:%s to version:%s" % (
                            pkg_id, on_server_release_n)

                    db_conn.close()
                    dest_patch_fd.close()
                    os.unlink(dest_patch)

                    if update_ok == True:
                        fm.pkg_filters_conf[pkg_id][
                            "status"] = PKG_STATUS_READY
                        fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info
                        fm.pkg_filters_conf[pkg_id]["progress"] = 0
                    else:
                        fm.pkg_filters_conf[pkg_id][
                            "status"] = PKG_STATUS_READY_UPDATE_AVAILABLE
                        fm.pkg_filters_conf[pkg_id]["progress"] = 0

                    fm.db_pools[pkg_id] = adbapi.ConnectionPool(
                        'sqlite3',
                        dest_db,
                        check_same_thread=False,
                        cp_openfun=on_db_connect)
                    print "Added to db pool -> %s" % pkg_id
                    threads.blockingCallFromThread(reactor,
                                                   fm._save_pkg_filters_conf)
        except:
            print "Something wrong updating pkg : %s" % pkg_id
            fm.pkg_filters_conf[pkg_id][
                "status"] = PKG_STATUS_READY_UPDATE_AVAILABLE
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
Esempio n. 17
0
    def __download_new_pkg(self, pkg_id, url, fm):
        import sqlite3
        import urllib2
        import urlparse
        import bz2

        try:
            if pkg_id in fm.db_pools.keys():
                db = fm.db_pools.pop(pkg_id)
                db.close()

            try:
                pkg_info = json.load(urllib2.urlopen(url))
            except:
                fm.pkg_filters_conf.pop(pkg_id)
                threads.blockingCallFromThread(reactor,
                                               fm._save_pkg_filters_conf)
                return

            fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info

            base_filename = pkg_info["base"]
            base_url = urlparse.urljoin(url, base_filename)
            dest_file = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR,
                                     "%s-%s" % (pkg_id, base_filename))
            dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR,
                                   "%s.db" % (pkg_id))

            if os.path.exists(dest_file):
                os.unlink(dest_file)

            if os.path.exists(dest_db):
                os.unlink(dest_db)

            df = open(dest_file, "wb")
            url_x = urllib2.urlopen(base_url)
            fm.pkg_filters_conf[pkg_id]["progress"] = 0

            total_len = int(url_x.info().getheaders("Content-Length")[0])
            downl_len = 0

            while True:
                x = url_x.read(1024)
                if x != '':
                    df.write(x)
                    downl_len += len(x)
                    fm.pkg_filters_conf[pkg_id]["progress"] = (downl_len *
                                                               100) / total_len
                else:
                    break

            df.close()

            df_uc_c = bz2.BZ2File(dest_file, "r")
            lines_counted = 0
            for line in df_uc_c.readlines():
                lines_counted += 1
            df_uc_c.close()

            df_uc = bz2.BZ2File(dest_file, "r")
            db_conn = sqlite3.connect(dest_db)

            sql = ''

            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_INSTALLING
            fm.pkg_filters_conf[pkg_id]["progress"] = 0

            lines_inserted = 0

            for line in df_uc.readlines():
                lines_inserted += 1
                sql = sql + line
                if sqlite3.complete_statement(sql):
                    c = db_conn.cursor()
                    try:
                        c.execute(sql)
                    except:
                        pass
                    sql = ''
                fm.pkg_filters_conf[pkg_id]["progress"] = (lines_inserted *
                                                           100) / lines_counted

            db_conn.commit()
            db_conn.close()
            df_uc.close()

            os.unlink(dest_file)

            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            fm.db_pools[pkg_id] = adbapi.ConnectionPool(
                'sqlite3',
                dest_db,
                check_same_thread=False,
                cp_openfun=on_db_connect)
            print "Added to db pool -> %s" % pkg_id
            threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)

        except:
            if os.path.exists(dest_file):
                os.unlink(dest_file)

            if os.path.exists(dest_db):
                os.unlink(dest_db)

            fm.pkg_filters_conf[pkg_id]["pkg_info"] = {}
            fm.pkg_filters_conf[pkg_id][
                "status"] = PKG_STATUS_ERROR_INSTALLING_NEW_BL
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
Esempio n. 18
0
# -*- coding: UTF-8 -*-
import sys
import requests
from bs4 import BeautifulSoup
import re
from urlparse import urlparse
from urlparse import urljoin
reload(sys)
sys.setdefaultencoding('utf8')

url = 'http://www.bjkufang.cn/kusou.aspx'
r = requests.get(url)

soup = BeautifulSoup(r.text)

g_data = soup.find_all('div', {'class': 'btnr'})
for links in g_data:
    for link in links.findAll('a', {'target': '_blank'}):
        href = urlparse.urljoin(url, link.get('href'))
'''
    try:
        print item.contents[1].find_all('li','class':'primary'})[0].text
    except:
        pass
'''
Esempio n. 19
0
	def onMessage(self, type, args):
		if type == 'public':
			args = args.split(' ')
			for frag in args:
				o = urlparse(frag)
				if o.scheme == 'http' or o.scheme == 'https':
					uri = frag
					if self.DEBUG:
						print('parsing url "' + uri + '"\n')
				
					localhost = [
						'http://localhost/', 'http://localhost:80/',
						'http://localhost:8080/', 'http://127.0.0.1/',
						'http://127.0.0.1:80/', 'http://127.0.0.1:8080/',
						'https://localhost/', 'https://localhost:80/',
						'https://localhost:8080/', 'https://127.0.0.1/',
						'https://127.0.0.1:80/', 'https://127.0.0.1:8080/',
					]

					for s in localhost:
						if uri.startswith(s):
							if self.DEBUG:
								print(uri + ": acces for localhost denied")
							return

					try:
						redirects = 0
						while True:
							if self.DEBUG:
								print(uri + ": fetching ...")
							headers = {
								'Accept': 'text/html',
								'User-Agent': 'Mozilla/5.0 (fsiBot)'
								}
							req = urllib2.Request(uri, headers = headers)
							u = urllib2.urlopen(req)
							info = u.info()
							u.close()

							if not isinstance(info, list):
								status = '200'
							else:
								status = str(info[1])
								info = info[0]
							if status.startswith('3'):
								uri = urlparse.urljoin(uri, info['Location'])
							else:
								break

							redirects += 1
							if redirects >= 10:
								if self.DEBUG:
									print(url + ": To many redirects")
								return

						try:
							mtype = info['content-type']
						except:
							if self.DEBUG:
								print(url + ": Couldnt get the Content-Type")
							return

						if not (('/html' in mtype) or ('/xhtml' in mtype)):
							if self.DEBUG:
								print(url + ": Document isnt HTML")
							return


						if self.DEBUG:
							print(uri + ": opening ...")
						u = urllib2.urlopen(req)
						bytes = u.read(262144)
						if self.DEBUG:
							print("read: " + bytes)
						u.close

					except IOError:
						if self.DEBUG:
							print(url + ": Can't connet to")
						return

					r_title = re.compile(r'(?ims)<title[^>]*>(.*?)</title\s*>')
					m = r_title.search(bytes)

					if m:
						title = m.group(1)
						if self.DEBUG:
							print("parsed html, title is: " + title)

						if (len(title) > 200):
							title = title[:200] + "[...]"

						def e(m):
							entity = m.group(0)
							if entity.startswith('&#x'):
								cp = int(entity[3:-1], 16)
								return unichr(cp).encode('utf-8')
							elif entity.startswith('&#'):
								cp = int(entity[2:-1])
								return unichr(cp).encode('utf-8')
							else:
								char = name2codepoint[entity[1:-1]]
								return unichr(char).encode('utf-8')
						r_entity = re.compile(r'&[A-Za-z0-9#]+;')
						title = r_entity.sub(e, title)

						if title:
							try: title.decode('utf-8')
							except:
								try: title = title.decode('iso-8859-1').encode('utf-8')
								except: title = title.decode('cp1252').encode('utf-8')
							else: pass
						else: title = '[Title is empty.]'

						answer = re.sub(r'\s+', ' ', '[' + o.hostname + '] ' + title)
						self.sendPublicMessage(answer)
					else:
						if self.DEBUG:
							print(url + ": Title is empty")
				else:
					if self.DEBUG:
						print(url + ": No title found")
Esempio n. 20
0
    def __download_new_pkg(self, pkg_id, url, fm):
        import sqlite3
        import urllib2
        import urlparse
        import bz2

        try:
            if pkg_id in fm.db_pools.keys():
                db = fm.db_pools.pop(pkg_id)
                db.close()

            try:
                pkg_info = json.load(urllib2.urlopen(url))
            except:
                fm.pkg_filters_conf.pop(pkg_id)
                threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
                return

            fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info

            base_filename = pkg_info["base"]
            base_url = urlparse.urljoin(url, base_filename)
            dest_file = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s-%s" % (pkg_id, base_filename))
            dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.db" % (pkg_id))

            if os.path.exists(dest_file):
                os.unlink(dest_file)

            if os.path.exists(dest_db):
                os.unlink(dest_db)

            df = open(dest_file, "wb")
            url_x = urllib2.urlopen(base_url)
            fm.pkg_filters_conf[pkg_id]["progress"] = 0

            total_len = int(url_x.info().getheaders("Content-Length")[0])
            downl_len = 0

            while True:
                x = url_x.read(1024)
                if x != "":
                    df.write(x)
                    downl_len += len(x)
                    fm.pkg_filters_conf[pkg_id]["progress"] = (downl_len * 100) / total_len
                else:
                    break

            df.close()

            df_uc_c = bz2.BZ2File(dest_file, "r")
            lines_counted = 0
            for line in df_uc_c.readlines():
                lines_counted += 1
            df_uc_c.close()

            df_uc = bz2.BZ2File(dest_file, "r")
            db_conn = sqlite3.connect(dest_db)

            sql = ""

            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_INSTALLING
            fm.pkg_filters_conf[pkg_id]["progress"] = 0

            lines_inserted = 0

            for line in df_uc.readlines():
                lines_inserted += 1
                sql = sql + line
                if sqlite3.complete_statement(sql):
                    c = db_conn.cursor()
                    try:
                        c.execute(sql)
                    except:
                        pass
                    sql = ""
                fm.pkg_filters_conf[pkg_id]["progress"] = (lines_inserted * 100) / lines_counted

            db_conn.commit()
            db_conn.close()
            df_uc.close()

            os.unlink(dest_file)

            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            fm.db_pools[pkg_id] = adbapi.ConnectionPool(
                "sqlite3", dest_db, check_same_thread=False, cp_openfun=on_db_connect
            )
            print "Added to db pool -> %s" % pkg_id
            threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)

        except:
            if os.path.exists(dest_file):
                os.unlink(dest_file)

            if os.path.exists(dest_db):
                os.unlink(dest_db)

            fm.pkg_filters_conf[pkg_id]["pkg_info"] = {}
            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_ERROR_INSTALLING_NEW_BL
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
Esempio n. 21
0
def _create_published_file(tk, context, path, name, version_number, task, comment, published_file_type,
                           created_by_user, created_at, version_entity, sg_fields=None, dry_run=False):
    """
    Creates a publish entity in shotgun given some standard fields.

    :param tk: :class:`~sgtk.Sgtk` instance
    :param context: A :class:`~sgtk.Context` to associate with the publish. This will
                    populate the ``task`` and ``entity`` link in Shotgun.
    :param path: The path to the file or sequence we want to publish. If the
                 path is a sequence path it will be abstracted so that
                 any sequence keys are replaced with their default values.
    :param name: A name, without version number, which helps distinguish
               this publish from other publishes. This is typically
               used for grouping inside of Shotgun so that all the
               versions of the same "file" can be grouped into a cluster.
               For example, for a Maya publish, where we track only
               the scene name, the name would simply be that: the scene
               name. For something like a render, it could be the scene
               name, the name of the AOV and the name of the render layer.
    :param version_number: The version number of the item we are publishing.
    :param task: Shotgun Task dictionary to associate with publish or ``None``
    :param comment: Comments string to associate with publish
    :param published_file_type: Shotgun publish type dictionary to
                associate with publish
    :param created_by_user: User entity to associate with publish or ``None``
                if current user (via :meth:`sgtk.util.get_current_user`)
                should be used.
    :param created_at: Timestamp to associate with publish or None for default.
    :param version_entity: Version dictionary to associate with publish or ``None``.
    :param sg_fields: Dictionary of additional data to add to publish.
    :param dry_run: Don't actually create the published file entry. Simply
                    return the data dictionary that would be supplied.

    :returns: The result of the shotgun API create method.
    """

    data = {
        "description": comment,
        "name": name,
        "task": task,
        "version_number": version_number,
        }

    # we set the optional additional fields first so we don't allow overwriting the standard parameters
    if sg_fields is None:
        sg_fields = {}
    data.update(sg_fields)

    if created_by_user:
        data["created_by"] = created_by_user
    else:
        # use current user
        sg_user = login.get_current_user(tk)
        if sg_user:
            data["created_by"] = sg_user

    if created_at:
        data["created_at"] = created_at

    published_file_entity_type = get_published_file_entity_type(tk)

    if published_file_type:
        if published_file_entity_type == "PublishedFile":
            data["published_file_type"] = published_file_type
        else:
            # using legacy type TankPublishedFile
            data["tank_type"] = published_file_type

    if version_entity:
        data["version"] = version_entity


    # Determine the value of the link field based on the given context
    if context.project is None:
        # when running toolkit as a standalone plugin, the context may be
        # empty and not contain a project. Publishes are project entities
        # in Shotgun, so we cannot proceed without a project.
        raise DskError("Your context needs to at least have a project set in order to publish.")

    elif context.entity is None:
        # If the context does not have an entity, link it up to the project.
        # This happens for project specific workflows such as editorial
        # workflows, ingest and when running zero config toolkit plugins in
        # a generic project mode.
        data["entity"] = context.project

    else:
        data["entity"] = context.entity

    # set the associated project
    data["project"] = context.project

    # Check if path is a url or a straight file path.  Path
    # is assumed to be a url if it has a scheme:
    #
    #     scheme://netloc/path
    #
    path_is_url = False
    res = urlparse(path)
    if res.scheme:
        # handle Windows drive letters - note this adds a limitation
        # but one that is not likely to be a problem as single-character
        # schemes are unlikely!
        if len(res.scheme) > 1 or not res.scheme.isalpha():
            path_is_url = True

    # naming and path logic is different depending on url
    if path_is_url:

        # extract name from url:
        #
        # scheme://hostname.com/path/to/file.ext -> file.ext
        # scheme://hostname.com -> hostname.com
        if res.path:
            # scheme://hostname.com/path/to/file.ext -> file.ext
            data["code"] = res.path.split("/")[-1]
        else:
            # scheme://hostname.com -> hostname.com
            data["code"] = res.netloc

        # make sure that the url is escaped property, otherwise
        # shotgun might not accept it.
        #
        # for quoting logic, see bugfix here:
        # http://svn.python.org/view/python/trunk/Lib/urllib.py?r1=71780&r2=71779&pathrev=71780
        #
        # note: by applying a safe pattern like this, we guarantee that already quoted paths
        #       are not touched, e.g. quote('foo bar') == quote('foo%20bar')
        data["path"] = {
            "url": quote(path, safe="%/:=&?~#+!$,;'@()*[]"),
            "name": data["code"]  # same as publish name
        }

    else:

        # normalize the path to native slashes
        norm_path = ShotgunPath.normalize(path)
        if norm_path != path:
            log.debug("Normalized input path '%s' -> '%s'" % (path, norm_path))
            path = norm_path

        # convert the abstract fields to their defaults
        path = _translate_abstract_fields(tk, path)

        # name of publish is the filename
        data["code"] = os.path.basename(path)

        # Make path platform agnostic and determine if it belongs
        # to a storage that is associated with this toolkit config.
        root_name, path_cache = _calc_path_cache(tk, path)

        if path_cache:
            # there is a toolkit storage mapping defined for this storage
            log.debug(
                "The path '%s' is associated with config root '%s'." % (path, root_name)
            )

            # check if the shotgun server supports the storage and relative_path parameters which
            # allows us to specify exactly which storage to bind a publish to rather than relying on
            # Shotgun to compute this.
            supports_specific_storage_syntax = (
                hasattr(tk.shotgun, "server_caps") and
                tk.shotgun.server_caps.version and
                tk.shotgun.server_caps.version >= (7, 0, 1)
            )

            if supports_specific_storage_syntax:

                # get corresponding SG local storage for the matching root name
                storage = tk.pipeline_configuration.get_local_storage_for_root(root_name)

                if storage is None:
                    # there is no storage in Shotgun that matches the one toolkit expects.
                    # this *may* be ok because there may be another storage in Shotgun that
                    # magically picks up the publishes and associates with them. In this case,
                    # issue a warning and fall back on the server-side functionality
                    log.warning(
                        "Could not find the expected storage for required root "
                        "'%s' in Shotgun to associate publish '%s' with. "
                        "Falling back to Shotgun's built-in storage resolution "
                        "logic. It is recommended that you explicitly map a "
                        "local storage to required root '%s'." %
                        (root_name, path, root_name))
                    data["path"] = {"local_path": path}

                else:
                    data["path"] = {"relative_path": path_cache, "local_storage": storage}

            else:
                # use previous syntax where we pass the whole path to Shotgun
                # and shotgun will do the storage/relative path split server side.
                # This operation may do unexpected things if you have multiple
                # storages that are identical or overlapping
                data["path"] = {"local_path": path}

            # fill in the path cache field which is used for filtering in Shotgun
            # (because SG does not support
            data["path_cache"] = path_cache

        else:

            # path does not map to any configured root - fall back gracefully:
            # 1. look for storages in Shotgun and see if we can create a local path
            # 2. failing that, just register the entry as a file:// resource.
            log.debug("Path '%s' does not have an associated config root." % path)
            log.debug("Will check shotgun local storages to see if there is a match.")

            matching_local_storage = False
            for storage in get_cached_local_storages(tk):
                local_storage_path = ShotgunPath.from_shotgun_dict(storage).current_os
                # assume case preserving file systems rather than case sensitive
                if local_storage_path and path.lower().startswith(local_storage_path.lower()):
                    log.debug("Path matches Shotgun local storage '%s'" % storage["code"])
                    matching_local_storage = True
                    break

            if matching_local_storage:
                # there is a local storage matching this path
                # so use that when publishing
                data["path"] = {"local_path": path}

            else:
                # no local storage defined so publish as a file:// url
                log.debug(
                    "No local storage matching path '%s' - path will be "
                    "registered as a file:// url." % (path, )
                )

                # (see http://stackoverflow.com/questions/11687478/convert-a-filename-to-a-file-url)
                file_url = urlparse.urljoin("file:", pathname2url(path))
                log.debug("Converting '%s' -> '%s'" % (path, file_url))
                data["path"] = {
                    "url": file_url,
                    "name": data["code"]  # same as publish name
                }


    # now call out to hook just before publishing
    data = tk.execute_core_hook(dsk_constants.PUBLISH_HOOK_NAME, shotgun_data=data, context=context)

    if dry_run:
        # add the publish type to be as consistent as possible
        data["type"] = published_file_entity_type
        log.debug("Dry run. Simply returning the data that would be sent to SG: %s" % pprint.pformat(data))
        return data
    else:
        log.debug("Registering publish in Shotgun: %s" % pprint.pformat(data))
        return tk.shotgun.create(published_file_entity_type, data)
Esempio n. 22
0
    def o(self, data, puredata=0, force=0):
        if self.abbr_data is not None: self.abbr_data += data

        if not self.quiet:
            if options.google_doc:
                # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
                lstripped_data = data.lstrip()
                if self.drop_white_space and not (self.pre or self.code):
                    data = lstripped_data
                if lstripped_data != '':
                    self.drop_white_space = 0

            if puredata and not self.pre:
                #                data = re.sub('\s+', ' ', data)
                if data and data[0] == ' ':
                    self.space = 1
                    data = data[1:]
            if not data and not force: return

            if self.startpre:
                #self.out(" :") #TODO: not output when already one there
                self.startpre = 0

            bq = (">" * self.blockquote)
            if not (force and data and data[0] == ">") and self.blockquote:
                bq += " "

            if self.pre:
                bq += "    "
                data = data.replace("\n", "\n" + bq)

            if self.start:
                self.space = 0
                self.p_p = 0
                self.start = 0

            if force == 'end':
                # It's the end.
                self.p_p = 0
                self.out("\n")
                self.space = 0

            if self.p_p:
                self.out((self.br_toggle + '\n' + bq) * self.p_p)
                self.space = 0
                self.br_toggle = ''

            if self.space:
                if not self.lastWasNL: self.out(' ')
                self.space = 0

            if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH)
                           or force == "end"):
                if force == "end": self.out("\n")

                newa = []
                for link in self.a:
                    if self.outcount > link['outcount']:
                        self.out("   [" + str(link['count']) + "]: " +
                                 urlparse.urljoin(self.baseurl, link['href']))
                        if has_key(link, 'title'):
                            self.out(" (" + link['title'] + ")")
                        self.out("\n")
                    else:
                        newa.append(link)

                if self.a != newa:
                    self.out(
                        "\n"
                    )  # Don't need an extra line when nothing was done.

                self.a = newa

            if self.abbr_list and force == "end":
                for abbr, definition in self.abbr_list.items():
                    self.out("  *[" + abbr + "]: " + definition + "\n")

            self.p_p = 0
            self.out(data)
            self.outcount += 1
Esempio n. 23
0
pp.pprint(r)
#Remove the anchor (#),
u = 'http://docs.python.org/library/urlparse.html#item22'
udfrag = urldefrag(u)  # the retrun type is a tuple
#('http://docs.python.org/library/urlparse.html', 'urlparse.urldefrag')

print("URL defrag :")

length = len(udfrag)  # Get the number of items in a udfrag Tuple
print("Tuple Length :", length)
print(udfrag)  # ('http://docs.python.org/library/urlparse.html','item22'')
print(udfrag[0])  # http://docs.python.org/library/urlparse.html
print(udfrag[1])  # 'item22'

print("slice", udfrag[0:length])
print "this is a tuple: %s" % (udfrag, )  # Another way to print a tuple

# Build a URL by calling its geturl() method.
#  When combined with the urlencode() function, which knows how to build
#  query strings, this can be used to construct new URLs:
import urllib, urlparse
query = urllib.urlencode({'company': 'Nord/LB', 'report': 'sales'})
p = urlparse.ParseResult('https', 'example.com', 'data', None, query, None)
url = p.geturl()
print(url)

#Relative URLs
path = 'grants'
url = urlparse.urljoin('http://www.python.org/psf/', path)
print(url)
print "URL + Relative path : %s path= %s " % (url, path)
Esempio n. 24
0
    def indexDocs(self, root, writer):

        # t1 = FieldType()
        # t1.setIndexed(True)
        # t1.setStored(True)
        # t1.setTokenized(False)
        # t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)

        # t2 = FieldType()
        # t2.setIndexed(True)
        # t2.setStored(False)
        # t2.setTokenized(True)
        # t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        if not root.endswith('.txt'):
            print "Please give the index file end with .txt !"
            return

        index_file = open(root)
        for line in index_file.readlines():
            url_and_name = line.split()
            url = url_and_name[0]
            filename = url_and_name[1]
            print "adding", filename
            try:
                path = os.path.join("html", filename)
                file = open(path)
                contents = file.read()
                soup = BeautifulSoup(contents, features="html.parser")
                imgs = soup.find_all('img')
                title = soup.head.title.string
                file.close()
                for img in imgs:
                    imgurl = img.get('src')
                    imgurl = urlparse.urljoin(url, imgurl)
                    contents = getinfo(img)
                    if imgurl not in crawled_imgurls:
                        crawled_imgurls.append(imgurl)
                        if contents:
                            contents_list = jieba.cut(contents)
                            contents = ' '.join(contents_list)
                        doc = Document()
                        doc.add(
                            Field("imgurl", imgurl, Field.Store.YES,
                                  Field.Index.NOT_ANALYZED))
                        doc.add(
                            Field("title", title, Field.Store.YES,
                                  Field.Index.NOT_ANALYZED))
                        doc.add(
                            Field("url", url, Field.Store.YES,
                                  Field.Index.NOT_ANALYZED))
                        if contents:
                            if len(contents) > 0:
                                doc.add(
                                    Field("contents", contents, Field.Store.NO,
                                          Field.Index.ANALYZED))
                        else:
                            print "warning: no content in %s" % filename
                        writer.addDocument(doc)
            except Exception, e:
                print "Failed in indexDocs:", e
Esempio n. 25
0
    def __real_update_pkg_filter(self, pkg_id, fm):
        import sqlite3
        import urllib2
        import urlparse
        import bz2

        if pkg_id not in fm.pkg_filters_conf.keys():
            return

        try:
            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_DOWNLOADING
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            url = fm.pkg_filters_conf[pkg_id]["update_url"]
            pkg_info = json.load(urllib2.urlopen(url))

            orig_t = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"]["orig-timestamp"]
            release_n = fm.pkg_filters_conf[pkg_id]["pkg_info"]["metadata"]["release-number"]

            on_server_orig_t = pkg_info["metadata"]["orig-timestamp"]
            on_server_release_n = pkg_info["metadata"]["release-number"]

            if orig_t != on_server_orig_t:
                reactor.callInThread(self.__download_new_pkg, pkg_id, url, self)
                return
            else:
                force_download = False

                for x in range(int(release_n) + 1, int(on_server_release_n) + 1):
                    if "diff-%s-%s.bz2" % (orig_t, x) not in pkg_info["diffs"]:
                        force_download = True
                        break

                if force_download == True:
                    reactor.callInThread(self.__download_new_pkg, pkg_id, url, self)
                    return
                else:
                    patches = []
                    for x in range(int(release_n) + 1, int(on_server_release_n) + 1):
                        patches.append(
                            ["diff-%s-%s.bz2" % (orig_t, x), urlparse.urljoin(url, "diff-%s-%s.bz2" % (orig_t, x))]
                        )

                    dest_patch = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.update-patch" % (pkg_id))

                    if os.path.exists(dest_patch):
                        os.unlink(dest_patch)

                    dest_patch_fd = open(dest_patch, "w")
                    lines_counted = 0

                    total_diffs = len(patches)
                    downl_diffs = 0

                    for diff_filename, diff_url in patches:
                        dest_file = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s-%s" % (pkg_id, diff_filename))

                        if os.path.exists(dest_file):
                            os.unlink(dest_file)

                        df = open(dest_file, "wb")
                        url_x = urllib2.urlopen(diff_url)

                        while True:
                            x = url_x.read(1024)
                            if x != "":
                                df.write(x)
                            else:
                                break

                        df.close()

                        df_uc = bz2.BZ2File(dest_file, "r")
                        for line in df_uc.readlines():
                            if not line.startswith("#"):
                                dest_patch_fd.write(line)
                                lines_counted += 1

                        df_uc.close()
                        os.unlink(dest_file)

                        downl_diffs += 1
                        fm.pkg_filters_conf[pkg_id]["progress"] = (downl_diffs * 100) / total_diffs

                    dest_patch_fd.close()

                    dest_patch_fd = open(dest_patch, "r")

                    if pkg_id in fm.db_pools.keys():
                        db = fm.db_pools.pop(pkg_id)
                        db.close()

                    dest_db = os.path.join(NANNY_DAEMON_BLACKLISTS_DIR, "%s.db" % (pkg_id))
                    db_conn = sqlite3.connect(dest_db)

                    fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_UPDATING
                    fm.pkg_filters_conf[pkg_id]["progress"] = 0

                    lines_inserted = 0

                    sql = ""
                    update_ok = True
                    for line in dest_patch_fd.readlines():
                        lines_inserted += 1
                        sql = sql + line
                        if sqlite3.complete_statement(sql):
                            c = db_conn.cursor()
                            try:
                                c.execute(sql)
                            except:
                                db_conn.rollback()
                                update_ok = False
                                break

                            sql = ""
                        fm.pkg_filters_conf[pkg_id]["progress"] = (lines_inserted * 100) / lines_counted

                    if update_ok == True:
                        c = db_conn.cursor()
                        c.execute("UPDATE metadata SET value='%s' WHERE key='release-number'" % on_server_release_n)
                        db_conn.commit()
                        print "UPDATED pkg:%s to version:%s" % (pkg_id, on_server_release_n)

                    db_conn.close()
                    dest_patch_fd.close()
                    os.unlink(dest_patch)

                    if update_ok == True:
                        fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY
                        fm.pkg_filters_conf[pkg_id]["pkg_info"] = pkg_info
                        fm.pkg_filters_conf[pkg_id]["progress"] = 0
                    else:
                        fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY_UPDATE_AVAILABLE
                        fm.pkg_filters_conf[pkg_id]["progress"] = 0

                    fm.db_pools[pkg_id] = adbapi.ConnectionPool(
                        "sqlite3", dest_db, check_same_thread=False, cp_openfun=on_db_connect
                    )
                    print "Added to db pool -> %s" % pkg_id
                    threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)
        except:
            print "Something wrong updating pkg : %s" % pkg_id
            fm.pkg_filters_conf[pkg_id]["status"] = PKG_STATUS_READY_UPDATE_AVAILABLE
            fm.pkg_filters_conf[pkg_id]["progress"] = 0
            threads.blockingCallFromThread(reactor, fm._save_pkg_filters_conf)