def login(opener): params = { "_flowId": "legacy-login-flow", "username": config["rhn-username"], "password": config["rhn-password"] } common.retrieve_m("https://www.redhat.com/wapps/sso/login.html", urllib.urlencode(params), opener = opener)
def login(opener): url = config["pt-root"] + "register/" html = common.retrieve_m(url, opener = opener, tries = 10).read() html = lxml.html.fromstring(html) params = {} for i in html.xpath("//form[@name = 'login']//input"): params[i.get("name")] = i.get("value") params["email"] = config["pt-username"] params["password"] = config["pt-password"] common.retrieve_m(url, urllib.urlencode(params), opener = opener, tries = 10)
def getjnlpurl(url): f = common.retrieve_m(url) m = re.search("LaunchURL = \"(.*?)\"", f.read()) f.close() t = urllib.splittype(url) return t[0] + "://" + urllib.splithost(t[1])[0] + m.group(1)
def getjnlpurl(url): f = common.retrieve_m(url) m = re.search("LaunchURL = \"(.*?)\"", f.read().decode("utf-8")) f.close() t = urllib.parse.splittype(url) return t[0] + "://" + urllib.parse.splithost(t[1])[0] + m.group(1)
def getjnlpurl(url): f = common.retrieve_m(url) m = re.search('LaunchURL = "(.*?)"', f.read().decode("utf-8")) f.close() t = urllib.parse.splittype(url) return t[0] + "://" + urllib.parse.splithost(t[1])[0] + m.group(1)
def download_item_page(item, tries=1): try: html = common.retrieve_m(item.pageurl, tries=tries) except urllib.error.HTTPError as e: warn("can't load item page %s (#%u, %s, %s) (%s), continuing..." % \ (item.pageurl, item.number, item.title, item.type_, e)) return xml = lxml.html.parse(html) try: if item.type_ == "Videos": item.set_dlurl(xml.xpath("//a[text()='OGG']/@href")[0]) extension = ".ogg" else: item.set_dlurl(xml.xpath("//a[@class='ResourceFile']/@href")[0]) extension = ".pdf" except IndexError: if item.type_ != "Other Resources": # "Other Resources" often contain a text article with no link warn( "can't find item download link at %s (#%u, %s, %s), continuing..." % (item.pageurl, item.number, item.title, item.type_)) return download_item(item, extension, tries=tries)
def download(item, db, tries): if item["href"] in db: path = db.get(item["href"]) else: f = common.retrieve_m(config["clearspace-root"] + item["href"], tries=tries) doc = WikiDoc(f.read()) f.close() path = doc.path + "/" + doc.filename if want(path): skip = False if os.path.exists(path): st = os.stat(path) if st.st_mtime == doc.mtime: skip = True if not skip: common.mkdirs(doc.path) common.retrieve(config["clearspace-root"] + doc.filehref, path, force=True, tries=tries) common.mkro(path) os.utime(path, (doc.mtime, doc.mtime)) updatedbs(db, keep, item["href"], path)
def download_item_page(item, tries = 1): try: html = common.retrieve_m(item.pageurl, tries = tries) except urllib2.HTTPError, e: warn("can't load item page %s (#%u, %s, %s) (%s), continuing..." % \ (item.pageurl, item.number, item.title, item.type_, e)) return
def read_project_list(opener): url = config["pt-root"] + "projects-emea/project-list" html = common.retrieve_m(url, opener = opener, tries = 10).read() html = lxml.html.fromstring(html) for _url in html.xpath("//a[text()='events']/@href"): _url = urlparse.urljoin(url, _url) q.put((read_events, opener, _url))
def read_events(opener, url): try: html = common.retrieve_m(url, opener = opener, tries = 10).read() except urllib2.HTTPError, e: if e.code == 403: print >>sys.stderr, "WARNING: %s, continuing..." % e return else: raise
def sync(query, keep): xml = common.retrieve_m(config["gsa-url"] + "?client=internal&output=xml&num=1000&filter=0&q=" + query, tries = 10) xml = lxml.etree.parse(xml) if int(xml.xpath("//M/text()")[0]) == 1000: raise Exception("search returned too many results") for result in xml.xpath("//U/text()"): dest = result.split("//")[1] dest = dest.replace("~", "") common.mkdirs(os.path.split(dest)[0]) common.retrieve(result, dest, tries = 10) common.mkro(dest) keep.add(dest)
def sync(query, keep): xml = common.retrieve_m( config["gsa-url"] + "?client=internal&output=xml&num=1000&filter=0&q=" + query, tries=10) xml = lxml.etree.parse(xml) if int(xml.xpath("//M/text()")[0]) == 1000: raise Exception("search returned too many results") for result in xml.xpath("//U/text()"): dest = result.split("//")[1] dest = dest.replace("~", "") common.mkdirs(os.path.split(dest)[0]) common.retrieve(result, dest, tries=10) common.mkro(dest) keep.add(dest)
def urls(): cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) login(opener) html = common.retrieve_m("https://rhn.redhat.com/rhn/software/channel/downloads/Download.do?cid=15989", opener = opener).read() html = lxml.html.fromstring(html) urls = html.xpath("//div[@class='bases']/div//a/@href") urls = [(url.split("?")[0].split("/")[-1], url) for url in urls] for url in sorted(urls, key = lambda url: url[0]): yield url urls = html.xpath("//div[@class='incrementals']/div//a/@href") urls = [(url.split("?")[0].split("/")[-1], url) for url in urls] for url in sorted(urls, key = lambda url: url[0]): yield url
def download_item_page(item, tries = 1): try: html = common.retrieve_m(item.pageurl, tries = tries) except urllib.error.HTTPError as e: warn("can't load item page %s (#%u, %s, %s) (%s), continuing..." % \ (item.pageurl, item.number, item.title, item.type_, e)) return xml = lxml.html.parse(html) try: if item.type_ == "Videos": item.set_dlurl(xml.xpath("//a[text()='OGG']/@href")[0]) extension = ".ogg" else: item.set_dlurl(xml.xpath("//a[@class='ResourceFile']/@href")[0]) extension = ".pdf" except IndexError: if item.type_ != "Other Resources": # "Other Resources" often contain a text article with no link warn("can't find item download link at %s (#%u, %s, %s), continuing..." % (item.pageurl, item.number, item.title, item.type_)) return download_item(item, extension, tries = tries)
db = common.DB(".sync-db") now = time.gmtime() for line in config["lists-sync"]: line = line.split(" ") url = line[0].rstrip("/") _list = url.split("/")[-1] credentials = None if len(line) == 3: credentials = urllib.urlencode(dict(zip(("username", "password"), line[1:3]))) index = common.retrieve_m(url, credentials) index_xml = lxml.html.parse(index).getroot() index.close() for href in index_xml.xpath("//a[substring-after(@href, '.') = 'txt.gz']/@href"): tm = time.strptime(href, "%Y-%B.txt.gz") path = "%s/%04u/%02u" % (_list, tm.tm_year, tm.tm_mon) if tm.tm_year < int(config["lists-start-year"]): break if not path in db or not os.path.isfile(path): common.mkdirs(os.path.split(path)[0]) try: f = common.retrieve_tmpfile(url + "/" + href, credentials) except urllib2.HTTPError, e:
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) # Permit write of UTF-8 characters to stderr (required when piping output) if sys.stderr.encoding == None: sys.stderr = codecs.getwriter("UTF-8")(sys.stderr) threads = int(config["resourcelibrary-threads"]) # When multi-threaded, silence progress meter if threads > 1: common.progress = lambda x, y: None common.progress_finish = lambda: None tries = 10 # Initialise cookie store common.retrieve_m("http://www.redhat.com/resourcelibrary/results", tries = tries) i = 0 indexurl = "http://www.redhat.com/resourcelibrary/results?portal:componentId=bf73926d-2aa3-4b8b-bf8d-a1f6f56b8469&portal:type=action&actionType=orderBy&orderBy=Date-Desc&resultPerPage=100" while True: html = common.retrieve_m(indexurl, tries = tries) xml = lxml.html.parse(html) for indexitem in xml.xpath("//div[@id='sidebar-left-events']"): indexitem = copy.deepcopy(indexitem) i += 1 item = Item() item.number = i item.type_ = indexitem.xpath("//b[text()='Type:']")[0].tail.strip()
urllib.request.HTTPCookieProcessor(cj)) urllib.request.install_opener(opener) # Permit write of UTF-8 characters to stderr (required when piping output) if sys.stderr.encoding == None: sys.stderr = codecs.getwriter("UTF-8")(sys.stderr) threads = int(config["resourcelibrary-threads"]) # When multi-threaded, silence progress meter if threads > 1: common.progress = lambda x, y: None common.progress_finish = lambda: None tries = 10 # Initialise cookie store common.retrieve_m("http://www.redhat.com/resourcelibrary/results", tries=tries) i = 0 indexurl = "http://www.redhat.com/resourcelibrary/results?portal:componentId=bf73926d-2aa3-4b8b-bf8d-a1f6f56b8469&portal:type=action&actionType=orderBy&orderBy=Date-Desc&resultPerPage=100" while True: html = common.retrieve_m(indexurl, tries=tries) xml = lxml.html.parse(html) for indexitem in xml.xpath("//div[@id='sidebar-left-events']"): indexitem = copy.deepcopy(indexitem) i += 1 item = Item() item.number = i item.type_ = indexitem.xpath("//b[text()='Type:']")[0].tail.strip()
for line in config["lists-sync"]: line = line.split(" ") url = line[0].rstrip("/") _list = url.split("/")[-1] if args["list"] and _list != args["list"]: continue credentials = None if len(line) == 3: credentials = urllib.parse.urlencode( dict(zip(("username", "password"), line[1:3]))).encode("utf-8") index = common.retrieve_m(url, credentials) index_xml = lxml.html.parse(index).getroot() index.close() for href in index_xml.xpath( "//a[substring-after(@href, '.') = 'txt.gz']/@href"): tm = time.strptime(href, "%Y-%B.txt.gz") path = "%s/%04u/%02u" % (_list, tm.tm_year, tm.tm_mon) if tm.tm_year < int(config["lists-start-year"]): break if not path in db or not os.path.isfile(path): common.mkdirs(os.path.split(path)[0]) req = urllib.request.Request(url + "/" + href, credentials, {"Accept-Encoding": "gzip"})
threads = int(config["clearspace-threads"]) if threads > 1: common.progress = lambda x, y: None common.progress_finish = lambda: None for i in range(threads): t = threading.Thread(target=worker, name=i) t.daemon = True t.start() keep = set() step = 50 tries = 10 for i in itertools.count(step=step): f = common.retrieve_m( config["clearspace-root"] + "/view-documents.jspa?start=%u&numResults=%u&filter=presentations" % (i, step), tries=tries, ) index = DocIndex(f.read()) f.close() for item in index.items: q.put((download, item, db, tries)) if len(index.items) != step: break q.join() for dirpath, dirnames, filenames in os.walk(".", topdown=False): # remove local files which are no longer found in clearspace for f in filenames: