def get_for_keyword(self, keyword): logger.info(lambda: "Fetching quotes from Goodreads for keyword=%s" % keyword) url = iri2uri("https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s" % keyword) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/tag.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri("https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s&page=%d" % (keyword, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def get_for_keyword(self, keyword): logger.info(lambda: "Fetching quotes from Goodreads for keyword=%s" % keyword) url = iri2uri(u"https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s" % keyword) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/tag.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri(u"https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s&page=%d" % (keyword, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def get_for_author(self, author): logger.info(lambda: "Fetching quotes from Goodreads for author=%s" % author) url = iri2uri(u"https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s" % author) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/search.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri(u"https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s&page=%d" % (author, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def get_for_author(self, author): logger.info(lambda: "Fetching quotes from Goodreads for author=%s" % author) url = iri2uri("https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s" % author) soup = Util.html_soup(url) page_links = list(Util.safe_map(int, [pagelink.contents[0] for pagelink in soup.find_all(href=re.compile('quotes/search.*page='))])) if page_links: page = random.randint(1, max(page_links)) url = iri2uri("https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s&page=%d" % (author, page)) soup = Util.html_soup(url) return self.get_from_soup(url, soup)
def download_image(self, image_info, image_path, retries=5): """Download an image""" count = retries image_url = image_info['original_url'] image_url = httplib2.iri2uri(image_url) image_path_temp = image_path + "_temp" while count > 0: count -= 1 # Doing the actual downloading urllib.urlretrieve (image_url, image_path_temp) # Checking the image image_data = SmugMug.load_image(image_path_temp) image_md5sum = hashlib.md5(image_data).hexdigest() image_size = str(len(image_data)) if image_md5sum != image_info['md5_sum']: raise "MD5 sum doesn't match." elif image_size != str(image_info['size']): raise "Image size doesn't match." else: os.rename(image_path_temp, image_path) break if count > 0: print "Retrying..." else: raise "Error: Too many retries." sys.exit(1)
def request(self, uri, method="GET", body=None, headers=None, max_redirects=None, connection_type=None): """Start an HTTP request. @param uri: The uri to retrieve @param method: (optional) The HTTP method to use. Default is 'GET' @param body: (optional) The request body. Default is no body. @param headers: (optional) Additional headers to send. Defaults include C{connection: keep-alive}, C{user-agent} and C{content-type}. @param max_redirects: (optional) The maximum number of redirects to use for this request. The class instance's max_redirects is default @param connection_type: (optional) see L{httplib2.Http.request} @return: (response, content) tuple """ if max_redirects is None: max_redirects = self.max_redirects if headers is None: headers = {} # Prepare headers headers.pop('cookie', None) req = DummyRequest(uri, headers) self.cookiejar.lock.acquire() try: self.cookiejar.add_cookie_header(req) finally: self.cookiejar.lock.release() headers = req.headers # Wikimedia squids: add connection: keep-alive to request headers # unless overridden headers['connection'] = headers.pop('connection', 'keep-alive') # determine connection pool key and fetch connection (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm( httplib2.iri2uri(uri)) conn_key = scheme+":"+authority connection = self.connection_pool.pop_connection(conn_key) if connection is not None: self.connections[conn_key] = connection # Redirect hack: we want to regulate redirects follow_redirects = self.follow_redirects self.follow_redirects = False pywikibot.debug(u"%r" % ( (uri.replace("%7C","|"), method, body, headers, max_redirects, connection_type),), _logger) try: (response, content) = httplib2.Http.request( self, uri, method, body, headers, max_redirects, connection_type) except Exception, e: # what types? # return exception instance to be retrieved by the calling thread return e
def __init__(self, url): if not url.startswith("http"): url = "http://" + url url = httplib2.iri2uri(url) # certain urls are self-explicable if re_wikiurl.match(url): try: assert urllib2.unquote(url.encode("ascii")).decode('utf8') != url except: raise MeaninglessTitle("wikipedia title is within the url") try: resp = opener_en.open(url.encode("utf-8"), timeout=5) except urllib2.URLError as e: raise CantGetContents(e) info = resp.info() if info.type not in ("text/html", "text/xhtml"): raise ThisIsNotHTML("this doesn't look like html") data = resp.read(262144) if info.get('Content-Encoding') == 'gzip': data = gzip.GzipFile(fileobj=StringIO(data)).read() encoding = info.getparam("charset") or getcharset(data) title = x_title(html.fromstring(data, parser=html.HTMLParser(encoding=encoding))) if not title: raise ThereIsNoTitle(u"there's no title in the first 4⁹ bytes") title = title[0].text if title is None: raise MeaninglessTitle(u"title is present but empty") title = clean(title) if title == "imgur: the simple image sharer": raise MeaninglessTitle("who needs the default imgur title?") if title == "Photos" and "core.org.ua" in url: raise MeaninglessTitle(u"рамок снова фотачками хвастается, да?") elif title.lower() in url.lower(): raise MeaninglessTitle("title text is contained within the url") self.shortargs = self.longargs = (title,)
def InitRequestHead(self): """Initializes curl object for a HEAD request. A HEAD request is initiated so that we can check from the headers if this is a valid HTML file. If it is not a valid HTML file, then we do not initiate a GET request, saving any unnecessary downloadings. """ self._curl_object = pycurl.Curl() # Handles sites with unicode URLs. if isinstance(self._url, unicode): self._url = str(iri2uri(self._url)) self._curl_object.setopt(pycurl.URL, self._url) # The following line fixes the GnuTLS package error that pycurl depends # on for getting https pages. self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3) self._curl_object.setopt(pycurl.HEADERFUNCTION, self._GetHeaders) self._curl_object.setopt(pycurl.FOLLOWLOCATION, True) self._curl_object.setopt(pycurl.NOBODY, True) self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False); self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS) self._curl_object.setopt(pycurl.FAILONERROR, False) self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file) self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file) self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30) self._curl_object.setopt(pycurl.TIMEOUT, 300) self._curl_object.setopt(pycurl.NOSIGNAL, 1)
def _process(self, item): url = item["url"] log.debug(u"Crawling: %s", url) uri = httplib2.iri2uri(url) report = {"url": url, "result": None, "status_code": None, "visited": None} total_start_time = time.time() (scheme, authority, _path, _query, _fragment) = httplib2.parse_uri(uri) if scheme is None or authority is None: report["result"] = u"Invalid URI" return report try: # this line is copied from robotsparser.py:can_fetch urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) except KeyError: report["result"] = u"Malformed URL quoting." return report try: robot_check_result = self.ask_robots(uri, scheme, authority) # Graceful stop thing. if robot_check_result is None: raise Stop() except CrawlError, e: report["result"] = unicode(e) return report
def _load_result(self, result): data = None if result.rest_url is not None and len(result.rest_url) > 0: full_url = result.rest_url + '/legend?f=pjson' try: f = urllib.request.urlopen(httplib2.iri2uri(full_url), None, self.TIMEOUT) data = json.load(f) except Exception as e: log.error(full_url) log.exception(e) if data is not None: for l in data['layers']: if str(l['layerId']) in result.layers.split(','): attribute = l['layerName'] self._insert_attribute( result.layer + ' : ' + attribute, ("ogc_server_id:%s URL:%s Layer" % (result.ogc_server_id, result.rest_url), result.layer), ) for leg in l['legend']: attribute = leg['label'] self._insert_attribute( result.layer + ' / ' + l['layerName'] + ' : ' + attribute, ("ogc_server_id:%s Layer:%s Sublayer" % (result.ogc_server_id, result.layer), l['layerName']), )
def extractRealSupportedURI(uri): """ Returns "real" URI if it survives redirects and returns a 200. Returns None otherwise. """ realURI = None try: # this function follows the URI, resolving all redirects, # and detects redirect loops # iri2uri is needed for IRIs request = urllib.request.urlopen(httplib2.iri2uri(uri)) if request.getcode() == 200: realURI = request.geturl() except urllib.error.HTTPError as e: # something went wrong, we don't care what realURI = None except urllib.error.URLError as e: # something went wrong, we don't care what realURI = None except UnicodeError as e: # something went very wrong with the IRI decoding realURI = None return realURI
def parse(word): all_news = [] url = httplib2.iri2uri( 'https://smi2.ru/api/search?limit=100&offset=0&order=date&query={}'. format(word)) isError = True for q in range(5): try: jsonurl = urllib.request.urlopen(url) except urllib.error.HTTPError as error: print(error) print('Trying again') else: isError = False break if not (isError): obj = json.load(jsonurl) articles = obj['articles'] for art in articles: if (int(art['create_date']) < time.time() - 86400): continue news = {'title': '', 'article': '', 'link': ''} news['title'] = art['title_original'] news['article'] = art['announce_original'] news['link'] = art['share_url'] all_news.append(news) return all_news else: return []
def _get_links(self, url, page): soup = BeautifulSoup(page) links = [] a = soup.findAll('a') for tag in a: link = tag.get('href', None) print '1', link link1 = urlparse.urljoin(url, link) print '2', link1 link2 = httplib2.iri2uri(link) print '2', link2 pattern = self._site_pattern[self.chosen_site] if pattern.match(link): print link # path = urlparse.urlparse(link)[2] ## print path, # if pattern.match(path): # is_required_pattern = True # link = urlparse.urljoin(index_url, link) # link = httplib2.iri2uri(link) # if link and self._is_required_pattern(index_url, link): # links.append(link) # for tag in a: # link = tag.get('href', None) # link = urlparse.urljoin(index_url, link) # link = httplib2.iri2uri(link) # if link and self._is_required_pattern(index_url, link): # links.append(link) # links_set = set(links) # links = list(links_set) return links
def search(text): url = u'https://www.youtube.com/results?search_query={}'.format( (u'+'.join(text))) url = iri2uri(url) print(url) yt = urllib.request.urlopen(url).read() soup = BeautifulSoup(yt, 'html.parser') a = soup.find_all('div') links = [] titles = [] for i in a: if i.get("data-context-item-id"): video = i.h3.a link = video.get('href') title = video.get('title') if not link.startswith('/watch'): continue links.append("http://www.youtube.com" + link) titles.append(title) return links, titles
def process_user(username, fullname): filename = 'github/{}.csv'.format(username) filename_tmp = '{}.tmp'.format(filename) with open(filename_tmp, 'a'): os.utime(filename_tmp, None) uri_param = httplib2.iri2uri(fullname.replace(' ', '+')) url = u'{}/search?q={}&type=Users'.format(GITHUB_URL, uri_param) text = read_page(url) soup = BeautifulSoup(text) user_info = soup.find(class_='user-list-info') if not user_info: os.rename(filename_tmp, filename) soup.decompose() return a = user_info.find('a') github_username = a['href'][1:] with open(filename_tmp, 'w') as f: f.write(github_username + '\n') f.close() print "link stackoverflow '{}' to github '{}'".format( username, github_username) soup.decompose() commits = process_days(github_username, filename_tmp) os.rename(filename_tmp, filename) if github_username in CACHE: del CACHE[github_username]
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) try: isproduct = len(soup.find("a", {"class": "next i-next"})) > 0 return (isproduct) except: return (False)
def get_links(self, url=None, opt=None): ''' Return a dict(tag, value) of links from a webpage, opt is the tag you want to filter. example : "opt = 'a'" will return all the 'a' tags. ''' links = {} url = iri2uri(url) html_stream = urllib.urlopen(url) html_string = html_stream.read() html_stream.close() lxml_web_page = lxml.html.fromstring(html_string) # transform all the urls in absolute urls for elem, attr, link, pos in lxml_web_page.iterlinks(): absolute = urlparse.urljoin(url, link.strip()) if elem.tag in links: links[elem.tag].append(absolute) else: links[elem.tag] = [absolute] if opt is None: list_links = [] for tag, tag_links in links.iteritems(): for tag_link in tag_links: list_links.append(tag_link) return list(set(list_links)) else: links_opt = [] try: links_opt = list(set(links[opt])) # No links with the tag 'opt' except KeyError: pass return links_opt
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) self.logger.info(url) cstr = soup.find('div', {"class": "product-count"}).text.strip() end = int(cstr.split("sur")[1].replace(".", "").strip()) start = int(cstr.split("sur")[0].split("-")[1].strip()) return (start < end)
def _AddLink(self, link): """Adds url |link|, if not already present, to the appropriate list. The link only gets added to the single list that is appopriate for it: _secure_links, _general_links, _clues_secure_links or _clues_general_links. Args: link: the url that is inserted to the appropriate links list. """ # Handles sites with unicode URLs. if isinstance(link, unicode): # Encode in 'utf-8' to avoid the UnicodeEncodeError exception. link = httplib2.iri2uri(link).encode('utf-8') link_parsed = urlparse.urlparse(link) link_lists = [self._clues_secure_links, self._secure_links, self._clues_general_links, self._general_links] # Checks that the registration page is within the domain. if (self._domain in link_parsed[1] and all(link not in x for x in link_lists)): for clue in LINK_CLUES: if clue in link.lower(): if link_parsed[0].startswith('https'): self._clues_secure_links.append(link) return else: self._clues_general_links.append(link) return if link_parsed[0].startswith('https'): # No clues found in the link. self._secure_links.append(link) else: self._general_links.append(link)
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """ Internal function to follow a redirect recieved by L{request} """ (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if not response.has_key('location') and response.status != 300: raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if response.has_key('location'): location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority == None: response['location'] = httplib2.urlparse.urljoin(uri, location) logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location'])) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if not response.has_key('content-location'): response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if response.has_key('location'): location = response['location'] redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1) else: raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content)
def _AddLink(self, link): """Adds url |link|, if not already present, to the appropriate list. The link only gets added to the single list that is appopriate for it: _secure_links, _general_links, _clues_secure_links or _clues_general_links. Args: link: the url that is inserted to the appropriate links list. """ # Handles sites with unicode URLs. if isinstance(link, unicode): # Encode in 'utf-8' to avoid the UnicodeEncodeError exception. link = httplib2.iri2uri(link).encode('utf-8') link_parsed = urlparse.urlparse(link) link_lists = [ self._clues_secure_links, self._secure_links, self._clues_general_links, self._general_links ] # Checks that the registration page is within the domain. if (self._domain in link_parsed[1] and all(link not in x for x in link_lists)): for clue in LINK_CLUES: if clue in link.lower(): if link_parsed[0].startswith('https'): self._clues_secure_links.append(link) return else: self._clues_general_links.append(link) return if link_parsed[0].startswith( 'https'): # No clues found in the link. self._secure_links.append(link) else: self._general_links.append(link)
def get_url(self): url = "https://www.google.com/search?num=20&q=-youtube.com+" url = url + self.title.replace(" ", "+") url = url + "+" url = url + self.artist.replace(" ", "+") url = url + "+lyrics" url = str(iri2uri(url)) return url
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) try: isnotdisabled = not (soup.find("span", {"class": "disabled"}) == None) return (isnotdisabled) except: return (False)
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) try: isproduct = len( soup.find_all("div", {"class": "product-container fr"})) > 0 return (isproduct) except: return (False)
def CheckSubmissions(subreddit): """ Given a PRAW subreddit, marks expired links and returns a list of the submissions that were marked. It also returns a list of submissions we were unable to process (either because we don't know how to find the price or because we were unable to get the price). """ modified_submissions = [] needs_review_submissions = [] needs_review_cache = LoadCacheFromFile(NEEDS_REVIEW_CACHE_FILE) already_expired_cache = LoadCacheFromFile(ALREADY_EXPIRED_CACHE_FILE) for rank, submission in enumerate(subreddit.get_hot(limit=MAX_SUBMISSIONS)): submission.rank = rank # Used when creating digests for the mods # Both urllib2.urlopen() and the file writer to save the cache have trouble # when a submission's URL contains Unicode characters. Consequently, we # encode any stray Unicode characters right away so we don't need to worry # about it later. submission.url = httplib2.iri2uri(submission.url) # Skip anything already marked as expired, unless it's test data. if (submission.link_flair_css_class == EXPIRED_CSS_CLASS or submission.url in already_expired_cache) and not TEST_DATA: continue price = GetPrice(submission.url) # The price might be the empty string if we're unable to get the real price. if not price: if IsKnownFree(submission.url): # No human review needed! continue if submission.url not in needs_review_cache: needs_review_submissions.append(submission) # Send it to the mods! # Regardless of whether we need to tell the mods, move this submission to # the front of the cache. needs_review_cache[submission.url] = True # Dummy value continue # This next line is a little hard for non-Python people to read. It's # asking whether any nonzero digit is contained in the price. if not any(digit in price for digit in "123456789"): continue # It's still free! # If we get here, this submission is no longer free. Make a comment # explaining this and set the flair to expired. if not DRY_RUN: submission.add_comment(EXPIRED_MESSAGE % (price, submission.permalink)) subreddit.set_flair(submission, EXPIRED_FLAIR, EXPIRED_CSS_CLASS) # Add it to the cache, so that if we have made a mistake and this # submission is later un-expired, we don't re-expire it the next day. already_expired_cache[submission.url] = True # Dummy value submission.list_price = price # Store this to put in the digest later. modified_submissions.append(submission) if not DRY_RUN and not TEST_DATA: # Don't change the next run's cache if this is just a test StoreCacheToFile(already_expired_cache, ALREADY_EXPIRED_CACHE_FILE) StoreCacheToFile(needs_review_cache, NEEDS_REVIEW_CACHE_FILE) return modified_submissions, needs_review_submissions
def run(self): config=self.config url = config['urls'] soup = self.get_soup(httplib2.iri2uri(url)) #get segments try: catlist=self.get_catgorylinks(soup) except: catlist=[] if (len(catlist)>0): for cat in catlist: config['Category']=list(cat.keys())[0] url=cat[config['Category']] soup = self.get_soup(httplib2.iri2uri(url)) try: allseg= self.get_allseg(soup) except: allseg= dict() if (len(allseg)>0): for seg in list(allseg.keys()): url = allseg[seg] config['segment']=seg soup = self.get_soup(httplib2.iri2uri(url)) try: allsubseg= self.get_allseg(soup) except: allsubseg= dict() if (len(allsubseg)>0): for subseg in list(allsubseg.keys()): url = allsubseg[subseg] config['Sub-segment']=subseg self.get_proddata(url) else: config['Sub-segment']="None" self.get_proddata(url) else: config['segment']="None" config['Sub-segment']="None" self.get_proddata(url) else: config['Category']="None" config['segment']="None" config['Sub-segment']="None" self.get_proddata(url) pass
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) try: isnotdisabled = not (soup.find( "li", {"id": "pagination_next"})["class"].strip() == "disabled") return (isnotdisabled) except: return (False)
def main(): parser = argparse.ArgumentParser( description= "Organise music folders in <artist> - <album> [<tags>...] looking for tags at LastFM." ) parser.add_argument( "sourcedir", help="source dir where the music folders to modify are present") parser.add_argument( "targetdir", help="target dir where to place the converted music directories") args = parser.parse_args() subdirs = [x[0] for x in os.walk(args.sourcedir) if x[0] != "Cover"] ok_entries = [] failed_files = [] bad_tags = ["metal", "electronic", "electronica"] for folder in subdirs: files = [f for f in os.listdir(folder) if f.endswith("mp3")] isleaf = 1 for f in os.listdir(folder): if os.path.isdir(f): isleaf = 0 if isleaf == 1 and len(files) > 0: mp3file = files[0] artist = "" try: mp3info = EasyID3(folder + "/" + mp3file) artist = mp3info["artist"][0] album = mp3info["album"][0] # extract tags from lastfm tags = [] url = "http://www.last.fm/es/music/" + artist.replace(" ", "+") page = html.fromstring( urllib.urlopen(httplib2.iri2uri(url)).read()) for element in page.xpath("//li[@class='tag']/a"): tags.append(element.text) tags = filter((lambda x: x not in bad_tags), tags) if len(tags) == 0: raise Exception("no tags") else: print(tags) ok_entries.append([folder, artist, album, mp3file, tags]) new_folder_name = args.targetdir + artist + " - " + album for tag in tags: new_folder_name += " [" + tag + "]" call(["mv", folder, new_folder_name]) except (Exception, mutagen._id3util.ID3NoHeaderError) as e: print(e) traceback.print_exc() failed_files.append([folder, mp3file])
def parseHouse(self, response): if self.close_down: raise CloseSpider(reason='Duplicate house') hxs = HtmlXPathSelector(response) item = HouseItem() item['currency'] = "€" item['title'] = hxs.select('//h1[contains(@class, "long_subject")]/text()').extract() Concelho = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Concelho")]').select('../text()').extract()[1].strip() Freguesia = "" try: Freguesia = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Freguesia")]').select('../text()').extract()[1].strip() except: log.msg("No Freguesia", level=log.INFO) item['address'] = Concelho + ' ' + Freguesia item['link'] = response.url item['size'] = int(hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Tipologia")]').select('../text()').extract()[1].strip().replace("T","")[0]) item['desc'] = hxs.select('//div[contains(@class, "body_text")]/text()').extract() locale.setlocale(locale.LC_ALL, 'pt_PT') item['publication'] = hxs.select('//p[contains(@class,"right")]/text()').extract()[1].strip() if re.match("Ontem|Hoje", item['publication']) is None: log.msg("date is older, analyse...", level=log.INFO) computed_date = datetime.datetime.strptime(item['publication'], "%d %b %H:%M") if datetime.date.today().month is 1 and computed_date.month is 12: year = (datetime.date.today() - datetime.timedelta(days=365)).year else: year = datetime.date.today().year computed_date = datetime.datetime.strptime(item['publication'] + ' %d' % year, "%d %b %H:%M %Y") item['publication'] = computed_date.strftime("%d-%m-%Y") one_month_ago = datetime.datetime.today() - datetime.timedelta(days=30) if computed_date < one_month_ago: log.msg("Too old...", level=log.INFO) raise CloseSpider('Houses are too old') image_from_script = hxs.select('//div[contains(@id, "slider")]/script/text()').extract() images_urls = image_from_script[0].split('[')[1].split(',') images_urls[-1] = images_urls[-1].split(']')[0] try: item['lat'] = float(hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Ver mapa")]').select('../a/@onclick').extract()[0].split(',')[0].split('(')[1]) item['lng'] = float(hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Ver mapa")]').select('../a/@onclick').extract()[0].split(',')[1]) except: iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true" result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0] item['lat'] = result['geometry']['location']['lat'] item['lng'] = result['geometry']['location']['lng'] item['image_urls'] = [] for image_url in images_urls: item['image_urls'].append(image_url.replace("'","")) try: item['price'] = hxs.select('//span[contains(@class, "coolprice")]/text()').extract()[0].strip() yield item except: log.msg("no prices, no houses", level=log.INFO)
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) try: isnotlast = soup.find( "li", { "class": "ais-pagination--item ais-pagination--item__last" }).find("a") != None return (isnotlast) except: return (False)
def shorten(url): url = httplib2.iri2uri(url) if BITLY_USERNAME and BITLY_KEY: if len(url) > 40 or ':' in url: try: api = bitly.Api(login=BITLY_USERNAME, apikey=BITLY_KEY) url = api.shorten(url) except: pass return url
def request(self, method, uri, body=None, headers=None): if not self.handle: self.connect() handle = self.fcurl.curl_handle if headers is None: headers = {} if method == 'GET': handle.setopt(pycurl.HTTPGET, 1) elif method == 'HEAD': handle.setopt(pycurl.NOBODY, 1) elif method == 'POST': handle.setopt(pycurl.POST, 1) if body: headers['Content-Length'] = len(body) body_IO = StringIO(body) handle.setopt(pycurl.READFUNCTION, body_IO.read) elif method == 'PUT': handle.setopt(pycurl.UPLOAD, 1) if body: headers['Content-Length'] = len(body) body_IO = StringIO(body) handle.setopt(pycurl.READFUNCTION, body_IO.read) elif method == 'PATCH': handle.setopt(pycurl.UPLOAD, 1) handle.setopt(pycurl.CUSTOMREQUEST, 'PATCH') if body: headers['Transfer-Encoding'] = '' headers['Content-Length'] = len(body) body_IO = StringIO(body) handle.setopt(pycurl.READFUNCTION, body_IO.read) elif body is not None: # Custom method and body provided, error. raise Exception("body not supported with custom method %s." % method) else: # Custom method and no body provided, pretend to do a GET. handle.setopt(pycurl.CUSTOMREQUEST, method) if self.port: netloc = '%s:%s' % (self.host, self.port) else: netloc = self.host url = urlparse.urlunparse((self.scheme, netloc, uri, '', '', '')) self.url = str(iri2uri(url)) handle.setopt(pycurl.URL, self.url) if headers: handle.setopt(pycurl.HTTPHEADER, ['%s: %s' % (header, str(value)) for header, value in headers.iteritems()]) handle.setopt(pycurl.SSL_VERIFYPEER, 0) handle.setopt(pycurl.NOSIGNAL, 1) if self.key_file: handle.setopt(pycurl.SSLKEY, self.key_file) if self.cert_file: handle.setopt(pycurl.SSLCERT, self.cert_file) if self.timeout: handle.setopt(pycurl.TIMEOUT, self.timeout)
def curl_fetch_binary(url, data=None): url = iri2uri(url) req = Request(url, data=data, headers={ 'User-Agent': userAgent, "Accept-Language": "en" }) with urlopen(req) as f: data = f.read() return data
def process_prop_query_results(self, url_req, results): try: uri = httplib2.iri2uri(unicode(url_req)) req_result = json.loads(urllib.urlopen(uri).read()) # req_result = json.loads(self.__get_url(url_req).read()) if 'query-continue' in req_result.keys(): raise MwQueryError("continue not supported for prop query") r = req_result['query']['pages'] for p in r: results[p] = r[p] except KeyError: print "Empty result for --> %s" % (url_req)
def _common_perform(self, url, headers, accept_self_signed_SSL=False, follow_location=True, body_buffer=None, debug=False): """Perform activities common to all FriendlyCURL operations. Several parameters are passed through and processed identically for all of the \*_url functions, and all produce the same return type. :param url: The URL to access. If a unicode string, it will be treated\ as an IRI and converted to a URI. :type url: str or unicode :param headers: Additional headers to add to the request. :type headers: dict :param accept_self_signed_SSL: Whether to accept self-signed SSL certs. :type accept_self_signed_SSL: bool :param follow_location: If True, FriendlyCURL will follow location\ headers on HTTP redirects. If False, the redirect will be returned. :type follow_location: bool :param body_buffer: A buffer to write body content into. :type body_buffer: ``.write(str)``-able file-like object :param debug: Turn on debug logging for this request. :type debug: bool :returns: A tuple containing a dictionary of response headers, including\ the HTTP status as an int in 'status' and a buffer containing the body\ of the response.""" self.curl_handle.setopt( pycurl.HTTPHEADER, ['%s: %s' % (header, str(value)) for header, value in headers.iteritems()]) if isinstance(url, unicode): url = str(iri2uri(url)) self.curl_handle.setopt(pycurl.URL, url) if body_buffer: body = body_buffer else: body = StringIO() self.curl_handle.setopt(pycurl.FORBID_REUSE, 1) self.curl_handle.setopt(pycurl.WRITEFUNCTION, body.write) header = StringIO() self.curl_handle.setopt(pycurl.HEADERFUNCTION, header.write) if accept_self_signed_SSL == True: self.curl_handle.setopt(pycurl.SSL_VERIFYPEER, 0) if follow_location == True: self.curl_handle.setopt(pycurl.FOLLOWLOCATION, 1) if debug: self.curl_handle.setopt(pycurl.VERBOSE, 1) self.curl_handle.setopt(pycurl.DEBUGFUNCTION, debugfunction) self.curl_handle.perform() body.seek(0) headers = [hdr.split(': ') for hdr in header.getvalue().strip().split('\r\n') if hdr and not hdr.startswith('HTTP/')] response = dict((header[0].lower(), header[1]) for header in headers) response['status'] = self.curl_handle.getinfo(pycurl.HTTP_CODE) return (response, body)
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) try: currpage = soup.find("li", {"id": "pagination_next"}) if (currpage == None): return False elif (currpage['class'][0] == "disabled"): return False else: return True except: return False
def process_prop_query_results(self, url_req, results): """Process the result of a prop query.""" try: uri = httplib2.iri2uri(unicode(url_req)) req_result = json.loads(urllib.urlopen(uri).read()) # req_result = json.loads(self.__get_url(url_req).read()) if 'query-continue' in req_result: raise MwQueryError("continue not supported for prop query") r = req_result['query']['pages'] for p in r: results[p] = r[p] except KeyError: print "Empty result for --> %s" % (url_req)
def _get_links(self, index_url, page): soup = BeautifulSoup(page) links = [] for tag in soup.findAll('a'): link = tag.get('href', None) link = urlparse.urljoin(index_url, link) link = httplib2.iri2uri(link) if link and self._is_required_pattern(index_url, link): links.append(link) links_set = set(links) links = list(links_set) return links
def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects): """Internal function to follow a redirect recieved by L{request}""" (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri)) if self.cache: cachekey = defrag_uri else: cachekey = None # Pick out the location header and basically start from the beginning # remembering first to strip the ETag header and decrement our 'depth' if "location" not in response and response.status != 300: raise httplib2.RedirectMissingLocation( "Redirected but the response is missing a Location: header.", response, content) # Fix-up relative redirects (which violate an RFC 2616 MUST) if "location" in response: location = response['location'] (scheme, authority, path, query, fragment) = httplib2.parse_uri(location) if authority is None: response['location'] = httplib2.urlparse.urljoin(uri, location) pywikibot.debug( u"Relative redirect: changed [%s] to [%s]" % (location, response['location']), _logger) if response.status == 301 and method in ["GET", "HEAD"]: response['-x-permanent-redirect-url'] = response['location'] if "content-location" not in response: response['content-location'] = absolute_uri httplib2._updateCache(headers, response, content, self.cache, cachekey) headers.pop('if-none-match', None) headers.pop('if-modified-since', None) if "location" in response: location = response['location'] redirect_method = ( (response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method return self.request(location, redirect_method, body=body, headers=headers, max_redirects=max_redirects - 1) else: raise httplib2.RedirectLimit( "Redirected more times than redirection_limit allows.", response, content)
def open(self, url, encoding=None): if isinstance(url, (str, unicode)): if isinstance(url, unicode): url = url.encode('utf-8') log.info('Retrieving "{url}"'.format(**locals())) try: return self.opener.opener.open(url, encoding) except UnicodeEncodeError: uri = iri2uri(url) return self.opener.opener.open(uri, encoding) else: req = url log.info('Retrieving "{url}"'.format(url = req.get_full_url())) return self.opener.opener.open(req, encoding)
def run(self): config = self.config url = config['urls'] soup = self.get_soup(url) #get segments #get segments catlist = self.get_catgorylinks(soup) if (len(catlist) > 0): for cat in catlist: config['Category'] = list(cat.keys())[0] url = cat[config['Category']] soup = self.get_soup(httplib2.iri2uri(url)) allseg = self.get_allseg(soup) if (len(allseg) > 0): for seg in allseg: config['segment'] = list(seg.keys())[0] url = seg[config['segment']] soup = self.get_soup(httplib2.iri2uri(url)) allsubseg = self.get_allsubseg(soup) if (len(allsubseg) > 0): for subseg in allsubseg: config['Sub-segment'] = list(subseg.keys())[0] url = subseg[config['Sub-segment']] self.get_proddata(url) else: config['Sub-segment'] = "None" self.get_proddata(url) else: config['segment'] = "None" config['Sub-segment'] = "None" self.get_proddata(url) else: config['Category'] = "None" config['segment'] = "None" config['Sub-segment'] = "None" self.get_proddata(url) pass
def downloadHtmlImages(self, html): parser = ImgParser() parser.feed(html) urls = parser.image_urls imagesPath = os.path.join(self.rootDir, 'OEBPS', 'images') if not os.path.exists(imagesPath): os.makedirs(imagesPath) for url in urls: if 'http://' in url.lower() or 'https://' in url.lower(): print "downloading " + url src = os.path.join(self.rootDir, os.path.split(url)[1]) dest = os.path.join("images", os.path.split(url)[1]) urllib.urlretrieve(httplib2.iri2uri(url), src) self.addImage(src, dest) return urls
def prepare_wiki_resp(name, article, url): name = unicode(name, 'utf-8') article = article.replace('_', ' ').strip() if not article: return "" if '#' in url: url_hash = url.split('#', 1)[-1] url_hash = httplib2.iri2uri(url_hash) url_hash = url_hash.replace(' ', '_').replace('%', '.') print url_hash url = url.split('#', 1)[0] + '#' + url_hash url = url.replace(' ', '%20') url = shorten(url) resp = name + ': ' + article + ' ' + url return '/me ' + resp
def send_to_api(self, request, debug=False): # add action to url url_req = "%s?action=%s" % (self.url, request.action) # add each property for k in request.prop.keys(): #url_req += "&%s=%s" % (k, urlEncodeNonAscii(request.prop[k])) url_req += "&%s=%s" % (k, self.__encode_param(request.prop[k])) # add the format url_req += "&format=%s" % (request.format) # print url_req if not debug: uri = httplib2.iri2uri(unicode(url_req)) return urllib.urlopen(uri).read() else: return url_req
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) currpageid = url.split("=")[len(url.split("=")) - 1] try: cstr = soup.find('div', {"class": "pager"}).find("a")['href'] newpageid = cstr.split("=")[len(cstr.split("=")) - 1] return (currpageid < newpageid) except: try: cstr = soup.find('div', { "class": "pager loaded" }).find("a")['href'] newpageid = cstr.split("=")[len(cstr.split("=")) - 1] return (currpageid < newpageid) except: return (False)
def is_product(self, url): soup = self.get_soup(httplib2.iri2uri(url)) self.logger.info(url) try: cpg = int(url.split("=")[-1:][0]) numli = len( soup.find('ul', { "class": "pages" }).find_all("li", recursive=False)) npg = int( soup.find('ul', { "class": "pages" }).find_all("li", recursive=False)[numli - 2].find("a")["data-value"]) return (cpg < npg) except: return False
def parseHouse(self, response): hxs = HtmlXPathSelector(response) item = HouseItem() item['title'] =hxs.select('//h1[contains(@class, "long_subject")]/text()').extract() Concelho = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Concelho")]').select('../text()').extract()[1].strip() Freguesia = "" try: Freguesia = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Freguesia")]').select('../text()').extract()[1].strip() except: print "No Freguesia" item['address'] = Concelho + ' ' + Freguesia item['link'] = response.url item['size'] = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Tipologia")]').select('../text()').extract()[1].strip() item['desc'] = hxs.select('//div[contains(@class, "body_text")]/text()').extract() item['price'] = hxs.select('//span[contains(@class, "coolprice")]/text()').extract()[0].strip() item['publication'] = hxs.select('//p[contains(@class,"right")]/text()').extract()[1].strip() image_from_script = hxs.select('//div[contains(@id, "slider")]/script/text()').extract() images_urls = image_from_script[0].split('[')[1].split(',') try: if "googleapis" in images_urls[-3]: item['lng'] = images_urls[-1].split("'")[0] item['lat'] = images_urls[-2].split('C')[-1] images_urls.remove(images_urls[-1]) images_urls.remove(images_urls[-1]) images_urls.remove(images_urls[-1]) else: raise RunExceptCode except: iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true" result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0] item['lat'] = result['geometry']['location']['lat'] item['lng'] = result['geometry']['location']['lng'] item['image_urls'] = [] for image_url in images_urls: item['image_urls'].append(image_url.replace("'","")) yield item
def __init__(self, *args, **kwargs): super(LaconicModel, self).__init__(*args, **kwargs) graph = Graph() graph.bind('dbpedia', 'http://dbpedia.org/resource/') graph.bind('rdfs', 'http://www.w3.org/2000/01/rdf-schema#') graph.bind('schema', 'http://schema.org/') home = hyperspace.jump('http://dyli-thingy.herokuapp.com/', client=http_client) thing = home.queries['lookup'][0].build({'iri': self.iri}).submit() graph = graph + thing.data if len(graph) == 0: raise LaconicModel.DoesNotExist('No data found for: ' + self.iri) self._graph = graph factory = laconia.ThingFactory(graph) self._entity = factory(iri2uri(self.iri))
def send_http_request(method, request_url, body=None, request_headers={}): uri = httplib2.iri2uri(request_url) (scheme, authority, request_uri) = httplib2.urlnorm(uri)[:3] address = _get_hostport(authority) http_client = httplib2.HTTPConnectionWithTimeout(address[0], port=address[1]) if http_client.sock is None: http_client.connect() http_client.putrequest(method, request_uri.encode(DEFAULT_HTTP_URI_CHARSET), {'skip_host': 1, 'skip_accept_encoding': 1}) for key, value in request_headers.items(): http_client.putheader(key, value.encode(DEFAULT_HTTP_HEADER_CHARSET)) http_client.endheaders() if body: http_client.send(body) return http_client.getresponse()
def get_html(self): lang = self.request.params.get("lang") name = self.request.params.get("name") if lang == 'lb': lang = 'lu' url = \ "https://wiki.geoportail.lu/doku.php?" \ "id=%s:legend:%s&do=export_html" % \ (lang, name) f = urllib2.urlopen(httplib2.iri2uri(url), None, 15) data = f.read() data = data.replace( "/lib/exe/fetch.php", "https://wiki.geoportail.lu/lib/exe/fetch.php") data = data.replace( "src=\"img/", "src=\"https://wiki.geoportail.lu/img/") data = data.replace( "/lib/exe/detail.php", "https://wiki.geoportail.lu/lib/exe/detail.php") soup = BeautifulSoup(data, "lxml") a_tags = soup.find_all("a") for a_tag in a_tags: if a_tag.get('class') is not None and\ 'media' in a_tag.get('class'): a_tag['target'] = '_blank' img_tags = soup.find_all("img") for img_tag in img_tags: if img_tag.get('style') is None: img_tag['style'] = 'max-width:290px;' res = soup.find("div", {"class": "dokuwiki export"}) if res is not None: data = res.encode_contents() else: data = "" headers = {"Content-Type": f.info()['Content-Type']} return Response(data, headers=headers)
def send_to_api(self, request, debug=False): """Send a request to mediawiki API. Args: request (MwApi): Request to send. debug (bool): if true, then just only return the string of the API request, otherwise return the result. """ # add action to url url_req = "%s?action=%s" % (self.url, request.action) # add each property for k in request.prop: url_req += "&%s=%s" % (k, self.__encode_param(request.prop[k])) # add the format url_req += "&format=%s" % (request.format) # print url_req if not debug: uri = httplib2.iri2uri(unicode(url_req)) return urllib.urlopen(uri).read() else: return url_req
def parseHouse(self, response): if self.close_down: raise CloseSpider(reason='Duplicate house') hxs = HtmlXPathSelector(response) item = HouseItem() item['currency'] = "€" item['title'] = hxs.select('//div[contains(@class, "detaiHeaderProperty")]/text()')[0].extract() item['address'] = hxs.select('//div[contains(@class, "detaiHeaderLocation")]/text()')[0].extract() item['link'] = response.url item['desc'] = hxs.select('//div[contains(@class, "detailDescription")]/h2/text()').extract() item['price'] = hxs.select('//div[contains(@class, "detailHeaderPriceValue")]/text()')[0].extract().strip() item['state'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Estado")]').select('../span/text()').extract() size_description = hxs.select('//div[contains(@class, "detaiHeaderProperty")]/text()')[0].extract().split(",")[0].strip() if size_description == "Quarto": item['size'] = 0 elif size_description == "Apartamento": item['size'] = 1 else: item['size'] = int(size_description.split()[-1].replace("T","").strip('+')[0]) item['publication'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Publicado")]').select('../span/text()').extract()[0] computed_date = datetime.datetime.strptime(item['publication'], "%d-%m-%Y") one_month_ago = datetime.datetime.today() - datetime.timedelta(days=30) if computed_date < one_month_ago: log.msg("Too old...", level=log.INFO) raise CloseSpider('Houses are too old') image_urls = hxs.select('//a[contains(@id, "SmallFotos")]/@onclick').extract() item['image_urls'] = [] for image_url in image_urls: item['image_urls'].append(re.findall(r'\'(.+?)\'',image_url)[0]) iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true" result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0] item['lat'] = result['geometry']['location']['lat'] item['lng'] = result['geometry']['location']['lng'] yield item
def get_html(self): lang = self.request.params.get("lang") name = self.request.params.get("name") if lang == 'lb': lang = 'lu' url = \ "https://wiki.geoportail.lu/doku.php?" \ "id=%s:legend:%s&do=export_html" % \ (lang, name) f = urllib2.urlopen(httplib2.iri2uri(url), None, 15) data = f.read() data = data.replace("/lib/exe/fetch.php", "https://wiki.geoportail.lu/lib/exe/fetch.php") data = data.replace("src=\"img/", "src=\"https://wiki.geoportail.lu/img/") data = data.replace("/lib/exe/detail.php", "https://wiki.geoportail.lu/lib/exe/detail.php") soup = BeautifulSoup(data, "lxml") a_tags = soup.find_all("a") for a_tag in a_tags: if a_tag.get('class') is not None and\ 'media' in a_tag.get('class'): a_tag['target'] = '_blank' img_tags = soup.find_all("img") for img_tag in img_tags: if img_tag.get('style') is None: img_tag['style'] = 'max-width:290px;' res = soup.find("div", {"class": "dokuwiki export"}) if res is not None: data = res.encode_contents() else: data = "" headers = {"Content-Type": f.info()['Content-Type']} return Response(data, headers=headers)
def parseHouse(self, response): hxs = HtmlXPathSelector(response) item = HouseItem() item['title'] = hxs.select('//div[contains(@class, "detaiHeaderProperty")]/text()')[0].extract() item['address'] = hxs.select('//div[contains(@class, "detaiHeaderLocation")]/text()')[0].extract() item['link'] = response.url item['desc'] = hxs.select('//div[contains(@class, "detailDescription")]/h2/text()').extract() item['price'] = hxs.select('//div[contains(@class, "detailHeaderPriceValue")]/text()')[0].extract().strip() item['state'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Estado")]').select('../span/text()').extract() item['publication'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Publicado")]').select('../span/text()').extract() item['size'] = item['title'].split(",")[0].split()[-1] image_urls = hxs.select('//a[contains(@id, "SmallFotos")]/@onclick').extract() item['image_urls'] = [] for image_url in image_urls: item['image_urls'].append(re.findall(r'\'(.+?)\'',image_url)[0]) iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true" result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0] item['lat'] = result['geometry']['location']['lat'] item['lng'] = result['geometry']['location']['lng'] yield item