Beispiel #1
0
    def get_for_keyword(self, keyword):
        logger.info(lambda: "Fetching quotes from Goodreads for keyword=%s" % keyword)

        url = iri2uri("https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s" % keyword)
        soup = Util.html_soup(url)
        page_links = list(Util.safe_map(int,
                                        [pagelink.contents[0] for pagelink in
                                         soup.find_all(href=re.compile('quotes/tag.*page='))]))
        if page_links:
            page = random.randint(1, max(page_links))
            url = iri2uri("https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s&page=%d" % (keyword, page))
            soup = Util.html_soup(url)

        return self.get_from_soup(url, soup)
Beispiel #2
0
    def get_for_keyword(self, keyword):
        logger.info(lambda: "Fetching quotes from Goodreads for keyword=%s" % keyword)

        url = iri2uri(u"https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s" % keyword)
        soup = Util.html_soup(url)
        page_links = list(Util.safe_map(int,
                                        [pagelink.contents[0] for pagelink in
                                         soup.find_all(href=re.compile('quotes/tag.*page='))]))
        if page_links:
            page = random.randint(1, max(page_links))
            url = iri2uri(u"https://www.goodreads.com/quotes/tag?utf8=\u2713&id=%s&page=%d" % (keyword, page))
            soup = Util.html_soup(url)

        return self.get_from_soup(url, soup)
Beispiel #3
0
    def get_for_author(self, author):
        logger.info(lambda: "Fetching quotes from Goodreads for author=%s" % author)

        url = iri2uri(u"https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s" % author)
        soup = Util.html_soup(url)
        page_links = list(Util.safe_map(int,
                                        [pagelink.contents[0] for pagelink in
                                         soup.find_all(href=re.compile('quotes/search.*page='))]))
        if page_links:
            page = random.randint(1, max(page_links))
            url = iri2uri(u"https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s&page=%d" % (author, page))
            soup = Util.html_soup(url)

        return self.get_from_soup(url, soup)
Beispiel #4
0
    def get_for_author(self, author):
        logger.info(lambda: "Fetching quotes from Goodreads for author=%s" % author)

        url = iri2uri("https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s" % author)
        soup = Util.html_soup(url)
        page_links = list(Util.safe_map(int,
                                        [pagelink.contents[0] for pagelink in
                                         soup.find_all(href=re.compile('quotes/search.*page='))]))
        if page_links:
            page = random.randint(1, max(page_links))
            url = iri2uri("https://www.goodreads.com/quotes/search?utf8=\u2713&q=%s&page=%d" % (author, page))
            soup = Util.html_soup(url)

        return self.get_from_soup(url, soup)
Beispiel #5
0
	def download_image(self, image_info, image_path, retries=5):
		"""Download an image"""
		count = retries
		image_url = image_info['original_url']
		image_url = httplib2.iri2uri(image_url)
		image_path_temp = image_path + "_temp"
		while count > 0:
			count -= 1
			# Doing the actual downloading
			urllib.urlretrieve (image_url, image_path_temp)
			
			# Checking the image			
			image_data = SmugMug.load_image(image_path_temp)
			image_md5sum = hashlib.md5(image_data).hexdigest()
			image_size = str(len(image_data))
			if image_md5sum != image_info['md5_sum']:
				raise "MD5 sum doesn't match."
			elif image_size != str(image_info['size']):
				raise "Image size doesn't match."
			else:
				os.rename(image_path_temp, image_path)
				break

			if count > 0:
				print "Retrying..."
			else:
				raise "Error: Too many retries."
				sys.exit(1)
Beispiel #6
0
    def request(self, uri, method="GET", body=None, headers=None,
                max_redirects=None, connection_type=None):
        """Start an HTTP request.

        @param uri: The uri to retrieve
        @param method: (optional) The HTTP method to use. Default is 'GET'
        @param body: (optional) The request body. Default is no body.
        @param headers: (optional) Additional headers to send. Defaults
               include C{connection: keep-alive}, C{user-agent} and
               C{content-type}.
        @param max_redirects: (optional) The maximum number of redirects to
               use for this request. The class instance's max_redirects is
               default
        @param connection_type: (optional) see L{httplib2.Http.request}

        @return: (response, content) tuple

        """
        if max_redirects is None:
            max_redirects = self.max_redirects
        if headers is None:
            headers = {}
        # Prepare headers
        headers.pop('cookie', None)
        req = DummyRequest(uri, headers)
        self.cookiejar.lock.acquire()
        try:
            self.cookiejar.add_cookie_header(req)
        finally:
            self.cookiejar.lock.release()
        headers = req.headers

        # Wikimedia squids: add connection: keep-alive to request headers
        # unless overridden
        headers['connection'] = headers.pop('connection', 'keep-alive')

        # determine connection pool key and fetch connection
        (scheme, authority, request_uri, defrag_uri) = httplib2.urlnorm(
                                                        httplib2.iri2uri(uri))
        conn_key = scheme+":"+authority

        connection = self.connection_pool.pop_connection(conn_key)
        if connection is not None:
            self.connections[conn_key] = connection

        # Redirect hack: we want to regulate redirects
        follow_redirects = self.follow_redirects
        self.follow_redirects = False
        pywikibot.debug(u"%r" % (
                            (uri.replace("%7C","|"), method, body,
                            headers, max_redirects,
                            connection_type),),
                        _logger)
        try:
            (response, content) = httplib2.Http.request(
                                    self, uri, method, body, headers,
                                    max_redirects, connection_type)
        except Exception, e: # what types?
            # return exception instance to be retrieved by the calling thread
            return e
Beispiel #7
0
    def __init__(self, url):
        if not url.startswith("http"):
            url = "http://" + url
        url = httplib2.iri2uri(url)

        # certain urls are self-explicable
        if re_wikiurl.match(url):
            try: assert urllib2.unquote(url.encode("ascii")).decode('utf8') != url
            except: raise MeaninglessTitle("wikipedia title is within the url")

        try: resp = opener_en.open(url.encode("utf-8"), timeout=5)
        except urllib2.URLError as e: raise CantGetContents(e)
        info = resp.info()
        if info.type not in ("text/html", "text/xhtml"):
            raise ThisIsNotHTML("this doesn't look like html")

        data = resp.read(262144)
        if info.get('Content-Encoding') == 'gzip':
            data = gzip.GzipFile(fileobj=StringIO(data)).read()

        encoding = info.getparam("charset") or getcharset(data)

        title = x_title(html.fromstring(data, parser=html.HTMLParser(encoding=encoding)))
        if not title:
            raise ThereIsNoTitle(u"there's no title in the first 4⁹ bytes")
        title = title[0].text
        if title is None:
            raise MeaninglessTitle(u"title is present but empty")
        title = clean(title)

        if title == "imgur: the simple image sharer": raise MeaninglessTitle("who needs the default imgur title?")
        if title == "Photos" and "core.org.ua" in url: raise MeaninglessTitle(u"рамок снова фотачками хвастается, да?")
        elif title.lower() in url.lower(): raise MeaninglessTitle("title text is contained within the url")
        self.shortargs = self.longargs = (title,)
Beispiel #8
0
  def InitRequestHead(self):
    """Initializes curl object for a HEAD request.

    A HEAD request is initiated so that we can check from the headers if this is
    a valid HTML file. If it is not a valid HTML file, then we do not initiate a
    GET request, saving any unnecessary downloadings.
    """
    self._curl_object = pycurl.Curl()
    # Handles sites with unicode URLs.
    if isinstance(self._url, unicode):
      self._url = str(iri2uri(self._url))
    self._curl_object.setopt(pycurl.URL, self._url)
    # The following line fixes the GnuTLS package error that pycurl depends
    # on for getting https pages.
    self._curl_object.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)
    self._curl_object.setopt(pycurl.HEADERFUNCTION, self._GetHeaders)
    self._curl_object.setopt(pycurl.FOLLOWLOCATION, True)
    self._curl_object.setopt(pycurl.NOBODY, True)
    self._curl_object.setopt(pycurl.SSL_VERIFYPEER, False);
    self._curl_object.setopt(pycurl.MAXREDIRS, MAX_REDIRECTIONS)
    self._curl_object.setopt(pycurl.FAILONERROR, False)
    self._curl_object.setopt(pycurl.COOKIEFILE, self._cookie_file)
    self._curl_object.setopt(pycurl.COOKIEJAR, self._cookie_file)
    self._curl_object.setopt(pycurl.CONNECTTIMEOUT, 30)
    self._curl_object.setopt(pycurl.TIMEOUT, 300)
    self._curl_object.setopt(pycurl.NOSIGNAL, 1)
Beispiel #9
0
    def _process(self, item):
        url = item["url"]
        log.debug(u"Crawling: %s", url)
        uri = httplib2.iri2uri(url)
        report = {"url": url, "result": None, "status_code": None, "visited": None}

        total_start_time = time.time()

        (scheme, authority, _path, _query, _fragment) = httplib2.parse_uri(uri)
        if scheme is None or authority is None:
            report["result"] = u"Invalid URI"
            return report

        try:
            # this line is copied from robotsparser.py:can_fetch
            urllib.quote(urlparse.urlparse(urllib.unquote(url))[2])
        except KeyError:
            report["result"] = u"Malformed URL quoting."
            return report

        try:
            robot_check_result = self.ask_robots(uri, scheme, authority)
            # Graceful stop thing.
            if robot_check_result is None:
                raise Stop()
        except CrawlError, e:
            report["result"] = unicode(e)
            return report
 def _load_result(self, result):
     data = None
     if result.rest_url is not None and len(result.rest_url) > 0:
         full_url = result.rest_url + '/legend?f=pjson'
         try:
             f = urllib.request.urlopen(httplib2.iri2uri(full_url), None,
                                        self.TIMEOUT)
             data = json.load(f)
         except Exception as e:
             log.error(full_url)
             log.exception(e)
     if data is not None:
         for l in data['layers']:
             if str(l['layerId']) in result.layers.split(','):
                 attribute = l['layerName']
                 self._insert_attribute(
                     result.layer + ' : ' + attribute,
                     ("ogc_server_id:%s URL:%s Layer" %
                      (result.ogc_server_id, result.rest_url),
                      result.layer),
                 )
                 for leg in l['legend']:
                     attribute = leg['label']
                     self._insert_attribute(
                         result.layer + ' / ' + l['layerName'] + ' : ' +
                         attribute,
                         ("ogc_server_id:%s Layer:%s Sublayer" %
                          (result.ogc_server_id, result.layer),
                          l['layerName']),
                     )
def extractRealSupportedURI(uri):
    """
        Returns "real" URI if it survives redirects and returns a 200.

        Returns None otherwise.
    """

    realURI = None

    try:
        # this function follows the URI, resolving all redirects,
        # and detects redirect loops
        # iri2uri is needed for IRIs
        request = urllib.request.urlopen(httplib2.iri2uri(uri))
        
        if request.getcode() == 200:
            realURI = request.geturl()

    except urllib.error.HTTPError as e:
        # something went wrong, we don't care what
        realURI = None

    except urllib.error.URLError as e:
        # something went wrong, we don't care what
        realURI = None

    except UnicodeError as e:
        # something went very wrong with the IRI decoding
        realURI = None

    return realURI
def parse(word):
    all_news = []
    url = httplib2.iri2uri(
        'https://smi2.ru/api/search?limit=100&offset=0&order=date&query={}'.
        format(word))
    isError = True
    for q in range(5):
        try:
            jsonurl = urllib.request.urlopen(url)
        except urllib.error.HTTPError as error:
            print(error)
            print('Trying again')
        else:
            isError = False
            break
    if not (isError):
        obj = json.load(jsonurl)
        articles = obj['articles']
        for art in articles:
            if (int(art['create_date']) < time.time() - 86400):
                continue
            news = {'title': '', 'article': '', 'link': ''}
            news['title'] = art['title_original']
            news['article'] = art['announce_original']
            news['link'] = art['share_url']
            all_news.append(news)
        return all_news
    else:
        return []
Beispiel #13
0
    def _get_links(self, url, page):
        soup = BeautifulSoup(page)
        links = []
        a = soup.findAll('a')
        for tag in a:
            link = tag.get('href', None)
            print '1', link
            link1 = urlparse.urljoin(url, link)
            print '2', link1
            link2 = httplib2.iri2uri(link)
            print '2', link2
            
            pattern = self._site_pattern[self.chosen_site]
            if pattern.match(link):
                print link
#            path = urlparse.urlparse(link)[2]
##            print path,
#            if pattern.match(path):
#                is_required_pattern = True
            
#            link = urlparse.urljoin(index_url, link)
#            link = httplib2.iri2uri(link)
#            if link and self._is_required_pattern(index_url, link):
#                links.append(link)
        
#        for tag in a:
#            link = tag.get('href', None)
#            link = urlparse.urljoin(index_url, link)
#            link = httplib2.iri2uri(link)
#            if link and self._is_required_pattern(index_url, link):
#                links.append(link)
#        links_set = set(links)
#        links = list(links_set)
                
        return links
Beispiel #14
0
def search(text):
    url = u'https://www.youtube.com/results?search_query={}'.format(
        (u'+'.join(text)))
    url = iri2uri(url)
    print(url)
    yt = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(yt, 'html.parser')
    a = soup.find_all('div')
    links = []
    titles = []

    for i in a:

        if i.get("data-context-item-id"):

            video = i.h3.a
            link = video.get('href')
            title = video.get('title')
            if not link.startswith('/watch'):
                continue

            links.append("http://www.youtube.com" + link)
            titles.append(title)

    return links, titles
def process_user(username, fullname):
    filename = 'github/{}.csv'.format(username)
    filename_tmp = '{}.tmp'.format(filename)
    with open(filename_tmp, 'a'):
        os.utime(filename_tmp, None)
    uri_param = httplib2.iri2uri(fullname.replace(' ', '+'))
    url = u'{}/search?q={}&type=Users'.format(GITHUB_URL, uri_param)
    text = read_page(url)
    soup = BeautifulSoup(text)
    user_info = soup.find(class_='user-list-info')
    if not user_info:
        os.rename(filename_tmp, filename)
        soup.decompose()
        return
    a = user_info.find('a')
    github_username = a['href'][1:]
    with open(filename_tmp, 'w') as f:
        f.write(github_username + '\n')
        f.close()
    print "link stackoverflow '{}' to github '{}'".format(
        username, github_username)
    soup.decompose()
    commits = process_days(github_username, filename_tmp)
    os.rename(filename_tmp, filename)
    if github_username in CACHE:
        del CACHE[github_username]
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     try:
         isproduct = len(soup.find("a", {"class": "next i-next"})) > 0
         return (isproduct)
     except:
         return (False)
Beispiel #17
0
 def get_links(self, url=None, opt=None):
     '''
     Return a dict(tag, value) of links from a webpage,
     opt is the tag you want to filter.
     example : "opt = 'a'" will return all the 'a' tags.
     '''
     links = {}
     url = iri2uri(url)
     html_stream = urllib.urlopen(url)
     html_string = html_stream.read()
     html_stream.close()
     lxml_web_page = lxml.html.fromstring(html_string)
     # transform all the urls in absolute urls
     for elem, attr, link, pos in lxml_web_page.iterlinks():
         absolute = urlparse.urljoin(url, link.strip())
         if elem.tag in links:
             links[elem.tag].append(absolute)
         else:
             links[elem.tag] = [absolute]
     if opt is None:
         list_links = []
         for tag, tag_links in links.iteritems():
             for tag_link in tag_links:
                 list_links.append(tag_link)
         return list(set(list_links))
     else:
         links_opt = []
         try:
             links_opt = list(set(links[opt]))
         # No links with the tag 'opt'
         except KeyError:
             pass
         return links_opt
Beispiel #18
0
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     self.logger.info(url)
     cstr = soup.find('div', {"class": "product-count"}).text.strip()
     end = int(cstr.split("sur")[1].replace(".", "").strip())
     start = int(cstr.split("sur")[0].split("-")[1].strip())
     return (start < end)
  def _AddLink(self, link):
    """Adds url |link|, if not already present, to the appropriate list.

    The link only gets added to the single list that is appopriate for it:
    _secure_links, _general_links, _clues_secure_links or _clues_general_links.

    Args:
      link: the url that is inserted to the appropriate links list.
    """
    # Handles sites with unicode URLs.
    if isinstance(link, unicode):
      # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
      link = httplib2.iri2uri(link).encode('utf-8')
    link_parsed = urlparse.urlparse(link)
    link_lists = [self._clues_secure_links, self._secure_links,
                  self._clues_general_links, self._general_links]
    # Checks that the registration page is within the domain.
    if (self._domain in link_parsed[1] and
        all(link not in x for x in link_lists)):
      for clue in LINK_CLUES:
        if clue in link.lower():
          if link_parsed[0].startswith('https'):
            self._clues_secure_links.append(link)
            return
          else:
            self._clues_general_links.append(link)
            return
      if link_parsed[0].startswith('https'):  # No clues found in the link.
        self._secure_links.append(link)
      else:
        self._general_links.append(link)
Beispiel #20
0
    def _follow_redirect(self, uri, method, body, headers, response, content, max_redirects):
        """ Internal function to follow a redirect recieved by L{request} """
        (scheme, authority, absolute_uri, defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri))
        if self.cache:
            cachekey = defrag_uri
        else:
            cachekey = None

        # Pick out the location header and basically start from the beginning
        # remembering first to strip the ETag header and decrement our 'depth'
        if not response.has_key('location') and response.status != 300:
            raise httplib2.RedirectMissingLocation("Redirected but the response is missing a Location: header.", response, content)
        # Fix-up relative redirects (which violate an RFC 2616 MUST)
        if response.has_key('location'):
            location = response['location']
            (scheme, authority, path, query, fragment) = httplib2.parse_uri(location)
            if authority == None:
                response['location'] = httplib2.urlparse.urljoin(uri, location)
                logging.debug('Relative redirect: changed [%s] to [%s]' % (location, response['location']))
        if response.status == 301 and method in ["GET", "HEAD"]:
            response['-x-permanent-redirect-url'] = response['location']
            if not response.has_key('content-location'):
                response['content-location'] = absolute_uri 
            httplib2._updateCache(headers, response, content, self.cache, cachekey)
        
        headers.pop('if-none-match', None)
        headers.pop('if-modified-since', None)
        
        if response.has_key('location'):
            location = response['location']
            redirect_method = ((response.status == 303) and (method not in ["GET", "HEAD"])) and "GET" or method
            return self.request(location, redirect_method, body=body, headers = headers, max_redirects = max_redirects - 1)
        else:
            raise httplib2.RedirectLimit("Redirected more times than redirection_limit allows.", response, content)
Beispiel #21
0
    def _AddLink(self, link):
        """Adds url |link|, if not already present, to the appropriate list.

    The link only gets added to the single list that is appopriate for it:
    _secure_links, _general_links, _clues_secure_links or _clues_general_links.

    Args:
      link: the url that is inserted to the appropriate links list.
    """
        # Handles sites with unicode URLs.
        if isinstance(link, unicode):
            # Encode in 'utf-8' to avoid the UnicodeEncodeError exception.
            link = httplib2.iri2uri(link).encode('utf-8')
        link_parsed = urlparse.urlparse(link)
        link_lists = [
            self._clues_secure_links, self._secure_links,
            self._clues_general_links, self._general_links
        ]
        # Checks that the registration page is within the domain.
        if (self._domain in link_parsed[1]
                and all(link not in x for x in link_lists)):
            for clue in LINK_CLUES:
                if clue in link.lower():
                    if link_parsed[0].startswith('https'):
                        self._clues_secure_links.append(link)
                        return
                    else:
                        self._clues_general_links.append(link)
                        return
            if link_parsed[0].startswith(
                    'https'):  # No clues found in the link.
                self._secure_links.append(link)
            else:
                self._general_links.append(link)
Beispiel #22
0
 def get_url(self):
     url = "https://www.google.com/search?num=20&q=-youtube.com+"
     url = url + self.title.replace(" ", "+")
     url = url + "+"
     url = url + self.artist.replace(" ", "+")
     url = url + "+lyrics"
     url = str(iri2uri(url))
     return url
Beispiel #23
0
 def get_url(self):
     url = "https://www.google.com/search?num=20&q=-youtube.com+"
     url = url + self.title.replace(" ", "+")
     url = url + "+"
     url = url + self.artist.replace(" ", "+")
     url = url + "+lyrics"
     url = str(iri2uri(url))
     return url
Beispiel #24
0
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     try:
         isnotdisabled = not (soup.find("span", {"class": "disabled"})
                              == None)
         return (isnotdisabled)
     except:
         return (False)
Beispiel #25
0
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     try:
         isproduct = len(
             soup.find_all("div", {"class": "product-container fr"})) > 0
         return (isproduct)
     except:
         return (False)
def CheckSubmissions(subreddit):
  """
  Given a PRAW subreddit, marks expired links and returns a list of the
  submissions that were marked. It also returns a list of submissions we were
  unable to process (either because we don't know how to find the price or
  because we were unable to get the price).
  """
  modified_submissions = []
  needs_review_submissions = []
  needs_review_cache = LoadCacheFromFile(NEEDS_REVIEW_CACHE_FILE)
  already_expired_cache = LoadCacheFromFile(ALREADY_EXPIRED_CACHE_FILE)

  for rank, submission in enumerate(subreddit.get_hot(limit=MAX_SUBMISSIONS)):
    submission.rank = rank  # Used when creating digests for the mods
    # Both urllib2.urlopen() and the file writer to save the cache have trouble
    # when a submission's URL contains Unicode characters. Consequently, we
    # encode any stray Unicode characters right away so we don't need to worry
    # about it later.
    submission.url = httplib2.iri2uri(submission.url)

    # Skip anything already marked as expired, unless it's test data.
    if (submission.link_flair_css_class == EXPIRED_CSS_CLASS or
        submission.url in already_expired_cache) and not TEST_DATA:
      continue

    price = GetPrice(submission.url)
    # The price might be the empty string if we're unable to get the real price.
    if not price:
      if IsKnownFree(submission.url):  # No human review needed!
        continue

      if submission.url not in needs_review_cache:
        needs_review_submissions.append(submission)  # Send it to the mods!
      # Regardless of whether we need to tell the mods, move this submission to
      # the front of the cache.
      needs_review_cache[submission.url] = True  # Dummy value
      continue

    # This next line is a little hard for non-Python people to read. It's
    # asking whether any nonzero digit is contained in the price.
    if not any(digit in price for digit in "123456789"):
      continue  # It's still free!

    # If we get here, this submission is no longer free. Make a comment
    # explaining this and set the flair to expired.
    if not DRY_RUN:
      submission.add_comment(EXPIRED_MESSAGE % (price, submission.permalink))
      subreddit.set_flair(submission, EXPIRED_FLAIR, EXPIRED_CSS_CLASS)
      # Add it to the cache, so that if we have made a mistake and this
      # submission is later un-expired, we don't re-expire it the next day.
      already_expired_cache[submission.url] = True  # Dummy value
    submission.list_price = price  # Store this to put in the digest later.
    modified_submissions.append(submission)
  if not DRY_RUN and not TEST_DATA:
    # Don't change the next run's cache if this is just a test
    StoreCacheToFile(already_expired_cache, ALREADY_EXPIRED_CACHE_FILE)
    StoreCacheToFile(needs_review_cache, NEEDS_REVIEW_CACHE_FILE)
  return modified_submissions, needs_review_submissions
Beispiel #27
0
 def run(self):
     config=self.config
     url = config['urls']
     soup = self.get_soup(httplib2.iri2uri(url))
     #get segments
     try:
         catlist=self.get_catgorylinks(soup)
     except:
         catlist=[]
     if (len(catlist)>0):
         for cat in catlist:
             config['Category']=list(cat.keys())[0]
             url=cat[config['Category']]
             soup = self.get_soup(httplib2.iri2uri(url))
             try:
                 allseg= self.get_allseg(soup)
             except:
                 allseg= dict()
             if (len(allseg)>0):
                 for seg in list(allseg.keys()):
                     url = allseg[seg]
                     config['segment']=seg
                     soup = self.get_soup(httplib2.iri2uri(url))
                     try:
                         allsubseg= self.get_allseg(soup)
                     except:
                         allsubseg= dict()      
                     if (len(allsubseg)>0):
                         for subseg in list(allsubseg.keys()):
                             url = allsubseg[subseg]
                             config['Sub-segment']=subseg
                             self.get_proddata(url)
                     else:
                         config['Sub-segment']="None"
                         self.get_proddata(url)
             else:
                 config['segment']="None"
                 config['Sub-segment']="None"
                 self.get_proddata(url)
     else:
         config['Category']="None"
         config['segment']="None"
         config['Sub-segment']="None"
         self.get_proddata(url)   
     pass
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     try:
         isnotdisabled = not (soup.find(
             "li", {"id": "pagination_next"})["class"].strip()
                              == "disabled")
         return (isnotdisabled)
     except:
         return (False)
Beispiel #29
0
def main():

    parser = argparse.ArgumentParser(
        description=
        "Organise music folders in <artist> - <album> [<tags>...] looking for tags at LastFM."
    )
    parser.add_argument(
        "sourcedir",
        help="source dir where the music folders to modify are present")
    parser.add_argument(
        "targetdir",
        help="target dir where to place the converted music directories")
    args = parser.parse_args()

    subdirs = [x[0] for x in os.walk(args.sourcedir) if x[0] != "Cover"]

    ok_entries = []
    failed_files = []

    bad_tags = ["metal", "electronic", "electronica"]
    for folder in subdirs:
        files = [f for f in os.listdir(folder) if f.endswith("mp3")]
        isleaf = 1
        for f in os.listdir(folder):
            if os.path.isdir(f):
                isleaf = 0
        if isleaf == 1 and len(files) > 0:
            mp3file = files[0]
            artist = ""
            try:
                mp3info = EasyID3(folder + "/" + mp3file)
                artist = mp3info["artist"][0]
                album = mp3info["album"][0]

                # extract tags from lastfm
                tags = []
                url = "http://www.last.fm/es/music/" + artist.replace(" ", "+")
                page = html.fromstring(
                    urllib.urlopen(httplib2.iri2uri(url)).read())
                for element in page.xpath("//li[@class='tag']/a"):
                    tags.append(element.text)
                tags = filter((lambda x: x not in bad_tags), tags)
                if len(tags) == 0:
                    raise Exception("no tags")
                else:
                    print(tags)

                ok_entries.append([folder, artist, album, mp3file, tags])
                new_folder_name = args.targetdir + artist + " - " + album
                for tag in tags:
                    new_folder_name += " [" + tag + "]"
                call(["mv", folder, new_folder_name])

            except (Exception, mutagen._id3util.ID3NoHeaderError) as e:
                print(e)
                traceback.print_exc()
                failed_files.append([folder, mp3file])
  def parseHouse(self, response):
    if self.close_down:
      raise CloseSpider(reason='Duplicate house')
    hxs = HtmlXPathSelector(response)
    item = HouseItem()
    item['currency'] = "€"
    item['title'] = hxs.select('//h1[contains(@class, "long_subject")]/text()').extract()
    Concelho = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Concelho")]').select('../text()').extract()[1].strip()
    Freguesia = ""
    try:
      Freguesia = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Freguesia")]').select('../text()').extract()[1].strip()
    except:
      log.msg("No Freguesia", level=log.INFO)

    item['address'] = Concelho + ' ' + Freguesia
    item['link'] = response.url
    item['size'] = int(hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Tipologia")]').select('../text()').extract()[1].strip().replace("T","")[0])

    item['desc'] = hxs.select('//div[contains(@class, "body_text")]/text()').extract()

    locale.setlocale(locale.LC_ALL, 'pt_PT')
    item['publication'] = hxs.select('//p[contains(@class,"right")]/text()').extract()[1].strip()
    if re.match("Ontem|Hoje", item['publication']) is None:
      log.msg("date is older, analyse...", level=log.INFO)
      computed_date = datetime.datetime.strptime(item['publication'], "%d %b %H:%M")
      if datetime.date.today().month is 1 and computed_date.month is 12:
        year = (datetime.date.today() - datetime.timedelta(days=365)).year
      else:
        year = datetime.date.today().year

      computed_date = datetime.datetime.strptime(item['publication'] + ' %d' % year, "%d %b %H:%M %Y")
      item['publication'] = computed_date.strftime("%d-%m-%Y")
      one_month_ago = datetime.datetime.today() - datetime.timedelta(days=30)
      if computed_date < one_month_ago:
        log.msg("Too old...", level=log.INFO)
        raise CloseSpider('Houses are too old')

    image_from_script = hxs.select('//div[contains(@id, "slider")]/script/text()').extract()
    images_urls = image_from_script[0].split('[')[1].split(',')
    images_urls[-1] = images_urls[-1].split(']')[0]
    try:
      item['lat'] = float(hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Ver mapa")]').select('../a/@onclick').extract()[0].split(',')[0].split('(')[1])
      item['lng'] = float(hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Ver mapa")]').select('../a/@onclick').extract()[0].split(',')[1])
    except:
      iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true"
      result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0]
      item['lat'] = result['geometry']['location']['lat']
      item['lng'] = result['geometry']['location']['lng']

    item['image_urls'] = []
    for image_url in images_urls:
      item['image_urls'].append(image_url.replace("'",""))
    try:
      item['price'] = hxs.select('//span[contains(@class, "coolprice")]/text()').extract()[0].strip()
      yield item
    except:
      log.msg("no prices, no houses", level=log.INFO)
Beispiel #31
0
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     try:
         isnotlast = soup.find(
             "li", {
                 "class": "ais-pagination--item ais-pagination--item__last"
             }).find("a") != None
         return (isnotlast)
     except:
         return (False)
Beispiel #32
0
def shorten(url):
    url = httplib2.iri2uri(url)
    if BITLY_USERNAME and BITLY_KEY:
        if len(url) > 40 or ':' in url:
            try:
                api = bitly.Api(login=BITLY_USERNAME, apikey=BITLY_KEY)
                url = api.shorten(url)
            except:
                pass
    return url
Beispiel #33
0
 def request(self, method, uri, body=None, headers=None):
     if not self.handle:
         self.connect()
     handle = self.fcurl.curl_handle
     if headers is None:
         headers = {}
     if method == 'GET':
         handle.setopt(pycurl.HTTPGET, 1)
     elif method == 'HEAD':
         handle.setopt(pycurl.NOBODY, 1)
     elif method == 'POST':
         handle.setopt(pycurl.POST, 1)
         if body:
             headers['Content-Length'] = len(body)
             body_IO = StringIO(body)
             handle.setopt(pycurl.READFUNCTION, body_IO.read)
     elif method == 'PUT':
         handle.setopt(pycurl.UPLOAD, 1)
         if body:
             headers['Content-Length'] = len(body)
             body_IO = StringIO(body)
             handle.setopt(pycurl.READFUNCTION, body_IO.read)
     elif method == 'PATCH':
         handle.setopt(pycurl.UPLOAD, 1)
         handle.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
         if body:
             headers['Transfer-Encoding'] = ''
             headers['Content-Length'] = len(body)
             body_IO = StringIO(body)
             handle.setopt(pycurl.READFUNCTION, body_IO.read)
     elif body is not None:
         # Custom method and body provided, error.
         raise Exception("body not supported with custom method %s." % method)
     else:
         # Custom method and no body provided, pretend to do a GET.
         handle.setopt(pycurl.CUSTOMREQUEST, method)
     if self.port:
         netloc = '%s:%s' % (self.host, self.port)
     else:
         netloc = self.host
     url = urlparse.urlunparse((self.scheme, netloc, uri, '', '', ''))
     self.url = str(iri2uri(url))
     handle.setopt(pycurl.URL, self.url)
     if headers:
         handle.setopt(pycurl.HTTPHEADER, ['%s: %s' % (header, str(value)) for
                                             header, value in
                                             headers.iteritems()])
     handle.setopt(pycurl.SSL_VERIFYPEER, 0)
     handle.setopt(pycurl.NOSIGNAL, 1)
     if self.key_file:
         handle.setopt(pycurl.SSLKEY, self.key_file)
     if self.cert_file:
         handle.setopt(pycurl.SSLCERT, self.cert_file)
     if self.timeout:
         handle.setopt(pycurl.TIMEOUT, self.timeout)
Beispiel #34
0
def curl_fetch_binary(url, data=None):
    url = iri2uri(url)
    req = Request(url,
                  data=data,
                  headers={
                      'User-Agent': userAgent,
                      "Accept-Language": "en"
                  })
    with urlopen(req) as f:
        data = f.read()
    return data
Beispiel #35
0
 def process_prop_query_results(self, url_req, results):
     try:
         uri = httplib2.iri2uri(unicode(url_req))
         req_result = json.loads(urllib.urlopen(uri).read())
         # req_result = json.loads(self.__get_url(url_req).read())
         if 'query-continue' in req_result.keys():
             raise MwQueryError("continue not supported for prop query")
         r = req_result['query']['pages']
         for p in r:
             results[p] = r[p]
     except KeyError:
         print "Empty result for --> %s" % (url_req)
Beispiel #36
0
 def _common_perform(self, url, headers,
                     accept_self_signed_SSL=False,
                     follow_location=True,
                     body_buffer=None, debug=False):
     """Perform activities common to all FriendlyCURL operations. Several
     parameters are passed through and processed identically for all of the
     \*_url functions, and all produce the same return type.
     
     :param url: The URL to access. If a unicode string, it will be treated\
     as an IRI and converted to a URI.
     :type url: str or unicode
     :param headers: Additional headers to add to the request.
     :type headers: dict
     :param accept_self_signed_SSL: Whether to accept self-signed SSL certs.
     :type accept_self_signed_SSL: bool
     :param follow_location: If True, FriendlyCURL will follow location\
     headers on HTTP redirects. If False, the redirect will be returned.
     :type follow_location: bool
     :param body_buffer: A buffer to write body content into.
     :type body_buffer: ``.write(str)``-able file-like object
     :param debug: Turn on debug logging for this request.
     :type debug: bool
     :returns: A tuple containing a dictionary of response headers, including\
     the HTTP status as an int in 'status' and a buffer containing the body\
     of the response."""
     self.curl_handle.setopt(
         pycurl.HTTPHEADER,
         ['%s: %s' % (header, str(value)) for header, value in headers.iteritems()])
     if isinstance(url, unicode):
         url = str(iri2uri(url))
     self.curl_handle.setopt(pycurl.URL, url)
     if body_buffer:
         body = body_buffer
     else:
         body = StringIO()
     self.curl_handle.setopt(pycurl.FORBID_REUSE, 1)
     self.curl_handle.setopt(pycurl.WRITEFUNCTION, body.write)
     header = StringIO()
     self.curl_handle.setopt(pycurl.HEADERFUNCTION, header.write)
     if accept_self_signed_SSL == True:
         self.curl_handle.setopt(pycurl.SSL_VERIFYPEER, 0)
     if follow_location == True:
         self.curl_handle.setopt(pycurl.FOLLOWLOCATION, 1)
     if debug:
         self.curl_handle.setopt(pycurl.VERBOSE, 1)
         self.curl_handle.setopt(pycurl.DEBUGFUNCTION, debugfunction)
     self.curl_handle.perform()
     body.seek(0)
     headers = [hdr.split(': ') for hdr in header.getvalue().strip().split('\r\n') if
                hdr and not hdr.startswith('HTTP/')]
     response = dict((header[0].lower(), header[1]) for header in headers)
     response['status'] = self.curl_handle.getinfo(pycurl.HTTP_CODE)
     return (response, body)
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     try:
         currpage = soup.find("li", {"id": "pagination_next"})
         if (currpage == None):
             return False
         elif (currpage['class'][0] == "disabled"):
             return False
         else:
             return True
     except:
         return False
Beispiel #38
0
 def process_prop_query_results(self, url_req, results):
     """Process the result of a prop query."""
     try:
         uri = httplib2.iri2uri(unicode(url_req))
         req_result = json.loads(urllib.urlopen(uri).read())
         # req_result = json.loads(self.__get_url(url_req).read())
         if 'query-continue' in req_result:
             raise MwQueryError("continue not supported for prop query")
         r = req_result['query']['pages']
         for p in r:
             results[p] = r[p]
     except KeyError:
         print "Empty result for --> %s" % (url_req)
 def _get_links(self, index_url, page):
     soup = BeautifulSoup(page)
     links = []
     for tag in soup.findAll('a'):
         link = tag.get('href', None)
         link = urlparse.urljoin(index_url, link)
         link = httplib2.iri2uri(link)
         if link and self._is_required_pattern(index_url, link):
             links.append(link)
     links_set = set(links)
     links = list(links_set)
             
     return links
Beispiel #40
0
    def _follow_redirect(self, uri, method, body, headers, response, content,
                         max_redirects):
        """Internal function to follow a redirect recieved by L{request}"""
        (scheme, authority, absolute_uri,
         defrag_uri) = httplib2.urlnorm(httplib2.iri2uri(uri))
        if self.cache:
            cachekey = defrag_uri
        else:
            cachekey = None

        # Pick out the location header and basically start from the beginning
        # remembering first to strip the ETag header and decrement our 'depth'
        if "location" not in response and response.status != 300:
            raise httplib2.RedirectMissingLocation(
                "Redirected but the response is missing a Location: header.",
                response, content)
        # Fix-up relative redirects (which violate an RFC 2616 MUST)
        if "location" in response:
            location = response['location']
            (scheme, authority, path, query,
             fragment) = httplib2.parse_uri(location)
            if authority is None:
                response['location'] = httplib2.urlparse.urljoin(uri, location)
                pywikibot.debug(
                    u"Relative redirect: changed [%s] to [%s]" %
                    (location, response['location']), _logger)
        if response.status == 301 and method in ["GET", "HEAD"]:
            response['-x-permanent-redirect-url'] = response['location']
            if "content-location" not in response:
                response['content-location'] = absolute_uri
            httplib2._updateCache(headers, response, content, self.cache,
                                  cachekey)

        headers.pop('if-none-match', None)
        headers.pop('if-modified-since', None)

        if "location" in response:
            location = response['location']
            redirect_method = (
                (response.status == 303) and
                (method not in ["GET", "HEAD"])) and "GET" or method
            return self.request(location,
                                redirect_method,
                                body=body,
                                headers=headers,
                                max_redirects=max_redirects - 1)
        else:
            raise httplib2.RedirectLimit(
                "Redirected more times than redirection_limit allows.",
                response, content)
Beispiel #41
0
 def open(self, url,  encoding=None):
     if isinstance(url, (str, unicode)):
         if isinstance(url, unicode):
             url = url.encode('utf-8')
         log.info('Retrieving "{url}"'.format(**locals()))
         try:
             return self.opener.opener.open(url, encoding)
         except UnicodeEncodeError:
             uri = iri2uri(url)
             return self.opener.opener.open(uri, encoding)
     else:
         req = url
         log.info('Retrieving "{url}"'.format(url = req.get_full_url()))
         return self.opener.opener.open(req, encoding)
 def run(self):
     config = self.config
     url = config['urls']
     soup = self.get_soup(url)
     #get segments
     #get segments
     catlist = self.get_catgorylinks(soup)
     if (len(catlist) > 0):
         for cat in catlist:
             config['Category'] = list(cat.keys())[0]
             url = cat[config['Category']]
             soup = self.get_soup(httplib2.iri2uri(url))
             allseg = self.get_allseg(soup)
             if (len(allseg) > 0):
                 for seg in allseg:
                     config['segment'] = list(seg.keys())[0]
                     url = seg[config['segment']]
                     soup = self.get_soup(httplib2.iri2uri(url))
                     allsubseg = self.get_allsubseg(soup)
                     if (len(allsubseg) > 0):
                         for subseg in allsubseg:
                             config['Sub-segment'] = list(subseg.keys())[0]
                             url = subseg[config['Sub-segment']]
                             self.get_proddata(url)
                     else:
                         config['Sub-segment'] = "None"
                         self.get_proddata(url)
             else:
                 config['segment'] = "None"
                 config['Sub-segment'] = "None"
                 self.get_proddata(url)
     else:
         config['Category'] = "None"
         config['segment'] = "None"
         config['Sub-segment'] = "None"
         self.get_proddata(url)
     pass
Beispiel #43
0
 def downloadHtmlImages(self, html):
     parser = ImgParser()
     parser.feed(html)
     urls = parser.image_urls
     imagesPath = os.path.join(self.rootDir, 'OEBPS', 'images')
     if not os.path.exists(imagesPath):
         os.makedirs(imagesPath)
     for url in urls:
         if 'http://' in url.lower() or 'https://' in url.lower():
             print "downloading " + url
             src = os.path.join(self.rootDir, os.path.split(url)[1])
             dest = os.path.join("images", os.path.split(url)[1])
             urllib.urlretrieve(httplib2.iri2uri(url), src)
             self.addImage(src, dest)
     return urls
Beispiel #44
0
def prepare_wiki_resp(name, article, url):
    name = unicode(name, 'utf-8')
    article = article.replace('_', ' ').strip()
    if not article:
        return ""
    if '#' in url:
        url_hash = url.split('#', 1)[-1]
        url_hash = httplib2.iri2uri(url_hash)
        url_hash = url_hash.replace(' ', '_').replace('%', '.')
        print url_hash
        url = url.split('#', 1)[0] + '#' + url_hash
    url = url.replace(' ', '%20')
    url = shorten(url)
    resp = name + ': ' + article + ' ' + url
    return '/me ' + resp
Beispiel #45
0
 def downloadHtmlImages(self, html):
     parser = ImgParser()
     parser.feed(html)
     urls = parser.image_urls
     imagesPath = os.path.join(self.rootDir, 'OEBPS', 'images')
     if not os.path.exists(imagesPath):
         os.makedirs(imagesPath)
     for url in urls:
         if 'http://' in url.lower() or 'https://' in url.lower():
             print "downloading " + url
             src = os.path.join(self.rootDir, os.path.split(url)[1])
             dest = os.path.join("images", os.path.split(url)[1])
             urllib.urlretrieve(httplib2.iri2uri(url), src)
             self.addImage(src, dest)
     return urls
Beispiel #46
0
 def send_to_api(self, request, debug=False):
     # add action to url
     url_req = "%s?action=%s" % (self.url, request.action)
     # add each property
     for k in request.prop.keys():
         #url_req += "&%s=%s" % (k, urlEncodeNonAscii(request.prop[k]))
         url_req += "&%s=%s" % (k, self.__encode_param(request.prop[k]))
     # add the format
     url_req += "&format=%s" % (request.format)
     # print url_req
     if not debug:
         uri = httplib2.iri2uri(unicode(url_req))
         return urllib.urlopen(uri).read()
     else:
         return url_req
Beispiel #47
0
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     currpageid = url.split("=")[len(url.split("=")) - 1]
     try:
         cstr = soup.find('div', {"class": "pager"}).find("a")['href']
         newpageid = cstr.split("=")[len(cstr.split("=")) - 1]
         return (currpageid < newpageid)
     except:
         try:
             cstr = soup.find('div', {
                 "class": "pager loaded"
             }).find("a")['href']
             newpageid = cstr.split("=")[len(cstr.split("=")) - 1]
             return (currpageid < newpageid)
         except:
             return (False)
Beispiel #48
0
 def is_product(self, url):
     soup = self.get_soup(httplib2.iri2uri(url))
     self.logger.info(url)
     try:
         cpg = int(url.split("=")[-1:][0])
         numli = len(
             soup.find('ul', {
                 "class": "pages"
             }).find_all("li", recursive=False))
         npg = int(
             soup.find('ul', {
                 "class": "pages"
             }).find_all("li", recursive=False)[numli -
                                                2].find("a")["data-value"])
         return (cpg < npg)
     except:
         return False
Beispiel #49
0
  def parseHouse(self, response):
    hxs = HtmlXPathSelector(response)
    item = HouseItem()
    item['title'] =hxs.select('//h1[contains(@class, "long_subject")]/text()').extract()
    Concelho = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Concelho")]').select('../text()').extract()[1].strip()
    Freguesia = ""
    try:
      Freguesia = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Freguesia")]').select('../text()').extract()[1].strip()
    except:
      print "No Freguesia"

    item['address'] = Concelho + ' ' + Freguesia
    item['link'] = response.url
    item['size'] = hxs.select('//div[contains(@class, "info right")]/ul/li/*[contains(text(), "Tipologia")]').select('../text()').extract()[1].strip()

    item['desc'] = hxs.select('//div[contains(@class, "body_text")]/text()').extract()

    item['price'] = hxs.select('//span[contains(@class, "coolprice")]/text()').extract()[0].strip()
    item['publication'] = hxs.select('//p[contains(@class,"right")]/text()').extract()[1].strip()

    image_from_script = hxs.select('//div[contains(@id, "slider")]/script/text()').extract()
    images_urls = image_from_script[0].split('[')[1].split(',')

    try:
      if "googleapis" in images_urls[-3]:
        item['lng'] = images_urls[-1].split("'")[0]
        item['lat'] = images_urls[-2].split('C')[-1]
        images_urls.remove(images_urls[-1])
        images_urls.remove(images_urls[-1])
        images_urls.remove(images_urls[-1])
      else:
        raise RunExceptCode
    except:
      iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true"
      result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0]
      item['lat'] = result['geometry']['location']['lat']
      item['lng'] = result['geometry']['location']['lng']


    item['image_urls'] = []
    for image_url in images_urls:
     item['image_urls'].append(image_url.replace("'",""))

    yield item
Beispiel #50
0
    def __init__(self, *args, **kwargs):
        super(LaconicModel, self).__init__(*args, **kwargs)

        graph = Graph()
        graph.bind('dbpedia', 'http://dbpedia.org/resource/')
        graph.bind('rdfs', 'http://www.w3.org/2000/01/rdf-schema#')
        graph.bind('schema', 'http://schema.org/')

        home = hyperspace.jump('http://dyli-thingy.herokuapp.com/',
                               client=http_client)
        thing = home.queries['lookup'][0].build({'iri': self.iri}).submit()
        graph = graph + thing.data

        if len(graph) == 0:
            raise LaconicModel.DoesNotExist('No data found for: ' + self.iri)

        self._graph = graph
        factory = laconia.ThingFactory(graph)
        self._entity = factory(iri2uri(self.iri))
Beispiel #51
0
def send_http_request(method, request_url, body=None, request_headers={}):

    uri = httplib2.iri2uri(request_url)
    (scheme, authority, request_uri) = httplib2.urlnorm(uri)[:3]
    address = _get_hostport(authority)
    http_client = httplib2.HTTPConnectionWithTimeout(address[0], port=address[1])
    if http_client.sock is None:
        http_client.connect()

    http_client.putrequest(method,
                           request_uri.encode(DEFAULT_HTTP_URI_CHARSET),
                           {'skip_host': 1, 'skip_accept_encoding': 1})

    for key, value in request_headers.items():
        http_client.putheader(key, value.encode(DEFAULT_HTTP_HEADER_CHARSET))
    http_client.endheaders()
    if body:
        http_client.send(body)
    return http_client.getresponse()
    def get_html(self):
        lang = self.request.params.get("lang")
        name = self.request.params.get("name")
        if lang == 'lb':
            lang = 'lu'
        url = \
            "https://wiki.geoportail.lu/doku.php?" \
            "id=%s:legend:%s&do=export_html" % \
            (lang, name)

        f = urllib2.urlopen(httplib2.iri2uri(url), None, 15)
        data = f.read()
        data = data.replace(
            "/lib/exe/fetch.php",
            "https://wiki.geoportail.lu/lib/exe/fetch.php")
        data = data.replace(
            "src=\"img/", "src=\"https://wiki.geoportail.lu/img/")
        data = data.replace(
            "/lib/exe/detail.php",
            "https://wiki.geoportail.lu/lib/exe/detail.php")

        soup = BeautifulSoup(data, "lxml")
        a_tags = soup.find_all("a")
        for a_tag in a_tags:
            if a_tag.get('class') is not None and\
               'media' in a_tag.get('class'):
                a_tag['target'] = '_blank'
        img_tags = soup.find_all("img")
        for img_tag in img_tags:
            if img_tag.get('style') is None:
                img_tag['style'] = 'max-width:290px;'

        res = soup.find("div", {"class": "dokuwiki export"})

        if res is not None:
            data = res.encode_contents()
        else:
            data = ""

        headers = {"Content-Type": f.info()['Content-Type']}

        return Response(data, headers=headers)
Beispiel #53
0
    def send_to_api(self, request, debug=False):
        """Send a request to mediawiki API.

        Args:
            request (MwApi): Request to send.
            debug (bool): if true, then just only return the string of the
                API request, otherwise return the result.
        """
        # add action to url
        url_req = "%s?action=%s" % (self.url, request.action)
        # add each property
        for k in request.prop:
            url_req += "&%s=%s" % (k, self.__encode_param(request.prop[k]))
        # add the format
        url_req += "&format=%s" % (request.format)
        # print url_req
        if not debug:
            uri = httplib2.iri2uri(unicode(url_req))
            return urllib.urlopen(uri).read()
        else:
            return url_req
Beispiel #54
0
  def parseHouse(self, response):
    if self.close_down:
      raise CloseSpider(reason='Duplicate house')
    hxs = HtmlXPathSelector(response)
    item = HouseItem()
    item['currency'] = "€"
    item['title'] = hxs.select('//div[contains(@class, "detaiHeaderProperty")]/text()')[0].extract()
    item['address'] = hxs.select('//div[contains(@class, "detaiHeaderLocation")]/text()')[0].extract()
    item['link'] = response.url
    item['desc'] = hxs.select('//div[contains(@class, "detailDescription")]/h2/text()').extract()
    item['price'] = hxs.select('//div[contains(@class, "detailHeaderPriceValue")]/text()')[0].extract().strip()
    item['state'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Estado")]').select('../span/text()').extract()    
    size_description = hxs.select('//div[contains(@class, "detaiHeaderProperty")]/text()')[0].extract().split(",")[0].strip()
    if size_description == "Quarto":
      item['size'] = 0
    elif size_description == "Apartamento":
      item['size'] = 1
    else:
      item['size'] = int(size_description.split()[-1].replace("T","").strip('+')[0])

    item['publication'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Publicado")]').select('../span/text()').extract()[0]

    computed_date = datetime.datetime.strptime(item['publication'], "%d-%m-%Y")
    one_month_ago = datetime.datetime.today() - datetime.timedelta(days=30)
    if computed_date < one_month_ago:
      log.msg("Too old...", level=log.INFO)
      raise CloseSpider('Houses are too old')

    image_urls = hxs.select('//a[contains(@id, "SmallFotos")]/@onclick').extract()

    item['image_urls'] = []
    for image_url in image_urls:
      item['image_urls'].append(re.findall(r'\'(.+?)\'',image_url)[0])

    iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true"
    result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0]
    item['lat'] = result['geometry']['location']['lat']
    item['lng'] = result['geometry']['location']['lng']
    
    yield item
Beispiel #55
0
    def get_html(self):
        lang = self.request.params.get("lang")
        name = self.request.params.get("name")
        if lang == 'lb':
            lang = 'lu'
        url = \
            "https://wiki.geoportail.lu/doku.php?" \
            "id=%s:legend:%s&do=export_html" % \
            (lang, name)

        f = urllib2.urlopen(httplib2.iri2uri(url), None, 15)
        data = f.read()
        data = data.replace("/lib/exe/fetch.php",
                            "https://wiki.geoportail.lu/lib/exe/fetch.php")
        data = data.replace("src=\"img/",
                            "src=\"https://wiki.geoportail.lu/img/")
        data = data.replace("/lib/exe/detail.php",
                            "https://wiki.geoportail.lu/lib/exe/detail.php")

        soup = BeautifulSoup(data, "lxml")
        a_tags = soup.find_all("a")
        for a_tag in a_tags:
            if a_tag.get('class') is not None and\
               'media' in a_tag.get('class'):
                a_tag['target'] = '_blank'
        img_tags = soup.find_all("img")
        for img_tag in img_tags:
            if img_tag.get('style') is None:
                img_tag['style'] = 'max-width:290px;'

        res = soup.find("div", {"class": "dokuwiki export"})

        if res is not None:
            data = res.encode_contents()
        else:
            data = ""

        headers = {"Content-Type": f.info()['Content-Type']}

        return Response(data, headers=headers)
Beispiel #56
0
  def parseHouse(self, response):
    hxs = HtmlXPathSelector(response)
    item = HouseItem()
    item['title'] = hxs.select('//div[contains(@class, "detaiHeaderProperty")]/text()')[0].extract()
    item['address'] = hxs.select('//div[contains(@class, "detaiHeaderLocation")]/text()')[0].extract()
    item['link'] = response.url
    item['desc'] = hxs.select('//div[contains(@class, "detailDescription")]/h2/text()').extract()
    item['price'] = hxs.select('//div[contains(@class, "detailHeaderPriceValue")]/text()')[0].extract().strip()
    item['state'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Estado")]').select('../span/text()').extract()
    item['publication'] = hxs.select('//div[contains(@class, "detailInfo")]/p/*[contains(text(),"Publicado")]').select('../span/text()').extract()
    item['size'] = item['title'].split(",")[0].split()[-1]

    image_urls = hxs.select('//a[contains(@id, "SmallFotos")]/@onclick').extract()

    item['image_urls'] = []
    for image_url in image_urls:
      item['image_urls'].append(re.findall(r'\'(.+?)\'',image_url)[0])

    iri = "http://maps.googleapis.com/maps/api/geocode/json?address=" + item['address'] + "&sensor=true"
    result = json.load(urllib2.urlopen(httplib2.iri2uri(iri).replace(" ","%20")))['results'][0]
    item['lat'] = result['geometry']['location']['lat']
    item['lng'] = result['geometry']['location']['lng']
    
    yield item