Ejemplo n.º 1
0
 def _set_query_parameter(self, url, param_name, param_value):
     scheme, netloc, path, query_string, fragment = urlparse.urlsplit(url)
     query_params = urlparse.parse_qs(query_string)
     query_params[param_name] = [param_value]
     new_query_string = urllib.urlencode(query_params, doseq=True)
     return urlparse.urlunsplit(
         (scheme, netloc, path, new_query_string, fragment))
Ejemplo n.º 2
0
def _build_api_url(url, query):
    scheme, netloc, path, base_query, fragment = urlparse.urlsplit(url)

    if base_query:
        query = '%s&%s' % (base_query, query)

    return urlparse.urlunsplit((scheme, netloc, path, query, fragment))
def url_fix(s, charset='utf-8'):
    if isinstance(s, unicode):
        s = s.encode(charset, 'ignore')
    scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
    path = urllib.quote(path, '/%')
    qs = urllib.quote_plus(qs, ':&=')
    return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
Ejemplo n.º 4
0
 def get_user_details(self, response):
     """Generate username from identity url"""
     values = super(LiveJournalBackend, self).get_user_details(response)
     values['username'] = values.get('username') or \
                          urlparse.urlsplit(response.identity_url)\
                                .netloc.split('.', 1)[0]
     return values
Ejemplo n.º 5
0
def url_fix(s, charset='utf-8'):
    if isinstance(s, unicode):
        s = s.encode(charset, 'ignore')
    scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
    path = urllib.quote(path, '/%')
    qs = urllib.quote_plus(qs, ':&=')
    return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
Ejemplo n.º 6
0
def parse_uri(uri):
    """Parse absolute URI then return host, port and resource."""

    parsed = urlparse.urlsplit(uri)
    if parsed.scheme != 'wss' and parsed.scheme != 'ws':
        # |uri| must be a relative URI.
        # TODO(toyoshim): Should validate |uri|.
        return None, None, uri

    if parsed.hostname is None:
        return None, None, None

    port = None
    try:
        port = parsed.port
    except ValueError as e:
        # port property cause ValueError on invalid null port description like
        # 'ws://host:/path'.
        return None, None, None

    if port is None:
        if parsed.scheme == 'ws':
            port = 80
        else:
            port = 443

    path = parsed.path
    if not path:
        path += '/'
    if parsed.query:
        path += '?' + parsed.query
    if parsed.fragment:
        path += '#' + parsed.fragment

    return parsed.hostname, port, path
Ejemplo n.º 7
0
def YATSServer():
    if hasattr(settings, 'SSO_SERVER'):
        parts = list(urlparse.urlsplit(settings.SSO_SERVER))
        parts[2] = ''
        return urlparse.urlunsplit(parts)
    else:
        return ''
def url_fix(
    s,
    charset='utf-8'
):  #UTF-8 is one of the most commonly used encodings. UTF stands for “Unicode Transformation Format”, and the '8' means that 8-bit numbers are used in the encoding.
    if isinstance(s, unicode):
        s = s.encode(charset, 'ignore')
    scheme, netloc, path, qs, anchor = urlparse.urlsplit(s)
    path = urllib.quote(path, '/%')  #for delimeter
    qs = urllib.quote_plus(qs, ':&=')  #python data structures key-value pairs
    return urlparse.urlunsplit((scheme, netloc, path, qs, anchor))
Ejemplo n.º 9
0
def Shortining_Service(url):
    parts = urlparse(url)
    if not parts.scheme and not parts.hostname:
        # couldn't parse anything sensible, try again with a scheme.
        parts = urlparse.urlsplit("http://" + url)

    if bool(parts.hostname in services and parts.path):
        return 1
    else:
        return -1
Ejemplo n.º 10
0
def referrer_path(meta, default=None):
    referrer = meta.get("HTTP_REFERER")
    if not referrer:
        return default
    parsed = urlparse.urlsplit(referrer)
    next_domain = drop_subdomain(parsed.netloc)
    cur_domain = drop_subdomain(meta.get("HTTP_HOST", ""))
    if next_domain != cur_domain:
        return default
    return urlparse.urlunsplit(('', '') + parsed[2:])
Ejemplo n.º 11
0
def redirect(request, prefix, tiny, converter=default_converter):
    """
    Redirect to a given object from a short URL.
    """
    # Resolve the prefix and encoded ID into a model object and decoded ID.
    # Many things here could go wrong -- bad prefix, bad value in 
    # SHORTEN_MODELS, no such model, bad encoding -- so just return a 404 if
    # any of that stuff goes wrong.
    try:
        app_label, model_name = settings.SHORTEN_MODELS[prefix].split('.')
    except KeyError:
        raise Http404('Bad prefix.')
    try:
        model = models.get_model(app_label, model_name)
    except LookupError:
        model = False
    if not model:
        raise Http404('Bad model specified in SHORTEN_MODELS.')
    try:
        id = converter.to_decimal(tiny)
    except ValueError:
        raise Http404('Bad encoded ID.')
    
    # Try to look up the object. If it's not a valid object, or if it doesn't
    # have an absolute url, bail again.
    obj = get_object_or_404(model, pk=id)
    try:
        url = obj.get_absolute_url()
    except AttributeError:
        raise Http404("'%s' models don't have a get_absolute_url() method." % model.__name__)
    
    # We might have to translate the URL -- the badly-named get_absolute_url
    # actually returns a domain-relative URL -- into a fully qualified one.
    
    # If we got a fully-qualified URL, sweet.
    if urlparse.urlsplit(url)[0]:
        return HttpResponsePermanentRedirect(url)
    
    # Otherwise, we need to make a full URL by prepending a base URL.
    # First, look for an explicit setting.
    if hasattr(settings, 'SHORTEN_FULL_BASE_URL') and settings.SHORTEN_FULL_BASE_URL:
        base = settings.SHORTEN_FULL_BASE_URL
        
    # Next, if the sites app is enabled, redirect to the current site.
    elif Site._meta.installed:
        base = 'http://%s/' % Site.objects.get_current().domain
        
    # Finally, fall back on the current request.
    else:
        base = 'http://%s/' % RequestSite(request).domain
        
    return HttpResponsePermanentRedirect(urlparse.urljoin(base, url))
Ejemplo n.º 12
0
def get_host_credentials(config, hostname):
    """Get login information for a host `hostip` (ipv4) from marvin's `config`

    @return the tuple username, password for the host else raise keyerror"""
    for zone in config.get('zones', []):
        for pod in zone.get('pods', []):
            for cluster in pod.get('clusters', []):
                for host in cluster.get('hosts', []):
                    url = host.get('url')
                    if str(url).startswith('http'):
                        hostname_marvin = urlparse.urlsplit(str(url)).netloc
                    else:
                        hostname_marvin = str(url)
                    if hostname == hostname_marvin:
                        return host.get('username'), host.get('password')
    raise KeyError("Please provide the marvin configuration file with credentials to your hosts")
def goto(self, href, method='get', **args):
    """
    Monkeypatch the TestResponse.goto method so that it doesn't wipe out the
    scheme and host.
    """
    scheme, host, path, query, fragment = urlparse.urlsplit(href)
    # We
    fragment = ''
    href = urlparse.urlunsplit((scheme, host, path, query, fragment))
    href = urlparse.urljoin(self.request.url, href)
    method = method.lower()
    assert method in ('get', 'post'), (
        'Only "get" or "post" are allowed for method (you gave %r)' % method)
    if method == 'get':
        method = self.test_app.get
    else:
        method = self.test_app.post
    return method(href, **args)
Ejemplo n.º 14
0
    def remove_sensitive(cleartext):
        redactedtext = cleartext
        text_index = 0
        while True:
            match = UriCleaner.SENSITIVE_URI_PATTERN.search(
                redactedtext, text_index)
            if not match:
                break
            o = urlparse.urlsplit(match.group(1))
            if not o.username and not o.password:
                if o.netloc and ":" in o.netloc:
                    # Handle the special case url http://username:password that can appear in url
                    (username, password) = o.netloc.split(':')
                else:
                    text_index += len(match.group(1))
                    continue
            else:
                username = o.username
                password = o.password

            # Given a python MatchObject, with respect to redactedtext, find and
            # replace the first occurance of username and the first and second
            # occurance of password

            uri_str = redactedtext[match.start():match.end()]
            if username:
                uri_str = uri_str.replace(username, UriCleaner.REPLACE_STR, 1)
            # 2, just in case the password is $encrypted$
            if password:
                uri_str = uri_str.replace(password, UriCleaner.REPLACE_STR, 2)

            t = redactedtext[:match.start()] + uri_str
            text_index = len(t)
            if (match.end() < len(redactedtext)):
                t += redactedtext[match.end():]

            redactedtext = t
            if text_index >= len(redactedtext):
                text_index = len(redactedtext) - 1

        return redactedtext
Ejemplo n.º 15
0
 def run_validators(self, value):
     if self.allow_plain_hostname:
         try:
             url_parts = urlparse.urlsplit(value)
             if url_parts.hostname and '.' not in url_parts.hostname:
                 netloc = '{}.local'.format(url_parts.hostname)
                 if url_parts.port:
                     netloc = '{}:{}'.format(netloc, url_parts.port)
                 if url_parts.username:
                     if url_parts.password:
                         netloc = '{}:{}@{}' % (url_parts.username,
                                                url_parts.password, netloc)
                     else:
                         netloc = '{}@{}' % (url_parts.username, netloc)
                 value = urlparse.urlunsplit([
                     url_parts.scheme, netloc, url_parts.path,
                     url_parts.query, url_parts.fragment
                 ])
         except Exception:
             raise  # If something fails here, just fall through and let the validators check it.
     super(URLField, self).run_validators(value)
Ejemplo n.º 16
0
    async def update_csvs(self):
        """ Download/update CSVs if so much time has passed since last update """

        self.timestamp = self.settings['timestamp']
        self.maxDataAgeMinutes = self.settings['maxDataAgeMinutes']

        age_delta = epoch_now() - self.timestamp

        if (age_delta < self.maxDataAgeMinutes):
            return True

        print('Updating MCP data file extracts...')

        for url in self.data_files:
            fileName = fileName or os.path.basename(urlparse.urlsplit(url)[2])
            tempName = fileName + ".tmp"
            await download(url, tempName)
            self.remove_file(self, fileName)
            os.rename(tempName, fileName)

        return True
def fixurl(url):
    if not isinstance(url, unicode):
        url = url.decode('utf8')
    parsed = urlparse.urlsplit(url)
    userpass, at, hostport = parsed.netloc.rpartition('@')
    user, colon1, pass_ = userpass.partition(':')
    host, colon2, port = hostport.partition(':')
    scheme = parsed.scheme.encode('utf8')
    user = quote(user.encode('utf8'))
    colon1 = colon1.encode('utf8')
    pass_ = quote(pass_.encode('utf8'))
    at = at.encode('utf8')
    host = host.encode('idna')
    colon2 = colon2.encode('utf8')
    port = port.encode('utf8')
    path = '/'.join(  # could be encoded slashes!
        quote(unquote(pce).encode('utf8'), '')
        for pce in parsed.path.split('/'))
    query = quote(unquote(parsed.query).encode('utf8'), '=&?/')
    fragment = quote(unquote(parsed.fragment).encode('utf8'))
    netloc = ''.join((user, colon1, pass_, at, host, colon2, port))
    return urlunsplit((scheme, netloc, path, query, fragment))
Ejemplo n.º 18
0
    def __init__(self, proxy=None, timeout=60, cert=""):
        self.timeout = timeout
        self.cert = cert
        self.connection = None
        self.host = None
        self.port = None
        self.tls = None

        if isinstance(proxy, str):
            proxy_sp = urlparse.urlsplit(proxy)

            self.proxy = {
                "type": proxy_sp.scheme,
                "host": proxy_sp.hostname,
                "port": proxy_sp.port,
                "user": proxy_sp.username,
                "pass": proxy_sp.password
            }
        elif isinstance(proxy, dict):
            self.proxy = proxy
        else:
            self.proxy = None
Ejemplo n.º 19
0
def thumb(url, **kwargs):
    """
    Inspired by:
        http://tech.yipit.com/2013/01/03/how-yipit-scales-thumbnailing-with-thumbor-and-cloudfront/

    returns a thumbor url for 'url' with **kwargs as thumbor options.

    Positional arguments:
    url -- the location of the original image

    Keyword arguments:
    For the complete list of thumbor options
    https://github.com/globocom/thumbor/wiki/Usage
    and the actual implementation for the url generation
    https://github.com/heynemann/libthumbor/blob/master/libthumbor/url.py

    """
    THUMBOR_BASE_URL = getattr(settings, "THUMBOR_BASE_URL", None)
    THUMBOR_KEY = getattr(settings, "THUMBOR_KEY", "MY_SECURE_KEY")

    if THUMBOR_BASE_URL:
        base = THUMBOR_BASE_URL
    else:
        # otherwise assume that thumbor is setup behind the same
        # CDN behind the `thumbor` namespace.
        scheme, netloc = urlparse.urlsplit(url)[:2]
        base = f"{scheme}://{netloc}/thumbor"
    crypto = CryptoURL(key=THUMBOR_KEY)

    # just for code clarity
    thumbor_kwargs = kwargs
    if "fit_in" not in thumbor_kwargs:
        thumbor_kwargs["fit_in"] = True

    thumbor_kwargs["image_url"] = url
    path = crypto.generate(**thumbor_kwargs)

    return f"{base}{path}"
Ejemplo n.º 20
0
 def postprocess_document(self, uri, document):
     if uri:
         match_head_begin = self.re_head_begin.search(document)
         if match_head_begin:
             match_base_ref = self.re_base_href.search(
                 document, match_head_begin.end(0))
             if not match_base_ref:
                 insert = match_head_begin.end(0)
                 pre = document[:insert]
                 info = urlparse.urlsplit(uri)
                 href = info.scheme + "://" + info.netloc
                 if href[-1] != "\t":
                     href += "/"
                 base = "<base href=\"%s\" />" % href
                 post = document[insert:]
                 document = "".join((pre, base.encode("utf8"), post))
     match_head_end = self.re_head_end.search(document)
     if match_head_end:
         insert = match_head_end.start(0)
         pre = document[:insert]
         post = document[insert:]
         document = ''.join((pre, self.script, post))
     return document
Ejemplo n.º 21
0
 def process_item(self, item, spider):
     item['_id'] = urlparse.urlsplit(item['url']).path
     self.db[self.collection_name].update({'_id': item['_id']},
                                          {'$set': item},
                                          upsert=True)
     return item
Ejemplo n.º 22
0
def _getSeoLibraryFiltered(startUrl, language, country, useProxy=False):
    
    import tldextract
    from data_mining.seo_document_downloader import SeoDocumentDownloader
    from config import settings

    '''
    Montamos la query y descargamos los documentos
    '''
    
    extracted = tldextract.extract(startUrl)
    domain = u'%s.%s' % (extracted.subdomain, extracted.domain) if extracted.subdomain else extracted.domain
    domain = u'%s.%s' % (domain, extracted.suffix)

    # buscamos si hay un path después del dominio
    from urllib.parse import urlparse
    urlpath = [u for u in  urlparse.urlsplit(startUrl).path.split('/') if u and u!=domain ]
    path=[]
    if urlpath:
        for index, subpath in enumerate(urlpath):
            if subpath:
                if index<len(urlpath)-1 or not u'.' in subpath:
                    path.append(subpath)
    if path:
        query = u'site:{}/{}'.format(domain, '/'.join(path))
    else:
        query = u'site:{}'.format(domain,)
    
    print(query)
    
    seoLibrary = SeoDocumentDownloader(
                                       query=query,
                                       language=language,
                                       country=country,
                                       searchEngine=settings.DOWNLOADER_SEARCH_ENGINE,
                                       downloadLimit=settings.SITE_AUDIT_DOWNLOAD_LIMIT,
                                       sameOrigin=True,
                                       useProxy=useProxy
                                       ).getSeoLibrary()    
    
    print(u'Número de documentos originales: %s' % (len(seoLibrary.seoDocuments)))

    if len(seoLibrary.seoDocuments) < 20:
        raise Exception(u'Error: Not enough documents to get stats')
    
    '''
    Eliminamos aquellos enlaces que son considerados paginaciones
    '''
    
    DIFFERENCE_LIMIT = 6
    
    paginator = {}
    links2Remove = []
    
    import re
    for seoDocument in seoLibrary.seoDocuments:
        applied = re.sub(u'\d', u'', seoDocument.link)
        difference = len(seoDocument.link) - len(applied)
        if difference != 0 and difference <= DIFFERENCE_LIMIT:
            if applied not in paginator:
                paginator[applied] = []
            paginator[applied].append(seoDocument.link)
    
    for _shortUrl, origins in paginator.items():
        if len(origins) > 1:
            links2Remove.extend(origins)
            #print u'-'*15
            #print u'%s' % _shortUrl
            #for origin in origins:
            #    print u'\t\t%s' % origin
    
    seoLibrary.seoDocuments = [seoDocument for seoDocument in seoLibrary.seoDocuments if seoDocument.link not in links2Remove]
    
    print(u'Número de documentos tras Filtrado de Paginaciones: %s' % (len(seoLibrary.seoDocuments)))
    
    '''
    Descartamos aquellos documentos que no tengan una longitud mínima
    '''
    
    import numpy as np
    lengths = [seoDocument.getLenRawTokens() for seoDocument in seoLibrary.seoDocuments]
    percentilLengthText = np.percentile(lengths, 25)
    lowerLimit = max(percentilLengthText, settings.SITE_AUDIT_MIN_DOCUMENT_LENGTH)
    seoLibrary.seoDocuments = [seoDocument for seoDocument in seoLibrary.seoDocuments if seoDocument.getLenRawTokens() > lowerLimit]
    
    print(u'Número de documentos tras PRIMER filtrado por Longitud: %s' % (len(seoLibrary.seoDocuments)))
    
    allSentences = {}
    
    for seoDocument in seoLibrary.seoDocuments:
        sentences = seoDocument.getSentences()
        for sentence in sentences:
            if sentence not in allSentences:
                allSentences[sentence] = 0
            allSentences[sentence] += 1
    
    
    for seoDocument in seoLibrary.seoDocuments:
        sentences = seoDocument.getSentences()
        sentencesFiltered = []
        for sentence in sentences:
            if allSentences[sentence] < 2:
                sentencesFiltered.append(sentence)
                
        seoDocument.dataDocument.text = u' . '.join(sentencesFiltered)
        seoDocument.resetPreloads()
    
    
    lengths = [seoDocument.getLenRawTokens() for seoDocument in seoLibrary.seoDocuments]
    lowerLimit = np.percentile(lengths, 25)
    seoLibrary.seoDocuments = [seoDocument for seoDocument in seoLibrary.seoDocuments if seoDocument.getLenRawTokens() > lowerLimit]
    
    print(u'Número de documentos tras SEGUNDO filtrado por Longitud: %s' % (len(seoLibrary.seoDocuments)))

    return seoLibrary
Ejemplo n.º 23
0
def post_url(url, fields, files=[]):
    urlparts = urlparse.urlsplit(url)
    return post_multipart(urlparts[1], urlparts[2], fields, files)
Ejemplo n.º 24
0
    def down_html(self, url,dir_name):
        try:

            #获取biz
            params =  urlparse.urlsplit(url).query
            params = urlparse.parse_qs(params,True)
            if not '__biz' in params:
                #可能是搜狗链接,先转成微信连接
                url = self.deal_get_real_url(url)

            url = url.replace('\\x26','&')
            url = url.replace('x26','&')

            print(url)
            h = http.client.Http(timeout=30)
            html = self._get_gzh_article_text(url)
            content = html

            # 正则表达式javascript里的获取相关变量
            ct = re.findall('var ct = "(.*?)";', content)[0]
            msg_cdn_url = re.findall('var msg_cdn_url = "(.*?)";', content)[0]
            nickname = re.findall('var nickname = "(.*?)";', content)[0]
            if(nickname == ""):
                nickname = "not has name"
            if(ct == ""):
                ct = time.time()

            ctime = time.strftime("%Y%m%d%H%M%S", time.localtime(int(ct))) # int将字符串转成数字,不区分int和long, 这里将时间秒数转成日期格式
            # 建立文件夹
            #编码转换
            if isinstance(dir_name, str):
                dir_name = dir_name.encode('GB18030','ignore')
            else: 
                dir_name = dir_name.decode('utf-8','ignore').encode('GB18030','ignore')
            
            #print 
            if isinstance(nickname, str):
                nickname = nickname.encode('GB18030','ignore')
            else: 
                if chardet.detect(nickname)['encoding'] == 'KOI8-R' :
                    print("KOI8")
                    nickname = nickname.decode('KOI8-R','ignore').encode('GB18030','ignore')
                else:
                    print("GB18030")
                    nickname = nickname.decode('utf-8','ignore').encode('GB18030','ignore')

            dir = 'WeiXinGZH/' + nickname + '/' + ctime + '/' + dir_name + '/'
            #dir = 'WeiXinGZH/' + dir_name + '/'
            dir = dir.decode('gb2312','ignore')
            dir = dir.replace("?", "")
            dir = dir.replace("\\", "")
            dir = dir.replace("*", "")
            dir = dir.replace(":", "")
            dir = dir.replace('\"', "")
            dir = dir.replace("<", "")
            dir = dir.replace(">", "")
            dir = dir.replace("|", "")


            try :
                os.makedirs(dir)  # 建立相应的文件夹
                
            except :
                #不处理
                errormsg = 'none'

            # 下载封面
            url = msg_cdn_url
            print (u'正在下载文章:' + url)
            resp, contentface = h.request(url)
            
            file_name = dir + 'cover.jpg'
            codecs.open(file_name,mode='wb').write(contentface)

            # 下载其他图片
            soup = BeautifulSoup(content, 'html.parser')
            count = 0
            #logger.error(html)
            err_count = 0
            for link in soup.find_all('img') :
                try:
                    err_count += 1
                    if(err_count > 200) :
                        break #防止陷阱

                    if None != link.get('data-src') :
                        count = count + 1
                        orurl = link.get('data-src')
                        url = orurl.split('?')[0]  # 重新构造url,原来的url有一部分无法下载
                        #print u'正在下载:' + url
                        resp, content = h.request(url)

                        matchurlvalue = re.search(r'wx_fmt=(?P<wx_fmt>[^&]*)', orurl) # 无参数的可能是gif,也有可能是jpg
                        if None != matchurlvalue:
                            wx_fmt = matchurlvalue.group('wx_fmt') # 优先通过wx_fmt参数的值判断文件类型
                        else:
                            wx_fmt = binascii.b2a_hex(content[0:4]) # 读取前4字节转化为16进制字符串

                        #print wx_fmt
                        phototype = { 'jpeg': '.jpg', 'gif' : '.gif', 'png' : '.png', 'jpg' : '.jpg', '47494638' : '.gif', 'ffd8ffe0' : '.jpg', 'ffd8ffe1' : '.jpg', 'ffd8ffdb' : '.jpg', 'ffd8fffe' : '.jpg', 'other' : '.jpg', '89504e47' : '.png' }  # 方便写文件格式
                        file_name = 'Picture' + str(count) + phototype[wx_fmt]
                        file_path = dir + file_name
                        open(file_path, 'wb').write(content)

                        #图片替换成本地地址
                        re_url = 'data-src="%s(.+?)"' % (url[:-5])
                        re_pic = 'src="%s"' % (file_name)
                        html = re.sub(re_url, re_pic, html)
                except:
                    continue

            with open("%sindex.html" % (dir), "wb") as code :
                code.write(html)

            print(u'文章下载完成')
            ret_path = os.path.abspath('.')
            ret_path = ret_path.replace('\\', "/")
            ret_path = "%s/%sindex.html" %(ret_path.decode('GB18030').encode('utf-8'),dir)
            #print(ret_path)
        #except:
        except WechatSogouHistoryMsgException:
            print(u'文章内容有异常编码,无法下载')
            return ""
        return ret_path
Ejemplo n.º 25
0
    def parse_recipe(self, response):
        hxs = HtmlXPathSelector(response)
        recipe = CookpadRecipe()

        # id
        recipe['id'] = int(re.findall(r'recipe/(\d+)', response.url)[0])

        # name
        recipe['name'] = hxs.select("//div[@id='recipe-title']/h1/text()")[0] \
                            .extract().strip()

        # author
        recipe['author'] = int(
            hxs.select("//a[@id='recipe_author_name']/@href").re('(/d+)')[0])

        # description
        recipe['description'] = ''.join(hxs.select("//div[@id='description']/text()") \
                                           .extract()).strip()

        # ingredients
        ingredients = []
        ingredient_basepath = (
            "//div[@id='ingredients']/div[@id='ingredients_list']/"
            "div[@class='ingredient ingredient_row']")
        ingredient_nodes = hxs.select(ingredient_basepath)
        for ingredient_node in ingredient_nodes:
            try:
                if ingredient_node.select('div/span/a'):
                    # keyword ingredient
                    name = ingredient_node.select(
                        'div[1]/span/a/text()').extract()[0]
                else:
                    # normal ingredient
                    name = ingredient_node.select(
                        'div[1]/span/text()').extract()[0]
                quantity = ingredient_node.select('div[2]/text()').extract()[0]
            except:
                continue

            ingredient = Ingredient()
            ingredient['name'] = name
            ingredient['quantity'] = quantity
            ingredients.append(ingredient)
        recipe['ingredients'] = ingredients

        # instructions
        recipe['instructions'] = hxs.select(
            "//dd[@class='instruction']/p/text()").extract()

        # leaf category
        referer = response.request.headers.get('Referer')
        recipe['category'] = int(
            os.path.basename(urlparse.urlsplit(referer).path))

        # all categories (including leaf, internal, and root nodes)
        categories = hxs.select("//div[@id='category_list']/ul/li/a/@href").re(
            r'\d+')
        recipe['categories'] = map(lambda category: int(category), categories)

        # report count
        try:
            recipe['report_count'] = int(''.join(
                hxs.select("//li[@id='tsukurepo_tab']/a/span/text()").re(
                    '(\d+)')))
        except:
            recipe['report_count'] = 0

        # comment count
        try:
            recipe['comment_count'] = int(''.join(
                hxs.select("//li[@id='comment_tab']/a/span/text()").re(
                    '(\d+)')))
        except:
            recipe['comment_count'] = 0

        # advice and history
        for text in ('advice', 'history'):
            recipe[text] = ''.join(
                hxs.select(
                    "//div[@id='{0}']/text()".format(text)).extract()).strip()

        # related keywords
        recipe['related_keywords'] = hxs.select("//div[@class='related_keywords']/a/text()") \
                                        .extract()

        # main image
        image_main = hxs.select("//div[@id='main-photo']/img/@src").extract()
        recipe['image_main'] = image_main[0] if image_main else []

        # instruction images
        recipe['images_instruction'] = hxs.select(
            "//dd[@class='instruction']/div/div[@class='image']/img/@src"
        ).extract()

        # published date
        recipe['published_date'] = hxs.select(
            "//div[@id='recipe_id_and_published_date']/span[2]/text()").re(
                '\d{2}/\d{2}/\d{2}')[0]

        # udpated date
        recipe['updated_date'] = hxs.select(
            "//div[@id='recipe_id_and_published_date']/span[3]/text()").re(
                '\d{2}/\d{2}/\d{2}')[0]

        return recipe