Beispiel #1
0
def response(resp):
    results = []

    json_regex = regex_json.search(resp.text)

    # check if results are returned
    if not json_regex:
        return []

    json_raw = regex_json_remove_end.sub(b'', regex_json_remove_start.sub(b'', json_regex.group()))
    json = loads(json_raw.decode('utf-8'))

    # parse results
    for result in json['Results'].get('items', []):
        result_title = result['Title'].replace(u'\uE000', '').replace(u'\uE001', '')

        # parse image results
        if result.get('ContentType', '').startswith('image'):
            img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8'))

            # append result
            results.append({'url': result['SourceUrl'],
                            'title': result['Title'],
                            'content': '',
                            'img_src': img_url,
                            'template': 'images.html'})

        # parse general results
        else:
            result_url = result['Url'].replace(u'\uE000', '').replace(u'\uE001', '')
            result_content = result['Description'].replace(u'\uE000', '').replace(u'\uE001', '')

            # append result
            results.append({'url': result_url,
                            'title': result_title,
                            'content': result_content})

    # parse images
    for result in json.get('Images', []):
        # decode image url
        img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8'))

        # append result
        results.append({'url': result['SourceUrl'],
                        'title': result['Title'],
                        'content': '',
                        'img_src': img_url,
                        'template': 'images.html'})

    # return results
    return results
Beispiel #2
0
def compare_urls(url_a, url_b):
    # ignore www. in comparison
    if url_a.netloc.startswith('www.'):
        host_a = url_a.netloc.replace('www.', '', 1)
    else:
        host_a = url_a.netloc
    if url_b.netloc.startswith('www.'):
        host_b = url_b.netloc.replace('www.', '', 1)
    else:
        host_b = url_b.netloc

    if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
        return False

    # remove / from the end of the url if required
    path_a = url_a.path[:-1]\
        if url_a.path.endswith('/')\
        else url_a.path
    path_b = url_b.path[:-1]\
        if url_b.path.endswith('/')\
        else url_b.path

    return unquote(path_a) == unquote(path_b)
Beispiel #3
0
def parse_url(url_string):
    endings = ['/RS', '/RK']
    endpositions = []
    start = url_string.find('http', url_string.find('/RU=') + 1)

    for ending in endings:
        endpos = url_string.rfind(ending)
        if endpos > -1:
            endpositions.append(endpos)

    if start == 0 or len(endpositions) == 0:
        return url_string
    else:
        end = min(endpositions)
        return unquote(url_string[start:end])
Beispiel #4
0
def parse_url(url_string):
    endings = ['/RS', '/RK']
    endpositions = []
    start = url_string.find('http', url_string.find('/RU=') + 1)

    for ending in endings:
        endpos = url_string.rfind(ending)
        if endpos > -1:
            endpositions.append(endpos)

    if start == 0 or len(endpositions) == 0:
        return url_string
    else:
        end = min(endpositions)
        return unquote(url_string[start:end])
Beispiel #5
0
def normalize_url(url):
    parsed_url = urlparse(url)

    # add a / at this end of the url if there is no path
    if not parsed_url.netloc:
        raise Exception('Cannot parse url')
    if not parsed_url.path:
        url += '/'

    # FIXME : hack for yahoo
    if parsed_url.hostname == 'search.yahoo.com'\
       and parsed_url.path.startswith('/r'):
        p = parsed_url.path
        mark = p.find('/**')
        if mark != -1:
            return unquote(p[mark + 3:]).decode('utf-8')

    return url
Beispiel #6
0
def normalize_url(url):
    parsed_url = urlparse(url)

    # add a / at this end of the url if there is no path
    if not parsed_url.netloc:
        raise Exception('Cannot parse url')
    if not parsed_url.path:
        url += '/'

    # FIXME : hack for yahoo
    if parsed_url.hostname == 'search.yahoo.com'\
       and parsed_url.path.startswith('/r'):
        p = parsed_url.path
        mark = p.find('/**')
        if mark != -1:
            return unquote(p[mark + 3:]).decode('utf-8')

    return url
Beispiel #7
0
def response(resp):
    results = []

    matches = modelexport_re.search(resp.text)

    if matches is None:
        return results

    match = matches.group(1)
    model_export = loads(match)

    if 'legend' not in model_export:
        return results

    legend = model_export['legend']

    # handle empty page
    if not legend or not legend[0]:
        return results

    for index in legend:
        photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
        author = unquote(photo.get('realname', ''))
        source = unquote(photo.get('username', '')) + ' @ Flickr'
        title = unquote(photo.get('title', ''))
        content = unquote(photo.get('description', ''))

        img_src = None
        # From the biggest to the lowest format
        for image_size in image_sizes:
            if image_size in photo['sizes']:
                img_src = photo['sizes'][image_size]['url']
                img_format = 'jpg ' \
                    + str(photo['sizes'][image_size]['width']) \
                    + 'x' \
                    + str(photo['sizes'][image_size]['height'])
                break

        if not img_src:
            logger.debug('cannot find valid image size: {0}'.format(repr(photo)))
            continue

        # For a bigger thumbnail, keep only the url_z, not the url_n
        if 'n' in photo['sizes']:
            thumbnail_src = photo['sizes']['n']['url']
        elif 'z' in photo['sizes']:
            thumbnail_src = photo['sizes']['z']['url']
        else:
            thumbnail_src = img_src

        if 'ownerNsid' not in photo:
            # should not happen, disowned photo? Show it anyway
            url = img_src
        else:
            url = build_flickr_url(photo['ownerNsid'], photo['id'])

        results.append({'url': url,
                        'title': title,
                        'img_src': img_src,
                        'thumbnail_src': thumbnail_src,
                        'content': content,
                        'author': author,
                        'source': source,
                        'img_format': img_format,
                        'template': 'images.html'})

    return results