def response(resp): results = [] dom = html.fromstring(resp.text) for result in dom.xpath('//div[@class="dg_u"]'): # try to extract the url url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload') if len(url_container) > 0: url = loads(url_container[0])['purl'] else: url = result.xpath('./a/@href')[0] # discard results that do not return an external url # very recent results sometimes don't return the video's url if url.startswith('/videos/search?'): continue title = extract_text(result.xpath('./a//div[@class="tl"]')) content = extract_text(result.xpath('.//div[@class="pubInfo"]')) thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0] results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html'}) # first page ignores requested number of results if len(results) >= number_of_results: break return results
def response(resp): results = [] # we get html in a JSON container... response = loads(resp.text) if "content" not in response: return [] dom = html.fromstring(response["content"]) p = HTMLParser() # parse results for result in dom.xpath(results_xpath): videoid = result.xpath(url_xpath)[0] url = base_url + videoid title = p.unescape(extract_text(result.xpath(title_xpath))) thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) if thumbnail[0] == '/': thumbnail = base_url + thumbnail d = extract_text(result.xpath(publishedDate_xpath)[0]) d = d.split('/') # force ISO date to avoid wrong parsing d = "%s-%s-%s" % (d[2], d[1], d[0]) publishedDate = parser.parse(d) content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': url, 'title': title, 'content': content, 'template': 'videos.html', 'publishedDate': publishedDate, 'thumbnail': thumbnail}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(base_url, link.attrib.get('href')) # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this... title = escape(extract_text(link)) thumbnail_tags = result.xpath(thumbnail_xpath) thumbnail = None if len(thumbnail_tags) > 0: thumbnail = extract_text(thumbnail_tags[0]) if thumbnail[0] == '/': thumbnail = base_url + thumbnail content = escape(extract_text(result.xpath(content_xpath))) # append result results.append({'url': href, 'title': title, 'img_src': thumbnail, 'content': content}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) p = HTMLParser() # parse results for result in dom.xpath(results_xpath): url = base_url + result.xpath(url_xpath)[0] title = p.unescape(extract_text(result.xpath(title_xpath))) thumbnail = extract_text(result.xpath(content_xpath)[0]) publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0])) # append result results.append( { "url": url, "title": title, "content": "", "template": "videos.html", "publishedDate": publishedDate, "thumbnail": thumbnail, } ) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.content) # parse results for result in dom.xpath(results_xpath): links = result.xpath(link_xpath) if not links: continue link = links[0] url = link.attrib.get('href') # block google-ad url's if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url): continue title = escape(extract_text(link)) if result.xpath('./p[@class="desc"]'): content = escape(extract_text(result.xpath('./p[@class="desc"]'))) else: content = '' # append result results.append({'url': url, 'title': title, 'content': content}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): title = extract_text(result.xpath(title_xpath)[0]) try: url = parse_url(extract_url(result.xpath(url_xpath), search_url)) parsed_url = urlparse(url) if parsed_url.netloc==google_hostname and parsed_url.path==search_path: # remove the link to google news continue if parsed_url.netloc==google_hostname and parsed_url.path==images_path: # images result results = results + parse_images(result) else: # normal result content = extract_text(result.xpath(content_xpath)[0]) # append result results.append({'url': url, 'title': title, 'content': content}) except: continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.content) # parse results for result in dom.xpath('//div[@class="sa_cc"]'): link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = extract_text(link) content = escape(extract_text(result.xpath('.//p'))) # append result results.append({'url': url, 'title': title, 'content': content}) # return results if something is found if results: return results # parse results again if nothing is found yet for result in dom.xpath('//li[@class="b_algo"]'): link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = extract_text(link) content = escape(extract_text(result.xpath('.//p'))) # append result results.append({'url': url, 'title': title, 'content': content}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) p = HTMLParser() # parse results for result in dom.xpath(results_xpath): videoid = result.xpath(url_xpath)[0] url = base_url + videoid title = p.unescape(extract_text(result.xpath(title_xpath))) thumbnail = extract_text(result.xpath(thumbnail_xpath)[0]) publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0])) embedded = embedded_url.format(videoid=videoid) # append result results.append({'url': url, 'title': title, 'content': '', 'template': 'videos.html', 'publishedDate': publishedDate, 'embedded': embedded, 'thumbnail': thumbnail}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) try: results.append( {"number_of_results": int(dom.xpath('//span[@class="sb_count"]/text()')[0].split()[0].replace(",", ""))} ) except: pass # parse results for result in dom.xpath('//div[@class="sa_cc"]'): link = result.xpath(".//h3/a")[0] url = link.attrib.get("href") title = extract_text(link) content = extract_text(result.xpath(".//p")) # append result results.append({"url": url, "title": title, "content": content}) # parse results again if nothing is found yet for result in dom.xpath('//li[@class="b_algo"]'): link = result.xpath(".//h2/a")[0] url = link.attrib.get("href") title = extract_text(link) content = extract_text(result.xpath(".//p")) # append result results.append({"url": url, "title": title, "content": content}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for tweet in dom.xpath(results_xpath): try: link = tweet.xpath(link_xpath)[0] content = extract_text(tweet.xpath(content_xpath)[0]) except Exception: continue url = urljoin(base_url, link.attrib.get('href')) title = extract_text(tweet.xpath(title_xpath)) pubdate = tweet.xpath(timestamp_xpath) if len(pubdate) > 0: timestamp = float(pubdate[0].attrib.get('data-time')) publishedDate = datetime.fromtimestamp(timestamp, None) # append result results.append({'url': url, 'title': title, 'content': content, 'publishedDate': publishedDate}) else: # append result results.append({'url': url, 'title': title, 'content': content}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath('//div[@class="g"]'): title = extract_text(result.xpath('.//h3')) url = result.xpath('.//div[@class="r"]/a/@href')[0] content = extract_text(result.xpath('.//span[@class="st"]')) # get thumbnails script = str(dom.xpath('//script[contains(., "_setImagesSrc")]')[0].text) id = result.xpath('.//div[@class="s"]//img/@id')[0] thumbnails_data = re.findall('s=\'(.*?)(?:\\\\[a-z,1-9,\\\\]+\'|\')\;var ii=\[(?:|[\'vidthumb\d+\',]+)\'' + id, script) tmp = [] if len(thumbnails_data) != 0: tmp = re.findall('(data:image/jpeg;base64,[a-z,A-Z,0-9,/,\+]+)', thumbnails_data[0]) thumbnail = '' if len(tmp) != 0: thumbnail = tmp[-1] # append result results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html'}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): try: url = parse_url(extract_url(result.xpath(url_xpath), search_url)) title = extract_text(result.xpath(title_xpath)[0]) except: continue content = extract_text(result.xpath(content_xpath)[0]) # append result results.append({'url': url, 'title': title, 'content': content}) # if no suggestion found, return results if not suggestion_xpath: return results # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) # return results return results
def response(resp): dom = html.fromstring(resp.content) search_res = dom.xpath('.//td[@class="x-item"]') if not search_res: return list() results = list() for result in search_res: url = urljoin(URL, result.xpath('.//a[@title]/@href')[0]) title = result.xpath('.//a[@title]/text()')[0] content = extract_text(result.xpath('.//div[@class="files"]')) files_data = extract_text(result.xpath('.//div[@class="tail"]')).split() filesize = get_torrent_size(files_data[FILESIZE], files_data[FILESIZE_MULTIPLIER]) magnetlink = result.xpath('.//div[@class="tail"]//a[@class="title"]/@href')[0] results.append({'url': url, 'title': title, 'content': content, 'filesize': filesize, 'magnetlink': magnetlink, 'seed': 'N/A', 'leech': 'N/A', 'template': 'torrent.html'}) return results
def response(resp): results = [] doc = fromstring(resp.text) # parse results for r in doc.xpath(result_xpath): try: res_url = r.xpath(url_xpath)[-1] except: continue if not res_url: continue title = extract_text(r.xpath(title_xpath)) content = extract_text(r.xpath(content_xpath)) # append result results.append({'title': title, 'content': content, 'url': res_url}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) regex = re.compile('3\.jpg.*$') # parse results for result in dom.xpath('//div[@class="photo"]'): link = result.xpath('.//a')[0] url = urljoin(base_url, link.attrib.get('href')) title = extract_text(result.xpath('.//div[@class="title"]')) thumbnail_src = link.xpath('.//img')[0].attrib.get('src') # To have a bigger thumbnail, uncomment the next line # thumbnail_src = regex.sub('4.jpg', thumbnail_src) content = extract_text(result.xpath('.//div[@class="info"]')) img_src = regex.sub('2048.jpg', thumbnail_src) # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'content': content, 'thumbnail_src': thumbnail_src, 'template': 'images.html'}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): videoid = result.xpath('@data-context-item-id')[0] url = base_youtube_url + videoid thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg' title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) embedded = embedded_url.format(videoid=videoid) # append result results.append({'url': url, 'title': title, 'content': content, 'template': 'videos.html', 'embedded': embedded, 'thumbnail': thumbnail}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//div[@id="search_res"]/table/tr') # return empty array if nothing is found if not search_res: return [] # parse results for result in search_res: link = result.xpath('.//td[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) content = extract_text(result.xpath('.//pre[@class="snippet"]')[0]) content = "<br />".join(content.split("\n")) filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0] filesize_multiplier = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[1] files = result.xpath('.//span[@class="attr_val"]/text()')[1] seed = result.xpath('.//span[@class="attr_val"]/text()')[2] # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 leech = 0 # convert filesize to byte if possible filesize = get_torrent_size(filesize, filesize_multiplier) # convert files to int if possible if files.isdigit(): files = int(files) else: files = None magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href'] # append result results.append({'url': href, 'title': title, 'content': content, 'seed': seed, 'leech': leech, 'filesize': filesize, 'files': files, 'magnetlink': magnetlink, 'template': 'torrent.html'}) # return results sorted by seeder return sorted(results, key=itemgetter('seed'), reverse=True)
def parse_images(result, google_hostname): results = [] for image in result.xpath(images_xpath): url = parse_url(extract_text(image.xpath(image_url_xpath)[0]), google_hostname) img_src = extract_text(image.xpath(image_img_src_xpath)[0]) # append result results.append({"url": url, "title": "", "content": "", "img_src": img_src, "template": "images.html"}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//table[@id="searchResult"]//tr') # return empty array if nothing is found if not search_res: return [] # parse results for result in search_res[1:]: link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get("href")) title = extract_text(link) content = escape(extract_text(result.xpath(content_xpath))) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 # convert leech to int if possible if leech.isdigit(): leech = int(leech) else: leech = 0 magnetlink = result.xpath(magnet_xpath)[0] torrentfile_links = result.xpath(torrent_xpath) if torrentfile_links: torrentfile_link = torrentfile_links[0].attrib.get("href") else: torrentfile_link = None # append result results.append( { "url": href, "title": title, "content": content, "seed": seed, "leech": leech, "magnetlink": magnetlink.attrib.get("href"), "torrentfile": torrentfile_link, "template": "torrent.html", } ) # return results sorted by seeder return sorted(results, key=itemgetter("seed"), reverse=True)
def response(resp): results = [] dom = html.fromstring(resp.content) # parse results for result in dom.xpath('//div[@class="sn_r"]'): link = result.xpath('.//div[@class="newstitle"]/a')[0] url = link.attrib.get('href') title = extract_text(link) contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]') content = escape(extract_text(contentXPath)) # parse publishedDate publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div' '//div[contains(@class,"sn_ST")]' '//span[contains(@class,"sn_tm")]') publishedDate = escape(extract_text(publishedDateXPath)) if re.match("^[0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0])) elif re.match("^[0-9]+ hour(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0])) elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now()\ - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) elif re.match("^[0-9]+ day(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0])) else: try: publishedDate = parser.parse(publishedDate, dayfirst=False) except TypeError: publishedDate = datetime.now() except ValueError: publishedDate = datetime.now() # append result results.append({'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}) # return results return results
def parse_images(result): results = [] for image in result.xpath(images_xpath): url = parse_url(extract_text(image.xpath(image_url_xpath)[0])) img_src = extract_text(image.xpath(image_img_src_xpath)[0]) # append result results.append({'url': url, 'title': '', 'content': '', 'img_src': img_src, 'template': 'images.html'}) return results
def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//table[@id="searchResult"]//tr') # return empty array if nothing is found if not search_res: return [] # parse results for result in search_res[1:]: link = result.xpath('.//div[@class="detName"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) content = extract_text(result.xpath(content_xpath)) seed, leech = result.xpath('.//td[@align="right"]/text()')[:2] # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 # convert leech to int if possible if leech.isdigit(): leech = int(leech) else: leech = 0 magnetlink = result.xpath(magnet_xpath)[0] torrentfile_links = result.xpath(torrent_xpath) if torrentfile_links: torrentfile_link = torrentfile_links[0].attrib.get('href') else: torrentfile_link = None # append result results.append({'url': href, 'title': title, 'content': content, 'seed': seed, 'leech': leech, 'magnetlink': magnetlink.attrib.get('href'), 'torrentfile': torrentfile_link, 'template': 'torrent.html'}) # return results sorted by seeder return sorted(results, key=itemgetter('seed'), reverse=True)
def response(resp): results = [] dom = html.fromstring(resp.text) search_lang = "" # dirty fix for languages named differenly in their site if resp.search_params['language'][:2] == 'fa': search_lang = 'Farsi' elif resp.search_params['language'] == 'pt-BR': search_lang = 'Brazilian' else: search_lang = [lc[3] for lc in language_codes if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]] search_lang = search_lang[0].split(' (')[0] # parse results for result in dom.xpath(results_xpath): link = result.xpath(".//a")[0] href = link.attrib.get('href') if language is not "": href = href + language + '/' elif search_lang: href = href + search_lang + '/' title = extract_text(link) content = extract_text(result.xpath('.//div[contains(@class,"red")]')) content = content + " - " text = extract_text(result.xpath('.//div[contains(@class,"grey-web")]')[0]) content = content + text if result.xpath(".//span") != []: content = content +\ " - (" +\ extract_text(result.xpath(".//span")) +\ ")" # append result results.append({'url': href, 'title': title, 'content': content}) # return results return results
def response(resp): '''post-response callback resp: requests response object ''' results = [] dom = html.fromstring(resp.text) try: number_of_results_string = re.sub('[^0-9]', '', dom.xpath( '//a[@class="active" and contains(@href,"/suchen/dudenonline")]/span/text()')[0] ) results.append({'number_of_results': int(number_of_results_string)}) except: logger.debug("Couldn't read number of results.") pass for result in dom.xpath('//section[@class="wide" and not(contains(@style,"overflow:hidden"))]'): try: logger.debug("running for %s" % str(result)) link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = result.xpath('string(.//h2/a)') content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, 'title': title, 'content': content}) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue return results
def response(resp): results = [] # return empty array if a redirection code is returned if resp.status_code == 302: return [] dom = html.fromstring(resp.text) regex = re.compile('\/200H\/') # parse results for result in dom.xpath('//div[contains(@class, "tt-a tt-fh")]'): link = result.xpath('.//a[contains(@class, "thumb")]')[0] url = urljoin(base_url, link.attrib.get('href')) title_links = result.xpath('.//span[@class="details"]//a[contains(@class, "t")]') title = extract_text(title_links[0]) thumbnail_src = link.xpath('.//img')[0].attrib.get('src') img_src = regex.sub('/', thumbnail_src) # http to https, remove domain sharding thumbnail_src = re.sub(r"https?://(th|fc)\d+.", "https://th01.", thumbnail_src) thumbnail_src = re.sub(r"http://", "https://", thumbnail_src) url = re.sub(r"http://(.*)\.deviantart\.com/", "https://\\1.deviantart.com/", url) # append result results.append({'url': url, 'title': title, 'img_src': img_src, 'thumbnail_src': thumbnail_src, 'template': 'images.html'}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) for result in dom.xpath('//div[@class="results"]/dl'): name_cell = result.xpath('./dt')[0] title = extract_text(name_cell) # skip rows that do not contain a link to a torrent links = name_cell.xpath('./a') if len(links) != 1: continue # extract url and remove a slash in the beginning link = links[0].attrib.get('href').lstrip('/') seed = 0 leech = 0 try: seed = int(result.xpath('./dd/span[4]/text()')[0].replace(',', '')) leech = int(result.xpath('./dd/span[5]/text()')[0].replace(',', '')) except: pass params = { 'url': base_url + link, 'title': title, 'seed': seed, 'leech': leech, 'template': 'torrent.html' } # let's try to calculate the torrent size try: filesize_info = result.xpath('./dd/span[3]/text()')[0] filesize, filesize_multiplier = filesize_info.split() filesize = get_torrent_size(filesize, filesize_multiplier) params['filesize'] = filesize except: pass # does our link contain a valid SHA1 sum? if re.compile('[0-9a-fA-F]{40}').match(link): # add a magnet link to the result params['magnetlink'] = 'magnet:?xt=urn:btih:' + link # extract and convert creation date try: date_ts = result.xpath('./dd/span[2]')[0].attrib.get('title') date = datetime.fromtimestamp(float(date_ts)) params['publishedDate'] = date except: pass results.append(params) return results
def result_to_text(url, text, htmlResult): # TODO : remove result ending with "Meaning" or "Category" dom = html.fromstring(htmlResult) a = dom.xpath('//a') if len(a) >= 1: return extract_text(a[0]) else: return text
def response(resp): results = [] dom = html.fromstring(resp.text) for result in dom.xpath(results_xpath): url = parse_url(extract_url(result.xpath(url_xpath), search_url)) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) results.append({'url': url, 'title': title, 'content': content}) if not suggestion_xpath: return results for suggestion in dom.xpath(suggestion_xpath): results.append({'suggestion': extract_text(suggestion)}) return results
def response(resp): results = [] doc = fromstring(resp.text) # parse results # Quickhits for r in doc.xpath('//div[@class="search_quickresult"]/ul/li'): try: res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] except: continue if not res_url: continue title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) # append result results.append({'title': title, 'content': "", 'url': base_url + res_url}) # Search results for r in doc.xpath('//dl[@class="search_results"]/*'): try: if r.tag == "dt": res_url = r.xpath('.//a[@class="wikilink1"]/@href')[-1] title = extract_text(r.xpath('.//a[@class="wikilink1"]/@title')) elif r.tag == "dd": content = extract_text(r.xpath('.')) # append result results.append({'title': title, 'content': content, 'url': base_url + res_url}) except: continue if not res_url: continue # return results return results
def response(resp): results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): urls = result.xpath(url_xpath) if len(urls) != 1: continue url = sanitize_url(parse_url(extract_url(urls, search_url))) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) # still useful ? if re.match("^[0-9]+ minute(s|) ago$", publishedDate): publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) elif re.match("^[0-9]+ days? ago$", publishedDate): publishedDate = datetime.now() - timedelta(days=int(re.match(r'\d+', publishedDate).group())) elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate): timeNumbers = re.findall(r'\d+', publishedDate) publishedDate = datetime.now()\ - timedelta(hours=int(timeNumbers[0]))\ - timedelta(minutes=int(timeNumbers[1])) else: try: publishedDate = parser.parse(publishedDate) except: publishedDate = datetime.now() if publishedDate.year == 1900: publishedDate = publishedDate.replace(year=datetime.now().year) # append result results.append({'url': url, 'title': title, 'content': content, 'publishedDate': publishedDate}) # return results return results
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath( dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, '@alt')[0] img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath(img_node, '../../../a[2]')[0] url = eval_xpath(link_node, '@href')[0] pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath(img_node, '../../../@data-id')[0] src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//section[@id="#torrents"]/div/table/tbody/tr') # return empty array if nothing is found if not search_res: return [] # parse results for result in search_res: link = result.xpath('.//a[@id="torrent_name"]')[0] href = link.attrib.get('href') title = extract_text(link) seed = result.xpath('.//td[8]/text()')[0] leech = result.xpath('.//td[9]/text()')[0] # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 # convert leech to int if possible if leech.isdigit(): leech = int(leech) else: leech = 0 params = { 'url': href, 'title': title, 'seed': seed, 'leech': leech, 'template': 'torrent.html' } # let's try to calculate the torrent size try: filesize_info = result.xpath('.//td[6]/text()')[0] filesize = filesize_info[:-2] filesize_multiplier = filesize_info[-2:].lower() multiplier_french_to_english = { 'to': 'TiB', 'go': 'GiB', 'mo': 'MiB', 'ko': 'KiB' } filesize = get_torrent_size( filesize, multiplier_french_to_english[filesize_multiplier]) params['filesize'] = filesize except: pass # extract and convert creation date try: date_ts = result.xpath('.//td[5]/div/text()')[0] date = datetime.fromtimestamp(float(date_ts)) params['publishedDate'] = date except: pass # append result results.append(params) # return results sorted by seeder return sorted(results, key=itemgetter('seed'), reverse=True)
def response(resp): results = [] dom = html.fromstring(resp.text) for result in dom.xpath(xpath_results): # defaults filesize = 0 magnet_link = "" torrent_link = "" # category in which our torrent belongs try: category = result.xpath(xpath_category)[0].attrib.get('title') except: pass # torrent title page_a = result.xpath(xpath_title)[0] title = extract_text(page_a) # link to the page href = base_url + page_a.attrib.get('href') for link in result.xpath(xpath_torrent_links): url = link.attrib.get('href') if 'magnet' in url: # link to the magnet magnet_link = url else: # link to the torrent file torrent_link = url # seed count seed = int_or_zero(result.xpath(xpath_seeds)) # leech count leech = int_or_zero(result.xpath(xpath_leeches)) # torrent downloads count downloads = int_or_zero(result.xpath(xpath_downloads)) # let's try to calculate the torrent size try: filesize_info = result.xpath(xpath_filesize)[0] filesize, filesize_multiplier = filesize_info.split() filesize = get_torrent_size(filesize, filesize_multiplier) except: pass # content string contains all information not included into template content = 'Category: "{category}". Downloaded {downloads} times.' content = content.format(category=category, downloads=downloads) results.append({ 'url': href, 'title': title, 'content': content, 'seed': seed, 'leech': leech, 'filesize': filesize, 'torrentfile': torrent_link, 'magnetlink': magnet_link, 'template': 'torrent.html' }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) rows = dom.xpath( '//table[@class="listing"]//tr[contains(@class, "category_0")]') # check if there are no results or page layout was changed so we cannot parse it # currently there are two rows for each result, so total count must be even if len(rows) == 0 or len(rows) % 2 != 0: return [] # regular expression for parsing torrent size strings size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) # processing the results, two rows at a time for i in xrange(0, len(rows), 2): # parse the first row name_row = rows[i] links = name_row.xpath('./td[@class="desc-top"]/a') params = { 'template': 'torrent.html', 'url': links[-1].attrib.get('href'), 'title': extract_text(links[-1]) } # I have not yet seen any torrents without magnet links, but # it's better to be prepared to stumble upon one some day if len(links) == 2: magnet = links[0].attrib.get('href') if magnet.startswith('magnet'): # okay, we have a valid magnet link, let's add it to the result params['magnetlink'] = magnet # no more info in the first row, start parsing the second one info_row = rows[i + 1] desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0]) for item in desc.split('|'): item = item.strip() if item.startswith('Size:'): try: # ('1.228', 'GB') groups = size_re.match(item).groups() multiplier = get_filesize_mul(groups[1]) params['filesize'] = int(multiplier * float(groups[0])) except Exception as e: pass elif item.startswith('Date:'): try: # Date: 2016-02-21 21:44 UTC date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') params['publishedDate'] = date except Exception as e: pass elif item.startswith('Comment:'): params['content'] = item stats = info_row.xpath('./td[@class="stats"]/span') # has the layout not changed yet? if len(stats) == 3: params['seed'] = int_or_zero(extract_text(stats[0])) params['leech'] = int_or_zero(extract_text(stats[1])) results.append(params) return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({'number_of_results': int(dom.xpath('//p[@class="num-tips"]/text()')[0] .split(u'\u7ea6')[1].split(u'\u6761')[0].replace(',', ''))}) except Exception: sentry.captureException() # parse results try: for result in dom.xpath('//div[@class="vrwrap"]'): try: url = result.xpath('.//a')[0].attrib.get('href') if result.xpath('.//a')[0].attrib.get( 'href').startswith("http") else "https://sogou.com" + result.xpath('.//a')[0].attrib.get('href') # parse weixin.sogou html if "http://weixin.sogou.com/" == url.strip(): url = result.xpath('.//div[@class="str-pd-box str-pd-none"]//a')[0].attrib.get('href') title = extract_text( result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a')[0]) content = extract_text( result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]')[0]) else: title = extract_text(result.xpath('.//h3/a')[0]) content = extract_text(result.xpath('.//div')[0]) if 'sogou.com/link?url' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) showurl = re.findall(WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0] showurl = showurl.lstrip('.') else: showurl = url # append result results.append({'url': url, 'showurl': showurl, 'title': title, 'content': content}) except Exception: sentry.captureException() continue except Exception as e: sentry.captureException() try: for result in dom.xpath('//div[@class="rb"]'): try: url = result.xpath('.//a')[0].attrib.get('href') if result.xpath('.//a')[0].attrib.get( 'href').startswith("http") else "https://sogou.com" + result.xpath('.//a')[0].attrib.get('href') # to parse sogou weixin html if "http://weixin.sogou.com/" == url.strip(): url = result.xpath('.//div[@class="str-pd-box str-pd-none"]//a')[0].attrib.get('href') title = extract_text( result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a')[0]) content = extract_text( result.xpath('.//div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]')[0]) else: title = extract_text(result.xpath('.//h3/a')[0]) content = extract_text(result.xpath('.//div')[0]) if 'sogou.com/link?url' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) showurl = re.findall(WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0] showurl = showurl.lstrip('.') else: showurl = url results.append({'url': url, 'showurl': showurl, 'title': title, 'content': content}) except Exception as e: sentry.captureException() continue except Exception as e: sentry.captureException() # return results return results
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path)) or (parsed_url.netloc.startswith(map_hostname_start))): x = result.xpath(map_near) if len(x) > 0: # map : near the location results = results + parse_map_near(parsed_url, x, google_hostname) else: # map : detail about a location results = results + parse_map_detail( parsed_url, result, google_hostname) # google news elif (parsed_url.netloc == google_hostname and parsed_url.path == search_path): # skipping news results pass # images result elif (parsed_url.netloc == google_hostname and parsed_url.path == images_path): # only thumbnail image provided, # so skipping image results # results = results + parse_images(result, google_hostname) pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({ 'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': escape(extract_text(suggestion))}) # return results return results
def response(resp): results = [] dom = html.fromstring(resp.content) # parse results for result in dom.xpath(results_xpath): links = result.xpath(link_xpath) if not links: continue link = links[0] url = link.attrib.get('href') # block google-ad url's if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url): continue # block startpage search url's if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url): continue # block ixquick search url's if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url): continue title = extract_text(link) if result.xpath('./p[@class="desc clk"]'): content = extract_text(result.xpath('./p[@class="desc clk"]')) else: content = '' published_date = None # check if search result starts with something like: "2 Sep 2014 ... " if re.match( r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] published_date = parser.parse(date_string, dayfirst=True) # fix content string content = content[date_pos:] # check if search result starts with something like: "5 days ago ... " elif re.match(r"^[0-9]+ days? ago \.\.\. ", content): date_pos = content.find('...') + 4 date_string = content[0:date_pos - 5] # calculate datetime published_date = datetime.now() - timedelta( days=int(re.match(r'\d+', date_string).group())) # fix content string content = content[date_pos:] if published_date: # append result results.append({ 'url': url, 'title': title, 'content': content, 'publishedDate': published_date }) else: # append result results.append({'url': url, 'title': title, 'content': content}) # return results return results
def extract_text_from_dom(result, xpath): r = result.xpath(xpath) if len(r) > 0: return extract_text(r[0]) return None
def response(resp): results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which hostname ? google_hostname = resp.search_params.get('google_hostname') google_url = "https://" + google_hostname # convert the text to dom dom = html.fromstring(resp.text) instant_answer = dom.xpath('//div[@id="_vBb"]//text()') if instant_answer: results.append({'answer': u' '.join(instant_answer)}) try: results_num = int( dom.xpath('//div[@id="resultStats"]//text()')[0].split() [1].replace(',', '')) results.append({'number_of_results': results_num}) except: pass # parse results for result in dom.xpath(results_xpath): try: title = extract_text(result.xpath(title_xpath)[0]) url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname) parsed_url = urlparse(url, google_hostname) # map result if parsed_url.netloc == google_hostname: # TODO fix inside links continue # if parsed_url.path.startswith(maps_path) or parsed_url.netloc.startswith(map_hostname_start): # print "yooooo"*30 # x = result.xpath(map_near) # if len(x) > 0: # # map : near the location # results = results + parse_map_near(parsed_url, x, google_hostname) # else: # # map : detail about a location # results = results + parse_map_detail(parsed_url, result, google_hostname) # # google news # elif parsed_url.path == search_path: # # skipping news results # pass # # images result # elif parsed_url.path == images_path: # # only thumbnail image provided, # # so skipping image results # # results = results + parse_images(result, google_hostname) # pass else: # normal result content = extract_text_from_dom(result, content_xpath) if content is None: continue content_misc = extract_text_from_dom(result, content_misc_xpath) if content_misc is not None: content = content_misc + "<br />" + content # append result results.append({ 'url': url, 'title': title, 'content': content }) except: logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in dom.xpath(spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results
def getDetail(jsonresponse, wikidata_id, language, locale): results = [] urls = [] attributes = [] title = jsonresponse.get('parse', {}).get('displaytitle', {}) result = jsonresponse.get('parse', {}).get('text', {}) if not title or not result: return results title = fromstring(title) for elem in title.xpath(language_fallback_xpath): elem.getparent().remove(elem) title = extract_text(title.xpath(title_xpath)) result = fromstring(result) for elem in result.xpath(language_fallback_xpath): elem.getparent().remove(elem) description = extract_text(result.xpath(description_xpath)) # URLS # official website add_url(urls, result, 'P856', results=results) # wikipedia wikipedia_link_count = 0 wikipedia_link = get_wikilink(result, language + 'wiki') if wikipedia_link: wikipedia_link_count += 1 urls.append({'title': 'Wikipedia (' + language + ')', 'url': wikipedia_link}) if language != 'en': wikipedia_en_link = get_wikilink(result, 'enwiki') if wikipedia_en_link: wikipedia_link_count += 1 urls.append({'title': 'Wikipedia (en)', 'url': wikipedia_en_link}) # TODO: get_wiki_firstlanguage # if wikipedia_link_count == 0: # more wikis add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki') add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo') # musicbrainz add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') # IMDb add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') # source code repository add_url(urls, result, 'P1324') # blog add_url(urls, result, 'P1581') # social media links add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/') add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/') add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/') urls.append({'title': 'Wikidata', 'url': 'https://www.wikidata.org/wiki/' + wikidata_id + '?uselang=' + language}) # INFOBOX ATTRIBUTES (ROWS) # DATES # inception date add_attribute(attributes, result, 'P571', date=True) # dissolution date add_attribute(attributes, result, 'P576', date=True) # start date add_attribute(attributes, result, 'P580', date=True) # end date add_attribute(attributes, result, 'P582', date=True) # date of birth add_attribute(attributes, result, 'P569', date=True) # date of death add_attribute(attributes, result, 'P570', date=True) # date of spacecraft launch add_attribute(attributes, result, 'P619', date=True) # date of spacecraft landing add_attribute(attributes, result, 'P620', date=True) # nationality add_attribute(attributes, result, 'P27') # country of origin add_attribute(attributes, result, 'P495') # country add_attribute(attributes, result, 'P17') # headquarters add_attribute(attributes, result, 'Q180') # PLACES # capital add_attribute(attributes, result, 'P36', trim=True) # head of state add_attribute(attributes, result, 'P35', trim=True) # head of government add_attribute(attributes, result, 'P6', trim=True) # type of government add_attribute(attributes, result, 'P122') # official language add_attribute(attributes, result, 'P37') # population add_attribute(attributes, result, 'P1082', trim=True) # area add_attribute(attributes, result, 'P2046') # currency add_attribute(attributes, result, 'P38', trim=True) # heigth (building) add_attribute(attributes, result, 'P2048') # MEDIA # platform (videogames) add_attribute(attributes, result, 'P400') # author add_attribute(attributes, result, 'P50') # creator add_attribute(attributes, result, 'P170') # director add_attribute(attributes, result, 'P57') # performer add_attribute(attributes, result, 'P175') # developer add_attribute(attributes, result, 'P178') # producer add_attribute(attributes, result, 'P162') # manufacturer add_attribute(attributes, result, 'P176') # screenwriter add_attribute(attributes, result, 'P58') # production company add_attribute(attributes, result, 'P272') # record label add_attribute(attributes, result, 'P264') # publisher add_attribute(attributes, result, 'P123') # original network add_attribute(attributes, result, 'P449') # distributor add_attribute(attributes, result, 'P750') # composer add_attribute(attributes, result, 'P86') # publication date add_attribute(attributes, result, 'P577', date=True) # genre add_attribute(attributes, result, 'P136') # original language add_attribute(attributes, result, 'P364') # isbn add_attribute(attributes, result, 'Q33057') # software license add_attribute(attributes, result, 'P275') # programming language add_attribute(attributes, result, 'P277') # version add_attribute(attributes, result, 'P348', trim=True) # narrative location add_attribute(attributes, result, 'P840') # LANGUAGES # number of speakers add_attribute(attributes, result, 'P1098') # writing system add_attribute(attributes, result, 'P282') # regulatory body add_attribute(attributes, result, 'P1018') # language code add_attribute(attributes, result, 'P218') # OTHER # ceo add_attribute(attributes, result, 'P169', trim=True) # founder add_attribute(attributes, result, 'P112') # legal form (company/organization) add_attribute(attributes, result, 'P1454') # operator add_attribute(attributes, result, 'P137') # crew members (tripulation) add_attribute(attributes, result, 'P1029') # taxon add_attribute(attributes, result, 'P225') # chemical formula add_attribute(attributes, result, 'P274') # winner (sports/contests) add_attribute(attributes, result, 'P1346') # number of deaths add_attribute(attributes, result, 'P1120') # currency code add_attribute(attributes, result, 'P498') image = add_image(result) if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: results.append({ 'url': urls[0]['url'], 'title': title, 'content': description }) else: results.append({ 'infobox': title, 'id': wikipedia_link, 'content': description, 'img_src': image, 'attributes': attributes, 'urls': urls }) return results
def response(resp): results = [] dom = html.fromstring(resp.text) search_res = dom.xpath('//div[@id="search_res"]/table/tr') # return empty array if nothing is found if not search_res: return [] # parse results for result in search_res: link = result.xpath('.//td[@class="torrent_name"]//a')[0] href = urljoin(url, link.attrib.get('href')) title = escape(extract_text(link)) content = escape( extract_text(result.xpath('.//pre[@class="snippet"]')[0])) content = "<br />".join(content.split("\n")) filesize = result.xpath( './/span[@class="attr_val"]/text()')[0].split()[0] filesize_multiplier = result.xpath( './/span[@class="attr_val"]/text()')[0].split()[1] files = result.xpath('.//span[@class="attr_val"]/text()')[1] seed = result.xpath('.//span[@class="attr_val"]/text()')[2] # convert seed to int if possible if seed.isdigit(): seed = int(seed) else: seed = 0 leech = 0 # convert filesize to byte if possible try: filesize = float(filesize) # convert filesize to byte if filesize_multiplier == 'TB': filesize = int(filesize * 1024 * 1024 * 1024 * 1024) elif filesize_multiplier == 'GB': filesize = int(filesize * 1024 * 1024 * 1024) elif filesize_multiplier == 'MB': filesize = int(filesize * 1024 * 1024) elif filesize_multiplier == 'KB': filesize = int(filesize * 1024) except: filesize = None # convert files to int if possible if files.isdigit(): files = int(files) else: files = None magnetlink = result.xpath('.//td[@class="ttth"]//a')[0].attrib['href'] # append result results.append({ 'url': href, 'title': title, 'content': content, 'seed': seed, 'leech': leech, 'filesize': filesize, 'files': files, 'magnetlink': magnetlink, 'template': 'torrent.html' }) # return results sorted by seeder return sorted(results, key=itemgetter('seed'), reverse=True)