Esempio n. 1
0
	def craw_single_keyword(self, keyword):
		self.cur_key = keyword
		self.cur_key_seachcount = 0
		search_url = self._btdepot_url + "/search/" + keyword
		r = requests.get(search_url, headers = self._headers, timeout=10)
		#print r.cookies
		soup = BeautifulSoup(r.content)
		ret = re.search(r'totalPages: \d*',r.content)
		print "match result ", ret
		if ret is None:
			print r.content
		totalPages = int(ret.group(0).split(':')[1].strip())
		csv_name = self._csv_dir + "/breadsearch_" + time.strftime("%Y%m%d") + "_" + self.pid + ".csv"
		storer = my_csv_storer.my_csv_storer(csv_name)
		print 'totalPages:',totalPages

		for page in range(1,totalPages + 1):
			try:
				search_url = self._btdepot_url + "/search/" + keyword +"/" + str(page)
				r = requests.get(search_url, headers = self._headers, timeout=10)
				#print r.cookies
				soup = BeautifulSoup(r.content)
				item_list = soup.find_all("div", class_ = "search-item")
				for i in range(len(item_list)):
					temp = item_list[i].find_all('a')[0]["href"]
					info_url = self._btdepot_url + temp
					r = requests.get(info_url, headers = self._headers1, cookies = r.cookies, timeout=10)
					child_soup = BeautifulSoup(r.content)
					magnet_url = child_soup.find_all('ul', class_="prop-list" )[0].find_all('a')[0]['href']
					print magnet_url
					hash_info  = magnet_url[20:60]
					title = child_soup.find_all('h1', class_ = 'detail-title')[0].string
					print "==> title", title


					
					detailfiles = child_soup.find_all('ul',class_="file-list")[0].find_all('li')
					files = [ '-'.join(e.find_all('span')[0].strings) + " " + e.find_all('span')[1].string for e in detailfiles]
					content = title + "\n" + "\n".join(files)
					#print '===============>'
					#print (hash_info)
					#print (content.encode('utf8'))
					#print (magnet_url)
					#print '<==============='
					storer.store(unicode(hash_info).encode('utf8'), unicode(content).encode('utf8'), unicode(magnet_url).encode('utf8'))
					self.cur_key_seachcount += 1
			except Exception, e:
				print "found exception", e
				traceback.print_exc()
Esempio n. 2
0
    def craw_single_keyword(self, keyword):
        self.cur_key = keyword
        self.cur_key_seachcount = 0
        search_url = self._btdepot_url + "/search/" + keyword
        r0 = requests.get(search_url, headers=self._headers, timeout=10)
        # print r0.cookies
        soup = BeautifulSoup(r0.content)
        ret = re.search(r"totalPages: \d*", r0.content)
        print "match result ", ret
        if ret is None:
            print r0.content

        totalPages = int(ret.group(0).split(":")[1].strip())
        csv_name = self._csv_dir + "/btdepot_" + time.strftime("%Y%m%d") + "_" + self.pid + ".csv"
        storer = my_csv_storer.my_csv_storer(csv_name)
        print "totalPages:", totalPages
        time.sleep(0.5)

        for page in range(1, totalPages + 1):
            try:
                search_url = self._btdepot_url + "/search/" + keyword + "/" + str(page)
                r = requests.get(search_url, headers=self._headers, timeout=10)
                # print r.cookies
                soup = BeautifulSoup(r.content)
                item_list = soup.find_all("div", class_="item_container")

                time.sleep(0.5)
                for i in range(len(item_list)):
                    if i == 0:
                        continue
                    temp = item_list[i].a["href"]
                    # print "===>",i, "  " , temp
                    info_url = self._btdepot_url + temp
                    r = requests.get(info_url, headers=self._headers1, cookies=r0.cookies, timeout=10)
                    child_soup = BeautifulSoup(r.content)
                    magnet_url = child_soup.find_all("textarea")[0].string
                    print magnet_url
                    size = child_soup.find_all("span", string="Size: ")[0].next_sibling.string
                    files = child_soup.find_all("span", string="Files: ")[0].next_sibling.string
                    index_date = child_soup.find_all("span", string="Index Date: ")[0].next_sibling.string
                    hash_info = child_soup.find_all("span", string="Hash: ")[0].next_sibling.string
                    title = child_soup.find_all("h1", class_="torrent_title")[0].string

                    detailfiles = child_soup.find_all("div")
                    files = []
                    for d in detailfiles:
                        if d.has_attr("style") and d["style"] == "margin-bottom: 50px;":
                            fnn = d.find_all("div")
                            files = [
                                "-".join(e.find_all("span")[0].strings) + " " + e.find_all("span")[1].string
                                for e in fnn
                            ]
                    content = title + "\n" + "\n".join(files)
                    # print '===============>'
                    # print type(hash_info)
                    # print type(content.encode('utf8'))
                    # print type(magnet_url)
                    # print '<==============='
                    storer.store(
                        unicode(hash_info).encode("utf8"),
                        unicode(content).encode("utf8"),
                        unicode(magnet_url).encode("utf8"),
                    )
                    self.cur_key_seachcount += 1
            except Exception, e:
                print "found exception", e
                traceback.print_exc()