def test_multi_threaded(): auth_data = { HS_BASIC_HOST: (HS_BASIC_AUTH, AuthType.Basic) } if HS_BASIC_HOST and HS_BASIC_AUTH else None with TorRequests(auth_data=auth_data) as tor_requests: links = [ 'https://httpbin.org/headers', 'https://google.com', 'https://ifconfig.me', 'http://facebookcorewwwi.onion', ] if HS_BASIC_HOST: links.append('http://' + HS_BASIC_HOST) links = links * 10 with tor_requests.get_session(retries=RETRIES) as sess: def process(link): try: logger.debug('get link: %s', link) r = sess.get(link, timeout=30) logger.warning('get link %s finish: %s', link, r) return r except BaseException: logger.exception('get link %s error', link) pool = ThreadPool(10) for i, w in enumerate(pool._pool): w.name = 'Worker{}'.format(i) results = pool.map(process, links) pool.close() pool.join() logger.debug('test_multi_threaded ends: %r', results)
def scrape_url(url: str): with TorRequests() as tor_requests: logger.debug("build circuit") with tor_requests.get_session() as sess: logger.info(sess.get("http://httpbin.org/ip").json()) content = sess.get(url).content logger.debug(content) return content
def g(): with TorRequests() as tr: tr.add_header('User-Agent','Mozilla/5.0 (X11; U; Linux i686; it-IT; rv:1.9.0.2) Gecko/2008092313 Ubuntu/9.04 (jaunty) Firefox/3.5') with tr.get_session() as s: while 1: c=s.get('http://cvmun.000webhostapp.com/cred.txt') cs=c.text.split(' ') if cs==['127.0.0.1','0001']: continue return cs
def main(): subprocess.call('cls' if os.name == 'nt' else 'clear', shell=True) print(banner) print(f' Made by @Akex64') keyword = input(' [~] Keyword => ') time.sleep(1) checked = 0 while True: ctypes.windll.kernel32.SetConsoleTitleW( f" Tor Search | Checked {str(checked)}") url = randomurl() try: with TorRequests() as tr: with tr.get_session() as sess: ret = sess.get(url).text checked += 1 if keyword in ret: print( f'{Fore.WHITE}[{Fore.GREEN}+{Fore.WHITE}] {Fore.GREEN}Found web-server {str(url)}' ) except: checked += 1
def torsession(): global last_cursor, this_cursor, post_list, run_number, total_posts, ploc # catch last cuursor if empty if last_cursor == "Last_Cursor_empty": print("Last cursor was empty for {}. Can't scrape further.".format(object_id_or_string)) return "no_more_page" with TorRequests() as tor_requests: with tor_requests.get_session() as sess, tqdm(total=0) as pbar: print("Circuit built.") # conncection works i = 0 while i < max_requests: # enter main loop print("Start iteration {}: {}".format(i,datetime.datetime.now())) if last_cursor == "Last_Cursor_empty": print("Last cursor was empty for {}. Can't scrape further.".format(object_id_or_string)) return "no_more_page" try: ireq = sess.get(ilink(cursor = last_cursor),headers = headers) # fire request idata = ireq.json() # get data from page as json except: try: print("Tor end node blocked. Last response: {}".format(ireq)) except: print("Tor end node blocked.") return # go back to main loop and get next session if idata["data"][location_or_hashtag] == None: print("No posts available!") return "no_more_page" # access response json edge_to_media = idata["data"][location_or_hashtag]["edge_{}_to_media".format(location_or_hashtag)] # if while scraping new posts appear, they will be considered! total_posts = edge_to_media["count"] pbar.total = total_posts pbar.refresh() ipage = edge_to_media["edges"] # get posts # append location information for location scraping if location_or_hashtag == "location": ploc = idata["data"][location_or_hashtag] ipage = add_locations_data_to_cleaned_node(ipage) else: ipage = add_locations_data_to_cleaned_node(ipage, just_clean=True) post_list.extend(ipage) # extend list with all posts (50 every time) pbar.update(len(ipage)) # start a new thread to download media if save_media: img_df = pd.json_normalize(ipage) img_name = img_df["shortcode"]# name img_link = img_df["display_url"]# link img_dict = dict(zip(img_name,img_link)) start_thread(download_images, args =[img_dict]) # csv saving logic as pandas df if save_as == "csv": pf = pd.json_normalize(post_list) file_name = "{}{}{}.csv".format(out_dir, object_id_or_string, run_number) pf.to_csv(file_name, index=False, encoding=out_encoding) # json saving logic elif save_as == "json": if run_number != "": run_number_loop = "_" + str(float(datetime.datetime.now().timestamp())).replace(".","") + "_" + run_number # change to some index number or just leave the timestamp but watch out for duplicates! else: run_number_loop = "_" + str(float(datetime.datetime.now().timestamp())).replace(".","") # change to some index number or just leave the timestamp but watch out for duplicates! file_name = "{}{}{}.json".format(out_dir, object_id_or_string, run_number_loop) with open(file_name, 'w', encoding=out_encoding) as f: ensure_ascii_true = False if out_encoding == "ascii": ensure_ascii_true = True json.dump(idata, f, ensure_ascii=ensure_ascii_true) else: raise RuntimeError('--save_as flag must be "csv" or "json"') print("File saved: iteration: {}".format(i)) this_cursor = edge_to_media["page_info"]["end_cursor"] # this_cursor is next page cursor means unscraped page # catch all versions of "no more page available" # compare this and last cursor, just in case if this_cursor == last_cursor: print("Last two cursors are the same ({}), finishing.".format(this_cursor)) return "no_more_page" # if has no more page and cursor none if not edge_to_media["page_info"]["has_next_page"] and this_cursor == None: print("Successfully scraped until last page for {}".format(object_id_or_string)) open("{}{}_last_cursor.txt".format(out_dir,object_id_or_string), 'a').write("Last_Cursor_empty"+"\n") return "no_more_page" # if has next page but cursor is emtpy if edge_to_media["page_info"]["has_next_page"] and this_cursor == "": print("Last cursor was empty for {}. Can't scrape further.".format(object_id_or_string)) open("{}{}_last_cursor.txt".format(out_dir,object_id_or_string), 'a').write("Last_Cursor_empty"+"\n") return "no_more_page" # for --last_cursor, long pause or jupyter shutdown: saves only the last cursor open("{}{}_last_cursor.txt".format(out_dir,object_id_or_string), 'a').write(this_cursor+"\n") # alternatively just print last_cursor for every iteration # print(this_cursor) if len(post_list) > max_posts: print("Maximum number of posts scraped:{}".format(len(post_list))) return "no_more_page" # return completely if no more page available (has_next_page: False) if not edge_to_media["page_info"]["has_next_page"]: if len(post_list) < max_posts: print("Maybe you scraped too fast. Try setting a higher wait_between_requests.") return "no_more_page" else: return "no_more_page" last_cursor = this_cursor i+=1 time.sleep(wait_between_requests) # take a nap
def tor_img_download_loop(i_img_dict): with TorRequests() as tor_requests: with tor_requests.get_session() as sess: print("Image download tor circuit built.") for i, (k, v) in tqdm(enumerate(i_img_dict.items())): download_img(sess,k,v)
#1. Tor 브라우저를 설치 #2. Tor 브라우저를 컨트롤하는 패키지 url = 'youtube.com' from torpy.http.requests import TorRequests with TorRequests() as tor_requests: with tor_requests.get_session() as sess: response = sess.get(url) # but! response.text 가 반환되는 값이 requests와 다르고 속도가 현저히 느리다는 단점