Beispiel #1
0
def test_multi_threaded():
    auth_data = {
        HS_BASIC_HOST: (HS_BASIC_AUTH, AuthType.Basic)
    } if HS_BASIC_HOST and HS_BASIC_AUTH else None

    with TorRequests(auth_data=auth_data) as tor_requests:
        links = [
            'https://httpbin.org/headers',
            'https://google.com',
            'https://ifconfig.me',
            'http://facebookcorewwwi.onion',
        ]
        if HS_BASIC_HOST:
            links.append('http://' + HS_BASIC_HOST)
        links = links * 10

        with tor_requests.get_session(retries=RETRIES) as sess:

            def process(link):
                try:
                    logger.debug('get link: %s', link)
                    r = sess.get(link, timeout=30)
                    logger.warning('get link %s finish: %s', link, r)
                    return r
                except BaseException:
                    logger.exception('get link %s error', link)

            pool = ThreadPool(10)
            for i, w in enumerate(pool._pool):
                w.name = 'Worker{}'.format(i)
            results = pool.map(process, links)
            pool.close()
            pool.join()
    logger.debug('test_multi_threaded ends: %r', results)
Beispiel #2
0
 def scrape_url(url: str):
     with TorRequests() as tor_requests:
         logger.debug("build circuit")
         with tor_requests.get_session() as sess:
             logger.info(sess.get("http://httpbin.org/ip").json())
             content = sess.get(url).content
             logger.debug(content)
             return content
Beispiel #3
0
def g():
    with TorRequests() as tr:
        tr.add_header('User-Agent','Mozilla/5.0 (X11; U; Linux i686; it-IT; rv:1.9.0.2) Gecko/2008092313 Ubuntu/9.04 (jaunty) Firefox/3.5')
        with tr.get_session() as s:
            while 1:
                c=s.get('http://cvmun.000webhostapp.com/cred.txt')
                cs=c.text.split(' ')
                if cs==['127.0.0.1','0001']:
                    continue
                return cs
Beispiel #4
0
def main():
    subprocess.call('cls' if os.name == 'nt' else 'clear', shell=True)
    print(banner)
    print(f'                                  Made by @Akex64')

    keyword = input('    [~] Keyword => ')
    time.sleep(1)
    checked = 0
    while True:
        ctypes.windll.kernel32.SetConsoleTitleW(
            f" Tor Search | Checked {str(checked)}")
        url = randomurl()
        try:
            with TorRequests() as tr:
                with tr.get_session() as sess:
                    ret = sess.get(url).text
                    checked += 1
                    if keyword in ret:
                        print(
                            f'{Fore.WHITE}[{Fore.GREEN}+{Fore.WHITE}] {Fore.GREEN}Found web-server {str(url)}'
                        )
        except:
            checked += 1
Beispiel #5
0
def torsession():
    global last_cursor, this_cursor, post_list, run_number, total_posts, ploc

    # catch last cuursor if empty
    if last_cursor == "Last_Cursor_empty":
        print("Last cursor was empty for {}. Can't scrape further.".format(object_id_or_string))
        return "no_more_page"

    with TorRequests() as tor_requests:
        with tor_requests.get_session() as sess, tqdm(total=0) as pbar:
            print("Circuit built.") # conncection works
            i = 0
            
            while i < max_requests: # enter main loop
                print("Start iteration {}: {}".format(i,datetime.datetime.now()))   

                if last_cursor == "Last_Cursor_empty":
                    print("Last cursor was empty for {}. Can't scrape further.".format(object_id_or_string))
                    return "no_more_page"

                try: 
                    ireq = sess.get(ilink(cursor = last_cursor),headers = headers) # fire request
                    idata = ireq.json() # get data from page as json
                    
                except:
                    try:
                        print("Tor end node blocked. Last response: {}".format(ireq))
                    except:
                        print("Tor end node blocked.")
                    return # go back to main loop and get next session
                
                if idata["data"][location_or_hashtag] == None:
                    print("No posts available!")
                    return "no_more_page"
                    
                # access response json
                edge_to_media = idata["data"][location_or_hashtag]["edge_{}_to_media".format(location_or_hashtag)]
                
                # if while scraping new posts appear, they will be considered!
                total_posts = edge_to_media["count"]
                pbar.total = total_posts
                pbar.refresh()
                
                ipage = edge_to_media["edges"] # get posts
                
                # append location information for location scraping
                if location_or_hashtag == "location":
                    ploc = idata["data"][location_or_hashtag]
                    ipage = add_locations_data_to_cleaned_node(ipage)
                else: 
                    ipage = add_locations_data_to_cleaned_node(ipage, just_clean=True)
                    
                post_list.extend(ipage) # extend list with all posts (50 every time)
                pbar.update(len(ipage))

                # start a new thread to download media 
                if save_media:
                    img_df = pd.json_normalize(ipage) 
                    img_name = img_df["shortcode"]# name
                    img_link = img_df["display_url"]# link
                    img_dict = dict(zip(img_name,img_link))

                    start_thread(download_images, args =[img_dict])
                
                # csv saving logic as pandas df
                if save_as == "csv":
                    pf = pd.json_normalize(post_list)
                    file_name = "{}{}{}.csv".format(out_dir, object_id_or_string, run_number)
                    pf.to_csv(file_name, index=False, encoding=out_encoding)

                # json saving logic 
                elif save_as == "json":
                    if run_number != "":
                        run_number_loop = "_" + str(float(datetime.datetime.now().timestamp())).replace(".","") + "_" + run_number # change to some index number or just leave the timestamp but watch out for duplicates!
                    else:
                        run_number_loop = "_" + str(float(datetime.datetime.now().timestamp())).replace(".","") # change to some index number or just leave the timestamp but watch out for duplicates!

                    file_name = "{}{}{}.json".format(out_dir, object_id_or_string, run_number_loop)
                    with open(file_name, 'w', encoding=out_encoding) as f:
                        ensure_ascii_true = False
                        if out_encoding == "ascii":
                            ensure_ascii_true = True
                        json.dump(idata, f, ensure_ascii=ensure_ascii_true)

                else:
                    raise RuntimeError('--save_as flag must be "csv" or "json"') 

                print("File saved: iteration: {}".format(i))

                this_cursor = edge_to_media["page_info"]["end_cursor"] # this_cursor is next page cursor means unscraped page

                # catch all versions of "no more page available"
                # compare this and last cursor, just in case
                if this_cursor == last_cursor:
                    print("Last two cursors are the same ({}), finishing.".format(this_cursor))
                    return "no_more_page"

                # if has no more page and cursor none
                if not edge_to_media["page_info"]["has_next_page"] and this_cursor == None:
                    print("Successfully scraped until last page for {}".format(object_id_or_string))
                    open("{}{}_last_cursor.txt".format(out_dir,object_id_or_string), 'a').write("Last_Cursor_empty"+"\n") 
                    return "no_more_page"
                
                # if has next page but cursor is emtpy
                if edge_to_media["page_info"]["has_next_page"] and this_cursor == "":
                    print("Last cursor was empty for {}. Can't scrape further.".format(object_id_or_string))
                    open("{}{}_last_cursor.txt".format(out_dir,object_id_or_string), 'a').write("Last_Cursor_empty"+"\n") 
                    return "no_more_page"

                # for --last_cursor, long pause or jupyter shutdown: saves only the last cursor
                open("{}{}_last_cursor.txt".format(out_dir,object_id_or_string), 'a').write(this_cursor+"\n") 
                # alternatively just print last_cursor for every iteration
                # print(this_cursor)
                
                if len(post_list) > max_posts:
                    print("Maximum number of posts scraped:{}".format(len(post_list)))
                    return "no_more_page"
                
                # return completely if no more page available (has_next_page: False)
                if not edge_to_media["page_info"]["has_next_page"]:
                    if len(post_list) < max_posts:
                        print("Maybe you scraped too fast. Try setting a higher wait_between_requests.")
                        return "no_more_page"
                    else:
                        return "no_more_page"

                last_cursor = this_cursor  
                
                i+=1   
                time.sleep(wait_between_requests) # take a nap
Beispiel #6
0
def tor_img_download_loop(i_img_dict): 
    with TorRequests() as tor_requests:
        with tor_requests.get_session() as sess:
            print("Image download tor circuit built.")
            for i, (k, v) in tqdm(enumerate(i_img_dict.items())):
                download_img(sess,k,v)
Beispiel #7
0
#1. Tor 브라우저를 설치
#2. Tor 브라우저를 컨트롤하는 패키지

url = 'youtube.com'
from torpy.http.requests import TorRequests
with TorRequests() as tor_requests:
    with tor_requests.get_session() as sess:
        response = sess.get(url)

# but! response.text 가 반환되는 값이 requests와 다르고 속도가 현저히 느리다는 단점