def __init__(self): self.headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' } self.proxies = get_proxies()
def get_page_from_url(self, url, selenium=False): if selenium: self.driver.get(url) page = self.driver.page_source return page headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} req = urllib2.Request(url, '', headers) page = urllib2.urlopen(req).read() print(page) # TODO : proxy import utils proxies = utils.get_proxies() proxy_pool = cycle(proxies) for i in range(1, 11): # Get a proxy from the pool proxy = next(proxy_pool) print(proxy) try: response = requests.get(url, proxies={"http": proxy, "https": proxy}) print(response.json()) except: print("Skipping. Connnection error") page = response.text return page
async def get_detail_by_mertial_id(id): params = {'id': id, 'b_type_new': 0} url = "https://haohuo.snssdk.com/channel/material?" + urlencode(params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.snssdk.com' headers[ 'Referer'] = 'https://haohuo.snssdk.com/views/channel/material?id=%s&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0' % id proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_activity_by_id(activity_id, _): params = {'id': activity_id, '_': _, 'b_type_new': 0} url = "https://bolt.jinritemai.com/api/activity/detail?" + urlencode( params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers[ 'Referer'] = 'https://bolt.jinritemai.com/h5/activity?id=%s' % activity_id proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_goods_by_id(goods_id): params = {'id': goods_id, 'b_type_new': 0} url = "https://haohuo.snssdk.com/product/fxgajaxstaticitem?" + urlencode( params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/views/product/item2?id=%s' % goods_id proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_kills(): params = {'b_type_new': 0} url = "https://haohuo.snssdk.com/seckill/seckillMultiSessionList?" + urlencode( params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/views/channel/seckill?a=1&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0' proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_activity_goods(id, page): params = {'id': id, 'page': page, 'size': 10, 'b_type_new': 0} url = "https://haohuo.snssdk.com/channel/ajaxActivityGoods?" + urlencode( params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/views/channel/flash?a=1&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0' proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
def main(): proxy_list = utils.get_proxies("proxies.txt") with open("config.json", "r") as f2: settings = json.loads(f2.read()) delay = settings["delay"] webhook = settings["webhook"] threads = [] for user in settings["users"]: t = threading.Thread(target=ig.instagram, args=(user, delay, webhook, proxy_list)) threads.append(t) t.start()
async def get_first_goods_by_shop(shop_id, page): params = { 'shop_id': shop_id, 'page': page, 'pageSize': 20, 'b_type_new': 0, } url = 'https://haohuo.snssdk.com/shop/goodsList?' + urlencode(params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/views/shop/index?id=%s' % shop_id proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_goods_by_material_id(activity_id, _, material_id, page): params = { 'material_id': material_id, 'page': page, 'size': 10, '_': _, 'b_type_new': 0 } url = "https://luban.snssdk.com/bolt/productlist?" + urlencode(params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://bolt.jinritemai.com' headers[ 'Referer'] = 'https://bolt.jinritemai.com/h5/activity?id=%s' % activity_id proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_goods_by_campaign_id(campaign_id, page): params = { 'campaign_id': campaign_id, 'req_id': 1, 'page': page, 'pageSize': 10, 'b_type_new': 0 } url = "https://haohuo.snssdk.com/seckill/seckillCampaignGoodsList?" + urlencode( params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/views/channel/seckill?a=1&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0' proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_goods_by_category(cids, id, parentid, page): params = { 'second_cid': cids, 'type': 5, 'sort': 1, # 销量排序 'page': page, 'pageSize': 10 } url = "https://haohuo.snssdk.com/productcategory/getList?" + urlencode( params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/views/channel/categorychoose?cids=%s&parent_id=%s&id=%s&fresh_come=undefined&origin_type=3030005&origin_id=0&new_source_type=100&new_source_id=0&source_type=100&source_id=0&come_from=0' % ( cids, parentid, id) proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
async def get_recommend_goods(page): params = { 'cids': '', 'page': page, 'size': 10, 'addActivity': 1, 'app_id': 'undefined', 'area_type': 5, 'area_id': 0, 'origin_type': 303, 'b_type_new': 0 } url = "https://haohuo.snssdk.com/channel/ajaxGetGoods?" + urlencode(params) headers = utils.get_defind_headers() headers['User-Agent'] = utils.random_agent() headers['Origin'] = 'https://haohuo.jinritemai.com' headers[ 'Referer'] = 'https://haohuo.jinritemai.com/channel/list?origin_type=303' proxy = utils.get_proxies() return await utils.aiohttp_get(url, headers, proxy)
def activate(): if sys.argv[-1] == "reloaded": reload(os) reload(sys) return sys.argv.append("reloaded") from splunk import Intersplunk settings = dict() Intersplunk.readResults(settings=settings) session_key = settings['sessionKey'] proxies = get_proxies(session_key) bin_dir = os.path.dirname(py_exec) path = bin_dir + os.pathsep + os.environ["PATH"] passed_envs = { "PATH": path, "SPLUNK_HOME": os.environ['SPLUNK_HOME'] } if proxies: passed_envs['HTTP_PROXY'] = proxies['http'] passed_envs['HTTPS_PROXY'] = proxies['https'] os.execve(py_exec, ['python'] + sys.argv, passed_envs)
def main(): # create dirs root_dir = Path(__file__).resolve().parents[0] if SECRET_KEY: data_dir = Path('/data/') dump_dir = Path('/data/dump/') else: data_dir = root_dir / 'data' dump_dir = root_dir / 'dump' mkdirs(data_dir, dump_dir) # load book_download_urls book_download_urls = read(data_dir / 'book_download_urls.txt').splitlines() # remove any books that have already been downloaded book_download_urls = [ 'https://www.smashwords.com' + url for url in book_download_urls if not (data_dir / f'{get_book_id(url)}.txt').exists() ] if book_download_urls: # keep only the first 500 (as smashwords blocks the IP-address after 500 requests) book_download_urls = book_download_urls #[:500] # get headers (user-agents) headers = get_headers(root_dir / 'user-agents.txt') # initialize cache-controlled session session = CacheControl(Session()) # get the books (concurrently) with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: for nb_retry in count(1): # break if all book_download_urls successful if not book_download_urls: break # break if max number of retries exceeded if nb_retry > NB_RETRIES: LOGGER.warning( f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.' ) break # maintain a list of failed downloads (for future retries) failed_book_download_urls = [] proxies = get_proxies() # get the book_responses book_responses = list( tqdm(executor.map(get, book_download_urls, repeat(session), cycle(headers), cycle(proxies)), total=len(book_download_urls), desc='Getting books')) # dump the book_responses dump(book_responses, 'book_responses.pkl', dump_dir=dump_dir) for book_url, book_r in zip(book_download_urls, book_responses): #print("Book content: {}".format(book_r.content)) if book_r is not None: if book_r.status_code == 200: book_r.encoding = 'utf-8' # write the content to disk write(book_r.content, data_dir / f'{get_book_id(book_url)}.txt') else: failed_book_download_urls.append(book_url) LOGGER.warning( f'Request failed for {book_url}: status code [{book_r.status_code}]' ) else: LOGGER.warning( f"The request for the book_url '{book_url}' was None." ) book_download_urls = failed_book_download_urls
from splunk.rest import simpleRequest from splunk import Intersplunk import requests import re from splunk_logger import setup_logging from utils import get_proxies if __name__ == "__main__": logger = setup_logging() settings = dict() Intersplunk.readResults(settings=settings) session_key = settings['sessionKey'] proxies = get_proxies(session_key) download_url = simpleRequest( "/servicesNS/nobody/pyden-manager/properties/pyden/download/url", sessionKey=session_key)[1] r = requests.get(download_url, proxies=proxies) version_pattern = r"""<a href\=\"\d(?:\.\d{1,2}){1,2}\/\"\>(?P<version>\d(?:\.\d{1,2}){1,2})""" all_versions = re.findall(version_pattern, r.text) # logger.debug(all_versions) compatible_versions = [ version for version in all_versions if (version.startswith('2') and version > '2.7') or ( version.startswith('3') and version > '3.5') ] # logger.debug(compatible_versions) # sometime there are only pre release or release candidates so we need to check each compatible version for release for version in compatible_versions: url = download_url.rstrip() + "%s/" % version logger.debug(url) r = requests.get(url,
FULLY_PROCESSED_PATH = "../data/fully_processed.json" MAX_PROD_NO = 50000 MAX_PAGES_PER_STORE = 1000 products_no = 0 VERBOSE = True should_stop = False if __name__ == "__main__": start = time() with open("product_sitemaps.json", "r") as json_file: data = json_file.read() json_data = json.loads(data) arr = [json_data["regular"], json_data["medium"], json_data["gigants"]] proxies = get_proxies("./proxies.txt") number_of_cpus = cpu_count() - 2 print(f"The number of used CPUs is {number_of_cpus}.") for option in arr: res = Parallel(n_jobs=number_of_cpus)( delayed(process_one_store)( store_url, details, proxies, HEADERS, HEADERS2, MAX_PAGES_PER_STORE, FULLY_PROCESSED_PATH, VERBOSE, )
import asyncio import aiohttp import datetime import pandas as pd import sys from itertools import cycle from utils import search_headers, nifty_headers, get_proxies, get_proxy_string search_url = 'https://host-vdgrw7.api.swiftype.com/api/as/v1/engines/nifties-search/search' ranked_stats_url = 'https://api.niftygateway.com//market/ranked-stats/' proxies = get_proxies() proxy_pool = cycle(proxies) worker_count = 30 nifty_responses = [] final_nifties = [] async def fetch(nifty): """ Get stats for given nifty object :param nifty: Nifty json object :return: """ data = f'{{"query":"","page":{{"current":1,"size":2}},"filters":{{"all":[{{"contract_address":["{nifty["unminted_nifty_obj"]["contractObj"]["contractAddress"]}"]}},{{"nifty_type_that_created":"1"}},{{"currently_on_sale":"true"}}]}},"sort":{{"price_in_cents":"asc"}}}}' proxy = next(proxy_pool) async with aiohttp.ClientSession(headers=search_headers) as session: async with session.get(search_url, data=data, proxy=get_proxy_string(proxy)) as response: