def __init__(self):
     self.headers = {
         'user-agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) '
         'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
     }
     self.proxies = get_proxies()
Beispiel #2
0
    def get_page_from_url(self, url, selenium=False):
        if selenium:
            self.driver.get(url)
            page = self.driver.page_source
            return page

        headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
        req = urllib2.Request(url, '', headers)
        page = urllib2.urlopen(req).read()
        print(page)
        # TODO : proxy
        import utils
        proxies = utils.get_proxies()
        proxy_pool = cycle(proxies)
        for i in range(1, 11):
            # Get a proxy from the pool
            proxy = next(proxy_pool)
            print(proxy)
            try:
                response = requests.get(url, proxies={"http": proxy, "https": proxy})
                print(response.json())
            except:
                print("Skipping. Connnection error")
        page = response.text
        return page
Beispiel #3
0
async def get_detail_by_mertial_id(id):
    params = {'id': id, 'b_type_new': 0}
    url = "https://haohuo.snssdk.com/channel/material?" + urlencode(params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.snssdk.com'
    headers[
        'Referer'] = 'https://haohuo.snssdk.com/views/channel/material?id=%s&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0' % id
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #4
0
async def get_activity_by_id(activity_id, _):
    params = {'id': activity_id, '_': _, 'b_type_new': 0}
    url = "https://bolt.jinritemai.com/api/activity/detail?" + urlencode(
        params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers[
        'Referer'] = 'https://bolt.jinritemai.com/h5/activity?id=%s' % activity_id
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #5
0
async def get_goods_by_id(goods_id):
    params = {'id': goods_id, 'b_type_new': 0}
    url = "https://haohuo.snssdk.com/product/fxgajaxstaticitem?" + urlencode(
        params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/views/product/item2?id=%s' % goods_id
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #6
0
async def get_kills():
    params = {'b_type_new': 0}
    url = "https://haohuo.snssdk.com/seckill/seckillMultiSessionList?" + urlencode(
        params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/views/channel/seckill?a=1&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0'
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #7
0
async def get_activity_goods(id, page):
    params = {'id': id, 'page': page, 'size': 10, 'b_type_new': 0}
    url = "https://haohuo.snssdk.com/channel/ajaxActivityGoods?" + urlencode(
        params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/views/channel/flash?a=1&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0'
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #8
0
def main():
    proxy_list = utils.get_proxies("proxies.txt")
    with open("config.json", "r") as f2:
        settings = json.loads(f2.read())
        delay = settings["delay"]
        webhook = settings["webhook"]
        threads = []
        for user in settings["users"]:
            t = threading.Thread(target=ig.instagram,
                                 args=(user, delay, webhook, proxy_list))
            threads.append(t)
            t.start()
Beispiel #9
0
async def get_first_goods_by_shop(shop_id, page):
    params = {
        'shop_id': shop_id,
        'page': page,
        'pageSize': 20,
        'b_type_new': 0,
    }
    url = 'https://haohuo.snssdk.com/shop/goodsList?' + urlencode(params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/views/shop/index?id=%s' % shop_id
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #10
0
async def get_goods_by_material_id(activity_id, _, material_id, page):
    params = {
        'material_id': material_id,
        'page': page,
        'size': 10,
        '_': _,
        'b_type_new': 0
    }
    url = "https://luban.snssdk.com/bolt/productlist?" + urlencode(params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://bolt.jinritemai.com'
    headers[
        'Referer'] = 'https://bolt.jinritemai.com/h5/activity?id=%s' % activity_id
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #11
0
async def get_goods_by_campaign_id(campaign_id, page):
    params = {
        'campaign_id': campaign_id,
        'req_id': 1,
        'page': page,
        'pageSize': 10,
        'b_type_new': 0
    }
    url = "https://haohuo.snssdk.com/seckill/seckillCampaignGoodsList?" + urlencode(
        params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/views/channel/seckill?a=1&origin_type=3030005&origin_id=0&new_source_type=5&new_source_id=1&source_type=5&source_id=1&come_from=0'
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #12
0
async def get_goods_by_category(cids, id, parentid, page):
    params = {
        'second_cid': cids,
        'type': 5,
        'sort': 1,  # 销量排序
        'page': page,
        'pageSize': 10
    }
    url = "https://haohuo.snssdk.com/productcategory/getList?" + urlencode(
        params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/views/channel/categorychoose?cids=%s&parent_id=%s&id=%s&fresh_come=undefined&origin_type=3030005&origin_id=0&new_source_type=100&new_source_id=0&source_type=100&source_id=0&come_from=0' % (
            cids, parentid, id)
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #13
0
async def get_recommend_goods(page):
    params = {
        'cids': '',
        'page': page,
        'size': 10,
        'addActivity': 1,
        'app_id': 'undefined',
        'area_type': 5,
        'area_id': 0,
        'origin_type': 303,
        'b_type_new': 0
    }
    url = "https://haohuo.snssdk.com/channel/ajaxGetGoods?" + urlencode(params)
    headers = utils.get_defind_headers()
    headers['User-Agent'] = utils.random_agent()
    headers['Origin'] = 'https://haohuo.jinritemai.com'
    headers[
        'Referer'] = 'https://haohuo.jinritemai.com/channel/list?origin_type=303'
    proxy = utils.get_proxies()
    return await utils.aiohttp_get(url, headers, proxy)
Beispiel #14
0
def activate():
    if sys.argv[-1] == "reloaded":
        reload(os)
        reload(sys)
        return

    sys.argv.append("reloaded")
    from splunk import Intersplunk
    settings = dict()
    Intersplunk.readResults(settings=settings)
    session_key = settings['sessionKey']
    proxies = get_proxies(session_key)
    bin_dir = os.path.dirname(py_exec)
    path = bin_dir + os.pathsep + os.environ["PATH"]
    passed_envs = {
        "PATH": path,
        "SPLUNK_HOME": os.environ['SPLUNK_HOME']
    }
    if proxies:
        passed_envs['HTTP_PROXY'] = proxies['http']
        passed_envs['HTTPS_PROXY'] = proxies['https']
    os.execve(py_exec, ['python'] + sys.argv, passed_envs)
Beispiel #15
0
def main():
    # create dirs
    root_dir = Path(__file__).resolve().parents[0]
    if SECRET_KEY:
        data_dir = Path('/data/')
        dump_dir = Path('/data/dump/')
    else:
        data_dir = root_dir / 'data'
        dump_dir = root_dir / 'dump'
    mkdirs(data_dir, dump_dir)

    # load book_download_urls
    book_download_urls = read(data_dir / 'book_download_urls.txt').splitlines()

    # remove any books that have already been downloaded
    book_download_urls = [
        'https://www.smashwords.com' + url for url in book_download_urls
        if not (data_dir / f'{get_book_id(url)}.txt').exists()
    ]

    if book_download_urls:
        # keep only the first 500 (as smashwords blocks the IP-address after 500 requests)
        book_download_urls = book_download_urls  #[:500]

        # get headers (user-agents)
        headers = get_headers(root_dir / 'user-agents.txt')

        # initialize cache-controlled session
        session = CacheControl(Session())

        # get the books (concurrently)
        with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
            for nb_retry in count(1):
                # break if all book_download_urls successful
                if not book_download_urls:
                    break

                # break if max number of retries exceeded
                if nb_retry > NB_RETRIES:
                    LOGGER.warning(
                        f'Could not download {len(book_download_urls)} books after {NB_RETRIES} retries.'
                    )
                    break

                # maintain a list of failed downloads (for future retries)
                failed_book_download_urls = []

                proxies = get_proxies()

                # get the book_responses
                book_responses = list(
                    tqdm(executor.map(get, book_download_urls, repeat(session),
                                      cycle(headers), cycle(proxies)),
                         total=len(book_download_urls),
                         desc='Getting books'))

                # dump the book_responses
                dump(book_responses, 'book_responses.pkl', dump_dir=dump_dir)

                for book_url, book_r in zip(book_download_urls,
                                            book_responses):
                    #print("Book content: {}".format(book_r.content))
                    if book_r is not None:
                        if book_r.status_code == 200:
                            book_r.encoding = 'utf-8'

                            # write the content to disk
                            write(book_r.content,
                                  data_dir / f'{get_book_id(book_url)}.txt')
                        else:
                            failed_book_download_urls.append(book_url)
                            LOGGER.warning(
                                f'Request failed for {book_url}: status code [{book_r.status_code}]'
                            )
                    else:
                        LOGGER.warning(
                            f"The request for the book_url '{book_url}' was None."
                        )

                book_download_urls = failed_book_download_urls
Beispiel #16
0
from splunk.rest import simpleRequest
from splunk import Intersplunk
import requests
import re
from splunk_logger import setup_logging
from utils import get_proxies

if __name__ == "__main__":
    logger = setup_logging()
    settings = dict()
    Intersplunk.readResults(settings=settings)
    session_key = settings['sessionKey']
    proxies = get_proxies(session_key)
    download_url = simpleRequest(
        "/servicesNS/nobody/pyden-manager/properties/pyden/download/url",
        sessionKey=session_key)[1]
    r = requests.get(download_url, proxies=proxies)
    version_pattern = r"""<a href\=\"\d(?:\.\d{1,2}){1,2}\/\"\>(?P<version>\d(?:\.\d{1,2}){1,2})"""
    all_versions = re.findall(version_pattern, r.text)
    # logger.debug(all_versions)
    compatible_versions = [
        version for version in all_versions
        if (version.startswith('2') and version > '2.7') or (
            version.startswith('3') and version > '3.5')
    ]
    # logger.debug(compatible_versions)
    # sometime there are only pre release or release candidates so we need to check each compatible version for release
    for version in compatible_versions:
        url = download_url.rstrip() + "%s/" % version
        logger.debug(url)
        r = requests.get(url,
Beispiel #17
0
FULLY_PROCESSED_PATH = "../data/fully_processed.json"

MAX_PROD_NO = 50000
MAX_PAGES_PER_STORE = 1000
products_no = 0
VERBOSE = True
should_stop = False


if __name__ == "__main__":
    start = time()
    with open("product_sitemaps.json", "r") as json_file:
        data = json_file.read()
    json_data = json.loads(data)
    arr = [json_data["regular"], json_data["medium"], json_data["gigants"]]
    proxies = get_proxies("./proxies.txt")
    number_of_cpus = cpu_count() - 2
    print(f"The number of used CPUs is {number_of_cpus}.")

    for option in arr:
        res = Parallel(n_jobs=number_of_cpus)(
            delayed(process_one_store)(
                store_url,
                details,
                proxies,
                HEADERS,
                HEADERS2,
                MAX_PAGES_PER_STORE,
                FULLY_PROCESSED_PATH,
                VERBOSE,
            )
Beispiel #18
0
import asyncio
import aiohttp
import datetime
import pandas as pd
import sys
from itertools import cycle

from utils import search_headers, nifty_headers, get_proxies, get_proxy_string

search_url = 'https://host-vdgrw7.api.swiftype.com/api/as/v1/engines/nifties-search/search'
ranked_stats_url = 'https://api.niftygateway.com//market/ranked-stats/'
proxies = get_proxies()
proxy_pool = cycle(proxies)
worker_count = 30
nifty_responses = []
final_nifties = []


async def fetch(nifty):
    """
    Get stats for given nifty object
    :param nifty: Nifty json object
    :return:
    """
    data = f'{{"query":"","page":{{"current":1,"size":2}},"filters":{{"all":[{{"contract_address":["{nifty["unminted_nifty_obj"]["contractObj"]["contractAddress"]}"]}},{{"nifty_type_that_created":"1"}},{{"currently_on_sale":"true"}}]}},"sort":{{"price_in_cents":"asc"}}}}'
    proxy = next(proxy_pool)

    async with aiohttp.ClientSession(headers=search_headers) as session:
        async with session.get(search_url,
                               data=data,
                               proxy=get_proxy_string(proxy)) as response: