コード例 #1
0
ファイル: urlfeeder.py プロジェクト: F3DS/f3ds
def main():
    minnap = 200
    maxnap = 800
    scale = 1000.0
    prefix = ''

    if len(sys.argv) < 2:
        with open(path.join(thisdir, 'args.txt'), 'rU') as argslist:
            lines = argslist.readlines()
            lines[:] = [line.replace('\r', ' ').replace('\n', ' ') for line in lines]
            argline = ''.join(lines)
            sys.argv = [sys.argv[0]] + argline.split()

    optlist, args = getopt.getopt(sys.argv[1:], 'u:d:m:M:s:i')
    options = dict(optlist)

    if '-u' not in options:
        print_usage()
        return
    nap = False
    if '-m' in options or '-M' in options or '-s' in options:
        nap = True
    UserAgent.retrieve(options['-u'], nap=nap, minnap=minnap, maxnap=maxnap,
                       napscale=scale, interactive='-i' in options)
    dbutils.export_csv(dbutils.dump(), description=options['-d'])
コード例 #2
0
ファイル: Core.py プロジェクト: gray-zeng/Python
def SendReq(start_url):
    request = urllib2.Request(
        url = start_url,
        headers = {'User-Agent':UserAgent.getAgent()}
    )
    response = urllib2.urlopen(request)
    return response.read()
コード例 #3
0
async def _action_combo_get_page_content(url, cookies_dir='data/cookies/'):
    try:
        #解析url属于那个domain
        parsed_uri = urlparse(url)
        cookies_file = "".join([cookies_dir, parsed_uri.netloc, "cookie"])
        my_cookie_file = DataFile.read_file_intostr(cookies_file)
        browser = await launch({
            "executablePath": "chromium-browser",
            "args": ["--no-sandbox"]
        })
        page = await browser.newPage()
        #读取cookies
        if (len(my_cookie_file) > 0):
            my_cookie_object = json.loads(my_cookie_file)
            print("".join(
                ["Load ",
                 str(len(my_cookie_object)), " cookie item(s)."]))
            for row in my_cookie_object:
                await page.setCookie(row)
        #设置UA
        ua_box = UserAgent.UserAgentBox()
        await page.setUserAgent(ua_box.wap_normal_user)
        await page.goto(url)
        new_cookie = await page.cookies()
        json_cookie = json.dumps(new_cookie)
        res = await action_get_page_content(page)
        DataFile.write_full_file(cookies_file, json_cookie)
        await browser.close()
        return res
    except Exception as e:
        traceback.print_exc()
        return ""
コード例 #4
0
ファイル: Check_Proxy.py プロジェクト: MOHQ-TM/InstaCrack
def Check():
    try:
        for i in proxy:
            i = i.strip()
            i = i.split(":")
            ip = i[0]
            port = i[1]
            addr = dict(http=f"http://{ip}:{port}",
                        https=f"https://{ip}:{port}")
            head = {"User-Agent": users.UserAgent()}
            try:
                req = r.get(url, proxies=addr, headers=head, timeout=6)
            except Exception as e:
                if "HTTPTunel" in str(e):
                    continue
                else:
                    print(f"{c.R}Error{c.C}!!!{c.W} {e}\n")
                    exit()
            if req.status_code == 200:
                print(
                    f"{c.C}Hit{c.R}_{c.C}Proxy {c.R}=> {c.C}{ip}:{port}{c.W}\n"
                )
                files = open("Hit.txt", "a+")
                files.write(f"{i}\n")
                files.close()
            else:
                print(
                    f"{c.C}Not{c.R}_{c.C}Found {c.R}=> {c.C}{ip}:{port}{c.W}\n"
                )
    except Exception as e:
        print(f"{c.R}Error{c.C}!!!{c.W} {e}\n")
        exit()
コード例 #5
0
def screen(url, select):
    headers = UserAgent.get_headers()  # 随机获取一个headers
    html = requests.get(url=url, headers=headers)
    html.encoding = 'gbk'  # 网站的编码
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)
コード例 #6
0
def request(req_url):
    #time.sleep(2)
    useragent = {'user-agent': UserAgent.randomUserAgent()}
    req = requests.get(req_url, headers=useragent)
    req.raise_for_status
    req.encoding = req.apparent_encoding
    return req
コード例 #7
0
def post_pan(form_action, form_post):  #进行表单数据的提交,返回一个html页面
    post_data = {'e_secret_key': form_post}
    ua = {'user-agent': UserAgent.randomUserAgent()}
    post_html = requests.post(form_action, headers=ua, data=post_data)
    post_html.raise_for_status
    post_html.encoding = post_html.apparent_encoding
    return post_html
コード例 #8
0
ファイル: spider.py プロジェクト: zbwbb/StreetPat_Ajax
def get_page_detail(url):
    try:
        response = requests.get(url, headers=UserAgent.get_user_agent())
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
コード例 #9
0
def spider_home():
    """
    获取物品的url以及标题价格
    :return: 返回物品编码
    """
    shop = input("请输入你要搜索的商品:")

    global headers

    headers = UserAgent.get_headers()  # 随机获取一个headers

    url = 'https://search.jd.com/Search?keyword={shop}&enc=utf-8&wq=%E5%B0%8F&pvid=469d5d51a3184cc9a053124dc020b31f'.format(
        shop=shop)

    try:
        r = requests.get(url=url, headers=headers).content

        content = le.HTML(r)

        href = content.xpath(
            '//*[@id="J_goodsList"]/ul/li[1]/div/div[1]/a/@href')

        price = content.xpath(
            '//*[@id="J_goodsList"]/ul/li[1]/div/div/strong/i/text()')

        title_01 = content.xpath(
            '//*[@id="J_goodsList"]/ul/li[1]/div/div/a/em/text()')

        title = [x.strip() for x in title_01
                 if x.strip() != '']  # 提取标题 将多余空格和\n去除

        re_01 = re.compile(r'\d+')

        number = re_01.findall(str(href))

        shop_price_01 = "".join(price)
        print("商品价格:" + shop_price_01)
        # for shop_price in price:
        #
        #     print("商品价格:" + shop_price)

        global shop_title  # 全局定义商品题目 进行文件改标题
        shop_title_01 = "".join(title)
        print("商品标题:" + shop_title_01)
        # for shop_title in title:
        #     print("商品标题:" + shop_title)

        for index in href:
            global href_shop
            href_shop = 'http:' + index
            print(href_shop)

        for num in number:
            # print(num)
            return num

    except:
        print('爬取失败')
コード例 #10
0
ファイル: spider.py プロジェクト: zbwbb/Request_re_File
def get_one_page(url):
    try:
        response = requests.get(url, headers=UserAgent.get_user_agent())
        if response.status_code == 200:
            return response.text
        return None
    except RequestException as e:
        print(e.response)
        return None
コード例 #11
0
ファイル: spider.py プロジェクト: zbwbb/StreetPat_Ajax
def download_image(url):
    print("正在下载。。。" + url)
    try:
        response = requests.get(url, headers=UserAgent.get_user_agent())
        if response.status_code == 200:
            # 二进制
            save_image(response.content)
        return None
    except RequestException:
        return None
コード例 #12
0
def creatTopic(data):
    ua = UserAgent()
    user_agent = {
        'User-agent': ua.random,
        'Referer': 'https://dealbub.com/',
        'Content-type': 'content_type_value'
    }
    session = Session()
    session.head('https://dealbub.com/')
    data = topicContect()
    response = session.post(url='https://alisdeals.com/posts/',
                            headers=user_agent,
                            data=data)
コード例 #13
0
ファイル: spider.py プロジェクト: zbwbb/StreetPat_Ajax
def get_page(offset, keywords):
    params = {
        'offset': offset,
        'format': 'json',
        'keyword': keywords,
        'autoload': 'true',
        'count': '20',
        'cur_tab': '3',
        'from': 'gallery'
    }
    url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
    try:
        response = requests.get(url, headers=UserAgent.get_user_agent())
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
コード例 #14
0
def screenPage(url, select):
    html = requests.get(url=url, headers=UserAgent.get_headers())
    html.encoding = 'gbk'
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)[0].next_sibling.text
コード例 #15
0
ファイル: gt.py プロジェクト: ShallWeiZeng/Code
            res = res + str(i) + "=" + str(cookie[i]) + ";"
    res = res[0:-1]
    return res


times = 0
success = 0
ipSets = []
dupCount = 0
while (True):
    # print(getTrace10(125))
    # exit(0)
    try:
        session = requests.Session()
        cookie = {}
        userAgent = UserAgent.getPCUserAgent()
        p = getProxy()
        p = json.loads(p)
        print(p[0])
        if p[0] in ipSets:
            dupCount += 1
            print("duplicated IP " + str(p[0]))
            print("dupCount : " + str(dupCount))
        else:
            ipSets.append(p[0])
        proxy = {"http": +str(p[0]), "https": "" + str(p[0])}

        header = getHeader()
        header['User-Agent'] = userAgent
        header['Cookie'] = getCookie()
        timeStamp = int(round(time.time() * 1000))
コード例 #16
0
ファイル: WebProxy.py プロジェクト: zhoulinfei/ScrapyDemo
def get_user_agent():
    """ 获取头协议 """
    user_agent = {'User-Agent': userAgent.get_user_agent()}
    return user_agent
コード例 #17
0
ファイル: jd_home.py プロジェクト: luobodage/-
import scrapy
import json
import os
import time
import random
import jd_home1
import jd_main
import UserAgent

number = jd_home1.spider_home()  # 获取产品id
comment_file_path = '../../../id.txt'  # 文件存储位置
headers = UserAgent.get_headers()  # 随机获取headers表头


class JdHomeSpider(scrapy.Spider):
    name = 'jd_home'

    # allowed_domains = [' ']
    # start_urls = ['http:// /']

    def start_requests(self):
        number_page = 10
        try:
            for page in range(1, number_page):
                url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId={number}' \
                      '&score=0&sortType=5&page={page}&pageSize=10&isShadowSku=0&fold=1'.format(
                    page=page,
                    number=number
                )
                yield scrapy.Request(
                    url=url,
コード例 #18
0
ファイル: WebProxy.py プロジェクト: lanxinxichen/ScrapyDemo
def get_user_agent():
    """ 获取头协议 """
    user_agent = {"User-Agent": userAgent.get_user_agent()}
    return user_agent
コード例 #19
0
ファイル: crawler.py プロジェクト: amansachdev/fix
import telegram
import sys
import proxies
from scraper_api import ScraperAPIClient
from proxies import random_proxy
from copy import copy
from lxml import html
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from datetime import date, datetime, timedelta
client = ScraperAPIClient('1911aafa4321c1afdbff5112a4d6685e')
#iplist = ['162.208.48.84:8118','165.138.4.41:8080']
#proxies = {'https':random.choice(iplist) }
#print(proxies)

ua = UserAgent.UserAgent()
#imported_proxy = random_proxy.random_proxies()
#print(imported_proxy)
intervalTimeBetweenCheck = 0
dateIndex = datetime.now()
emailinfo = {}

IFTTT_Key = ""
IFTTT_EventName = ""



# msg_content format
# msg_content['Subject'] = 'Subject'
# msg_content['Content'] = 'This is a content'
def isbotalive():
コード例 #20
0
ファイル: Crack.py プロジェクト: MOHQ-TM/InstaCrack
#Models
import requests as r, Color as c, UserAgent as usr, Logo
#Headers
headers = {
    'Host': 'www.instagram.com',
    'User-Agent': usr.UserAgent(),
    'Accept': '*/*',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Referer': 'https://www.instagram.com/',
    'X-CSRFToken': '',
    'X-Instagram-AJAX': '1',
    'Content-Type': 'application/x-www-form-urlencoded',
    'X-Requested-With': 'XMLHttpRequest',
    'Content-Length': '',
    'Cookie': '',
    'Connection': 'keep-alive'
}
#####....

#Globals Values
cambo = open(input(f"{c.C}Cambo{c.R}_{c.C}List {c.R}=> {c.W}"))
########
proxy = open(input(f"{c.C}Proxy{c.R}_{c.C}List {c.R}=> {c.W}"),
             "r").readlines()


#Proxies
def Proxies(num):
    pr = proxy[num].strip("\n")
    return pr