Beispiel #1
0
 def __init__(self):
     self.UA = UserAgent()
Beispiel #2
0
 def __init__(self,crawler):
     super(RandomUserAgentMiddlware,self).__init__()
     self.ua = UserAgent()
Beispiel #3
0
from random import shuffle
from fake_useragent import UserAgent
import linecache

from threading import Thread

channel_url = ""
proxies_file = "Proxies_txt/good_proxy.txt"
processes = []
max_nb_of_threads = 1000

all_proxies = []
nb_of_proxies = 0

# Session creating for request
ua = UserAgent()
session = Livestreamer()
session.set_option("http-headers", {
    'User-Agent': ua.random,
    "Client-ID": "ewvlchtxgqq88ru9gmfp1gmyt6h2b93"
})


def print_exception():
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    linecache.checkcache(filename)
    line = linecache.getline(filename, lineno, f.f_globals)
    print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno,
Beispiel #4
0
    else:
        proxies = re.findall(
            re.compile('<td>([\d.]+)</td>'),
            str(requests.get('https://www.sslproxies.org/').content))
        proxies = [
            '%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2]))
        ]
    print('%d proxies successfully loaded!' % len(proxies))
    proxy = Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    if args.user_agent:
        if path.isfile(args.user_agent):
            user_agents = list(
                filter(None,
                       open(args.user_agent, 'r').read().split('\n')))
        else:
            user_agents = [args.user_agent]
    else:
        user_agents = UserAgent()
    for i in range(args.threads):
        t = Thread(target=bot, args=(i, args.url))
        t.daemon = True
        t.start()
        sleep(uniform(2.0, 4.0))
    stdin.read(1)
    exit(0)
except KeyboardInterrupt:
    exit(0)
except:
    exit(1)
def get_header():
    ua = UserAgent(verify_ssl=False)
    return {'User-Agent': ua.random}
Beispiel #6
0
    if args.proxies:
        proxies = open(args.proxies, 'r').read().split('\n')
    else:
        proxies = re.findall(
            re.compile('<td>([\d.]+)</td>'),
            str(requests.get('https://free-proxy-list.net/').content))
        proxies = [
            '%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2]))
        ]
    print('%d proxies successfully loaded!' % len(proxies))
    proxy = Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    if args.user_agents:
        user_agents = open(args.user_agents, 'r').read().split('\n')
    else:
        agent = UserAgent()
    if args.driver == 'chrome':
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--mute-audio')
    else:
        firefox_profile = webdriver.FirefoxProfile()
        firefox_profile.set_preference('media.volum_scale', '0.0')
    for i in range(args.threads):
        t = Thread(target=bot, args=(args.url, ))
        t.deamon = True
        t.start()
        sleep(uniform(2.0, 4.0))
except KeyboardInterrupt:
    _exit(0)
except Exception:
    print_exc()
Beispiel #7
0
    def get_df(self):
        print('get data from services...')
        sfb_df = pd.read_csv(self.path_sfb, sep=';', index_col='id')
        serv_df = sfb_df[sfb_df['type'] == 'services']

        list_url = serv_df['URL'].values
        final_df = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        #mgts
        n = 0
        url = list_url[n]
        print(url)
        html = requests.get(url, headers={
            'User-Agent': UserAgent().chrome
        }).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_list = soup.findAll('div',
                                  {'class': 'slider_slide'})  #0 заменить
        for price_elem in price_list:
            if price_elem.findAll('div',
                                  {'class': 'texts'})[0].text == 'Безлимитный':
                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = 'services'
                id_n = int(serv_df[serv_df['URL'].str.contains(url)].index[0])
                price_dict['category_id'] = id_n
                price_dict['category_title'] = serv_df['cat_title'].loc[
                    price_dict['category_id']]
                price_dict['type'] = 'services'
                price_dict['site_title'] = price_elem.findAll(
                    'div', {'class': 'texts'})[0].text
                price_dict['price_new'] = int(
                    price_elem.findAll('div',
                                       {'class': 'slider_price_val'})[0].text)
                price_dict['price_old'] = ''
                price_dict['site_unit'] = price_elem.findAll(
                    'div', {'class': 'slider_price_rub1'
                            })[0].text + '/' + price_elem.findAll(
                                'div', {'class': 'slider_price_rub2'})[0].text
                price_dict['site_link'] = url
                final_df = final_df.append(price_dict, ignore_index=True)
                break

        #Помывка в бане в общем отделении, билет	http://legkiipar.ru/menraz.html
        try:
            n = 1
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  #, headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')  #Будние дни с 08:00 до 22:00
            pattern = re.compile(r'Будние дни')
            price_dict = dict()
            price_dict['date'] = Global().date
            price_dict['site_code'] = 'services'
            price_dict['type'] = 'services'
            price_dict['site_title'] = soup(text=pattern)[0]
            price_1 = soup.findAll('span', {'class': 'стиль6'})
            price_dict['price_new'] = re.findall('\d+', price_1[1].text)[0]
            price_dict['price_old'] = ''
            price_dict['site_unit'] = re.findall('\d+ часа',
                                                 price_1[4].text[:-1])[0]
            price_dict['category_id'] = int(
                serv_df[serv_df['URL'].str.contains(url)].index[0])
            price_dict['category_title'] = serv_df['cat_title'].loc[
                price_dict['category_id']].values[0]
            price_dict['site_link'] = url
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #Помывка в бане в общем отделении, билет	http://banya-lefortovo.ru/price.html
        n = 2
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        pattern = re.compile(r'Русская общая баня')
        price_dict['site_title'] = soup(text=pattern)[0]
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        price_dict['type'] = 'services'
        price_dict['price_new'] = int(
            re.findall('\d+',
                       re.findall('\d+ рублей',
                                  soup(text=pattern)[0])[0])[0])
        price_dict['price_old'] = ''
        price_dict['site_unit'] = re.findall('\d+ часа',
                                             soup(text=pattern)[0])[0]
        price_dict['site_link'] = url
        final_df = final_df.append(price_dict, ignore_index=True)

        #Помывка в бане в общем отделении, билет	https://rzhevskie-bani.ru/rb/bani.html
        n = 3
        price_dict = dict()
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_dict['price_new'] = int(
            re.findall('\d+',
                       soup.findAll('td', {'class': 'price'})[0].text)[0])
        pattern = re.compile(r'Стоимость')
        soup.findAll('td')
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        price_dict['site_title'] = soup(text=pattern)[0]
        price_dict['type'] = 'services'
        price_dict['site_unit'] = re.findall('(\d+.*\d часа)',
                                             soup(text=pattern)[0][-9:])[0]
        price_dict['site_link'] = url
        final_df = final_df.append(price_dict, ignore_index=True)

        #Помывка в бане в общем отделении, билет	http://vorontsovskie-bani.ru/obshchestvennye-bani/muzhskoj-zal-pervyj-razryad
        n = 4
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        try:
            html = requests.get(url,
                                headers={
                                    'User-Agent': UserAgent().chrome
                                },
                                timeout=10).content
        except:
            proxy = get_proxy(url)
            html = requests.get(url,
                                headers={
                                    'User-Agent': UserAgent().chrome
                                },
                                proxies=proxy).content
        soup = BeautifulSoup(html, 'lxml')
        price_div = soup.findAll('div', {'class': 'price-head'})[0]
        price_dict['price_new'] = int(
            re.findall('\d+',
                       price_div.findAll('span',
                                         {'class': 'price'})[0].text)[0])
        price_dict['price_old'] = ''
        price_dict['site_title'] = price_div.find('p').text.replace(
            '\xa0', ' ')
        price_dict['site_unit'] = re.findall('\d+ часа',
                                             price_dict['site_title'])[0]
        price_dict['type'] = 'services'
        price_dict['site_link'] = url
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        final_df = final_df.append(price_dict, ignore_index=True)

        #Постановка набоек, пара	https://masterskaya-obuvi.ru/tseny
        '''
        n=5
        price_dict=dict()
        price_dict['date']=Global().date
        price_dict['site_code']='services'
        url=list_url[n]
        print(url)
        html=requests.get(url).content#, headers={'User-Agent': UserAgent().chrome}
        soup=BeautifulSoup(html, 'lxml')
        price_dict['category_id']=int(serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[price_dict['category_id']]['cat_title'].values[0]
        for elem in soup.findAll('tr'):
            if re.findall('износоустойчивой резины',elem.text)!=[]:
                price_div=elem
                price_dict['site_title']=re.findall('[А-Яа-яёз(). ]+',elem.text)[0]
                price_dict['site_unit']=re.findall('[А-Яа-яёз(). ]+',elem.text)[1]
                price_dict['price_new']=int(price_div.findAll('td',{'width':"356"})[0].text)
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_link']=url
                break

        final_df=final_df.append(price_dict,ignore_index=True)
        '''

        #Постановка набоек, пара	https://masterskaya-obuvi.ru/tseny
        n = 6
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        for elem in soup.findAll('tr'):
            if re.findall('эконом', elem.text) != []:
                price_div = elem
                price_dict['site_title'] = self.wspex_space(
                    re.findall(
                        '[А-Яа-яёз(). ]+',
                        price_div.findAll('td', {'align': 'left'})[0].text)[0])
                price_text = price_div.findAll('strong')[0].text
                price_dict['price_new'] = int(re.findall('\d+', price_text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = re.findall(
                    '\([А-Яа-я]*\)', price_dict['site_title'])[0][1:-1]
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        #Билет на 1 поездку - мосгортранс
        n = 7
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        #soup.findAll('td')#,{'class':'text-center'})[0]
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title']
        for elem in soup.findAll('td'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                site_title = price_div.text
                break

        for elem in soup.findAll('tr'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                price_dict['site_title'] = price_div.find('td').text
                price_dict['price_new'] = int(
                    re.findall('\d{2,3}', price_div.text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'поездка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        # # стрижка
        try:
            n = 8
            price_dict = dict()
            price_dict['site_code'] = 'services'
            price_dict['date'] = Global().date
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  # , headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')

            # soup.findAll('td')#,{'class':'text-center'})[0]
            for elem in soup.findAll('tr'):
                if re.findall('(любой длины)', elem.text) != []:
                    price_dict['category_id'] = int(
                        serv_df[serv_df['URL'].str.contains(url)].index[-1])
                    price_dict['category_title'] = serv_df.loc[
                        price_dict['category_id']]['cat_title'].values[0]
                    price_text = elem.text
                    price_dict['site_title'] = re.findall(
                        '[А-Яа-я ()]+', price_text)[0]
                    price_dict['price_new'] = re.findall('\d+', price_text)[0]
                    price_dict['price_old'] = ''
                    price_dict['type'] = 'services'
                    price_dict['site_unit'] = 'стрижка'
                    price_dict['site_link'] = url
                    break
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #стрижка
        try:
            n = 9
            price_dict = dict()
            price_dict['site_code'] = 'services'
            price_dict['date'] = Global().date
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  #, headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')

            for elem in soup.findAll('tr'):
                if re.findall('Женская', elem.text) != []:
                    price_div = elem
                    price_dict['category_id'] = int(
                        serv_df[serv_df['URL'].str.contains(url)].index[0])
                    price_dict['category_title'] = serv_df.loc[
                        price_dict['category_id']]['cat_title'].values[0]
                    price_dict['site_title'] = price_div.find(
                        'td', {
                            'class': 'services-table__name'
                        }).text
                    price_dict['price_new'] = int(
                        self.wspex(
                            price_div.find(
                                'td', {
                                    'class':
                                    'services-table__price services-table__price-small'
                                }).text))
                    price_dict['price_old'] = ''
                    price_dict['type'] = 'services'
                    price_dict['site_unit'] = 'стрижка'
                    price_dict['site_link'] = url
                    break
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #стрижка
        n = 10
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        for elem in soup.findAll('tr'):
            if re.findall('лопаток', elem.text) != []:
                price_div = elem
                price_dict['category_id'] = int(
                    serv_df[serv_df['URL'].str.contains(list_url[n -
                                                                 1])].index[0])
                price_dict['category_title'] = serv_df.loc[
                    price_dict['category_id']]['cat_title'].values[0]
                price_dict['site_title'] = price_div.find(
                    'td', {
                        'height': '17'
                    }).text
                price_dict['price_new'] = int(
                    self.wspex(price_div.find('td', {
                        'width': '157'
                    }).text))
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'стрижка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        #Билет на 1 поездку - мосгортранс
        n = 11
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')

        for elem in soup.findAll('tr'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                price_dict['category_id'] = int(
                    serv_df[serv_df['URL'].str.contains(url)].index[-1])
                price_dict['category_title'] = serv_df.loc[
                    price_dict['category_id']]['cat_title']
                price_dict['site_title'] = price_div.find('td').text
                price_dict['price_new'] = int(
                    re.findall('\d{2,3}', price_div.text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'поездка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)
        final_df = final_df[final_df.site_title.notna()]
        print('ALL SERVICES HAVE BEEN SUCCESSFULLY PARSED!')
        return final_df
Beispiel #8
0
 def __init__(self):
     self.proxy = None
     self.ua = UserAgent()
     self.headers = self.get_header()
Beispiel #9
0
 def create_useragent(self):
     while True:
         try:
             return UserAgent().random
         except:
             pass
# -*- coding: utf-8 -*-
"""
    requests_shimo.py
    使用requests去模拟登录石墨文档
"""
import requests
from fake_useragent import UserAgent

ua = UserAgent(verify_ssl=False, use_cache_server=False)

headers = {
    'User-Agent': ua.random,
    'Referer': 'https://shimo.im/login?from=home',
    'origin': 'https://shimo.im',
    'pragma': 'no-cache',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-origin',
    'x-requested-with': 'XmlHttpRequest',
    'x-source': 'lizard-desktop',
}

s = requests.Session()

login_url = 'https://shimo.im/lizard-api/auth/password/login'

form_data = {
    # 脱敏
    'email': 'xxx',
    'mobile': '+86undefined',
    'password': '******'
Beispiel #11
0
 def __init__(self, crawler):
     self.ua = UserAgent()
     self.ua_type = crawler.settings.get('USER_AGENT_TYPE','random')
     pass
Beispiel #12
0
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from pprint import pprint
from typing import List
import csv
import re
from time import sleep
from tqdm import tqdm
from multiprocessing.dummy import Pool

Use_Agent = UserAgent()


def woaiwojia_spyder(page: int) -> List[list]:
    '''
    爬取我爱我家二手房的数据
    :param page: (int) 页面数据
    :return: (list) 二手房的数据
    '''
    info_list = []
    url = 'https://bj.5i5j.com/ershoufang/n{}/'.format(page)
    headers = {'User-Agent': Use_Agent.random}
    html = requests.get(url, headers=headers).text
    soup = BeautifulSoup(html, 'html5lib')
    # pprint(soup)
    # 方便用正则表达式读取
    all_house_information = [
Beispiel #13
0
from fake_useragent import UserAgent

ua = UserAgent(verify_ssl=False)
# 模拟不同的浏览器
print(ua.chrome)
print(ua.safari)

# 随机返回头部信息
print(ua.random)
Beispiel #14
0
import requests
import re
from fake_useragent import UserAgent
import parsel
import mysql_inport_data
import pipline

url = 'http://www.hnzzj.com/'
headers = {'UserAgent': UserAgent().chrome}
response = requests.get(url, headers).text
print(response)
print('----------------------------------------------------------')
#解析
html = parsel.Selector(response)
print(html)

# srcs=html.xpath('//div[@class="container clearfix"]//div//ul//li//a/@href').getall()
# print(srcs)
titles = html.xpath(
    '//div[@class="container clearfix"]//div//ul//li//a/text()').getall()
print(titles)
print(
    '--------------------------------------------------------------------------'
)
hrefs = html.xpath('//div[@class="main pagebox"]//@href').getall()
print(hrefs)
# for href in hrefs:
#     print(href)

furl = 'http://www.hnzzj.com/'
Beispiel #15
0
 def change_web_scraping_info(self):
     print('Changing user-agent and the proxy...')
     ua = UserAgent()
     self.user_agent = str(ua.random)
Beispiel #16
0
    def run(self):

        try:
            ua = UserAgent()
            header = {'User-Agent': str(ua.chrome)}
            profileDirName = self.data[0]
            photoName = self.data[1].split('+')[-1]

            filePath = Path(downloadDir + profileDirName + "/" + photoName)

            if filePath.is_file():
                newConn = sqlite3.connect(dbDir, timeout=15)
                nCur = newConn.cursor()
                print(' already downloaded --> ',
                      downloadDir + profileDirName + "/" + photoName)
                Q = 'update zimbo_fine_image_links set is_done =1 where link="' + self.data[
                    1] + '"'
                nCur.execute(Q)
                newConn.commit()
                newConn.close()
                return

            createDirOnce(downloadDir, profileDirName)
            url = self.data[1]

            print(url)

            # htmlContent = requests.get(url, headers=header)
            # regex = '<textarea id=\"share_link1\" onclick=\"this.select\(\)\;\" style=\'.*\' wrap="on" rows=".*" cols=".*"><img src="(.*?)" alt=\".*\"><br><a href=\".*\" target=\"_blank">.*<\/a>'
            # # another regex needed for pictures link.
            # photoLink = re.findall(regex, htmlContent.content.decode('latin-1'))

            # # print(photoLink)

            # if not len(photoLink):
            #     picRegex = '<a href=\".*\"><img width=\".*\" height=\".*\" src=\"(.*)\" alt=\".*\" \/><\/a>'
            #     photoLink = re.findall(picRegex, htmlContent.content.decode('latin-1'))

            # if(len(photoLink)):
            #     photoLink =photoLink [0]

            # else:
            #     print('--------------------------------------------- image not found--------------------------')
            #     print(url)
            #     return
            # htmlContent.connection.close()
            # photoName = ((photoLink.split('/').pop() ).split('full-').pop() ).replace('.' , '__'+self.photoId+'.')
            # userName = ((photoLink.split('/').pop() ).split('full-').pop()).split('.')[0]
            # print(  photoName)

            hackHeaders = ['User-Agent:' + str(ua.chrome) + '']

            print(downloadDir + profileDirName + "/" + photoName, url)
            fp = open(downloadDir + profileDirName + "/" + photoName, "wb")
            curl = pycurl.Curl()
            curl.setopt(pycurl.URL, url)
            curl.setopt(pycurl.WRITEDATA, fp)
            curl.setopt(pycurl.HTTPHEADER, hackHeaders)
            curl.perform()
            curl.close()
            fp.close()

            # # update db
            newConn = sqlite3.connect(dbDir, timeout=15)
            nCur = newConn.cursor()
            nCur.execute(
                'update zimbo_fine_image_links set is_done =1 where link="' +
                self.data[1] + '"')
            newConn.commit()
            newConn.close()

        except Exception as e:
            print(
                "------------------------------------errrr------------------------->>>>>>>>>>>>"
            )
            print(e, url)
Beispiel #17
0
 def __init__(self):
     self.ua = UserAgent(use_cache_server=False)
Beispiel #18
0
 def process_request(self, request, spider):
     ua_class = UserAgent()
     ua = ua_class.random
     logger.debug(ua)
     if ua:
         request.headers.setdefault(b'User-Agent', ua)
Beispiel #19
0
import requests
import re
import sys
import time
import os
import argparse
from bs4 import BeautifulSoup
from functools import partial
from multiprocessing import Pool, TimeoutError, cpu_count
from fake_useragent import UserAgent

ua = UserAgent().random

parser = argparse.ArgumentParser(
    description='Argument parser for dork-scanner')
parser.add_argument('-S',
                    '--search',
                    help='String to be searched for',
                    default='1')
parser.add_argument('-E',
                    '--engine',
                    help='Search engine to be used',
                    default='google')
parser.add_argument('-P',
                    '--page',
                    help='Number of pages to search in',
                    default='1')
parser.add_argument('-Pr',
                    '--process',
                    help='Number of parallel processes',
                    default='1')
Beispiel #20
0
 def get_header(self):
     ua = UserAgent(path=self.UAfilePath)
     return ua.random
Beispiel #21
0
 def __init__(self):
     self.headers = dict()
     self.headers['User-Agent'] = UserAgent().random
     self.my_session = requests.session()
    def parse(self,pass_url):
        #解析当前微博下的所有评论用户
        first_req=requests.get(pass_url+str(1),cookies=self.new_cookies()).content
        if 'not exist' in str(first_req):
            return None
        html = etree.HTML(first_req)
        #获取中断的页面
        try:
            with open('page_num.txt','r') as f:
                broken_page_num=int(f.readlines()[0])+1
        except:
            broken_page_num=1
        #评论总页数
        try:
            page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1]
        except:
            #print('[-----]页面请求错误')
            return self.parse(pass_url=pass_url)
        for page in range(broken_page_num,int(page_num)+1):
            print(page)
            if page % 5 == 0:
                with open('page_num.txt','w') as f:
                    f.write(str(page))
            fi=set()
            #保存当前运行状态
            cookies=self.new_cookies()
            #print('[++++++++]当前cookies:',str(cookies))
            try:
                req=requests.get(pass_url+str(page),cookies=cookies,headers={"User-Agent":UserAgent().random}).content
                html=etree.HTML(req)
                fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href')
                fans_name=html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()')
            except:
                while True:
                    #print('[!!!!!]出现错误,未获取到内容:')
                    time.sleep(5)
                    try:
                        req = requests.get(pass_url + str(page),headers={"User-Agent":UserAgent().random},cookies=cookies).content
                        html = etree.HTML(req)
                        fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href')
                        fans_name = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()')
                        break
                    except:
                        pass

            for i,j in enumerate(fans):
                #防止底部返回链接的干扰
                if '5644764907' in j:
                    continue
                fans_url='https://weibo.cn/'+j.split('/u/')[1]+'/info'
                fans_weibo='https://weibo.cn'+j
                m_url="https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}".format(j.split('/u/')[1],j.split('/u/')[1])
                name=fans_name[i]
                if name in fi:
                    pass
                else:
                    fi.add(name)
                    self.db.lpush(fans_url)
                    self.db1.lpush(fans_weibo)
                    self.db2.lpush(m_url)
                    print('[+++][+++][+++]',name)
                #在应对限制ip的反爬措施中,效率最高的等待时间
                time.sleep(0.35)
        #爬完该篇微博的所有评论后
        time.sleep(1)
        with open('page_num.txt','w') as f:
            f.write('0')
Beispiel #23
0
 def __init__(self, crawler):
     super(RandomUserAgentMiddleware, self).__init__()
     self.ua = UserAgent()
     self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
Beispiel #24
0
from bs4 import BeautifulSoup
import requests as re
import pandas as pd
from datetime import datetime
import numpy as np
import time
import tqdm
url = "https://www.hltv.org/results?offset="
pd.options.mode.chained_assignment = None  # default='warn'

from fake_useragent import UserAgent
ua = UserAgent(cache=False)


def team_matches(team, df):
    return df.loc[(df['team1'] == team) |
                  (df['team2'] == team)].reset_index(drop=True)


class hltv(object):
    """docstring for ClassName"""
    def __init__(self, user_agent):
        self.user_agent = user_agent
        self.dataframe = pd.DataFrame()

    def results(self, num):
        df = pd.DataFrame(
            columns=['team1', 'score1', 'score2', 'team2', 'url', 'date'])
        session = re.Session()
        session.headers.update({'User-Agent': self.user_agent})
        for index_of_page in range(num):
Beispiel #25
0
def main():

    START = datetime.now()
    bad = 'Our systems have detected unusual traffic from your computer network.  This page checks to see if it&#39;s really you sending the requests, and not a robot.'
    parser = argparse.ArgumentParser(description="Get URLS from Bing")
    parser.add_argument(
        'mot',
        nargs="?",
        help=
        "The phrase you want to look up. Put a '+' between the terms. eg: paris+the+city+of+love",
        type=str)
    parser.add_argument('-n',
                        '--number',
                        default=50,
                        help="Minimum number of links you want (default=50)",
                        type=int)
    args = parser.parse_args()
    nb_links = args.number

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    url_lis, ignored_link, total = [], 0, 0
    ua = UserAgent()
    header = {'User-Agent': str(ua.random)}
    print(colored(header, 'yellow'))

    page = requests.get("https://www.bing.com/search?q=" + args.mot.lower(),
                        headers=header)
    soup = BeautifulSoup(page.content, 'html.parser')

    out = open("out.txt", 'w')

    while len(url_lis) < nb_links:

        time.sleep(round(random.uniform(3, 7),
                         2))  # take it easy, don't get banned...

        # we get the h2 links of the search page
        h2links = soup.findAll("h2")
        good_link = re.findall('<a h="ID=SERP.{7}".href="(http.*?)"',
                               str(h2links))
        for link in good_link:
            total += 1
            if isValid(link) and link not in url_lis:
                out.write(link + '\n')
                print(link)
                url_lis.append(link)
            else:
                ignored_link += 1
        print(colored('{} links gotten'.format(len(url_lis)), 'red'))
        next_url = str(page.content)
        if re.findall(bad, str(next_url)):
            print(
                colored("they're coming after you, run !", 'red', 'on_yellow'))
            sys.exit(0)

        #we get here the link of the "Next" button

#If you're not searching from francophone areas, you need to change the title of the link eg: Página siguiente, Volgende pagina, Nächste Seite...
        next_url = re.findall('title="Page suivante" href="(.*?)"', next_url)
        try:
            next_url = "https://www.bing.com" + html.unescape(next_url[0])
        except IndexError:
            print(colored('No more results, sorry', 'yellow'))
            sys.exit(0)
        print('\n', colored(next_url, 'green'), sep="")
        page = requests.get(next_url, headers=header)
        soup = BeautifulSoup(page.content, 'html.parser')
    out.close()
    print('\n\nOutput file : out.txt')
    print(
        colored('links ignored : ' + str(ignored_link) + ' of ' + str(total),
                'blue'))
    END = (datetime.now() - START).total_seconds()
    print(colored("Done in {} secs".format(round(END, 2)), 'yellow'))
Beispiel #26
0
    def __init__(self,
                 login,
                 password,
                 like_per_day=1000,
                 media_max_like=150,
                 media_min_like=0,
                 follow_per_day=0,
                 follow_time=5 * 60 * 60, #Cannot be zero
                 unfollow_per_day=0,
                 start_at_h=0,
                 start_at_m=0,
                 end_at_h=23,
                 end_at_m=59,
                 database_name='follows_db.db',
                 comment_list=[["this", "the", "your"],
                               ["photo", "picture", "pic", "shot", "snapshot"],
                               ["is", "looks", "feels", "is really"],
                               ["great", "super", "good", "very good", "good",
                                "wow", "WOW", "cool", "GREAT", "magnificent",
                                "magical", "very cool", "stylish", "beautiful",
                                "so beautiful", "so stylish", "so professional",
                                "lovely", "so lovely", "very lovely", "glorious",
                                "so glorious", "very glorious", "adorable",
                                "excellent", "amazing"],
                               [".", "..", "...", "!", "!!", "!!!"]],
                 comments_per_day=0,
                 tag_list=['cat', 'car', 'dog'],
                 max_like_for_one_tag=5,
                 unfollow_break_min=15,
                 unfollow_break_max=30,
                 log_mod=0,
                 proxy="",
                 user_blacklist={},
                 tag_blacklist=[],
                 unwanted_username_list=[],
                 unfollow_whitelist=[]):

        self.database_name = database_name
        self.follows_db = sqlite3.connect(database_name, timeout=0, isolation_level=None)
        self.follows_db_c = self.follows_db.cursor()
        check_and_update(self)
        fake_ua = UserAgent()
        self.user_agent = check_and_insert_user_agent(self, str(fake_ua.random))
        self.bot_start = datetime.datetime.now()
        self.bot_start_ts = time.time()
        self.start_at_h = start_at_h
        self.start_at_m = start_at_m
        self.end_at_h = end_at_h
        self.end_at_m = end_at_m
        self.unfollow_break_min = unfollow_break_min
        self.unfollow_break_max = unfollow_break_max
        self.user_blacklist = user_blacklist
        self.tag_blacklist = tag_blacklist
        self.unfollow_whitelist = unfollow_whitelist
        self.comment_list = comment_list
        self.instaloader = instaloader.Instaloader()

        self.time_in_day = 24 * 60 * 60
        # Like
        self.like_per_day = like_per_day
        if self.like_per_day != 0:
            self.like_delay = self.time_in_day / self.like_per_day

        # Follow
        self.follow_time = follow_time #Cannot be zero
        self.follow_per_day = follow_per_day
        if self.follow_per_day != 0:
            self.follow_delay = self.time_in_day / self.follow_per_day

        # Unfollow
        self.unfollow_per_day = unfollow_per_day
        if self.unfollow_per_day != 0:
            self.unfollow_delay = self.time_in_day / self.unfollow_per_day

        # Comment
        self.comments_per_day = comments_per_day
        if self.comments_per_day != 0:
            self.comments_delay = self.time_in_day / self.comments_per_day

        # Don't like if media have more than n likes.
        self.media_max_like = media_max_like
        # Don't like if media have less than n likes.
        self.media_min_like = media_min_like
        # Auto mod seting:
        # Default list of tag.
        self.tag_list = tag_list
        # Get random tag, from tag_list, and like (1 to n) times.
        self.max_like_for_one_tag = max_like_for_one_tag
        # log_mod 0 to console, 1 to file
        self.log_mod = log_mod
        self.s = requests.Session()
        self.c = requests.Session()
        # if you need proxy make something like this:
        # self.s.proxies = {"https" : "http://proxyip:proxyport"}
        # by @ageorgios
        if proxy != "":
            proxies = {
                'http': 'http://' + proxy,
                'https': 'http://' + proxy,
            }
            self.s.proxies.update(proxies)
        # convert login to lower
        self.user_login = login.lower()
        self.user_password = password
        self.bot_mode = 0
        self.media_by_tag = []
        self.media_on_feed = []
        self.media_by_user = []
        self.current_user_info = ''
        self.unwanted_username_list = unwanted_username_list
        now_time = datetime.datetime.now()
        self.check_for_bot_update()
        log_string = 'Instabot v1.2.0/1 started at %s:' % \
                     (now_time.strftime("%d.%m.%Y %H:%M"))
        self.write_log(log_string)
        self.login()
        self.populate_user_blacklist()
        signal.signal(signal.SIGTERM, self.cleanup)
        atexit.register(self.cleanup)
Beispiel #27
0
 def __init__(self):
     self.ua = UserAgent()
Beispiel #28
0
def genUA():
    """returns a fake random user-agent"""
    return str(UserAgent().random)
Beispiel #29
0
import time
import json
import hashlib
import requests
from fake_useragent import UserAgent

url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
headers = {"User-Agent": UserAgent().random}


def main():
    def get_salt():
        """获取13位时间戳"""
        salt = str(int(round(time.time() * 1000)))
        return salt

    def get_sign():
        """获取sign"""
        sign = "fanyideskweb" + keywords + get_salt() + "6x(ZHw]mwzX#u0V7@yfwK"
        hl = hashlib.md5()
        hl.update(sign.encode(encoding='utf-8'))
        return sign

    data = {
        'i': keywords,
        'from': 'AUTO',
        'to': 'AUTO',
        'smartresult': 'dict',
        'client': 'fanyideskweb',
        'salt': get_salt(),
        'sign': get_sign(),
Beispiel #30
0
    def ret_augst(self):
        ua = UserAgent()

        return {'User-Agent': ua.random}