from bs4 import BeautifulSoup import requests as re import pandas as pd from datetime import datetime import numpy as np import time import tqdm url = "https://www.hltv.org/results?offset=" pd.options.mode.chained_assignment = None # default='warn' from fake_useragent import UserAgent ua = UserAgent(cache=False) def team_matches(team, df): return df.loc[(df['team1'] == team) | (df['team2'] == team)].reset_index(drop=True) class hltv(object): """docstring for ClassName""" def __init__(self, user_agent): self.user_agent = user_agent self.dataframe = pd.DataFrame() def results(self, num): df = pd.DataFrame( columns=['team1', 'score1', 'score2', 'team2', 'url', 'date']) session = re.Session() session.headers.update({'User-Agent': self.user_agent}) for index_of_page in range(num):
def __init__(self, login, password, like_per_day=1000, media_max_like=150, media_min_like=0, follow_per_day=0, follow_time=5 * 60 * 60, #Cannot be zero unfollow_per_day=0, start_at_h=0, start_at_m=0, end_at_h=23, end_at_m=59, database_name='follows_db.db', comment_list=[["this", "the", "your"], ["photo", "picture", "pic", "shot", "snapshot"], ["is", "looks", "feels", "is really"], ["great", "super", "good", "very good", "good", "wow", "WOW", "cool", "GREAT", "magnificent", "magical", "very cool", "stylish", "beautiful", "so beautiful", "so stylish", "so professional", "lovely", "so lovely", "very lovely", "glorious", "so glorious", "very glorious", "adorable", "excellent", "amazing"], [".", "..", "...", "!", "!!", "!!!"]], comments_per_day=0, tag_list=['cat', 'car', 'dog'], max_like_for_one_tag=5, unfollow_break_min=15, unfollow_break_max=30, log_mod=0, proxy="", user_blacklist={}, tag_blacklist=[], unwanted_username_list=[], unfollow_whitelist=[]): self.database_name = database_name self.follows_db = sqlite3.connect(database_name, timeout=0, isolation_level=None) self.follows_db_c = self.follows_db.cursor() check_and_update(self) fake_ua = UserAgent() self.user_agent = check_and_insert_user_agent(self, str(fake_ua.random)) self.bot_start = datetime.datetime.now() self.bot_start_ts = time.time() self.start_at_h = start_at_h self.start_at_m = start_at_m self.end_at_h = end_at_h self.end_at_m = end_at_m self.unfollow_break_min = unfollow_break_min self.unfollow_break_max = unfollow_break_max self.user_blacklist = user_blacklist self.tag_blacklist = tag_blacklist self.unfollow_whitelist = unfollow_whitelist self.comment_list = comment_list self.instaloader = instaloader.Instaloader() self.time_in_day = 24 * 60 * 60 # Like self.like_per_day = like_per_day if self.like_per_day != 0: self.like_delay = self.time_in_day / self.like_per_day # Follow self.follow_time = follow_time #Cannot be zero self.follow_per_day = follow_per_day if self.follow_per_day != 0: self.follow_delay = self.time_in_day / self.follow_per_day # Unfollow self.unfollow_per_day = unfollow_per_day if self.unfollow_per_day != 0: self.unfollow_delay = self.time_in_day / self.unfollow_per_day # Comment self.comments_per_day = comments_per_day if self.comments_per_day != 0: self.comments_delay = self.time_in_day / self.comments_per_day # Don't like if media have more than n likes. self.media_max_like = media_max_like # Don't like if media have less than n likes. self.media_min_like = media_min_like # Auto mod seting: # Default list of tag. self.tag_list = tag_list # Get random tag, from tag_list, and like (1 to n) times. self.max_like_for_one_tag = max_like_for_one_tag # log_mod 0 to console, 1 to file self.log_mod = log_mod self.s = requests.Session() self.c = requests.Session() # if you need proxy make something like this: # self.s.proxies = {"https" : "http://proxyip:proxyport"} # by @ageorgios if proxy != "": proxies = { 'http': 'http://' + proxy, 'https': 'http://' + proxy, } self.s.proxies.update(proxies) # convert login to lower self.user_login = login.lower() self.user_password = password self.bot_mode = 0 self.media_by_tag = [] self.media_on_feed = [] self.media_by_user = [] self.current_user_info = '' self.unwanted_username_list = unwanted_username_list now_time = datetime.datetime.now() self.check_for_bot_update() log_string = 'Instabot v1.2.0/1 started at %s:' % \ (now_time.strftime("%d.%m.%Y %H:%M")) self.write_log(log_string) self.login() self.populate_user_blacklist() signal.signal(signal.SIGTERM, self.cleanup) atexit.register(self.cleanup)
def get_header(): ua = UserAgent(verify_ssl=False) return {'User-Agent': ua.random}
def __init__(self): self.ua = UserAgent(use_cache_server=False)
def change_web_scraping_info(self): print('Changing user-agent and the proxy...') ua = UserAgent() self.user_agent = str(ua.random)
def __init__(self): self.UA = UserAgent()
from fake_useragent import UserAgent ua = UserAgent(verify_ssl=False) # 模拟不同的浏览器 print(ua.chrome) print(ua.safari) # 随机返回头部信息 print(ua.random)
def get_df(self): print('get data from services...') sfb_df = pd.read_csv(self.path_sfb, sep=';', index_col='id') serv_df = sfb_df[sfb_df['type'] == 'services'] list_url = serv_df['URL'].values final_df = pd.DataFrame(columns=[ 'date', 'type', 'category_id', 'category_title', 'site_title', 'price_new', 'price_old', 'site_unit', 'site_link', 'site_code' ]) #mgts n = 0 url = list_url[n] print(url) html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_list = soup.findAll('div', {'class': 'slider_slide'}) #0 заменить for price_elem in price_list: if price_elem.findAll('div', {'class': 'texts'})[0].text == 'Безлимитный': price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' id_n = int(serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_id'] = id_n price_dict['category_title'] = serv_df['cat_title'].loc[ price_dict['category_id']] price_dict['type'] = 'services' price_dict['site_title'] = price_elem.findAll( 'div', {'class': 'texts'})[0].text price_dict['price_new'] = int( price_elem.findAll('div', {'class': 'slider_price_val'})[0].text) price_dict['price_old'] = '' price_dict['site_unit'] = price_elem.findAll( 'div', {'class': 'slider_price_rub1' })[0].text + '/' + price_elem.findAll( 'div', {'class': 'slider_price_rub2'})[0].text price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) break #Помывка в бане в общем отделении, билет http://legkiipar.ru/menraz.html try: n = 1 url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') #Будние дни с 08:00 до 22:00 pattern = re.compile(r'Будние дни') price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' price_dict['type'] = 'services' price_dict['site_title'] = soup(text=pattern)[0] price_1 = soup.findAll('span', {'class': 'стиль6'}) price_dict['price_new'] = re.findall('\d+', price_1[1].text)[0] price_dict['price_old'] = '' price_dict['site_unit'] = re.findall('\d+ часа', price_1[4].text[:-1])[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df['cat_title'].loc[ price_dict['category_id']].values[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #Помывка в бане в общем отделении, билет http://banya-lefortovo.ru/price.html n = 2 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') pattern = re.compile(r'Русская общая баня') price_dict['site_title'] = soup(text=pattern)[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['type'] = 'services' price_dict['price_new'] = int( re.findall('\d+', re.findall('\d+ рублей', soup(text=pattern)[0])[0])[0]) price_dict['price_old'] = '' price_dict['site_unit'] = re.findall('\d+ часа', soup(text=pattern)[0])[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) #Помывка в бане в общем отделении, билет https://rzhevskie-bani.ru/rb/bani.html n = 3 price_dict = dict() url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_dict['price_new'] = int( re.findall('\d+', soup.findAll('td', {'class': 'price'})[0].text)[0]) pattern = re.compile(r'Стоимость') soup.findAll('td') price_dict['date'] = Global().date price_dict['site_code'] = 'services' price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = soup(text=pattern)[0] price_dict['type'] = 'services' price_dict['site_unit'] = re.findall('(\d+.*\d часа)', soup(text=pattern)[0][-9:])[0] price_dict['site_link'] = url final_df = final_df.append(price_dict, ignore_index=True) #Помывка в бане в общем отделении, билет http://vorontsovskie-bani.ru/obshchestvennye-bani/muzhskoj-zal-pervyj-razryad n = 4 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) try: html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }, timeout=10).content except: proxy = get_proxy(url) html = requests.get(url, headers={ 'User-Agent': UserAgent().chrome }, proxies=proxy).content soup = BeautifulSoup(html, 'lxml') price_div = soup.findAll('div', {'class': 'price-head'})[0] price_dict['price_new'] = int( re.findall('\d+', price_div.findAll('span', {'class': 'price'})[0].text)[0]) price_dict['price_old'] = '' price_dict['site_title'] = price_div.find('p').text.replace( '\xa0', ' ') price_dict['site_unit'] = re.findall('\d+ часа', price_dict['site_title'])[0] price_dict['type'] = 'services' price_dict['site_link'] = url price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] final_df = final_df.append(price_dict, ignore_index=True) #Постановка набоек, пара https://masterskaya-obuvi.ru/tseny ''' n=5 price_dict=dict() price_dict['date']=Global().date price_dict['site_code']='services' url=list_url[n] print(url) html=requests.get(url).content#, headers={'User-Agent': UserAgent().chrome} soup=BeautifulSoup(html, 'lxml') price_dict['category_id']=int(serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[price_dict['category_id']]['cat_title'].values[0] for elem in soup.findAll('tr'): if re.findall('износоустойчивой резины',elem.text)!=[]: price_div=elem price_dict['site_title']=re.findall('[А-Яа-яёз(). ]+',elem.text)[0] price_dict['site_unit']=re.findall('[А-Яа-яёз(). ]+',elem.text)[1] price_dict['price_new']=int(price_div.findAll('td',{'width':"356"})[0].text) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_link']=url break final_df=final_df.append(price_dict,ignore_index=True) ''' #Постановка набоек, пара https://masterskaya-obuvi.ru/tseny n = 6 price_dict = dict() price_dict['date'] = Global().date price_dict['site_code'] = 'services' url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] for elem in soup.findAll('tr'): if re.findall('эконом', elem.text) != []: price_div = elem price_dict['site_title'] = self.wspex_space( re.findall( '[А-Яа-яёз(). ]+', price_div.findAll('td', {'align': 'left'})[0].text)[0]) price_text = price_div.findAll('strong')[0].text price_dict['price_new'] = int(re.findall('\d+', price_text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = re.findall( '\([А-Яа-я]*\)', price_dict['site_title'])[0][1:-1] price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) #Билет на 1 поездку - мосгортранс n = 7 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') #soup.findAll('td')#,{'class':'text-center'})[0] price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'] for elem in soup.findAll('td'): if re.findall('не более', elem.text) != []: price_div = elem site_title = price_div.text break for elem in soup.findAll('tr'): if re.findall('не более', elem.text) != []: price_div = elem price_dict['site_title'] = price_div.find('td').text price_dict['price_new'] = int( re.findall('\d{2,3}', price_div.text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'поездка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) # # стрижка try: n = 8 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content # , headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') # soup.findAll('td')#,{'class':'text-center'})[0] for elem in soup.findAll('tr'): if re.findall('(любой длины)', elem.text) != []: price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[-1]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_text = elem.text price_dict['site_title'] = re.findall( '[А-Яа-я ()]+', price_text)[0] price_dict['price_new'] = re.findall('\d+', price_text)[0] price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #стрижка try: n = 9 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('Женская', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = price_div.find( 'td', { 'class': 'services-table__name' }).text price_dict['price_new'] = int( self.wspex( price_div.find( 'td', { 'class': 'services-table__price services-table__price-small' }).text)) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) except: print('DAMN! {} can not be parsed'.format(url)) #стрижка n = 10 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('лопаток', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(list_url[n - 1])].index[0]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'].values[0] price_dict['site_title'] = price_div.find( 'td', { 'height': '17' }).text price_dict['price_new'] = int( self.wspex(price_div.find('td', { 'width': '157' }).text)) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'стрижка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) #Билет на 1 поездку - мосгортранс n = 11 price_dict = dict() price_dict['site_code'] = 'services' price_dict['date'] = Global().date url = list_url[n] print(url) html = requests.get( url).content #, headers={'User-Agent': UserAgent().chrome} soup = BeautifulSoup(html, 'lxml') for elem in soup.findAll('tr'): if re.findall('не более', elem.text) != []: price_div = elem price_dict['category_id'] = int( serv_df[serv_df['URL'].str.contains(url)].index[-1]) price_dict['category_title'] = serv_df.loc[ price_dict['category_id']]['cat_title'] price_dict['site_title'] = price_div.find('td').text price_dict['price_new'] = int( re.findall('\d{2,3}', price_div.text)[0]) price_dict['price_old'] = '' price_dict['type'] = 'services' price_dict['site_unit'] = 'поездка' price_dict['site_link'] = url break final_df = final_df.append(price_dict, ignore_index=True) final_df = final_df[final_df.site_title.notna()] print('ALL SERVICES HAVE BEEN SUCCESSFULLY PARSED!') return final_df
else: proxies = re.findall( re.compile('<td>([\d.]+)</td>'), str(requests.get('https://www.sslproxies.org/').content)) proxies = [ '%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2])) ] print('%d proxies successfully loaded!' % len(proxies)) proxy = Proxy() proxy.proxy_type = ProxyType.MANUAL if args.user_agent: if path.isfile(args.user_agent): user_agents = list( filter(None, open(args.user_agent, 'r').read().split('\n'))) else: user_agents = [args.user_agent] else: user_agents = UserAgent() for i in range(args.threads): t = Thread(target=bot, args=(i, args.url)) t.daemon = True t.start() sleep(uniform(2.0, 4.0)) stdin.read(1) exit(0) except KeyboardInterrupt: exit(0) except: exit(1)
def run(self): try: ua = UserAgent() header = {'User-Agent': str(ua.chrome)} profileDirName = self.data[0] photoName = self.data[1].split('+')[-1] filePath = Path(downloadDir + profileDirName + "/" + photoName) if filePath.is_file(): newConn = sqlite3.connect(dbDir, timeout=15) nCur = newConn.cursor() print(' already downloaded --> ', downloadDir + profileDirName + "/" + photoName) Q = 'update zimbo_fine_image_links set is_done =1 where link="' + self.data[ 1] + '"' nCur.execute(Q) newConn.commit() newConn.close() return createDirOnce(downloadDir, profileDirName) url = self.data[1] print(url) # htmlContent = requests.get(url, headers=header) # regex = '<textarea id=\"share_link1\" onclick=\"this.select\(\)\;\" style=\'.*\' wrap="on" rows=".*" cols=".*"><img src="(.*?)" alt=\".*\"><br><a href=\".*\" target=\"_blank">.*<\/a>' # # another regex needed for pictures link. # photoLink = re.findall(regex, htmlContent.content.decode('latin-1')) # # print(photoLink) # if not len(photoLink): # picRegex = '<a href=\".*\"><img width=\".*\" height=\".*\" src=\"(.*)\" alt=\".*\" \/><\/a>' # photoLink = re.findall(picRegex, htmlContent.content.decode('latin-1')) # if(len(photoLink)): # photoLink =photoLink [0] # else: # print('--------------------------------------------- image not found--------------------------') # print(url) # return # htmlContent.connection.close() # photoName = ((photoLink.split('/').pop() ).split('full-').pop() ).replace('.' , '__'+self.photoId+'.') # userName = ((photoLink.split('/').pop() ).split('full-').pop()).split('.')[0] # print( photoName) hackHeaders = ['User-Agent:' + str(ua.chrome) + ''] print(downloadDir + profileDirName + "/" + photoName, url) fp = open(downloadDir + profileDirName + "/" + photoName, "wb") curl = pycurl.Curl() curl.setopt(pycurl.URL, url) curl.setopt(pycurl.WRITEDATA, fp) curl.setopt(pycurl.HTTPHEADER, hackHeaders) curl.perform() curl.close() fp.close() # # update db newConn = sqlite3.connect(dbDir, timeout=15) nCur = newConn.cursor() nCur.execute( 'update zimbo_fine_image_links set is_done =1 where link="' + self.data[1] + '"') newConn.commit() newConn.close() except Exception as e: print( "------------------------------------errrr------------------------->>>>>>>>>>>>" ) print(e, url)
def process_request(self, request, spider): ua_class = UserAgent() ua = ua_class.random logger.debug(ua) if ua: request.headers.setdefault(b'User-Agent', ua)
def __init__(self): self.proxy = None self.ua = UserAgent() self.headers = self.get_header()
def create_useragent(self): while True: try: return UserAgent().random except: pass
def __init__(self, crawler): self.ua = UserAgent() self.ua_type = crawler.settings.get('USER_AGENT_TYPE','random') pass
# -*- coding: utf-8 -*- """ requests_shimo.py 使用requests去模拟登录石墨文档 """ import requests from fake_useragent import UserAgent ua = UserAgent(verify_ssl=False, use_cache_server=False) headers = { 'User-Agent': ua.random, 'Referer': 'https://shimo.im/login?from=home', 'origin': 'https://shimo.im', 'pragma': 'no-cache', 'sec-fetch-dest': 'empty', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'x-requested-with': 'XmlHttpRequest', 'x-source': 'lizard-desktop', } s = requests.Session() login_url = 'https://shimo.im/lizard-api/auth/password/login' form_data = { # 脱敏 'email': 'xxx', 'mobile': '+86undefined', 'password': '******'
# -*- coding:utf-8 -*- import pandas as pd import numpy as np import requests from fake_useragent import UserAgent from bs4 import BeautifulSoup from pprint import pprint from typing import List import csv import re from time import sleep from tqdm import tqdm from multiprocessing.dummy import Pool Use_Agent = UserAgent() def woaiwojia_spyder(page: int) -> List[list]: ''' 爬取我爱我家二手房的数据 :param page: (int) 页面数据 :return: (list) 二手房的数据 ''' info_list = [] url = 'https://bj.5i5j.com/ershoufang/n{}/'.format(page) headers = {'User-Agent': Use_Agent.random} html = requests.get(url, headers=headers).text soup = BeautifulSoup(html, 'html5lib') # pprint(soup) # 方便用正则表达式读取 all_house_information = [
if args.proxies: proxies = open(args.proxies, 'r').read().split('\n') else: proxies = re.findall( re.compile('<td>([\d.]+)</td>'), str(requests.get('https://free-proxy-list.net/').content)) proxies = [ '%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2])) ] print('%d proxies successfully loaded!' % len(proxies)) proxy = Proxy() proxy.proxy_type = ProxyType.MANUAL if args.user_agents: user_agents = open(args.user_agents, 'r').read().split('\n') else: agent = UserAgent() if args.driver == 'chrome': chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--mute-audio') else: firefox_profile = webdriver.FirefoxProfile() firefox_profile.set_preference('media.volum_scale', '0.0') for i in range(args.threads): t = Thread(target=bot, args=(args.url, )) t.deamon = True t.start() sleep(uniform(2.0, 4.0)) except KeyboardInterrupt: _exit(0) except Exception: print_exc()
def get_header(self): ua = UserAgent(path=self.UAfilePath) return ua.random
import requests import re import sys import time import os import argparse from bs4 import BeautifulSoup from functools import partial from multiprocessing import Pool, TimeoutError, cpu_count from fake_useragent import UserAgent ua = UserAgent().random parser = argparse.ArgumentParser( description='Argument parser for dork-scanner') parser.add_argument('-S', '--search', help='String to be searched for', default='1') parser.add_argument('-E', '--engine', help='Search engine to be used', default='google') parser.add_argument('-P', '--page', help='Number of pages to search in', default='1') parser.add_argument('-Pr', '--process', help='Number of parallel processes', default='1')
def parse(self,pass_url): #解析当前微博下的所有评论用户 first_req=requests.get(pass_url+str(1),cookies=self.new_cookies()).content if 'not exist' in str(first_req): return None html = etree.HTML(first_req) #获取中断的页面 try: with open('page_num.txt','r') as f: broken_page_num=int(f.readlines()[0])+1 except: broken_page_num=1 #评论总页数 try: page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1] except: #print('[-----]页面请求错误') return self.parse(pass_url=pass_url) for page in range(broken_page_num,int(page_num)+1): print(page) if page % 5 == 0: with open('page_num.txt','w') as f: f.write(str(page)) fi=set() #保存当前运行状态 cookies=self.new_cookies() #print('[++++++++]当前cookies:',str(cookies)) try: req=requests.get(pass_url+str(page),cookies=cookies,headers={"User-Agent":UserAgent().random}).content html=etree.HTML(req) fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href') fans_name=html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()') except: while True: #print('[!!!!!]出现错误,未获取到内容:') time.sleep(5) try: req = requests.get(pass_url + str(page),headers={"User-Agent":UserAgent().random},cookies=cookies).content html = etree.HTML(req) fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href') fans_name = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()') break except: pass for i,j in enumerate(fans): #防止底部返回链接的干扰 if '5644764907' in j: continue fans_url='https://weibo.cn/'+j.split('/u/')[1]+'/info' fans_weibo='https://weibo.cn'+j m_url="https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}".format(j.split('/u/')[1],j.split('/u/')[1]) name=fans_name[i] if name in fi: pass else: fi.add(name) self.db.lpush(fans_url) self.db1.lpush(fans_weibo) self.db2.lpush(m_url) print('[+++][+++][+++]',name) #在应对限制ip的反爬措施中,效率最高的等待时间 time.sleep(0.35) #爬完该篇微博的所有评论后 time.sleep(1) with open('page_num.txt','w') as f: f.write('0')
def __init__(self): self.headers = dict() self.headers['User-Agent'] = UserAgent().random self.my_session = requests.session()
def __init__(self,crawler): super(RandomUserAgentMiddlware,self).__init__() self.ua = UserAgent()
def __init__(self, crawler): super(RandomUserAgentMiddleware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
from random import shuffle from fake_useragent import UserAgent import linecache from threading import Thread channel_url = "" proxies_file = "Proxies_txt/good_proxy.txt" processes = [] max_nb_of_threads = 1000 all_proxies = [] nb_of_proxies = 0 # Session creating for request ua = UserAgent() session = Livestreamer() session.set_option("http-headers", { 'User-Agent': ua.random, "Client-ID": "ewvlchtxgqq88ru9gmfp1gmyt6h2b93" }) def print_exception(): exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno,
def main(): START = datetime.now() bad = 'Our systems have detected unusual traffic from your computer network. This page checks to see if it's really you sending the requests, and not a robot.' parser = argparse.ArgumentParser(description="Get URLS from Bing") parser.add_argument( 'mot', nargs="?", help= "The phrase you want to look up. Put a '+' between the terms. eg: paris+the+city+of+love", type=str) parser.add_argument('-n', '--number', default=50, help="Minimum number of links you want (default=50)", type=int) args = parser.parse_args() nb_links = args.number if len(sys.argv) <= 1: parser.print_help() sys.exit(1) url_lis, ignored_link, total = [], 0, 0 ua = UserAgent() header = {'User-Agent': str(ua.random)} print(colored(header, 'yellow')) page = requests.get("https://www.bing.com/search?q=" + args.mot.lower(), headers=header) soup = BeautifulSoup(page.content, 'html.parser') out = open("out.txt", 'w') while len(url_lis) < nb_links: time.sleep(round(random.uniform(3, 7), 2)) # take it easy, don't get banned... # we get the h2 links of the search page h2links = soup.findAll("h2") good_link = re.findall('<a h="ID=SERP.{7}".href="(http.*?)"', str(h2links)) for link in good_link: total += 1 if isValid(link) and link not in url_lis: out.write(link + '\n') print(link) url_lis.append(link) else: ignored_link += 1 print(colored('{} links gotten'.format(len(url_lis)), 'red')) next_url = str(page.content) if re.findall(bad, str(next_url)): print( colored("they're coming after you, run !", 'red', 'on_yellow')) sys.exit(0) #we get here the link of the "Next" button #If you're not searching from francophone areas, you need to change the title of the link eg: Página siguiente, Volgende pagina, Nächste Seite... next_url = re.findall('title="Page suivante" href="(.*?)"', next_url) try: next_url = "https://www.bing.com" + html.unescape(next_url[0]) except IndexError: print(colored('No more results, sorry', 'yellow')) sys.exit(0) print('\n', colored(next_url, 'green'), sep="") page = requests.get(next_url, headers=header) soup = BeautifulSoup(page.content, 'html.parser') out.close() print('\n\nOutput file : out.txt') print( colored('links ignored : ' + str(ignored_link) + ' of ' + str(total), 'blue')) END = (datetime.now() - START).total_seconds() print(colored("Done in {} secs".format(round(END, 2)), 'yellow'))
import requests import re from fake_useragent import UserAgent import parsel import mysql_inport_data import pipline url = 'http://www.hnzzj.com/' headers = {'UserAgent': UserAgent().chrome} response = requests.get(url, headers).text print(response) print('----------------------------------------------------------') #解析 html = parsel.Selector(response) print(html) # srcs=html.xpath('//div[@class="container clearfix"]//div//ul//li//a/@href').getall() # print(srcs) titles = html.xpath( '//div[@class="container clearfix"]//div//ul//li//a/text()').getall() print(titles) print( '--------------------------------------------------------------------------' ) hrefs = html.xpath('//div[@class="main pagebox"]//@href').getall() print(hrefs) # for href in hrefs: # print(href) furl = 'http://www.hnzzj.com/'
def __init__(self): self.ua = UserAgent()
def genUA(): """returns a fake random user-agent""" return str(UserAgent().random)
import time import json import hashlib import requests from fake_useragent import UserAgent url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule" headers = {"User-Agent": UserAgent().random} def main(): def get_salt(): """获取13位时间戳""" salt = str(int(round(time.time() * 1000))) return salt def get_sign(): """获取sign""" sign = "fanyideskweb" + keywords + get_salt() + "6x(ZHw]mwzX#u0V7@yfwK" hl = hashlib.md5() hl.update(sign.encode(encoding='utf-8')) return sign data = { 'i': keywords, 'from': 'AUTO', 'to': 'AUTO', 'smartresult': 'dict', 'client': 'fanyideskweb', 'salt': get_salt(), 'sign': get_sign(),
def ret_augst(self): ua = UserAgent() return {'User-Agent': ua.random}