Esempio n. 1
1
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from fake_useragent import UserAgent

BOT_NAME = 'guazi'

SPIDER_MODULES = ['guazi.spiders']
NEWSPIDER_MODULE = 'guazi.spiders'
LOG_LEVEL = 'WARNING'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent().random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Esempio n. 2
1
from bs4 import BeautifulSoup
import requests as re
import pandas as pd
from datetime import datetime
import numpy as np
import time
import tqdm
url = "https://www.hltv.org/results?offset="
pd.options.mode.chained_assignment = None  # default='warn'

from fake_useragent import UserAgent
ua = UserAgent(cache=False)


def team_matches(team, df):
    return df.loc[(df['team1'] == team) |
                  (df['team2'] == team)].reset_index(drop=True)


class hltv(object):
    """docstring for ClassName"""
    def __init__(self, user_agent):
        self.user_agent = user_agent
        self.dataframe = pd.DataFrame()

    def results(self, num):
        df = pd.DataFrame(
            columns=['team1', 'score1', 'score2', 'team2', 'url', 'date'])
        session = re.Session()
        session.headers.update({'User-Agent': self.user_agent})
        for index_of_page in range(num):
Esempio n. 3
1
    def __init__(self,
                 login,
                 password,
                 like_per_day=1000,
                 media_max_like=150,
                 media_min_like=0,
                 follow_per_day=0,
                 follow_time=5 * 60 * 60, #Cannot be zero
                 unfollow_per_day=0,
                 start_at_h=0,
                 start_at_m=0,
                 end_at_h=23,
                 end_at_m=59,
                 database_name='follows_db.db',
                 comment_list=[["this", "the", "your"],
                               ["photo", "picture", "pic", "shot", "snapshot"],
                               ["is", "looks", "feels", "is really"],
                               ["great", "super", "good", "very good", "good",
                                "wow", "WOW", "cool", "GREAT", "magnificent",
                                "magical", "very cool", "stylish", "beautiful",
                                "so beautiful", "so stylish", "so professional",
                                "lovely", "so lovely", "very lovely", "glorious",
                                "so glorious", "very glorious", "adorable",
                                "excellent", "amazing"],
                               [".", "..", "...", "!", "!!", "!!!"]],
                 comments_per_day=0,
                 tag_list=['cat', 'car', 'dog'],
                 max_like_for_one_tag=5,
                 unfollow_break_min=15,
                 unfollow_break_max=30,
                 log_mod=0,
                 proxy="",
                 user_blacklist={},
                 tag_blacklist=[],
                 unwanted_username_list=[],
                 unfollow_whitelist=[]):

        self.database_name = database_name
        self.follows_db = sqlite3.connect(database_name, timeout=0, isolation_level=None)
        self.follows_db_c = self.follows_db.cursor()
        check_and_update(self)
        fake_ua = UserAgent()
        self.user_agent = check_and_insert_user_agent(self, str(fake_ua.random))
        self.bot_start = datetime.datetime.now()
        self.bot_start_ts = time.time()
        self.start_at_h = start_at_h
        self.start_at_m = start_at_m
        self.end_at_h = end_at_h
        self.end_at_m = end_at_m
        self.unfollow_break_min = unfollow_break_min
        self.unfollow_break_max = unfollow_break_max
        self.user_blacklist = user_blacklist
        self.tag_blacklist = tag_blacklist
        self.unfollow_whitelist = unfollow_whitelist
        self.comment_list = comment_list
        self.instaloader = instaloader.Instaloader()

        self.time_in_day = 24 * 60 * 60
        # Like
        self.like_per_day = like_per_day
        if self.like_per_day != 0:
            self.like_delay = self.time_in_day / self.like_per_day

        # Follow
        self.follow_time = follow_time #Cannot be zero
        self.follow_per_day = follow_per_day
        if self.follow_per_day != 0:
            self.follow_delay = self.time_in_day / self.follow_per_day

        # Unfollow
        self.unfollow_per_day = unfollow_per_day
        if self.unfollow_per_day != 0:
            self.unfollow_delay = self.time_in_day / self.unfollow_per_day

        # Comment
        self.comments_per_day = comments_per_day
        if self.comments_per_day != 0:
            self.comments_delay = self.time_in_day / self.comments_per_day

        # Don't like if media have more than n likes.
        self.media_max_like = media_max_like
        # Don't like if media have less than n likes.
        self.media_min_like = media_min_like
        # Auto mod seting:
        # Default list of tag.
        self.tag_list = tag_list
        # Get random tag, from tag_list, and like (1 to n) times.
        self.max_like_for_one_tag = max_like_for_one_tag
        # log_mod 0 to console, 1 to file
        self.log_mod = log_mod
        self.s = requests.Session()
        self.c = requests.Session()
        # if you need proxy make something like this:
        # self.s.proxies = {"https" : "http://proxyip:proxyport"}
        # by @ageorgios
        if proxy != "":
            proxies = {
                'http': 'http://' + proxy,
                'https': 'http://' + proxy,
            }
            self.s.proxies.update(proxies)
        # convert login to lower
        self.user_login = login.lower()
        self.user_password = password
        self.bot_mode = 0
        self.media_by_tag = []
        self.media_on_feed = []
        self.media_by_user = []
        self.current_user_info = ''
        self.unwanted_username_list = unwanted_username_list
        now_time = datetime.datetime.now()
        self.check_for_bot_update()
        log_string = 'Instabot v1.2.0/1 started at %s:' % \
                     (now_time.strftime("%d.%m.%Y %H:%M"))
        self.write_log(log_string)
        self.login()
        self.populate_user_blacklist()
        signal.signal(signal.SIGTERM, self.cleanup)
        atexit.register(self.cleanup)
def get_header():
    ua = UserAgent(verify_ssl=False)
    return {'User-Agent': ua.random}
Esempio n. 5
0
def test_fake_update_use_cache_server():
    ua = UserAgent(cache=False, use_cache_server=True)

    denied_urls = [
        'https://www.w3schools.com/browsers/browsers_stats.asp',
        'http://useragentstring.com/pages/useragentstring.php',
    ]

    with mock.patch(
        'fake_useragent.utils.Request',
        side_effect=partial(_request, denied_urls=denied_urls),
    ):
        ua.update()

        _probe(ua)

    denied_urls = [
        'https://www.w3schools.com/browsers/browsers_stats.asp',
        'http://useragentstring.com/pages/useragentstring.php',
        settings.CACHE_SERVER,
    ]

    with mock.patch(
        'fake_useragent.utils.Request',
        side_effect=partial(_request, denied_urls=denied_urls),
    ):
        with pytest.raises(FakeUserAgentError):
            ua.update()
Esempio n. 6
0
def test_fake_update_cache(path):
    assert not os.path.isfile(path)

    ua = UserAgent(path=path, cache=False, use_cache_server=False)

    assert not os.path.isfile(path)

    with pytest.raises(AssertionError):
        ua.update(cache='y')

    ua.update(cache=True)

    assert os.path.isfile(path)

    _probe(ua)
Esempio n. 7
0
def resolver(mongoconf, exit_event, debug=False):
    ua = UserAgent()
    ua.update()
    db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']]
    linkscoll = db['links']
    tweetscoll = db['tweets']
    while not exit_event.is_set():
        done = 0
        todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
        urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
        alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})}
        tweetsdone = []
        batchidsdone = set()
        for tweet in todo:
            if tweet.get("proper_links", []):
                tweetsdone.append(tweet["_id"])
                continue
            tweetid = tweet.get('retweet_id') or tweet['_id']
            if tweetid in batchidsdone:
                continue
            if exit_event.is_set():
                continue
            gdlinks = []
            for link in tweet.get("links", []):
                if link in alreadydone:
                    gdlinks.append(alreadydone[link])
                    continue
                good = resolve_url(link, user_agent=ua)
                gdlinks.append(good)
                try:
                    linkscoll.save({'_id': link, 'real': good})
                except Exception as e:
                    log("WARNING", "Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e))
                if link != good:
                    done += 1
            tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True)
            batchidsdone.add(tweetid)
        if debug and done:
            left = tweetscoll.count({"links_to_resolve": True})
            log("DEBUG", "[links] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left))
        # clear tweets potentially rediscovered
        if tweetsdone:
            tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True)
    log("INFO", "FINISHED resolver")
Esempio n. 8
0
def test_custom_path():
    location = os.path.join(
        tempfile.gettempdir(),
        'fake_useragent' + uuid.uuid1().hex + '.json',
    )

    ua = UserAgent(path=location)

    assert utils.exist(location)

    check_dict(ua.data)

    mtime = os.path.getmtime(location)

    ua.update()

    assert os.path.getmtime(location) != mtime

    clear(location)
Esempio n. 9
0
def test_fake_user_agent_browsers():
    ua = UserAgent(cache=False, use_cache_server=False)

    _probe(ua)

    with pytest.raises(FakeUserAgentError):
        ua.non_existing

    with pytest.raises(FakeUserAgentError):
        ua['non_existing']

    data1 = ua.data

    ua.update()

    data2 = ua.data

    assert data1 == data2

    assert data1 is not data2
Esempio n. 10
0
def test_user_agent():
    clear()
    assert not utils.exist()

    ua = UserAgent(cache=False)

    assert not ua.ie is None
    assert not ua.msie is None
    assert not ua.internetexplorer is None
    assert not ua.internet_explorer is None
    assert not ua['internet explorer'] is None
    assert not ua.google is None
    assert not ua.chrome is None
    assert not ua.googlechrome is None
    assert not ua.google_chrome is None
    assert not ua['google chrome'] is None
    assert not ua.firefox is None
    assert not ua.ff is None
    assert not ua.ie is None
    assert not ua.safari is None
    assert not ua.random is None
    assert not ua['random'] is None

    assert ua.non_existing is None
    assert ua['non_existing'] is None

    data1 = ua.data

    ua.update()

    data2 = ua.data

    assert data1 == data2
    assert not data1 is data2

    clear()
    del ua

    ua = UserAgent()

    assert utils.exist()

    data1 = ua.data

    clear()

    ua.update()

    assert utils.exist()

    data2 = ua.data

    assert data1 == data2
    assert not data1 is data2
Esempio n. 11
0
 def __init__(self, crawler):
     super(RandomUserAgentMiddlware, self).__init__()
     # self.user_agent_list = crawler.settings.get("user_agent_list", [])
     self.ua = UserAgent()
     self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random")
Esempio n. 12
0
from fake_useragent import UserAgent

us = UserAgent()
print(us.random)
#coding=utf-8
!pip install -U fake-useragent
!pip install -U func_timeout
from bs4 import BeautifulSoup
from lxml import html as h
from fake_useragent import UserAgent
from google.colab import drive
from math import ceil
from posixpath import normpath
from urllib.parse import urlencode, urljoin, urlparse, urlparse, urlunparse
from datetime import date, datetime, timedelta
import pandas as pd
import csv, func_timeout, html, os.path, pickle, re, requests, string, time

#drive.mount('/content/drive')
print(str(UserAgent().random))

@func_timeout.func_set_timeout(30)#添加func_set_timeout(2)的装饰器,参数2表示超时时间,此处设置为2s。
def askChoice(slogan):#将需要询问的内容封装进一个函数
		inputs = input(f'{slogan}\n')
    return inputs
#程序执行时先调用askChoice函数,并开始计时。
#若用户在计时期间内输入,则正常传参
#若用户超时,则触发func_timeout.exceptions.FunctionTimedOut异常,try...except捕捉异常,并进行后续操作。
 
def getHTMLText(url, code = 'utf-8'):
    Headers = {'User-Agent':str(UserAgent().random)}
    r = requests.get(url, headers = Headers, timeout = 30)
    r.raise_for_status()
    r.encoding = code
    return r
def getHTMLText(url, code = 'utf-8'):
    Headers = {'User-Agent':str(UserAgent().random)}
    r = requests.get(url, headers = Headers, timeout = 30)
    r.raise_for_status()
    r.encoding = code
    return r
Esempio n. 15
0
import requests
import re
import sys
import time
import os
import argparse
from bs4 import BeautifulSoup
from functools import partial
from multiprocessing import Pool, TimeoutError, cpu_count
from fake_useragent import UserAgent

ua = UserAgent().random

parser = argparse.ArgumentParser(
    description='Argument parser for dork-scanner')
parser.add_argument('-S',
                    '--search',
                    help='String to be searched for',
                    default='1')
parser.add_argument('-E',
                    '--engine',
                    help='Search engine to be used',
                    default='google')
parser.add_argument('-P',
                    '--page',
                    help='Number of pages to search in',
                    default='1')
parser.add_argument('-Pr',
                    '--process',
                    help='Number of parallel processes',
                    default='1')
Esempio n. 16
0
 def change_web_scraping_info(self):
     print('Changing user-agent and the proxy...')
     ua = UserAgent()
     self.user_agent = str(ua.random)
Esempio n. 17
0
def test_user_agent():
    clear(settings.DB)
    assert not utils.exist(settings.DB)

    ua = UserAgent(cache=False)

    assert ua.ie is not None
    assert ua.msie is not None
    assert ua.internetexplorer is not None
    assert ua.internet_explorer is not None
    assert ua['internet explorer'] is not None
    assert ua.google is not None
    assert ua.chrome is not None
    assert ua.googlechrome is not None
    assert ua.google_chrome is not None
    assert ua['google chrome'] is not None
    assert ua.firefox is not None
    assert ua.ff is not None
    assert ua.ie is not None
    assert ua.safari is not None
    assert ua.random is not None
    assert ua['random'] is not None

    try:
        ua.non_existing
    except FakeUserAgentError:
        pass
    else:
        assert False

    try:
        assert ua['non_existing']
    except FakeUserAgentError:
        pass
    else:
        assert False

    data1 = ua.data

    ua.update(settings.DB)

    data2 = ua.data

    assert data1 == data2
    assert data1 is not data2

    clear(settings.DB)
    del ua

    ua = UserAgent()

    assert utils.exist(settings.DB)

    data1 = ua.data

    clear(settings.DB)

    ua.update(settings.DB)

    assert utils.exist(settings.DB)

    data2 = ua.data

    assert data1 == data2
    assert data1 is not data2

    clear(settings.DB)
Esempio n. 18
0
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver import DesiredCapabilities
#跳过SSL验证证书
import ssl
#设置忽略SSL验证
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.proxy import ProxyType
from fake_useragent import UserAgent

from config import phantomjs_driver, generate_cookie_url, chrome_driver, generate_cookie_url, cookie_max_num, \
    cookie_timeout, cookie_retry_num, cookie_interval_time, log_dir, crawl_interval_mintime, crawl_interval_maxtime, \
    base_url1
from proxy_ip import _proxy

ssl._create_default_https_context = ssl._create_unverified_context
ua = UserAgent(verify_ssl=False)
is_clear = False

# 请求头设置
user_agent = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER) ",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E) ",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
    "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0) ",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
Esempio n. 19
0
 def __init__(self, crawler):
     super(RandomUserAgentMiddleware, self).__init__()
     self.ua = UserAgent()
     self.ua_type = crawler.settings.get('RANDOM_UA_TYPE', 'random')
    def parse(self,pass_url):
        #解析当前微博下的所有评论用户
        first_req=requests.get(pass_url+str(1),cookies=self.new_cookies()).content
        if 'not exist' in str(first_req):
            return None
        html = etree.HTML(first_req)
        #获取中断的页面
        try:
            with open('page_num.txt','r') as f:
                broken_page_num=int(f.readlines()[0])+1
        except:
            broken_page_num=1
        #评论总页数
        try:
            page_num = (html.xpath('//*[@id="pagelist"]/form/div/text()')[1].split('/')[1])[:-1]
        except:
            #print('[-----]页面请求错误')
            return self.parse(pass_url=pass_url)
        for page in range(broken_page_num,int(page_num)+1):
            print(page)
            if page % 5 == 0:
                with open('page_num.txt','w') as f:
                    f.write(str(page))
            fi=set()
            #保存当前运行状态
            cookies=self.new_cookies()
            #print('[++++++++]当前cookies:',str(cookies))
            try:
                req=requests.get(pass_url+str(page),cookies=cookies,headers={"User-Agent":UserAgent().random}).content
                html=etree.HTML(req)
                fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href')
                fans_name=html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()')
            except:
                while True:
                    #print('[!!!!!]出现错误,未获取到内容:')
                    time.sleep(5)
                    try:
                        req = requests.get(pass_url + str(page),headers={"User-Agent":UserAgent().random},cookies=cookies).content
                        html = etree.HTML(req)
                        fans = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/@href')
                        fans_name = html.xpath('//div[@class="c"]/a[contains(@href,"/u/")]/text()')
                        break
                    except:
                        pass

            for i,j in enumerate(fans):
                #防止底部返回链接的干扰
                if '5644764907' in j:
                    continue
                fans_url='https://weibo.cn/'+j.split('/u/')[1]+'/info'
                fans_weibo='https://weibo.cn'+j
                m_url="https://m.weibo.cn/api/container/getIndex?containerid=230283{}_-_INFO&title=%E5%9F%BA%E6%9C%AC%E8%B5%84%E6%96%99&luicode=10000011&lfid=230283{}".format(j.split('/u/')[1],j.split('/u/')[1])
                name=fans_name[i]
                if name in fi:
                    pass
                else:
                    fi.add(name)
                    self.db.lpush(fans_url)
                    self.db1.lpush(fans_weibo)
                    self.db2.lpush(m_url)
                    print('[+++][+++][+++]',name)
                #在应对限制ip的反爬措施中,效率最高的等待时间
                time.sleep(0.35)
        #爬完该篇微博的所有评论后
        time.sleep(1)
        with open('page_num.txt','w') as f:
            f.write('0')
Esempio n. 21
0
    else:
        proxies = re.findall(
            re.compile('<td>([\d.]+)</td>'),
            str(requests.get('https://www.sslproxies.org/').content))
        proxies = [
            '%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2]))
        ]
    print('%d proxies successfully loaded!' % len(proxies))
    proxy = Proxy()
    proxy.proxy_type = ProxyType.MANUAL
    if args.user_agent:
        if path.isfile(args.user_agent):
            user_agents = list(
                filter(None,
                       open(args.user_agent, 'r').read().split('\n')))
        else:
            user_agents = [args.user_agent]
    else:
        user_agents = UserAgent()
    for i in range(args.threads):
        t = Thread(target=bot, args=(i, args.url))
        t.daemon = True
        t.start()
        sleep(uniform(2.0, 4.0))
    stdin.read(1)
    exit(0)
except KeyboardInterrupt:
    exit(0)
except:
    exit(1)
Esempio n. 22
0
 def __init__(self):
     self.headers = dict()
     self.headers['User-Agent'] = UserAgent().random
     self.my_session = requests.session()
Esempio n. 23
0
    def get_df(self):
        print('get data from services...')
        sfb_df = pd.read_csv(self.path_sfb, sep=';', index_col='id')
        serv_df = sfb_df[sfb_df['type'] == 'services']

        list_url = serv_df['URL'].values
        final_df = pd.DataFrame(columns=[
            'date', 'type', 'category_id', 'category_title', 'site_title',
            'price_new', 'price_old', 'site_unit', 'site_link', 'site_code'
        ])

        #mgts
        n = 0
        url = list_url[n]
        print(url)
        html = requests.get(url, headers={
            'User-Agent': UserAgent().chrome
        }).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_list = soup.findAll('div',
                                  {'class': 'slider_slide'})  #0 заменить
        for price_elem in price_list:
            if price_elem.findAll('div',
                                  {'class': 'texts'})[0].text == 'Безлимитный':
                price_dict = dict()
                price_dict['date'] = Global().date
                price_dict['site_code'] = 'services'
                id_n = int(serv_df[serv_df['URL'].str.contains(url)].index[0])
                price_dict['category_id'] = id_n
                price_dict['category_title'] = serv_df['cat_title'].loc[
                    price_dict['category_id']]
                price_dict['type'] = 'services'
                price_dict['site_title'] = price_elem.findAll(
                    'div', {'class': 'texts'})[0].text
                price_dict['price_new'] = int(
                    price_elem.findAll('div',
                                       {'class': 'slider_price_val'})[0].text)
                price_dict['price_old'] = ''
                price_dict['site_unit'] = price_elem.findAll(
                    'div', {'class': 'slider_price_rub1'
                            })[0].text + '/' + price_elem.findAll(
                                'div', {'class': 'slider_price_rub2'})[0].text
                price_dict['site_link'] = url
                final_df = final_df.append(price_dict, ignore_index=True)
                break

        #Помывка в бане в общем отделении, билет	http://legkiipar.ru/menraz.html
        try:
            n = 1
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  #, headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')  #Будние дни с 08:00 до 22:00
            pattern = re.compile(r'Будние дни')
            price_dict = dict()
            price_dict['date'] = Global().date
            price_dict['site_code'] = 'services'
            price_dict['type'] = 'services'
            price_dict['site_title'] = soup(text=pattern)[0]
            price_1 = soup.findAll('span', {'class': 'стиль6'})
            price_dict['price_new'] = re.findall('\d+', price_1[1].text)[0]
            price_dict['price_old'] = ''
            price_dict['site_unit'] = re.findall('\d+ часа',
                                                 price_1[4].text[:-1])[0]
            price_dict['category_id'] = int(
                serv_df[serv_df['URL'].str.contains(url)].index[0])
            price_dict['category_title'] = serv_df['cat_title'].loc[
                price_dict['category_id']].values[0]
            price_dict['site_link'] = url
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #Помывка в бане в общем отделении, билет	http://banya-lefortovo.ru/price.html
        n = 2
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        pattern = re.compile(r'Русская общая баня')
        price_dict['site_title'] = soup(text=pattern)[0]
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        price_dict['type'] = 'services'
        price_dict['price_new'] = int(
            re.findall('\d+',
                       re.findall('\d+ рублей',
                                  soup(text=pattern)[0])[0])[0])
        price_dict['price_old'] = ''
        price_dict['site_unit'] = re.findall('\d+ часа',
                                             soup(text=pattern)[0])[0]
        price_dict['site_link'] = url
        final_df = final_df.append(price_dict, ignore_index=True)

        #Помывка в бане в общем отделении, билет	https://rzhevskie-bani.ru/rb/bani.html
        n = 3
        price_dict = dict()
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_dict['price_new'] = int(
            re.findall('\d+',
                       soup.findAll('td', {'class': 'price'})[0].text)[0])
        pattern = re.compile(r'Стоимость')
        soup.findAll('td')
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        price_dict['site_title'] = soup(text=pattern)[0]
        price_dict['type'] = 'services'
        price_dict['site_unit'] = re.findall('(\d+.*\d часа)',
                                             soup(text=pattern)[0][-9:])[0]
        price_dict['site_link'] = url
        final_df = final_df.append(price_dict, ignore_index=True)

        #Помывка в бане в общем отделении, билет	http://vorontsovskie-bani.ru/obshchestvennye-bani/muzhskoj-zal-pervyj-razryad
        n = 4
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        try:
            html = requests.get(url,
                                headers={
                                    'User-Agent': UserAgent().chrome
                                },
                                timeout=10).content
        except:
            proxy = get_proxy(url)
            html = requests.get(url,
                                headers={
                                    'User-Agent': UserAgent().chrome
                                },
                                proxies=proxy).content
        soup = BeautifulSoup(html, 'lxml')
        price_div = soup.findAll('div', {'class': 'price-head'})[0]
        price_dict['price_new'] = int(
            re.findall('\d+',
                       price_div.findAll('span',
                                         {'class': 'price'})[0].text)[0])
        price_dict['price_old'] = ''
        price_dict['site_title'] = price_div.find('p').text.replace(
            '\xa0', ' ')
        price_dict['site_unit'] = re.findall('\d+ часа',
                                             price_dict['site_title'])[0]
        price_dict['type'] = 'services'
        price_dict['site_link'] = url
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        final_df = final_df.append(price_dict, ignore_index=True)

        #Постановка набоек, пара	https://masterskaya-obuvi.ru/tseny
        '''
        n=5
        price_dict=dict()
        price_dict['date']=Global().date
        price_dict['site_code']='services'
        url=list_url[n]
        print(url)
        html=requests.get(url).content#, headers={'User-Agent': UserAgent().chrome}
        soup=BeautifulSoup(html, 'lxml')
        price_dict['category_id']=int(serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[price_dict['category_id']]['cat_title'].values[0]
        for elem in soup.findAll('tr'):
            if re.findall('износоустойчивой резины',elem.text)!=[]:
                price_div=elem
                price_dict['site_title']=re.findall('[А-Яа-яёз(). ]+',elem.text)[0]
                price_dict['site_unit']=re.findall('[А-Яа-яёз(). ]+',elem.text)[1]
                price_dict['price_new']=int(price_div.findAll('td',{'width':"356"})[0].text)
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_link']=url
                break

        final_df=final_df.append(price_dict,ignore_index=True)
        '''

        #Постановка набоек, пара	https://masterskaya-obuvi.ru/tseny
        n = 6
        price_dict = dict()
        price_dict['date'] = Global().date
        price_dict['site_code'] = 'services'
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title'].values[0]
        for elem in soup.findAll('tr'):
            if re.findall('эконом', elem.text) != []:
                price_div = elem
                price_dict['site_title'] = self.wspex_space(
                    re.findall(
                        '[А-Яа-яёз(). ]+',
                        price_div.findAll('td', {'align': 'left'})[0].text)[0])
                price_text = price_div.findAll('strong')[0].text
                price_dict['price_new'] = int(re.findall('\d+', price_text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = re.findall(
                    '\([А-Яа-я]*\)', price_dict['site_title'])[0][1:-1]
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        #Билет на 1 поездку - мосгортранс
        n = 7
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        #soup.findAll('td')#,{'class':'text-center'})[0]
        price_dict['category_id'] = int(
            serv_df[serv_df['URL'].str.contains(url)].index[0])
        price_dict['category_title'] = serv_df.loc[
            price_dict['category_id']]['cat_title']
        for elem in soup.findAll('td'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                site_title = price_div.text
                break

        for elem in soup.findAll('tr'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                price_dict['site_title'] = price_div.find('td').text
                price_dict['price_new'] = int(
                    re.findall('\d{2,3}', price_div.text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'поездка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        # # стрижка
        try:
            n = 8
            price_dict = dict()
            price_dict['site_code'] = 'services'
            price_dict['date'] = Global().date
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  # , headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')

            # soup.findAll('td')#,{'class':'text-center'})[0]
            for elem in soup.findAll('tr'):
                if re.findall('(любой длины)', elem.text) != []:
                    price_dict['category_id'] = int(
                        serv_df[serv_df['URL'].str.contains(url)].index[-1])
                    price_dict['category_title'] = serv_df.loc[
                        price_dict['category_id']]['cat_title'].values[0]
                    price_text = elem.text
                    price_dict['site_title'] = re.findall(
                        '[А-Яа-я ()]+', price_text)[0]
                    price_dict['price_new'] = re.findall('\d+', price_text)[0]
                    price_dict['price_old'] = ''
                    price_dict['type'] = 'services'
                    price_dict['site_unit'] = 'стрижка'
                    price_dict['site_link'] = url
                    break
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #стрижка
        try:
            n = 9
            price_dict = dict()
            price_dict['site_code'] = 'services'
            price_dict['date'] = Global().date
            url = list_url[n]
            print(url)
            html = requests.get(
                url).content  #, headers={'User-Agent': UserAgent().chrome}
            soup = BeautifulSoup(html, 'lxml')

            for elem in soup.findAll('tr'):
                if re.findall('Женская', elem.text) != []:
                    price_div = elem
                    price_dict['category_id'] = int(
                        serv_df[serv_df['URL'].str.contains(url)].index[0])
                    price_dict['category_title'] = serv_df.loc[
                        price_dict['category_id']]['cat_title'].values[0]
                    price_dict['site_title'] = price_div.find(
                        'td', {
                            'class': 'services-table__name'
                        }).text
                    price_dict['price_new'] = int(
                        self.wspex(
                            price_div.find(
                                'td', {
                                    'class':
                                    'services-table__price services-table__price-small'
                                }).text))
                    price_dict['price_old'] = ''
                    price_dict['type'] = 'services'
                    price_dict['site_unit'] = 'стрижка'
                    price_dict['site_link'] = url
                    break
            final_df = final_df.append(price_dict, ignore_index=True)
        except:
            print('DAMN! {} can not be parsed'.format(url))

        #стрижка
        n = 10
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')
        for elem in soup.findAll('tr'):
            if re.findall('лопаток', elem.text) != []:
                price_div = elem
                price_dict['category_id'] = int(
                    serv_df[serv_df['URL'].str.contains(list_url[n -
                                                                 1])].index[0])
                price_dict['category_title'] = serv_df.loc[
                    price_dict['category_id']]['cat_title'].values[0]
                price_dict['site_title'] = price_div.find(
                    'td', {
                        'height': '17'
                    }).text
                price_dict['price_new'] = int(
                    self.wspex(price_div.find('td', {
                        'width': '157'
                    }).text))
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'стрижка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)

        #Билет на 1 поездку - мосгортранс
        n = 11
        price_dict = dict()
        price_dict['site_code'] = 'services'
        price_dict['date'] = Global().date
        url = list_url[n]
        print(url)
        html = requests.get(
            url).content  #, headers={'User-Agent': UserAgent().chrome}
        soup = BeautifulSoup(html, 'lxml')

        for elem in soup.findAll('tr'):
            if re.findall('не более', elem.text) != []:
                price_div = elem
                price_dict['category_id'] = int(
                    serv_df[serv_df['URL'].str.contains(url)].index[-1])
                price_dict['category_title'] = serv_df.loc[
                    price_dict['category_id']]['cat_title']
                price_dict['site_title'] = price_div.find('td').text
                price_dict['price_new'] = int(
                    re.findall('\d{2,3}', price_div.text)[0])
                price_dict['price_old'] = ''
                price_dict['type'] = 'services'
                price_dict['site_unit'] = 'поездка'
                price_dict['site_link'] = url
                break
        final_df = final_df.append(price_dict, ignore_index=True)
        final_df = final_df[final_df.site_title.notna()]
        print('ALL SERVICES HAVE BEEN SUCCESSFULLY PARSED!')
        return final_df
Esempio n. 24
0
from bs4 import BeautifulSoup
import os
import urllib.parse as rep
import urllib.request as req
from fake_useragent import UserAgent
opener = req.build_opener()
opener.addheaders = [('User-agent', UserAgent().chrome)]
req.install_opener(opener)

base = "https://search.naver.com/search.naver?sm=tab_hty.top&where=image&query="
quote = req.quote('카구야')
url = base + quote

print('Request URL : {}'.format(url))
res = req.urlopen(url)
savePath = "C:/Users/hsm01/Desktop/web/crawl/result2"
try:
    if not (os.path.isdir(savePath)):
        os.makedirs(os.path.join(savePath))
except OSError as e:
    print("folder creation failed")
    print(f"folder name : {e.filename}")
    raise RuntimeError("system exit!")
else:
    print("folder created")

soup = BeautifulSoup(res, "html.parser")
# print(soup.prettify())

img_list = soup.select('div.img_area> a.thumb._thumb > img')
Esempio n. 25
0
            print(proxy.http_proxy)
            chrome_options.add_argument('user-agent="{}"'.format(agent.random))
            capabilities = webdriver.DesiredCapabilities.CHROME
            proxy.add_to_capabilities(capabilities)
            driver = webdriver.Chrome(options=chrome_options,
                                      desired_capabilities=capabilities)
            driver.get(args.url)
            sleep(args.duration)
            driver.close()
    except:
        _exit(0)


if args.proxies:
    proxies = open(args.proxies, 'r').read().split('\n')
else:
    proxies = re.findall(
        re.compile('<td>([\d.]+)</td>'),
        str(requests.get('https://free-proxy-list.net/').content))
    proxies = ['%s:%s' % x for x in list(zip(proxies[0::2], proxies[1::2]))]
print('%d proxies successfully loaded!' % len(proxies))
proxy = Proxy()
proxy.proxy_type = ProxyType.MANUAL
agent = UserAgent()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--mute-audio')
for i in range(args.threads):
    t = Thread(target=bot, args=(args.url, ))
    t.deamon = True
    t.start()
    sleep(uniform(1.5, 3.0))
Esempio n. 26
0
 def process_request(self, request, spider):
     ua = UserAgent()
     request.headers['User-Agent'] = ua.random
def proxy_chrome(PROXY_HOST,PROXY_PORT,PROXY_USER,PROXY_PASS):
    manifest_json = """
            {
                "version": "1.0.0",
                "manifest_version": 2,
                "name": "Chrome Proxy",
                "permissions": [
                    "proxy",
                    "tabs",
                    "unlimitedStorage",
                    "storage",
                    "<all_urls>",
                    "webRequest",
                    "webRequestBlocking"
                ],
                "background": {
                    "scripts": ["background.js"]
                },
                "minimum_chrome_version":"22.0.0"
            }
            """

    background_js = """
    var config = {
            mode: "fixed_servers",
            rules: {
              singleProxy: {
                scheme: "http",
                host: "%(host)s",
                port: parseInt(%(port)d)
              },
              bypassList: ["foobar.com"]
            }
          };
    chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
    function callbackFn(details) {
        return {
            authCredentials: {
                username: "******",
                password: "******"
            }
        };
    }
    chrome.webRequest.onAuthRequired.addListener(
                callbackFn,
                {urls: ["<all_urls>"]},
                ['blocking']
    );
        """ % {
            "host": PROXY_HOST,
            "port": PROXY_PORT,
            "user": PROXY_USER,
            "pass": PROXY_PASS,
        }


    pluginfile = 'extension\\proxy_auth_plugin.zip'

    with zipfile.ZipFile(pluginfile, 'w') as zp:
        zp.writestr("manifest.json", manifest_json)
        zp.writestr("background.js", background_js)

    co = Options()
    # extension support is not possible in incognito mode for now
    # co.add_argument('--incognito')
    co.add_argument('--disable-gpu')
    # disable infobars
    co.add_argument('--disable-infobars')
    co.add_argument("--start-maximized")
    ua = UserAgent()
    userAgent = ua.random
    co.add_argument('--user-agent="' + str(userAgent) + '"')
    co.add_experimental_option("excludeSwitches",["ignore-certificate-errors"])
    # location of chromedriver, please change it according to your project.
    chromedriver = os.getcwd()+'\\Chromedriver\\chromedriver.exe'
    co.add_extension(pluginfile)
    driver = webdriver.Chrome(chromedriver,chrome_options=co)
    # return the driver with added proxy configuration.
    return driver
Esempio n. 28
0
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'movie'

SPIDER_MODULES = ['movie.spiders']
NEWSPIDER_MODULE = 'movie.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
from fake_useragent import UserAgent
ua = UserAgent()
USER_AGENT = ua.random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Esempio n. 29
0
 def __init__(self):
     self.ua = UserAgent(use_cache_server=False)
Esempio n. 30
0
from requests import Session as _Session
from requests.exceptions import ConnectionError, ChunkedEncodingError, Timeout, HTTPError
from requests.adapters import HTTPAdapter
import logging
import time
from .cookiejar import ClientCookieJar

try:
    from fake_useragent import UserAgent
except ImportError:
    UserAgent = None
    ua = None
    ua_str = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
else:
    ua = UserAgent(
        fallback=
        'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
    )
    ua_str = ua.chrome

session_logger = logging.getLogger('showroom.session')


class ClientSession(_Session):
    """
    Wrapper for requests.Session.

    Mainly used to catch temporary errors and set a Timeout

    Overrides requests.Session.get() and increases max pool size

    Raises:
Esempio n. 31
0
import time
import json
import hashlib
import requests
from fake_useragent import UserAgent

url = "http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule"
headers = {"User-Agent": UserAgent().random}


def main():
    def get_salt():
        """获取13位时间戳"""
        salt = str(int(round(time.time() * 1000)))
        return salt

    def get_sign():
        """获取sign"""
        sign = "fanyideskweb" + keywords + get_salt() + "6x(ZHw]mwzX#u0V7@yfwK"
        hl = hashlib.md5()
        hl.update(sign.encode(encoding='utf-8'))
        return sign

    data = {
        'i': keywords,
        'from': 'AUTO',
        'to': 'AUTO',
        'smartresult': 'dict',
        'client': 'fanyideskweb',
        'salt': get_salt(),
        'sign': get_sign(),
Esempio n. 32
0
# 测试ttk
# from tkinter import *
# from tkinter import ttk
# help(ttk.Button)
# root=Tk()
# ttk.Style().configure("TButton",padding=15,relief=FLAT,background="green",foreground="blue")
# # mac上Button似乎仍然无法改bg
# ttk.Button(root,text="撒放放假啊阱n").pack()
#
# Button(root,text="NO.2",relief=SUNKEN,fg="purple").pack()
# # tk的按钮显现不了字还是
# mainloop()

from fake_useragent import UserAgent

headers = {UserAgent().random}
print(headers)
print('==================================')

str1 = "sasas, sajfqn http:http\n llsaajghttps"
a = str1.rfind("https")
print(a)

print('==================================')
with open("Burning.txt", 'a+') as f:
    # f.write("sssak\n")
    f.seek(0, 0)
    # print(f.read())
    print(f.readlines())
print('==================================')
Esempio n. 33
0
def genUA():
    """returns a fake random user-agent"""
    return str(UserAgent().random)
Esempio n. 34
0
from urllib.request import Request, build_opener
from urllib.request import HTTPHandler
from fake_useragent import UserAgent

url = 'http://httpbin.org/get'

headers = {
    'User-Agent':UserAgent().random
}

request = Request(url, headers=headers)
handler = HTTPHandler(debuglevel=1)
opener = build_opener(handler)
response = opener.open(request)
# print(response.read().decode())

Esempio n. 35
0
def main():

    START = datetime.now()
    bad = 'Our systems have detected unusual traffic from your computer network.  This page checks to see if it&#39;s really you sending the requests, and not a robot.'
    parser = argparse.ArgumentParser(description="Get URLS from Bing")
    parser.add_argument(
        'mot',
        nargs="?",
        help=
        "The phrase you want to look up. Put a '+' between the terms. eg: paris+the+city+of+love",
        type=str)
    parser.add_argument('-n',
                        '--number',
                        default=50,
                        help="Minimum number of links you want (default=50)",
                        type=int)
    args = parser.parse_args()
    nb_links = args.number

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    url_lis, ignored_link, total = [], 0, 0
    ua = UserAgent()
    header = {'User-Agent': str(ua.random)}
    print(colored(header, 'yellow'))

    page = requests.get("https://www.bing.com/search?q=" + args.mot.lower(),
                        headers=header)
    soup = BeautifulSoup(page.content, 'html.parser')

    out = open("out.txt", 'w')

    while len(url_lis) < nb_links:

        time.sleep(round(random.uniform(3, 7),
                         2))  # take it easy, don't get banned...

        # we get the h2 links of the search page
        h2links = soup.findAll("h2")
        good_link = re.findall('<a h="ID=SERP.{7}".href="(http.*?)"',
                               str(h2links))
        for link in good_link:
            total += 1
            if isValid(link) and link not in url_lis:
                out.write(link + '\n')
                print(link)
                url_lis.append(link)
            else:
                ignored_link += 1
        print(colored('{} links gotten'.format(len(url_lis)), 'red'))
        next_url = str(page.content)
        if re.findall(bad, str(next_url)):
            print(
                colored("they're coming after you, run !", 'red', 'on_yellow'))
            sys.exit(0)

        #we get here the link of the "Next" button

#If you're not searching from francophone areas, you need to change the title of the link eg: Página siguiente, Volgende pagina, Nächste Seite...
        next_url = re.findall('title="Page suivante" href="(.*?)"', next_url)
        try:
            next_url = "https://www.bing.com" + html.unescape(next_url[0])
        except IndexError:
            print(colored('No more results, sorry', 'yellow'))
            sys.exit(0)
        print('\n', colored(next_url, 'green'), sep="")
        page = requests.get(next_url, headers=header)
        soup = BeautifulSoup(page.content, 'html.parser')
    out.close()
    print('\n\nOutput file : out.txt')
    print(
        colored('links ignored : ' + str(ignored_link) + ' of ' + str(total),
                'blue'))
    END = (datetime.now() - START).total_seconds()
    print(colored("Done in {} secs".format(round(END, 2)), 'yellow'))
Esempio n. 36
0
 def __init__(self,crawler):
     super(RandomUserAgentMiddlware,self).__init__()
     self.ua = UserAgent()
Esempio n. 37
0
 def __init__(self):
     self.ua = UserAgent()
Esempio n. 38
0
            api_args['max_id'] = min(api_args.get('max_id', tw['id']), tw['id']-1)
            metas = prepare_tweet(tw)
            metas.pop('_id')
            tw.update(metas)
            for po in ['user', 'entities', 'extended_entities']:
                if po in tw:
                    tw.pop(po)
            db.tweets.update({'_id': tw['id']}, {"$set": tw}, upsert=True)
        print "...collected %s new tweets" % len(tweets)
        tweets = api.call('statuses.user_timeline', api_args)
    db.users.update({'_id': user['twitter']}, {"$set": {"done": True}})


# TODO: refacto all of this with gazouilloire/run.py

ua = UserAgent()
ua.update()
todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
left = tweetscoll.count({"links_to_resolve": True})
print "\n\n- STARTING LINKS RESOLVING: %s waiting\n\n" % left
while todo:
    done = 0
    urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
    alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})}
    tweetsdone = []
    batchidsdone = set()
    for tweet in todo:
        if tweet.get("proper_links", []):
            tweetsdone.append(tweet["_id"])
            continue
        tweetid = tweet.get('retweet_id') or tweet['_id']
Esempio n. 39
0
def test_fake_update():
    ua = UserAgent(cache=False, use_cache_server=False)

    ua.update()

    _probe(ua)