Beispiel #1
0
def test_fake_update_use_cache_server():
    ua = UserAgent(cache=False, use_cache_server=True)

    denied_urls = [
        'https://www.w3schools.com/browsers/browsers_stats.asp',
        'http://useragentstring.com/pages/useragentstring.php',
    ]

    with mock.patch(
            'fake_useragent.utils.Request',
            side_effect=partial(_request, denied_urls=denied_urls),
    ):
        ua.update()

        _probe(ua)

    denied_urls = [
        'https://www.w3schools.com/browsers/browsers_stats.asp',
        'http://useragentstring.com/pages/useragentstring.php',
        settings.CACHE_SERVER,
    ]

    with mock.patch(
            'fake_useragent.utils.Request',
            side_effect=partial(_request, denied_urls=denied_urls),
    ):
        with pytest.raises(FakeUserAgentError):
            ua.update()
Beispiel #2
0
    def __init__(self):
        super().__init__()
        proxy = QNetworkProxy()
        proxy.setType(QNetworkProxy.HttpProxy)
        proxy.setHostName("109.173.124.250")
        proxy.setPort(7793)
        QNetworkProxy.setApplicationProxy(proxy)
        try:
            print("1")
            ua = UserAgent()
            ua.update()
            useragent = ua.random
        except FakeUserAgentError:
            print("2")
            useragent = "Mozilla / 5.0 (Windows NT 10.0; Win64; x64) AppleWebKit / 537.36 (KHTML, как Gecko) Chrome / " \
                        "72.0.3626.121 Safari / 537.36"

        profile = QWebEngineProfile()
        profile.setHttpUserAgent(useragent)
        page = QWebEnginePage(profile)
        page.setUrl(QUrl("https://www.instagram.com/"))
        self.setPage(page)
        self.page().proxyAuthenticationRequired.connect(
            self.handle_proxy_auth_req)

        self.imit_peop = threading.Thread(target=self.imitation_people,
                                          args=())
        self._timer = QTimer()
        self.loadFinished.connect(self.startTimer)
        self._timer.timeout.connect(self.start_thread_imitation)
Beispiel #3
0
def resolver(pile_links, mongoconf, exit_event, debug=False):
    ua = UserAgent()
    ua.update()
    db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']]
    linkscoll = db['links']
    tweetscoll = db['tweets']
    while not exit_event.is_set() or not pile_links.empty():
        todo = []
        while not pile_links.empty() and len(todo) < 10:
            todo.append(pile_links.get())
        if not todo:
            if not exit_event.is_set():
                time.sleep(1)
            continue
        done = 0
        for tweet in todo:
            gdlinks = []
            for link in tweet["links"]:
                good = linkscoll.find_one({'_id': link})
                if good:
                    gdlinks.append(good['real'])
                    continue
                good = resolve_url(link, user_agent=ua)
                gdlinks.append(good)
                linkscoll.save({'_id': link, 'real': good})
                if link != good:
                    done += 1
            tweetscoll.update({'_id': tweet['_id']}, {'$set': {'proper_links': gdlinks}}, upsert=False)
        if debug and done:
            log("DEBUG", "[links] +%s links resolved (out of %s/%s)" % (done, len(todo), pile_links.qsize()))
    log("INFO", "FINISHED resolver")
class RotateUserAgentMiddleware(UserAgentMiddleware):
    def __init__(self, user_agent=''):
        self.user_agent = user_agent
        self.ua = UserAgent(fallback=self.user_agent)
        cache_file = settings.DB

        ONEDAY_DELTA = 86400
        now = time.time()

        cache_age = 0
        try:
            cache_age = getmtime(cache_file)
        except OSError:
            pass

        if cache_age <= now - ONEDAY_DELTA:
            self.ua.update()

        with open(cache_file, mode='rb') as fp:
            browser_data = json.load(fp)
            test = browser_data.get('browsers', {}).get('chrome', [])
            if not test:
                d = dirname(sys.modules["alascrapy"].__file__)
                backup_filename = 'fake_useragent_%s.json' % settings.__version__
                copyfile(join(d, backup_filename), cache_file)

    def process_request(self, request, spider):
        manual_user_agent = request.meta.get('User-Agent', None)
        if manual_user_agent:
            request.headers['User-Agent'] = manual_user_agent
        else:
            new_user_agent = self.ua.random
            if new_user_agent:
                request.headers['User-Agent'] = new_user_agent
    def process_request(self, request, spider):
        #设置user-agent的值
        try:
            ua = UserAgent()
            ua.update() 
            request.headers.setdefault('User-Agent', ua.random)
        except FakeUserAgentError as e:
            print(e)
            

# #产生随机代理,突破反爬虫对IP的检测
# class RandomProxy(object):
#     def __init__(self, iplist):
#         self.iplist = iplist
#     @classmethod
#     def from_crawler(cls, crawler):
#         return cls(crawler.settings.getlist('IPLIST'))
#     def process_request(self, request, spider):
#         '''
#         在请求上添加代理
#         :param request:
#         :param spider:
#         :return:
#         '''
#         proxy = random.choice(self.iplist)
#         request.meta['proxy'] = proxy
Beispiel #6
0
def test_fake_update_use_cache_server():
    ua = UserAgent(cache=False, use_cache_server=True)

    denied_urls = [
        'https://www.w3schools.com/browsers/browsers_stats.asp',
        'http://useragentstring.com/pages/useragentstring.php',
    ]

    with mock.patch(
        'fake_useragent.utils.Request',
        side_effect=partial(_request, denied_urls=denied_urls),
    ):
        ua.update()

        _probe(ua)

    denied_urls = [
        'https://www.w3schools.com/browsers/browsers_stats.asp',
        'http://useragentstring.com/pages/useragentstring.php',
        settings.CACHE_SERVER,
    ]

    with mock.patch(
        'fake_useragent.utils.Request',
        side_effect=partial(_request, denied_urls=denied_urls),
    ):
        with pytest.raises(FakeUserAgentError):
            ua.update()
Beispiel #7
0
def main():

    logging.basicConfig(
        level=logging.INFO,
        format=
        '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S',
    )

    log_file_handler = RotatingFileHandler('/tmp/fake_useragent.log',
                                           maxBytes=1024**2,
                                           encoding='utf-8')
    formatter = logging.Formatter(
        '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s'
    )
    log_file_handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.addHandler(log_file_handler)
    logging.info('fake_useragent fetch data now.....')

    try:
        ua = UserAgent()
        ua.update()
    except FakeUserAgentError as e:
        logging.error('fake_useragent fetch data fail, Exception occured: %s' %
                      e,
                      exc_info=True)
Beispiel #8
0
 def __init__(self, path="./phantomjs"):
     useragent = UserAgent()
     useragent.update()
     dcap = dict(DesiredCapabilities.PHANTOMJS)
     dcap['phantomjs.page.settings.userAgent'] = useragent.random
     self.processEngine = webdriver.PhantomJS(executable_path=path,
                                              desired_capabilities=dcap)
Beispiel #9
0
    def _get_headers() -> Dict:
        """
        Return header dict with random User-Agent to support request
        and to avoid being blocked by the server
        """

        ua = UserAgent()
        ua.update()

        return {"User-Agent": ua.random}
Beispiel #10
0
def get_user_agents_generator(verbose=False, verify_ssl=False):
    if verbose:
        print("retriving updated user-agent list...")

    ua = UserAgent(verify_ssl=verify_ssl)
    ua.update()

    if verbose:
        print("Done.")

    return ua
Beispiel #11
0
def test_user_agent():
    clear()
    assert not utils.exist()

    ua = UserAgent(cache=False)

    assert ua.ie is not None
    assert ua.msie is not None
    assert ua.internetexplorer is not None
    assert ua.internet_explorer is not None
    assert ua['internet explorer'] is not None
    assert ua.google is not None
    assert ua.chrome is not None
    assert ua.googlechrome is not None
    assert ua.google_chrome is not None
    assert ua['google chrome'] is not None
    assert ua.firefox is not None
    assert ua.ff is not None
    assert ua.ie is not None
    assert ua.safari is not None
    assert ua.random is not None
    assert ua['random'] is not None

    assert ua.non_existing is None
    assert ua['non_existing'] is None

    data1 = ua.data

    ua.update()

    data2 = ua.data

    assert data1 == data2
    assert data1 is not data2

    clear()
    del ua

    ua = UserAgent()

    assert utils.exist()

    data1 = ua.data

    clear()

    ua.update()

    assert utils.exist()

    data2 = ua.data

    assert data1 == data2
    assert data1 is not data2
Beispiel #12
0
def test_user_agent():
    clear()
    assert not utils.exist()

    ua = UserAgent(cache=False)

    assert not ua.ie is None
    assert not ua.msie is None
    assert not ua.internetexplorer is None
    assert not ua.internet_explorer is None
    assert not ua['internet explorer'] is None
    assert not ua.google is None
    assert not ua.chrome is None
    assert not ua.googlechrome is None
    assert not ua.google_chrome is None
    assert not ua['google chrome'] is None
    assert not ua.firefox is None
    assert not ua.ff is None
    assert not ua.ie is None
    assert not ua.safari is None
    assert not ua.random is None
    assert not ua['random'] is None

    assert ua.non_existing is None
    assert ua['non_existing'] is None

    data1 = ua.data

    ua.update()

    data2 = ua.data

    assert data1 == data2
    assert not data1 is data2

    clear()
    del ua

    ua = UserAgent()

    assert utils.exist()

    data1 = ua.data

    clear()

    ua.update()

    assert utils.exist()

    data2 = ua.data

    assert data1 == data2
    assert not data1 is data2
Beispiel #13
0
def post(CPM, rawCPM, wrong, words, ip, keys, char):
    ua = UserAgent()
    ua.update()
    header = {
        'Origin': 'https://typing-speed-test.aoeu.eu',
        'User-Agent': str(ua.chrome)
    }
    return requests.post(STATS_URL,
                         createParameters(CPM, rawCPM, wrong, words, ip, keys,
                                          char, 0, 0, 0, 60),
                         headers=header)
Beispiel #14
0
class FakeUA:
    """
    用于提供ua,单例模式,fake_ua能用则用,否则用自带的ua集,通过FakeUA.random获取随机ua
    """
    def __new__(cls, *args, **kwargs):
        if not hasattr(cls, 'instance'):
            cls.instance = super(FakeUA, cls).__new__(cls)
        return cls.instance

    def __init__(self, if_update_fake_ua=False):
        self.fake_ua = None
        try:
            if if_update_fake_ua:
                self.fake_ua = UserAgent(path='fake_useragent%s.json' %
                                         VERSION)
                self.fake_ua.update()
            else:
                raise FakeUserAgentError()
        except FakeUserAgentError:
            self.some = [
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60',
                'Opera/8.0 (Windows NT 5.1; U; en)',
                'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
                'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36',
                'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
                'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)"',
                'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
                'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
                'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
                'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
            ]

    def __getattr__(self, item):
        try:
            if item == 'random':
                if self.fake_ua is not None:
                    return self.fake_ua.random
                else:
                    return random.choice(self.some)
        except KeyError:
            raise AttributeError(r"Object does'n has attribute '%s'" % item)
Beispiel #15
0
def resolve_links(db):
    tweetscoll = db['tweets']
    linkscoll = db['links']
    ua = UserAgent()
    ua.update()
    todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
    left = tweetscoll.count({"links_to_resolve": True})
    print >> sys.stderr, "\n\n- STARTING LINKS RESOLVING: %s waiting\n\n" % left
    while todo:
        done = 0
        urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
        alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})}
        tweetsdone = []
        batchidsdone = set()
        ct = 0
        for tweet in todo:
            ct += 1
            if tweet.get("proper_links", []):
                tweetsdone.append(tweet["_id"])
                continue
            tweetid = tweet.get('retweet_id') or tweet['_id']
            if tweetid in batchidsdone:
                continue
            gdlinks = []
            for link in tweet.get("links", []):
                if link in alreadydone:
                    gdlinks.append(alreadydone[link])
                    continue
                print >> sys.stderr, "    %s / %s  " % (ct, left), link
                good = resolve_url(link, user_agent=ua)
                gdlinks.append(good)
                alreadydone[link] = good
                try:
                    linkscoll.save({'_id': link, 'real': good})
                    if good != link:
                        print >> sys.stderr, "              ->", good
                except Exception as e:
                    print >> sys.stderr, "- WARNING: Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e)
                if link != good:
                    done += 1
            tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True)
            batchidsdone.add(tweetid)
        # clear tweets potentially rediscovered
        if tweetsdone:
            tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True)
        if done:
            left = tweetscoll.count({"links_to_resolve": True})
            print >> sys.stderr, "- [LINKS RESOLVING] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left)
        todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
Beispiel #16
0
def test_fake_update_cache(path):
    assert not os.path.isfile(path)

    ua = UserAgent(path=path, cache=False, use_cache_server=False)

    assert not os.path.isfile(path)

    with pytest.raises(AssertionError):
        ua.update(cache='y')

    ua.update(cache=True)

    assert os.path.isfile(path)

    _probe(ua)
Beispiel #17
0
def test_fake_update_cache(path):
    assert not os.path.isfile(path)

    ua = UserAgent(path=path, cache=False, use_cache_server=False)

    assert not os.path.isfile(path)

    with pytest.raises(AssertionError):
        ua.update(cache='y')

    ua.update(cache=True)

    assert os.path.isfile(path)

    _probe(ua)
Beispiel #18
0
def user_agent(browser) -> str:
    result = BROWSER.get(browser)
    if result is None:
        global _ua
        if _ua is None:
            _ua = UserAgent(fallback=BAIDUSPIDER_USER_AGENT)
        else:
            try:
                delta = now() - fromtimestamp(getmtime(_ua.path))
                if delta.days >= 7:
                    _ua.update()
            except FileNotFoundError:
                pass

        result = _ua[browser]
    return result
Beispiel #19
0
class RandomUserAgentMiddleware(object):
    # 随机更换user-agent
    def __init__(self, crawler):
        super(RandomUserAgentMiddleware, self).__init__
        self.ua = UserAgent()
        self.ua.update()
        self.ua_type = crawler.settings.get('RANDOM_TYPE', 'random')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        request.headers.setdefault('User-Agent', get_ua())
Beispiel #20
0
def resolver(mongoconf, exit_event, debug=False):
    ua = UserAgent()
    ua.update()
    db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']]
    linkscoll = db['links']
    tweetscoll = db['tweets']
    while not exit_event.is_set():
        done = 0
        todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
        urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
        alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})}
        tweetsdone = []
        batchidsdone = set()
        for tweet in todo:
            if tweet.get("proper_links", []):
                tweetsdone.append(tweet["_id"])
                continue
            tweetid = tweet.get('retweet_id') or tweet['_id']
            if tweetid in batchidsdone:
                continue
            if exit_event.is_set():
                continue
            gdlinks = []
            for link in tweet.get("links", []):
                if link in alreadydone:
                    gdlinks.append(alreadydone[link])
                    continue
                good = resolve_url(link, user_agent=ua)
                gdlinks.append(good)
                alreadydone[link] = good
                try:
                    linkscoll.save({'_id': link, 'real': good})
                except Exception as e:
                    log("WARNING", "Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e))
                if link != good:
                    done += 1
            tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True)
            batchidsdone.add(tweetid)
        if debug and done:
            left = tweetscoll.count({"links_to_resolve": True})
            log("DEBUG", "[links] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left))
        # clear tweets potentially rediscovered
        if tweetsdone:
            tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True)
    log("INFO", "FINISHED resolver")
Beispiel #21
0
def resolver(mongoconf, exit_event, debug=False):
    ua = UserAgent()
    ua.update()
    db = MongoClient(mongoconf['host'], mongoconf['port'])[mongoconf['db']]
    linkscoll = db['links']
    tweetscoll = db['tweets']
    while not exit_event.is_set():
        done = 0
        todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
        urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
        alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})}
        tweetsdone = []
        batchidsdone = set()
        for tweet in todo:
            if tweet.get("proper_links", []):
                tweetsdone.append(tweet["_id"])
                continue
            tweetid = tweet.get('retweet_id') or tweet['_id']
            if tweetid in batchidsdone:
                continue
            if exit_event.is_set():
                continue
            gdlinks = []
            for link in tweet.get("links", []):
                if link in alreadydone:
                    gdlinks.append(alreadydone[link])
                    continue
                good = resolve_url(link, user_agent=ua)
                gdlinks.append(good)
                try:
                    linkscoll.save({'_id': link, 'real': good})
                except Exception as e:
                    log("WARNING", "Could not store resolved link %s -> %s because %s: %s" % (link, good, type(e), e))
                if link != good:
                    done += 1
            tweetscoll.update({'$or': [{'_id': tweetid}, {'retweet_id': tweetid}]}, {'$set': {'proper_links': gdlinks, 'links_to_resolve': False}}, upsert=False, multi=True)
            batchidsdone.add(tweetid)
        if debug and done:
            left = tweetscoll.count({"links_to_resolve": True})
            log("DEBUG", "[links] +%s new redirection resolved out of %s links (%s waiting)" % (done, len(todo), left))
        # clear tweets potentially rediscovered
        if tweetsdone:
            tweetscoll.update({"_id": {"$in": tweetsdone}}, {"$set": {"links_to_resolve": False}}, upsert=False, multi=True)
    log("INFO", "FINISHED resolver")
Beispiel #22
0
def test_custom_path():
    location = os.path.join(
        tempfile.gettempdir(),
        'fake_useragent' + uuid.uuid1().hex + '.json',
    )

    ua = UserAgent(path=location)

    assert utils.exist(location)

    check_dict(ua.data)

    mtime = os.path.getmtime(location)

    ua.update()

    assert os.path.getmtime(location) != mtime

    clear(location)
Beispiel #23
0
def test_fake_user_agent_browsers():
    ua = UserAgent(cache=False, use_cache_server=False)

    _probe(ua)

    with pytest.raises(FakeUserAgentError):
        ua.non_existing

    with pytest.raises(FakeUserAgentError):
        ua['non_existing']

    data1 = ua.data

    ua.update()

    data2 = ua.data

    assert data1 == data2

    assert data1 is not data2
Beispiel #24
0
def test_fake_user_agent_browsers():
    ua = UserAgent(cache=False, use_cache_server=False)

    _probe(ua)

    with pytest.raises(FakeUserAgentError):
        ua.non_existing

    with pytest.raises(FakeUserAgentError):
        ua['non_existing']

    data1 = ua.data

    ua.update()

    data2 = ua.data

    assert data1 == data2

    assert data1 is not data2
def main(argv):
    ua = UserAgent()
    ua.update()
    base_url = argv[1]
    url = 'http://web.archive.org/__wb/calendarcaptures?url=http://' + base_url + '&selected_year=2017'
    headers = {'User-Agent': ua.random}
    response = requests.get(url)
    json_obj = json.loads(response.text)
    for month in json_obj:
        if month:
            for week in month:
                if week:
                    for day in week:
                        if day:
                            for ts in day['ts']:
                                time_stamps.add(ts)
    for ts in time_stamps:
        urls.append('http://web.archive.org/web/' + str(ts) + '/http://' +
                    base_url)
    print urls
Beispiel #26
0
def random_header(logger):
    # Create a dict of accept headers for each user-agent.
    accepts = {
        "Firefox":
        "text/html, application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Safari, Chrome":
        "application/xhtml+xml,application/xml, text/html;q=0.9, text/plainlq=0.8,image/png,*/*,q=0.5"
    }

    # Get a random user-agent. We used Chrome and Firefox user agents.
    # More at: https://pypi.org/project/fake-useragent/
    try:
        # Getting a suer agent using the fake_useragent package
        ua = UserAgent(cache=True)
        ua.update()
        if random.random() > 0.5:
            random_user_agent = ua.chrome
        else:
            random_user_agent = ua.firefox

    # In case there's a problem with the fake-useragent package, we still want the scraper to function
    # so there's a list of ua's (https://developers.whatismybrowser.com/) that we created and swap to another ua.
    # Be aware of a need to update the list periodically.
    except FakeUserAgentError as error:
        # Save a message to the logs file.
        logger.error(
            "FakeUserAgent didn't work. Generating headers from the pre-defined list of headers. error: {}"
            .format(error))
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
            "Mozilla/5.0 (Windows NT 5.1; rv:7.0.1) Gecko/20100101 Firefox/7.0.1"
        ]
        random_user_agent = random.choice(user_agents)

        # Create the headers dictionary. It's key to match between the ua and the accept headers
    finally:
        valid_accept = accepts["Firefox"] if random_user_agent.find(
            "Firefox") > 0 else accepts["Safari, Chrome"]
        headers = {"User-Agent": random_user_agent, "Accept": valid_accept}
    return headers
Beispiel #27
0
    def __init__(
        self,
        username: str,
        password: str,
        region: RegionChoice = RegionChoice.US,
        quality: QualitySize = QualitySize.LARGE_256k,
        user_agent: Optional[str] = None,
        update_handler: Optional[Callable[[dict], None]] = None,
    ):

        self._log = logging.getLogger(__file__)

        if user_agent is None:
            try:
                ua = UserAgent(use_cache_server=False)
                ua.update()
                user_agent = ua.chrome
            except Exception:
                user_agent = FALLBACK_UA
        self._ua = user_agent_parser.Parse(user_agent)

        self.reset_session()

        self.username = username
        self.password = password
        self.region = region
        self.stream_quality = quality

        self._playlists = {}
        self._channels = None
        self._favorite_channels = None
        self._use_primary = True

        # vars to manage session cache
        self.last_renew = None
        self.update_interval = 30

        # hook function to call whenever the playlist updates
        self.update_handler = update_handler
Beispiel #28
0
def test_user_agent():
    clear(settings.DB)
    assert not utils.exist(settings.DB)

    ua = UserAgent(cache=False)

    assert ua.ie is not None
    assert ua.msie is not None
    assert ua.internetexplorer is not None
    assert ua.internet_explorer is not None
    assert ua['internet explorer'] is not None
    assert ua.google is not None
    assert ua.chrome is not None
    assert ua.googlechrome is not None
    assert ua.google_chrome is not None
    assert ua['google chrome'] is not None
    assert ua.firefox is not None
    assert ua.ff is not None
    assert ua.ie is not None
    assert ua.safari is not None
    assert ua.random is not None
    assert ua['random'] is not None

    try:
        ua.non_existing
    except FakeUserAgentError:
        pass
    else:
        assert False

    try:
        assert ua['non_existing']
    except FakeUserAgentError:
        pass
    else:
        assert False

    data1 = ua.data

    ua.update(settings.DB)

    data2 = ua.data

    assert data1 == data2
    assert data1 is not data2

    clear(settings.DB)
    del ua

    ua = UserAgent()

    assert utils.exist(settings.DB)

    data1 = ua.data

    clear(settings.DB)

    ua.update(settings.DB)

    assert utils.exist(settings.DB)

    data2 = ua.data

    assert data1 == data2
    assert data1 is not data2

    clear(settings.DB)
Beispiel #29
0
    def pull_items_search(self):
        """Pulls items down from amazon for the given pages."""
        print("Retrieving Items...")

        connection = sqlite3.connect(self._database_name)
        cursor = connection.cursor()
        ua = UserAgent()
        ua.update()

        for page in self._pages:
            headers = {'User-Agent': '{}'.format(ua.random)}
            r = requests.get(page, headers=headers)
            asoup = BeautifulSoup(r.text, 'lxml')

            if asoup.head.title.string == "Robot Check":
                print(
                    "You've been discovered as a bot. Take a break and come back tomorrow."
                )
                print(headers['User-Agent'])
                return 'bot'

            # s-result-item is the tag that contains all the data on an item.
            items_list = asoup.find_all('li', class_="s-result-item")

            if len(items_list) > 0:
                first_page_num = ceil(int(self._range[0]) / len(items_list))
                # Getting one extra page because the item count can change between here and when the items are found.
                last_page_num = ceil(int(self._range[1]) / len(items_list)) + 1
            else:
                print("No items found.")
                with open("./output/no_items.html", 'w') as f:
                    f.write(asoup.prettify())
                    print("Item less html written to ./output/no_items.html")
                break

            # Getting the category
            category_chain = asoup.find('h2',
                                        id="s-result-count").span.contents
            categorystr = ''
            for category in category_chain:
                if category.string is not None:
                    categorystr += category.string

            # If there are new lines in the category this was a search and the last piece is pulled from the search box.
            if '\n' in categorystr:
                categorystr = categorystr.replace('\n', '') + asoup.find(
                    'input', id="twotabsearchtextbox")["value"]

            categorystr = categorystr.replace(':', '-')
            print(categorystr)

            # Fast forwarding to the first page in the range
            next_page = "https://www.amazon.com" + asoup.find(
                'span', class_="pagnLink").contents[0]['href']
            first_valid_page = next_page.replace(
                "page=2", "page={}".format(first_page_num))
            r = requests.get(first_valid_page)

            # Going through all the pages and getting their items.
            for page_num in range(first_page_num, last_page_num + 1):
                asoup = BeautifulSoup(r.text, 'lxml')

                # s-result-item is the tag that contains all the data on an item.
                items = asoup.find_all('li', class_="s-result-item")

                # Scrapping the item information and adding it to the database.
                for item in items:
                    linkstr = item.find(
                        'a', class_="a-link-normal a-text-normal")['href']
                    namestr = item.find('h2').string

                    reviewscorestr = ''
                    if item.find(
                            'i', class_=re.compile(
                                "a-icon a-icon-star a-star-.")) is not None:
                        reviewscorestr = item.find(
                            'i',
                            class_=re.compile(
                                "a-icon a-icon-star a-star-.")).string

                    reviewersstr = '0'
                    psbl_num_reviewers_tag = item.find(
                        'a', class_="a-size-small a-link-normal a-text-normal")
                    if psbl_num_reviewers_tag and "#customerReviews" in psbl_num_reviewers_tag[
                            'href']:
                        reviewersstr = item.find(
                            'a',
                            class_="a-size-small a-link-normal a-text-normal"
                        ).string

                    pricestr = '0.00'
                    # The price is broken up into a whole section and fractional section.
                    if item.find(
                            'span',
                            class_="sx-price-whole") is not None and item.find(
                                'sup',
                                class_="sx-price-fractional") is not None:
                        whole = item.find('span',
                                          class_="sx-price-whole").string
                        fract = item.find('sup',
                                          class_="sx-price-fractional").string
                        pricestr = "{}.{}".format(whole, fract)
                    # The price is just its own full string value.
                    elif pricestr == '0.00' and item.find(
                            'span', class_="a-size-small s-padding-right-micro"
                    ) is not None:
                        pricestr = item.find(
                            'span',
                            class_="a-size-small s-padding-right-micro").string

                    asinstr = item['data-asin']
                    # The ranks on amazon are zero based
                    ranknum = int(item['id'][len("result_"):]) + 1

                    sql_command = """INSERT  OR IGNORE INTO items (item_number, category, name, reviewscore, price, link, rank, asin, reviewers)
                    VALUES (NULL, ?, ?, ?, ?, ?, ?, ?, ?);"""
                    cursor.execute(
                        sql_command,
                        (categorystr, namestr, reviewscorestr, pricestr,
                         linkstr, ranknum, asinstr, reviewersstr))

                connection.commit()

                try:
                    next_page = "https://www.amazon.com" + asoup.find(
                        'a', id="pagnNextLink")['href']
                except TypeError:
                    if asoup.head.title.string != "Robot Check":
                        print(
                            "You've been discovered as a bot. Take a break and come back tomorrow."
                        )
                        print(headers['User-Agent'])
                        return 'bot'
                    elif asoup.head.title.string != "503 Service Unavailable Error":
                        print(
                            "503 Service Unavailable Error from Amazon. Try again."
                        )
                    else:
                        print(
                            "Error: No more pages, range higher than number of items."
                        )
                        print(next_page)
                        with open(
                                "./output/failed_{}.html".format(
                                    categorystr.replace(" ", "")), 'w') as f:
                            f.write(asoup.prettify())
                            print(
                                "Failed html written to ./output/failed_{}.html"
                                .format(categorystr.replace(" ", "")))
                    break

                if page_num != last_page_num:
                    time.sleep(5)
                    r = requests.get(next_page)

            if page is not self._pages[-1]:
                time.sleep(45)
Beispiel #30
0
class MeituanSpider():
    def __init__(self):
        self.count = 0
        self.ip_lst = []
        self.user_agent = [
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
            'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
        ]  #
        self.ua = UserAgent(verify_ssl=False)  # 通过库随机产生useragent
        self.ua.update()
        # self.user_agent = self.ua.get_useragent_list()

    def get_ip(self, savefile):
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \
                                   Chrome/83.0.4103.116 Safari/537.36'
        }
        for num in range(1, 60):
            url = 'https://www.kuaidaili.com/free/inha/%d/' % num
            res = requests.get(url, headers=headers, timeout=5)
            html = etree.HTML(res.text)
            ips = html.xpath('//*[@id="list"]/table//tr/td[1]/text()')
            ports = html.xpath('//*[@id="list"]/table//tr/td[2]/text()')
            [
                self.ip_lst.append('%s:%s' % (ip, port))
                for ip, port in zip(ips, ports)
            ]
            time.sleep(2)
            print('IP获取成功,数量%d' % len(self.ip_lst))

        with open(savefile, 'w') as sf:
            json.dump(self.ip_lst, sf, ensure_ascii=False)

    # 获取页面数据,将json object转化为python dict
    def _get_url(self, url, city):
        simulateBrowserData = {
            'Accept': '*/*',
            'Accept-Encoding': 'br, gzip, deflate',
            'Accept-Language': 'zh-cn',
            'Connection': 'keep-alive',
            'Host': city + '.meituan.com',
            'Referer': 'https://' + city + '.meituan.com/meishi/',
            # 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15',
            'User-Agent': self.ua.random,
            # 'User-Agent': random.choice(self.user_agent)
        }

        proxy = {'http': random.choice(self.ip_lst)}
        response = requests.get(url,
                                headers=simulateBrowserData,
                                proxies=proxy,
                                allow_redirects=False)

        soup = BeautifulSoup(response.text)
        # for script in soup.find_all('script'):
        #    print(script.contents)
        all_scripts = soup.find_all('script')
        text = None
        for number, script in enumerate(all_scripts):
            if 'window._appState' in script.text:
                text = script.text
                # print(number, text)
        if text == None:
            print("服务器拒绝访问")
            # print(simulateBrowserData['User-Agent'])
            time.sleep(1 + 2 * random.random())
            self.count += 1
            if self.count > 6:
                print('sleep 600s...')
                time.sleep(600)  # 连续失败n次,休息10分钟再请求
                self.count = 0
            return self._get_url(url, city)  # user-agent 失效情况 继续随机一个进行请求
        else:
            self.count = 0
            data = json.loads(text.split("=", 1)[1][:-1])
            print(simulateBrowserData['User-Agent'])
            return data

    # 获取店铺评论数据
    def _get_comment(self, ):
        '''https://www.meituan.com/meishi/api/poi/getMerchantComment?uuid=763c7334407e4fb1aef8.1622721180.1.0.0&platform=1&partner=126&originUrl=https%3A%2F%2Fwww.meituan.com%2Fmeishi%2F913072238%2F&riskLevel=1&optimusCode=10&id=913072238&userId=147136762&offset=0&pageSize=10&sortType=1'''

    # 获取一个城市多区域多类目商家信息
    def get_city_shops_infos(self, city, categoryID, sf):
        '''美团商家信息最大显示67页,所以应该按各个小分类进行爬取,尽量列出所有店铺'''

        if not self.ip_lst:
            with open('ips.json', 'r') as f:
                self.ip_lst = json.load(f)
        success_shops = set()
        failed_shops = set()
        # city = 'bj'
        url = 'https://' + city + '.meituan.com/meishi/' + categoryID + '/'

        data = self._get_url(url, city)
        n = 0

        filters = data.get('filters', {})

        cates = filters.get('cates', [])  # 分类
        areas = filters.get('areas', [])  # 区域
        dinners = filters.get('dinnerCountsAttr', [])  # 用餐人数
        sorts = filters.get('sortTypesAttr', [])  # 店铺排序类型

        # 这里只考虑'区域'
        for _area in areas[:]:
            '''一级:大区域'''
            area_id = _area['id']  # int
            area_name = _area['name']
            print('大区域:', area_name)

            area_url = _area['url']
            sub_areas = _area.get('subAreas', [])
            if not sub_areas: continue

            for _sub_area in sub_areas[1:]:  # 第一个元素为大区域"全部"
                '''二级:小区域'''
                sub_area_id = _sub_area['id']  # int
                sub_area_name = _sub_area['name']
                print('小区域:', sub_area_name)

                sub_area_url = _sub_area['url'].replace('http:', 'https:')
                print(sub_area_url)
                a_data = self._get_url(sub_area_url, city)

                shopCounts = a_data.get('poiLists').get(
                    'totalCounts')  # 最小区域内的商家数量
                pageNum = math.ceil(shopCounts / 15)  # 美团一页展示15个商家

                for pn in range(1, pageNum + 1):
                    print('page:', pn)
                    try:
                        for eachShopInfo in a_data.get('poiLists').get(
                                'poiInfos'):
                            shopId = eachShopInfo.get('poiId')  # 店铺id
                            try:
                                if shopId in success_shops: continue

                                n += 1
                                # if n < 1215:  # todo: 中断跳过操作
                                #     print('===== 跳过 =====')
                                #     continue
                                print(n)

                                shopName = eachShopInfo.get('title')  # 店铺名
                                print(shopName)
                                avgScore = eachShopInfo.get('avgScore')  # 评分
                                allCommentNum = eachShopInfo.get(
                                    'allCommentNum')  # 评论条数
                                address = eachShopInfo.get('address')  # 地址
                                avgPrice = eachShopInfo.get('avgPrice')  # 人均价格

                                # 店铺详细信息
                                shop_url = 'https://www.meituan.com/meishi/%s/' % shopId
                                print(shop_url)
                                shop_data = self._get_url(shop_url, 'nc')
                                recommended = shop_data.get('recommended',
                                                            [])  # 推荐菜
                                shopClass = [
                                    _d['title']
                                    for _d in shop_data.get('crumbNav', [])
                                ]  # 店铺层级
                                phone = shop_data.get('detailInfo',
                                                      {}).get('phone',
                                                              '')  # 电话
                                address_detail = shop_data.get(
                                    'detailInfo', {}).get('address',
                                                          '')  # 详细地址
                                openTime = shop_data.get('detailInfo', {}).get(
                                    'openTime', '')  # 营业时间
                                dealList = shop_data.get('dealList',
                                                         {}).get('deals',
                                                                 [])  # 套餐列表

                                success_shops.add(shopId)

                                out_data = [
                                    city, categoryID, area_name, sub_area_name,
                                    shopId, shopName, avgScore, allCommentNum,
                                    avgPrice, address_detail,
                                    '/'.join(shopClass), phone, openTime,
                                    shop_url,
                                    json.dumps(recommended,
                                               ensure_ascii=False),
                                    json.dumps(dealList, ensure_ascii=False)
                                ]

                                out_data = [
                                    str(_str).replace('\t', '|')
                                    for _str in out_data
                                ]
                                out_str = '\t'.join(out_data)
                                sf.write(out_str +
                                         '\n') if sf else print(out_str)

                                time.sleep(20 + 5 * random.random())
                            except Exception as e:
                                print("error:%s" % e)
                                print("shopId:%s" % shopId)
                                time.sleep(1 + 6 * random.random())
                    except Exception as e:
                        print("error:%s" % e)
                        print("page:%s" % pn)
                        time.sleep(1 + 6 * random.random())

                    pn += 1
                    next_page_url = sub_area_url + 'pn%s/' % pn
                    if pn > pageNum: break
                    a_data = self._get_url(next_page_url, city)
Beispiel #31
0
def RequestsAndcheck(URL, Xpath, proxies, dict_rate2):
    dict_rate = GetRate()
    URL = covertURL(URL)
    htmls = {}
    for j in range(10):  # try get price with regular counect
        try:
            regular_result = getPrice(URL, Xpath)
            time.sleep(2.6)
            if regular_result:
                print(regular_result)
                break
        except:
            pass
    # proxy
    UserAgents = [
        'Mozilla/5.0 (Linux; Android 5.0; Nexus 6 Build/LRX21D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 5.1.1; KFAUWI Build/LVY48F) AppleWebKit/537.36 (KHTML, like Gecko) Silk/68.2.6 like Chrome/68.0.3440.85 Safari/537.36',
        'Mozilla/5.0 (Linux; Android 6.0; Nexus 6P Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.69 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 7.0; SM-T715 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Safari/537.36',
        'Mozilla/5.0 (Linux; Android 8.0.0; Pixel Build/OPR3.170623.007) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.98 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 8.0.0; SM-G935F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36',
        'Mozilla/5.0 (Linux; Android 9; Pixel 2 XL Build/PPR1.180610.009) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36',
        'Mozilla/5.0 (Android 9; Tablet; rv:61.0) Gecko/61.0 Firefox/61.0',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13C75 Safari/601.1',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/546.10 (KHTML, like Gecko) Version/6.0 Mobile/7E18WD Safari/8536.25',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25',
        'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
        'Mozilla/5.0 (iPad; CPU OS 9_0 like Mac OS X) AppleWebKit/601.1.17 (KHTML, like Gecko) Version/8.0 Mobile/13A175 Safari/600.1.4',
        'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
        'Mozilla/5.0 (iPad; CPU OS 7_0_2 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/11A501',
        'Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25',
        'Mozilla/5.0 (iPad; CPU OS 5_1_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B206 Safari/7534.48.3',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)',
        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0;  Trident/5.0)',
        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
        'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 OPR/52.0.2871.64',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0'
    ]
    for i in range(0, len(proxies)):
        # Get a proxy from the pool
        proxy = proxies[i]
        print("Request #%d" % i)
        try:
            if (len(UserAgents) > 0):
                browser = UserAgents[random.randint(0, len(UserAgents) - 1)]
            else:
                try:
                    ua = UserAgent(cache=False, use_cache_server=False)
                    ua.update()
                    UserAgents = ([
                        str(ua.safari),
                        str(ua.ff),
                        str(ua.firefox),
                        str(ua.google),
                        str(ua.chrome),
                        str(ua.opera),
                        str(ua["Internet Explorer"]),
                        str(ua["google chrome"]),
                        str(ua.msie),
                        str(ua.ie),
                        str(ua.chrome)
                    ])
                    browser = ua.random
                except:
                    UserAgents = [
                        'Mozilla/5.0 (Linux; Android 5.0; Nexus 6 Build/LRX21D) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile Safari/537.36',
                        'Mozilla/5.0 (Linux; Android 5.1.1; KFAUWI Build/LVY48F) AppleWebKit/537.36 (KHTML, like Gecko) Silk/68.2.6 like Chrome/68.0.3440.85 Safari/537.36',
                        'Mozilla/5.0 (Linux; Android 6.0; Nexus 6P Build/MDB08L) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.69 Mobile Safari/537.36',
                        'Mozilla/5.0 (Linux; Android 7.0; SM-T715 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Safari/537.36',
                        'Mozilla/5.0 (Linux; Android 8.0.0; Pixel Build/OPR3.170623.007) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.98 Mobile Safari/537.36',
                        'Mozilla/5.0 (Linux; Android 8.0.0; SM-G935F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36',
                        'Mozilla/5.0 (Linux; Android 9; Pixel 2 XL Build/PPR1.180610.009) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.91 Mobile Safari/537.36',
                        'Mozilla/5.0 (Android 9; Tablet; rv:61.0) Gecko/61.0 Firefox/61.0',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Version/10.0 Mobile/14D27 Safari/602.1',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 9_2 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13C75 Safari/601.1',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 8_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12D508 Safari/600.1.4',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/546.10 (KHTML, like Gecko) Version/6.0 Mobile/7E18WD Safari/8536.25',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25',
                        'Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3',
                        'Mozilla/5.0 (iPad; CPU OS 9_0 like Mac OS X) AppleWebKit/601.1.17 (KHTML, like Gecko) Version/8.0 Mobile/13A175 Safari/600.1.4',
                        'Mozilla/5.0 (iPad; CPU OS 8_4_1 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H321 Safari/600.1.4',
                        'Mozilla/5.0 (iPad; CPU OS 7_0_2 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Mobile/11A501',
                        'Mozilla/5.0 (iPad; CPU OS 6_1_3 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/6.0 Mobile/10B329 Safari/8536.25',
                        'Mozilla/5.0 (iPad; CPU OS 5_1_1 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9B206 Safari/7534.48.3',
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134',
                        'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
                        'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0; MDDCJS)',
                        'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0;  Trident/5.0)',
                        'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)',
                        'Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)',
                        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:53.0) Gecko/20100101 Firefox/53.0',
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36 OPR/52.0.2871.64',
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15',
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8',
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:58.0) Gecko/20100101 Firefox/58.0'
                    ]
                    browser = UserAgents[random.randint(
                        0,
                        len(UserAgents) - 1)]
            price_list = getPrice(URL, Xpath, proxy, browser)
            if not price_list:
                pass
            else:
                print(price_list)
                htmls[(proxy, browser)] = price_list
                UserAgents.remove(browser)
        except:
            print(proxy + " : " + browser + " Skipping. Connnection error")
    htmls = BastPrice(htmls, dict_rate, dict_rate2)
    if not regular_result and not htmls:
        return ('unsupported', False)
    if not regular_result:
        for Proxy_UA in htmls.keys():
            return (Proxy_UA, False)
    if not htmls:
        return ('only regular find', False)
    htmls["regular"] = regular_result
    htmls = BastPrice(htmls, dict_rate, dict_rate2)
    if not htmls:
        return ('Erorr', False)
    else:
        for Proxy_UA in htmls.keys():
            if Proxy_UA == "regular":
                return (Proxy_UA, True)
            return (Proxy_UA, False)
Beispiel #32
0
            metas = prepare_tweet(tw)
            metas.pop('_id')
            tw.update(metas)
            for po in ['user', 'entities', 'extended_entities']:
                if po in tw:
                    tw.pop(po)
            db.tweets.update({'_id': tw['id']}, {"$set": tw}, upsert=True)
        print "...collected %s new tweets" % len(tweets)
        tweets = api.call('statuses.user_timeline', api_args)
    db.users.update({'_id': user['twitter']}, {"$set": {"done": True}})


# TODO: refacto all of this with gazouilloire/run.py

ua = UserAgent()
ua.update()
todo = list(tweetscoll.find({"links_to_resolve": True}, projection={"links": 1, "proper_links": 1, "retweet_id": 1}, limit=600, sort=[("_id", 1)]))
left = tweetscoll.count({"links_to_resolve": True})
print "\n\n- STARTING LINKS RESOLVING: %s waiting\n\n" % left
while todo:
    done = 0
    urlstoclear = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
    alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": urlstoclear}})}
    tweetsdone = []
    batchidsdone = set()
    for tweet in todo:
        if tweet.get("proper_links", []):
            tweetsdone.append(tweet["_id"])
            continue
        tweetid = tweet.get('retweet_id') or tweet['_id']
        if tweetid in batchidsdone:
 def get_random_header(self):
     '''Get random user agents(headers)'''
     ua = UserAgent()
     ua.update()
     return ua
node_list.sort()

headers = requests.utils.default_headers()
node_cat_map = {}
ua = UserAgent()
url = "https://snoopsnoo.com/r/"

index = sum(1 for line in open("node_cat_map.txt"))

f = open("node_cat_map.txt", "a+")
agent_counter = 0
headers.update({'User-Agent': ua.firefox})

for it in range(index, len(node_list)):
    if agent_counter >= 10:
        ua.update()
        headers.update({'User-Agent': ua.firefox})
        agent_counter = 0

    try:
        req = requests.get(url + node_list[it], headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        cat = soup.body.select(".breadcrumb")[0].find_all("a")[1].get_text(
            strip=True)
        f.write(node_list[it] + ":" + cat + "\r")
        print(node_list[it] + ": " + cat +
              " has been written to the file successfully [" + str(it) + "/" +
              str(len(node_list)) + ")")
    except requests.exceptions.RequestException as e:
        print(e)
        f.close()
Beispiel #35
0
def test_fake_update():
    ua = UserAgent(cache=False, use_cache_server=False)

    ua.update()

    _probe(ua)
Beispiel #36
0
class Bot:
    def __init__(self, pair):
        self.region = pair[1]
        self.category = pair[0]
        self.key = 'af0deccbgcgidddjgnvljitntccdduijhdinfgjgfjir'
        self.conn = http.client.HTTPSConnection("m.avito.ru", timeout=10)
        self.ua = UserAgent()

    def get_id_location(self):
        payload = ''
        headers = {
            'Cookie':
            'u=2kfmrcai.1cy6s0w.wt2thhj49000; buyer_selected_search_radius4=0_general; '
            'buyer_local_priority_v2=0; buyer_selected_search_radius0=200; buyer_location_id=638920; '
            'sx=H4sIAAAAAAACAw3JwQqAIAwA0H%2FZucPMZcu%2FkZBFCyQsR0j%2FXu%2F6Osx8P0eSLV'
            '%2B1MpmZFissbBA7NIiQ93CmNoVxMWVSWpEI%2F6ciJkVhgAzRBWT06L173w9VOrM1VAAAAA%3D%3D; '
            'sessid=8bc4e5dea8b325ce05e21935d12b0464.1608066636; _mlocation=638920; v=1608070541; '
            'dfp_group=87',
            'user-agent':
            str(self.ua)
        }
        self.ua.update()
        self.conn.request(
            "GET", "/api/1/slocations?key=" + self.key + "&q=" +
            urllib.parse.quote_plus(self.region), payload, headers)
        res = self.conn.getresponse()

        data = res.read().decode("utf-8")
        id = json.loads(data)
        return id["result"]["locations"][0]['id']

    def get_count(self, id_location, time, page=1, count=0):
        payload = ''
        headers = {
            'Cookie':
            'u=2kfmrcai.1cy6s0w.wt2thhj49000; buyer_selected_search_radius4=0_general; '
            'buyer_local_priority_v2=0; buyer_selected_search_radius0=200; buyer_location_id=638920; '
            'sx=H4sIAAAAAAACAw3JwQqAIAwA0H%2FZucPMZcu%2FkZBFCyQsR0j%2FXu%2F6Osx8P0eSLV'
            '%2B1MpmZFissbBA7NIiQ93CmNoVxMWVSWpEI%2F6ciJkVhgAzRBWT06L173w9VOrM1VAAAAA%3D%3D; '
            'sessid=8bc4e5dea8b325ce05e21935d12b0464.1608066636; _mlocation=638920; v=1608070541; '
            'dfp_group=87',
            'user-agent':
            str(self.ua)
        }
        url = ("/api/10/items?key=" + self.key + "&locationId=" +
               str(id_location) + '&page=' + str(page) + "&query=" +
               urllib.parse.quote_plus(self.category) + '&sort=date')
        self.conn.request("GET", url, payload, headers)
        res = self.conn.getresponse()
        data = res.read().decode("utf-8")
        items = json.loads(data)['result']["items"]
        if len(items) > 0:
            positiv = 0
            for i in range(len(items)):
                item = items[i]['value']
                if item['time'] >= time:
                    positiv += 1
            if positiv != 0:
                return self.get_count(id_location,
                                      time=time,
                                      page=page + 1,
                                      count=count + positiv)
            else:
                self.conn.close()
                return count
        else:
            self.conn.close()
            return count
Beispiel #37
0
def test_fake_update():
    ua = UserAgent(cache=False, use_cache_server=False)

    ua.update()

    _probe(ua)