Esempio n. 1
0
def get_proxy_generator(filename):
    if not filename:
        # 从网站爬取新的ip保存起来
        proxy.get_proxies()
        # 读取结果并将结果以生成器返回
        with open('proxies.txt', 'r', encoding='utf-8') as f:
            for line in f:
                yield line.strip()
    else:
        with open(filename, 'r', encoding='utf-8') as f:
            for line in f:
                yield line.strip()
Esempio n. 2
0
def scrape_endpoint(endpoint):

    timeout = 5
    counter = 0
    proxies = proxy.get_proxies()

    URL = settings.PRODUCTION_URL + "/" + endpoint
    CLEAR_URL = settings.PRODUCTION_URL + "/clear_address"
    print("Scraping the %s endpoint with ip rotation: %s " % (endpoint, URL))

    try:
        while True:
            print("Request", counter)

            curr_proxy = proxy.create_proxy_dict(random.choice(proxies))
            print("Proxy: ", curr_proxy)
            response = requests.get(URL, proxies=curr_proxy)
            status = json.loads(response.content).get("status", 200)

            if status == 429:
                raise ValueError("Rate Limited after %s requests" % counter)
            if status == 403:
                raise ValueError("Blacklisted after %s requests" % counter)
            counter += 1

    except requests.Timeout:
        print("Timed out after", counter, "requests")

    except Exception as e:
        import traceback
        traceback.print_exc()
        print(e)

    requests.get(CLEAR_URL)
Esempio n. 3
0
 def __init__(self, url, proxy_enabled=0, thread_no=1):
     self.url = url
     self.categories = []
     self.proxies = {}
     self.thread_no = thread_no  # DO NOT set it as 0
     self.db_lock = Lock()
     if proxy_enabled:
         self.proxies = proxy.get_proxies()
Esempio n. 4
0
 def get_soups_helper(self, item_list):
     proxies = get_proxies()
     for item in item_list:
         status = item.fetch_soup(proxies=proxies)
         if status == None:
             self.item_list.remove(item)
             continue
         item.extract_info()
         with self.db_lock:
             ItemDb.create(url=item.url, name=item.name, price=item.price)
def write_new_proxies():
    # Write the extracted proxies to the file
    try:
        proxies = get_proxies()
        f = open('proxy_list.txt', 'w')
        for proxy in proxies:
            f.write(proxy + '\n')
        f.close()
        print("DONE")
    except:
        print("MAJOR ERROR")
Esempio n. 6
0
 def check(self):
     """
     检测代理IP数量是否低于阈值
     如果低于阈值,则执行添加任务
     :return:
     """
     if self.size() <= THRESHOLD:
         # 低于阈值,添加proxy
         results = get_proxies()
         for proxy in results:
             #proxy = result['ip'] + ':' + result['port']
             self.add(proxy)
     elif self.size() > THRESHOLD:
         print("还有至少三个proxy可使用")
Esempio n. 7
0
def spider(page):
    data = {
        "bt": "",
        "fydw": "",
        "pageNum": page,
    }
    for _ in range(5):
        try:
            response = requests.post(url, headers=headers, data=data, proxies=get_proxies())
            json_data = response.json()
        except (json.JSONDecodeError, adapters.SSLError):
            continue
        else:
            break
    else:
        return {}

    return json_data
Esempio n. 8
0
def community_infor(url):
    while True:
        try:
            html=requests.get('http://hf.anjuke.com/'+url,headers=headers,proxies=get_proxies(),timeout=10).text
            if '请输入图片中的验证码' in html:
                continue
            break
        except Exception as e:
            print('[community_infor]%s failed'%(url))
    soup=BeautifulSoup(html,'lxml').find('div',{'class':'comm-basic-mod'})
    item={}
    try:
        detail=soup.find('dl',{'class':'basic-parms-mod'})
        t**s=detail.find_all('dt')
        values=detail.find_all('dd')
        for index in range(len(t**s)):
            item[t**s[index].get_text().replace('\r','').replace('\n','').replace(' ','').replace(':','').replace('\xa0','')]=values[index].get_text().replace('\r','').replace('\n','').replace(' ','').replace(':','')
    except:
        pass
    while True:
        try:
            html=requests.get('http://hf.anjuke.com/ajax/communityext/?commid=%s&useflg=onlyForAjax'%url.split('/')[-2],headers=headers,proxies=get_proxies(),timeout=10).text
            break
        except Exception as e:
            print('[community_infor-json]%s failed'%(url))
    data=json.loads(html)['comm_propnum']
    try:
        item['saleNum']=data['saleNum']
    except:
        item['saleNum']='-'
    try:
        item['rentNum']=data['rentNum']
    except:
        item['rentNum']='-'
    keys=['saleNum','rentNum','所在版块','地址','总建面','总户数','建造年代','容积率','停车位','绿化率','出租率']
    line=''
    for key in keys:
        try:
            line+=str(item[key])+'|'
        except:
            line+='-|'
    return line
Esempio n. 9
0
def _get_page_via_proxy(url, retry=3, proxies=None, fpfirst=False):
    '''
    Get the page via given proxy server.
    '''
    start_time = time()
    for i in range(retry):
        if proxies:
            wait_b4_try(i, factor=3)
            x = proxies
        else:
            x = get_proxies()
        if x:
            # each proxy server is tried once only
            # if anything goes wrong, we'll get another one
            # so it'd better grasp the only chance it'll have
            p = _get_page(url, retry=1, proxies=x, fpfirst=fpfirst)
            if p:
                return p
        else:
            LOG.warning('No valid proxy to get page. Continuing.')
    LOG.warning(
        'All %d attempt(s) to get page via proxy failed in %s. Returning nothing.'
        % (retry, time() - start_time))
    return None
Esempio n. 10
0
def get_community():
    page=1
    while True:
        try:
            html=requests.get('http://hf.anjuke.com/community/p%s'%page,headers=headers,proxies=get_proxies(),timeout=10).text
            if '请输入图片中的验证码' in html:
                continue
        except:
            continue
        try:
            table=BeautifulSoup(html,'lxml').find('div',id='list-content').find_all('div',{'class':'li-itemmod'})
        except:
            break
        if table==[]:
            break
        f=open('urls.txt','a')
        for item in table:
            try:
                url=item.find('a').get('href')
                name=item.find('a').get('title')
            except:
                continue
            try:
                price=item.find('div',{'class':'li-side'}).find('strong').get_text()
            except:
                price='-'
            f.write(name+'|'+price+'|'+url+'\n')            
        f.close()
        print(page)
        page+=1
        if page==51:
            break
Esempio n. 11
0
        return self.db.lrem(REDIS_KEY, 0, p)

    def delete_all(self):
        """
        一键删除key的所有值
        :return:
        """
        return self.db.delete(REDIS_KEY)

    def check_proxy(self, ip, port):
        """
        检测代理是否失效
        :param ip:
        :param port:
        :return:
        """
        try:
            Telnet().open(ip, port, timeout=3)
            return True
        except Exception:
            return False


if __name__ == "__main__":
    db = REDISCLIENT()
    dict_ = get_proxies()
    for proxy in dict_:
        #proxy = result['ip'] + ":" + result['port']
        db.add(proxy)
    db.check()
Esempio n. 12
0
NAME = 0
token = open('token', 'r').read().strip()

bot = Bot(token=token)
updater = Updater(token=token, use_context=True)
dispatcher = updater.dispatcher
j = updater.job_queue

logging.basicConfig(
    filename="log",
    level=logging.ERROR,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

server = Server()
proxies = get_proxies()


def callback_alarm(context: CallbackContext):
    """ This is called every specified minutes to
    notify users if any item in their watchlist changes
    """
    logging.log(logging.ERROR, "Notifying Users")
    for user_id, user in server.users.items():
        updated_items = user.check_prices()
        if updated_items != "":
            context.bot.send_message(chat_id=user_id,
                                     text=updated_items,
                                     parse_mode=ParseMode.MARKDOWN)
        else:
            # DEBUG