Ejemplo n.º 1
0
def get_links():
    brands = hu.get_brands_link(
        hu.get_html('http://www.profit-msk.ru/goods/zip/index.html'))
    for brand in brands:
        models = hu.get_models_link(hu.get_html(brand))

        df = pandas.DataFrame(models)
        df.to_csv('models', index=False, mode='a', header=False, sep=";")
Ejemplo n.º 2
0
def parser_site():
    # data = pandas.read_csv('models', sep=';')
    data = fu.load_file('models')
    # models = data.values.tolist()
    for model in data:
        if model:
            proxy = {'http': 'http://' + get_proxy.get_proxies_list()}
            useragent = {'User-Agent': get_proxy.get_useregent_list()}

            soup = hu.get_html(model[1], useragent, proxy)

            if soup == 404:
                continue

            brand_name, model_name, device_spec, device_data = hu.model_parser(
                soup, model[0])

            model_name = re.sub('/', ' ', model_name)
            base_dir = os.path.dirname(__file__)
            base_dir = f'{base_dir}\\parse\\{brand_name}'
            if not os.path.exists(base_dir):
                os.mkdir(base_dir)

            df = pandas.DataFrame(device_spec)
            df.to_csv(f'{base_dir}\\{model_name}_spec.csv',
                      index=False,
                      header=False,
                      sep=";")
            df = pandas.DataFrame(device_data)
            df.to_csv(f'{base_dir}\\{model_name}_parts.csv',
                      index=False,
                      header=False,
                      sep=";")
Ejemplo n.º 3
0
def get_topn_words_from_urls(urls,topn,save_reports = False):
    htmls = [html_utils.get_html(url) for url in urls]
    # 汇总文本
    summary_atricle = '\n'.join([parse_report_article(html) for html in htmls])
    if save_reports:
        with open('reports.txt','w+') as fout:
            fout.write(summary_atricle)
    return cut_text_utils.get_topn_words(summary_atricle,topn)
Ejemplo n.º 4
0
 def analyze_info(self, url):
     """
     解析数据
     :param url: 网页地址
     """
     house_list = []
     doc = pyQuery(get_html(url, self.referer))
     items = doc('.key-list .item-mod').items()
     for item in items:
         address = item.find('.address').text()
         # 去空格
         index = address.find('\xa0', 2)
         address = ' '.join(address.split())
         # 地区
         city = ''
         if index >= 2:
             city = address[2:index]
         # 价格
         price_desc = item.find('.price').text() or item.find(
             '.price-txt').text()
         house_info = {
             # 城市
             'city': city,
             # 名称
             # 'name': item.find('.lp-name h3').text(),
             'name': item.find('.items-name').text(),
             # 户型
             'house_type': ' '.join(item.find('.huxing').text().split()),
             # 地址
             'address': address,
             # 地址链接
             'address_link': item.find('.address').attr('href'),
             # 标签
             'tags': item.find('.tag-panel').text(),
             # 价格
             'price': price_desc,
             'price_nu': analysis_price(price_desc),
             # 排名
             'rank': item.find('.group-mark').text(),
             # 图片
             'pic': item.find('.pic img').attr('src'),
             # 图片链接
             'pic_link': item.children('.pic').attr('href'),
             'report_date': self.report_date
         }
         # 加入列表中
         house_list.append(house_info)
     self.total += len(house_list)
     # 本页数据批量存入MongoDB中
     self.collection.insert(house_list)
     # 获取下一页,如果有下一页的,继续爬取下一页的内容
     next_url = doc('.list-page .next-page').attr('href')
     if next_url:
         # 引用上一个访问地址
         self.referer = url
         time.sleep(2)
         self.new_log.logger.info('next => %s' % next_url)
         self.analyze_info(next_url)
Ejemplo n.º 5
0
def get_report_urls(summary_url):
    html = html_utils.get_html(summary_url)
    soup = BS(html,'html.parser')
    reports_table = soup.select('#UCAP-CONTENT table tbody')[0]
    reports = [(atag.text,atag['href']) for trtag in reports_table.select('tr') for tdtag in trtag.select('td') if len(tdtag.select('a')) != 0 for atag in tdtag.select('a')]
    
    # 过滤去2017年的URL
    report_urls = [x for x in reports if x[0] != '2017']
    report_urls.append(('2017',REPORT2017_URL))
    # 按照年份升序排序
    report_urls = sorted(report_urls,key = lambda item:item[0])
    return report_urls
Ejemplo n.º 6
0
 def analyze_info(self, url):
     """
     解析数据
     :param url: 网页源码
     """
     house_list = []
     doc = pyQuery(get_html(url, self.referer))
     items = doc('#houselist-mod-new .list-item').items()
     for item in items:
         detail = ' '.join(
             item.find('.details-item').text().split()).split(' ')
         if len(detail) < 3:
             continue
         all_price_desc = item.find('.price-det').text()
         unit_price_desc = item.find('.unit-price').text()
         house_info = {
             # 区域
             'city': detail[2].split('-')[0],
             # 名称
             'name': detail[1],
             # 户型
             'house_type': detail[0][0:detail[0].find('造') + 1],
             # 地址
             'address': detail[2],
             # 标签
             'tags': item.find('.tags-bottom').text(),
             # 总价
             'all_price': all_price_desc,
             'all_price_nu': analysis_price(all_price_desc),
             # 单价
             'unit-price': unit_price_desc,
             'unit-price_nu': analysis_price(unit_price_desc),
             # 图片
             'pic': item.find('.item-img img').attr('src'),
             # 房源真实性
             'authenticity': item.find('.house-title .house-icon').text(),
             'report_date': self.report_date
         }
         # 加入列表中
         house_list.append(house_info)
     self.total += len(house_list)
     # 批量存入MongoDB中
     self.collection.insert(house_list)
     # 获取下一页,如果有下一页的,继续爬取下一页的内容
     next_url = doc('.multi-page .aNxt').attr('href')
     if next_url:
         # 引用上一个访问地址
         self.referer = url
         time.sleep(2)
         self.sale_log.logger.info('next => %s' % next_url)
         self.analyze_info(next_url)
Ejemplo n.º 7
0
def get_topn_words(url,topn):
    html = html_utils.get_html(url)
    article = parse_report_article(html)
    return cut_text_utils.get_topn_words(article,topn)