コード例 #1
0
def get_issues(url):
    r = requests.get(url)
    html = r.text
    if 'new_list2.jsp' in url:
        elems = get.webpage(html).xpath('//a/text()')
    elif 'new_list3.jsp' in url:
        elems = get.webpage(html).xpath('//td/@title')
    else:
        raise Exception('New DOM type.')
    return elems
コード例 #2
0
ファイル: crawl.py プロジェクト: JH27/crawlers
def get_issues(url):
    r = requests.get(url)
    html = r.text
    if 'new_list2.jsp' in url:
        elems = get.webpage(html).xpath('//a/text()')
    elif 'new_list3.jsp' in url:
        elems = get.webpage(html).xpath('//td/@title')
    else:
        raise Exception('New DOM type.')
    return elems
コード例 #3
0
ファイル: ndocs.py プロジェクト: JH27/crawlers
def google(query):
    url = settings['base_url'] + urllib2.quote(query.encode('utf-8'))
    f = get.htmltree(url)
    p = get.webpage(f)
    x = get.text(p, settings['xpath'])[0]
    n = re.findall(r'[0-9]+', x)
    return int(''.join(n))
コード例 #4
0
ファイル: ndocs.py プロジェクト: winnersky/crawlers
def google(query):
    url = settings['base_url'] + urllib2.quote(query.encode('utf-8'))
    f = get.htmltree(url)
    p = get.webpage(f)
    x = get.text(p, settings['xpath'])[0]
    n = re.findall(r'[0-9]+', x)
    return int(''.join(n))
コード例 #5
0
ファイル: crawl.py プロジェクト: e9t/encar
def get_car_ids(path=None):
    if path:
        with open(path, 'r') as f:
            return f.read().splitlines()
    else:
        f = get.htmltree(settings.LIST_URL[settings.BRAND])
        root = get.webpage(f)
        car_ids = root.xpath('//table[@class="car_list"]//tr/td[@class="inf"]/a[@class="newLink"]/@href')
        return [id.split('=')[1].split('&')[0] for id in car_ids]
コード例 #6
0
ファイル: crawl.py プロジェクト: tobby2002/encar
def get_car_ids(path=None):
    if path:
        with open(path, 'r') as f:
            return f.read().splitlines()
    else:
        f = get.htmltree(settings.LIST_URL[settings.BRAND])
        root = get.webpage(f)
        car_ids = root.xpath(
            '//table[@class="car_list"]//tr/td[@class="inf"]/a[@class="newLink"]/@href'
        )
        return [id.split('=')[1].split('&')[0] for id in car_ids]
コード例 #7
0
ファイル: crawl.py プロジェクト: e9t/encar
def get_car_info(encar_id):
    info = dict(encar_id=encar_id)

    url = settings.car_baseurl % encar_id
    f = get.htmltree(url)
    root = get.webpage(f)

    # summary
    summary = root.xpath('//div[@class="section summary hproduct"]')[0]
    info['name'] = [i.strip()\
            for i in summary.xpath('.//h3[@class="car"]/span/text()')]
    info['transmission'] =\
            x(summary, './/div[@class="short"]//li[@class="trs"]/text()')
    info['fuel'] = x(summary, './/div[@class="short"]//li[@class="fue"]/i/text()')
    info['engine'] = x(summary, './/div[@class="short"]//li[@class="eng"]/text()')
    info['type'] = x(summary, './/div[@class="short"]//li[@class="typ"]/text()')
    try:
        info['tags'] = x(summary, './/div[@class="merit"]//span/text()')
    except IndexError:
        info['tags'] = []
    info['price'] = int(x(summary, './/div[@class="prc"]//strong/text()'))

    # detail
    car_detail = root.xpath('//div[@class="field detail"]')[0]
    info['car_id'] = x(car_detail, './/li[@class="cid"]/i/text()')
    info['birthday'] = x(car_detail, './/li[@class="yer"]/i/text()')
    info['mileage'] = x(car_detail, './/li[@class="dts"]/i/text()')
    info['color'] = x(car_detail, './/li[@class="clr"]/i/text()')

    # options
    car_options = root.xpath('//table[@class="option_table"]//span[@class="check"]')
    info['options'] = {x(option, './/a/text()'): x(option, './/sup/@class')\
                                                 for option in car_options}

    # seller
    seller = root.xpath('//div[@class="field seller"]//dd[not(contains(@class, "image") or contains(@class, "email"))]')
    info['seller'] = {x(s, './@class'): xte(s, './p/span/text()|./p/strong/text()|./p/strong/a/text()') for s in seller[:5]}

    # accidents
    accident_list = root.xpath('//ul[@class="acclist"]/li')
    info['accidents'] = {x(a, './b/text()'): x(a, './/strong/text()')
                                             for a in accident_list}

    # etc
    encar = {}
    encar['registration_date'] = x(root, '//div[@class="field etc"]//span[@class="date"]/text()').strip(' :').replace('/', '-')
    encar['page_hit'] = int(x(root, '//div[@class="field etc"]//span[@class="hit"]/text()').strip(': '))
    encar['page_favs'] = int(x(root, '//div[@class="field etc"]//span[@class="hot"]//i/text()'))
    info['encar'] = encar

    return info
コード例 #8
0
ファイル: batch.py プロジェクト: JH27/crawlers
    def get_sitting_urls(assembly_id, div_id, sessionurl):
        root = get.webpage(get.htmltree(sessionurl))
        js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

        params = match_name_codes(js_calls, filter='mainsearch2', type='sessions')
        nsittings = len(params)
        params['j'] = str(nsittings)

        urls = []
        for i in range(nsittings):
            params['SES_NUM'] = params['SES_NUM%s' % i]
            url = '%s&%s' % (sessionurl, urlencode(params))
            # TODO: generalize me
            url = url.replace('con_search2', 'con_search3')
            urls.append({'session_name': params['SES_NUM'], 'url': url})
        return urls
コード例 #9
0
    def get_sitting_urls(assembly_id, div_id, sessionurl):
        root = get.webpage(get.htmltree(sessionurl))
        js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

        params = match_name_codes(js_calls,
                                  filter='mainsearch2',
                                  type='sessions')
        nsittings = len(params)
        params['j'] = str(nsittings)

        urls = []
        for i in range(nsittings):
            params['SES_NUM'] = params['SES_NUM%s' % i]
            url = '%s&%s' % (sessionurl, urlencode(params))
            # TODO: generalize me
            url = url.replace('con_search2', 'con_search3')
            urls.append({'session_name': params['SES_NUM'], 'url': url})
        return urls
コード例 #10
0
def parse_page(page_num, attrs):
    def save_pdf(data):
        filename = get_filename(data, 'pdf')
        urllib.urlretrieve(data['pdf'], filename)

    def save_json(data):
        filename = get_filename(data, 'json')
        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)

    html = get_html(page_num)
    root = get.webpage(html)
    rows = root.xpath(\
            '//table[@background="../img/main_boxback2.gif"]//tr')[2:-1]
    for row in rows:
        data = parse_row(row, attrs)
        save_json(data)
        save_pdf(data)
コード例 #11
0
ファイル: crawl.py プロジェクト: JH27/crawlers
def parse_page(page_num, attrs):

    def save_pdf(data):
        filename = get_filename(data, 'pdf')
        urllib.urlretrieve(data['pdf'], filename)

    def save_json(data):
        filename = get_filename(data, 'json')
        with open(filename, 'w') as f:
            json.dump(data, f, indent=2)

    html = get_html(page_num)
    root = get.webpage(html)
    rows = root.xpath(\
            '//table[@background="../img/main_boxback2.gif"]//tr')[2:-1]
    for row in rows:
        data = parse_row(row, attrs)
        save_json(data)
        save_pdf(data)
コード例 #12
0
ファイル: batch.py プロジェクト: JH27/crawlers
def get_session_urls(assembly_id, div_id, listurl):
    def searchform(root, num=''):
        return root.xpath('.//form[@name="searchform%s"]/@action' % num)[0]

    root = get.webpage(get.htmltree(listurl))
    js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

    params = match_name_codes(js_calls, filter='mainsearch', type='committees')
    nsessions = len(params)/2
    params['i'] = str(nsessions)
    params['div'] = str(div_id)
    params['DAE_NUM'] = str(assembly_id)

    urls = []
    for i in range(nsessions):
        params['COMM_NAME'] = params['COMM_NAME%s' % i]
        params['COMM_CODE'] = params['COMM_CODE%s' % i]
        urls.append(\
            {'committee': params['COMM_NAME'],
             'url': '%s/content/%s?%s' %\
                    (BASEURL, searchform(root)[:-2], urlencode(params))})
    return urls
コード例 #13
0
def get_session_urls(assembly_id, div_id, listurl):
    def searchform(root, num=''):
        return root.xpath('.//form[@name="searchform%s"]/@action' % num)[0]

    root = get.webpage(get.htmltree(listurl))
    js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]

    params = match_name_codes(js_calls, filter='mainsearch', type='committees')
    nsessions = len(params) / 2
    params['i'] = str(nsessions)
    params['div'] = str(div_id)
    params['DAE_NUM'] = str(assembly_id)

    urls = []
    for i in range(nsessions):
        params['COMM_NAME'] = params['COMM_NAME%s' % i]
        params['COMM_CODE'] = params['COMM_CODE%s' % i]
        urls.append(\
            {'committee': params['COMM_NAME'],
             'url': '%s/content/%s?%s' %\
                    (BASEURL, searchform(root)[:-2], urlencode(params))})
    return urls
コード例 #14
0
 def get_doc_ids(assembly_id, div_id, sittingurl):
     root = get.webpage(get.htmltree(sittingurl))
     js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]
     return [{'sitting_name': c[1][0], 'docid': c[1][1]}\
                 for c in js_calls if c[0]=='mainsearch4']
コード例 #15
0
ファイル: crawl.py プロジェクト: tobby2002/encar
def get_car_info(encar_id):
    info = dict(encar_id=encar_id)

    url = settings.car_baseurl % encar_id
    f = get.htmltree(url)
    root = get.webpage(f)

    # summary
    summary = root.xpath('//div[@class="section summary hproduct"]')[0]
    info['name'] = [i.strip()\
            for i in summary.xpath('.//h3[@class="car"]/span/text()')]
    info['transmission'] =\
            x(summary, './/div[@class="short"]//li[@class="trs"]/text()')
    info['fuel'] = x(summary,
                     './/div[@class="short"]//li[@class="fue"]/i/text()')
    info['engine'] = x(summary,
                       './/div[@class="short"]//li[@class="eng"]/text()')
    info['type'] = x(summary,
                     './/div[@class="short"]//li[@class="typ"]/text()')
    try:
        info['tags'] = x(summary, './/div[@class="merit"]//span/text()')
    except IndexError:
        info['tags'] = []
    info['price'] = int(x(summary, './/div[@class="prc"]//strong/text()'))

    # detail
    car_detail = root.xpath('//div[@class="field detail"]')[0]
    info['car_id'] = x(car_detail, './/li[@class="cid"]/i/text()')
    info['birthday'] = x(car_detail, './/li[@class="yer"]/i/text()')
    info['mileage'] = x(car_detail, './/li[@class="dts"]/i/text()')
    info['color'] = x(car_detail, './/li[@class="clr"]/i/text()')

    # options
    car_options = root.xpath(
        '//table[@class="option_table"]//span[@class="check"]')
    info['options'] = {x(option, './/a/text()'): x(option, './/sup/@class')\
                                                 for option in car_options}

    # seller
    seller = root.xpath(
        '//div[@class="field seller"]//dd[not(contains(@class, "image") or contains(@class, "email"))]'
    )
    info['seller'] = {
        x(s, './@class'):
        xte(s, './p/span/text()|./p/strong/text()|./p/strong/a/text()')
        for s in seller[:5]
    }

    # accidents
    accident_list = root.xpath('//ul[@class="acclist"]/li')
    info['accidents'] = {
        x(a, './b/text()'): x(a, './/strong/text()')
        for a in accident_list
    }

    # etc
    encar = {}
    encar['registration_date'] = x(
        root, '//div[@class="field etc"]//span[@class="date"]/text()').strip(
            ' :').replace('/', '-')
    encar['page_hit'] = int(
        x(root,
          '//div[@class="field etc"]//span[@class="hit"]/text()').strip(': '))
    encar['page_favs'] = int(
        x(root, '//div[@class="field etc"]//span[@class="hot"]//i/text()'))
    info['encar'] = encar

    return info
コード例 #16
0
ファイル: batch.py プロジェクト: JH27/crawlers
 def get_doc_ids(assembly_id, div_id, sittingurl):
     root = get.webpage(get.htmltree(sittingurl))
     js_calls = [parse_js_call(j) for j in root.xpath('.//a/@href')]
     return [{'sitting_name': c[1][0], 'docid': c[1][1]}\
                 for c in js_calls if c[0]=='mainsearch4']
コード例 #17
0
ファイル: crawl.py プロジェクト: JH27/crawlers
def get_hidden_url(url):
    r = requests.get(url)
    html = r.text
    root = get.webpage(html)
    return '%s/%s' % (baseurl, root.xpath('//frame/@src')[1])
コード例 #18
0
def get_hidden_url(url):
    r = requests.get(url)
    html = r.text
    root = get.webpage(html)
    return '%s/%s' % (baseurl, root.xpath('//frame/@src')[1])