Beispiel #1
0
def test_get_url_base():
    from craigslist._search import get_url_base
    assert get_url_base(
        'washingtondc') == 'https://washingtondc.craigslist.org'
    assert get_url_base('aberdeen') == 'https://aberdeen.craigslist.co.uk'
    with pytest.raises(ValueError) as e_info:
        get_url_base('asdadf')
Beispiel #2
0
def get_arguments(section):
    url = get_url_base(LOCATION) + '/' + section
    response = requests.get(url)
    doc = lxml.html.fromstring(response.content)
    arguments = []
    for ultag in doc.cssselect('div.search-options div.searchgroup ul.list'):
        name = ultag.cssselect('li input')[0].get('name')
        if name not in IGNORE_CHOICES:
            choices = tuple((slugify(tag.tail.strip()), tag.get('value'))
                            for tag in ultag.cssselect('li input')
                            if tag.get('value'))
        else:
            choces = None
        argument = Argument(dest=name, nargs='*', choices=choices)
        arguments.append(argument)
    for tag in doc.cssselect(
            'div.search-options div.searchgroup > input, '
            'div.search-options div.searchgroup > label > input, '
            'div.search-options div.searchgroup > ul:not(.js-only):not(.list) > li > label > input',
    ):
        name = tag.get('name')
        if not name: continue
        type_ = tag.get('type')
        value = tag.get('value')
        if type_ == "checkbox":
            argument = Argument(dest=name, action='store_const', const=value)
        else:
            argument = Argument(dest=name)
        arguments.append(argument)
    for selecttag in doc.cssselect(
            'div.search-options div.searchgroup select'):
        name = selecttag.get('name')
        if not name: continue
        default = None
        nargs = None
        if name not in IGNORE_CHOICES:
            choices = tuple(((slugify(tag.text.strip())), tag.get('value'))
                            for tag in selecttag.cssselect('option')
                            if tag.get('value') != '')
            potential_defaults = [
                value for key, value in choices if "all" in key
            ]
            if len(potential_defaults) == 1:
                default = potential_defaults[0]
                nargs = '?'
        else:
            choices = None
        argument = Argument(dest=name, default=default, choices=choices)
        arguments.append(argument)
    return arguments
Beispiel #3
0
def parse_post(post, craigslist_area_name):
    areas = get_areas()
    area_timezone = areas[craigslist_area_name]['timezone']
    pid = int(post.get('data-pid'))
    respost_pid = int(
        post.get('data-repost-of')) if post.get('data-repost-of') else None
    date_orig = post.cssselect('time')[0].get('datetime')
    date = arrow.get(date_orig).replace(
        tzinfo=area_timezone).to('utc').isoformat()
    url_orig = post.cssselect("p.result-info > a")[0].get('href')
    url = url_orig if "http" in url_orig else get_url_base(
        craigslist_area_name) + url_orig
    url = http_to_https(url)
    title = post.cssselect("p.result-info > a")[0].text
    price_el = get_only_first_or_none(
        post.cssselect("span.result-meta > span.result-price"))
    price_raw = price_el.text if price_el is not None else None
    price = int(price_raw.replace("$", "")) if price_raw else None
    housing_el = get_only_first_or_none(
        post.cssselect("p.result-info > span > span.housing"))
    housing = [x.strip() for x in housing_el.text.split("-\n") \
        if x.strip()] if housing_el is not None else []
    bedrooms_raw = get_only_first_or_none([x for x in housing if "br" in x])
    num_bedrooms = int(bedrooms_raw.replace("br",
                                            "")) if bedrooms_raw else None
    area_raw = get_only_first_or_none([x for x in housing if "ft" in x])
    area = int(area_raw.replace("ft", "")) if area_raw else None
    return RegularSearchPost(
        **{
            "id": pid,
            "title": title,
            "url": url,
            "repost_id": respost_pid,
            "price": price,
            "bedrooms": num_bedrooms,
            "date": date,
            "area": area,
        })