Beispiel #1
0
def parse_post(post):
    return JSONSearchPost(
        **{
            'id':
            int(post['PostingID']),
            'title':
            post['PostingTitle'],
            'url':
            cdn_url_to_http(post['PostingURL']),
            'category_id':
            post['CategoryID'],
            'thumbnail':
            post.get('ImageThumb'),
            'longitude':
            post['Longitude'],
            'latitude':
            post['Latitude'],
            'date':
            datetime.fromtimestamp(float(post['PostedDate']),
                                   timezone.utc).isoformat(),
            'price':
            post['Ask'],
            'bedrooms':
            post.get('Bedrooms'),
        })
Beispiel #2
0
def parse_cluster_url_output(body):
    try:
        items, meta = json.loads(body)
    except ValueError as e:  # pragma: no cover
        raise CraigslistException("could not find items and meta "
                                  "in json response body: '{}'".format(body))
    try:
        baseurl = cdn_url_to_http(meta['baseurl'])
    except KeyError as e:  # pragma: no cover
        raise CraigslistException("could not find baseurl in meta: '{}' "
                                  "with body:'{}'. probably empty response. "
                                  "is your query too specific?".format(
                                      meta, body))
        raise
    posts = [parse_post(x) for x in items if not x.get('GeoCluster')]

    clusters = [
        parse_cluster(x, baseurl) for x in items if x.get('GeoCluster')
    ]
    return posts, clusters
Beispiel #3
0
def get_tlds_mapping(areas_mapping):
    """
    create mapping of country to tld (top level domain)
    """
    tlds = {}

    redirected_hostnames = {'fortlauderdale': 'miami'}

    response = requests.get("http://www.craigslist.org/about/sites")
    doc = lxml.html.fromstring(response.content)
    for a in doc.cssselect("div.box a"):
        url = cdn_url_to_http(a.get('href'))
        parsed_url = tldextract.extract(url)
        area, tld = parsed_url.subdomain, parsed_url.suffix
        area = redirected_hostnames.get(area, area)
        try:
            country = areas_mapping[area]['country']
        except KeyError:
            print("could not found {} in areas_mapping".format(area),
                  file=sys.stderr)
            continue
        tlds[country] = tld
    return tlds