Esempio n. 1
0
def scrape_ad(url, city):
    r = requests.get(url)
    html = lxml.html.fromstring(clean_string(r.text))
    try:
        ad_body = [
            elem.text_content().replace("\r", "")
            for elem in html.xpath("//div[@class='postingBody']")
        ][0]
    except IndexError:
        #this means we don't have an ad body and thus there will be little to no useful information here.
        #and therefore we return only empty strings
        return '', '', '', '', '', '', ''
    extra_info = html.xpath("//div[@style='padding-left:2em;']")
    try:
        location = [
            elem.text_content() for elem in extra_info
            if "Location:" in elem.text_content()
        ][0]
        location = clean_location_string(location)
        latitude, longitude = get_lat_long(location, city)
    except:
        latitude, longitude = '', ''
    try:
        post_id = [
            elem.text_content() for elem in extra_info
            if "Post ID:" in elem.text_content()
        ][0]
        post_id = strip_post_id(post_id)
    except IndexError:
        post_id = ''
    try:
        photo_urls = html.xpath("//ul[@id='viewAdPhotoLayout']//img/@src")
    except:
        photo_urls = ''
    other_ads = [
        elem for elem in html.xpath("//a/@href") if "backpage" in elem
    ]
    phone_number = phone_number_parse(ad_body)
    ad_info = AdInfo(phone_number, ad_body, location, str(latitude),
                     str(longitude), post_id, datetime.now(), photo_urls)
    return ad_info
Esempio n. 2
0
def scrape_ad(url,city):
    r = requests.get(url)
    html = lxml.html.fromstring(clean_string(r.text))
    try:
        ad_body = [elem.text_content().replace("\r","") for elem in html.xpath("//div[@class='postingBody']")][0]
    except IndexError:
        #this means we don't have an ad body and thus there will be little to no useful information here.
        #and therefore we return only empty strings
        return '','','','','','',''
    extra_info = html.xpath("//div[@style='padding-left:2em;']")
    try:
        location = [elem.text_content() for elem in extra_info if "Location:" in elem.text_content()][0]
        location = clean_location_string(location)
        latitude, longitude = get_lat_long(location, city)
    except:
        latitude,longitude = '',''
    try:
        post_id = [elem.text_content() for elem in extra_info if "Post ID:" in elem.text_content()][0]
        post_id = strip_post_id(post_id)
    except IndexError:
        post_id = ''
    try:
        photo_urls = html.xpath("//ul[@id='viewAdPhotoLayout']//img/@src")
    except:
        photo_urls = ''
    other_ads = [elem for elem in html.xpath("//a/@href") if "backpage" in elem]
    phone_number = phone_number_parse(ad_body)
    ad_info = AdInfo(
        phone_number,
        ad_body,
        location,
        str(latitude),
        str(longitude),
        post_id,
        datetime.now(),
        photo_urls
    )
    return ad_info
Esempio n. 3
0
def test_third_phone_number_parse():
    text = """
    Hi there I'm brandi I'm a 23 yr old and I'm super hot. 516SevensEVENThree40SeVen1 is my number.  Give me a call!
    """
    print(text_parser.phone_number_parse(text))
    assert "5167734071" == text_parser.phone_number_parse(text)
Esempio n. 4
0
def test_second_phone_number_parse():
    text = """
    Hi there 516SevensEVENThree40SeVen1 is my number.  Give me a call!
    """
    assert "5167734071" == text_parser.phone_number_parse(text)
Esempio n. 5
0
def test_first_phone_number_parse():
    text = """
    Hi th5e1r6e sevEN sEvEn I'm brandi
    thRee and I'4m071 looking for a good time :)
    """
    assert "5167734071" == text_parser.phone_number_parse(text)
Esempio n. 6
0
def test_third_phone_number_parse():
    text = """
    Hi there I'm brandi I'm a 23 yr old and I'm super hot. 516SevensEVENThree40SeVen1 is my number.  Give me a call!
    """
    print(text_parser.phone_number_parse(text))
    assert "5167734071" == text_parser.phone_number_parse(text)
Esempio n. 7
0
def test_second_phone_number_parse():
    text = """
    Hi there 516SevensEVENThree40SeVen1 is my number.  Give me a call!
    """
    assert "5167734071" == text_parser.phone_number_parse(text)
Esempio n. 8
0
def test_first_phone_number_parse():
    text = """
    Hi th5e1r6e sevEN sEvEn I'm brandi
    thRee and I'4m071 looking for a good time :)
    """
    assert "5167734071" == text_parser.phone_number_parse(text)