Esempio n. 1
0
def mejeej(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"class": "photo-wrap"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.a['href']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 2
0
def stilettocouture(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"class": "PhotoWrapper"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.a.img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 3
0
def jjperfectlegs(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("section", {"class": "post"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    soup.decompose()
    return (res_url)
Esempio n. 4
0
def closetheels(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"id": "container"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    soup.decompose()
    return (res_url)
Esempio n. 5
0
def addicttosex(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"class": "photo-wrapper-inner"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    soup.decompose()
    return (res_url)
Esempio n. 6
0
def therubik(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"class": "post-content"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        if debug_app: logr('error')
        res_url = 'error'
    soup.decompose()
    return (res_url)
Esempio n. 7
0
def sexyonheels(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soup = BeautifulSoup(req.text, "lxml")
    tags = soup.find_all('div')
    soup.decompose()
    pattern = re.compile('src="?\'?([^"\'>]*)')
    try:
        res_url = re.findall(pattern, str(tags[5]))[0]
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 8
0
def nicelegsandperspectives(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("ul", {"id": "posts"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    tags = soup.find_all("section", class_="top")
    soup.decompose()
    try:
        res_url = tags[0].img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 9
0
def haawheels(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"class": "autopagerize_page_element"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    tags = soup.find_all("img")
    soup.decompose()
    try:
        res_url = tags[0]['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 10
0
def heelhunter(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("article")
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    tags = soup.find_all("a")
    soup.decompose()
    try:
        res_url = tags[0].img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 11
0
def naughtylegs(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"id": "posts"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    try:
        res_url = soup.img['src']
        ####### bypass images that are avatars http://apollo:3080/crawler/crawler.2/peeptoeheels.tumblr.com/avatar_f570d9426951_16.png
        if debug_app: logr('img link ' + res_url)
    except:
        res_url = 'error'
        if debug_app: logr('error')
    soup.decompose()
    return (res_url)
Esempio n. 12
0
def get_url_archives(url_master):
    global site_errors
    url_array = []
    parsed = urlparse(url_master)
    url_parsed = parsed.scheme + '://' + parsed.netloc
    soupfilter = SoupStrainer('nav', {'class': 'months'})
    req = requests.get(url_master,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soup = BeautifulSoup(req.text, 'lxml', parse_only=soupfilter)
    url_array = [
        url_parsed + url_path['href'] for url_path in soup.find_all('a')
    ]
    if len(url_array) == 0:
        site_errors.append(url_master)
        logr('Site %s not working properly' % url_master)
    return (url_array)
Esempio n. 13
0
def get_url_archives_old(url_master):
    global site_errors
    br = mechanize.Browser()
    br.set_handle_robots(False)
    url_array = []
    try:
        br.open(url_master)
        months = ['January','February','March','April','May','June','July','August','September','October','November','December', \
            'Janvier', 'Fevrier','mars', 'Avril','Mai','Juin','Juillet','Aout','Septembre','Octobre','Novembre','Decembre' \
            'Ocak', 'Subat', 'Mart', 'Nisan', 'Mayis', 'Harizan', 'Temmuz', 'Agustos', 'Eylul', 'Ekim', 'Kasim', 'Aralik']
        for item in br.links():
            print item
            continue
            if item.text in months:
                parsed = urlparse(item.base_url)
                url_parsed = parsed.scheme + '://' + parsed.netloc
                url = url_parsed + item.url
                url_array.append(url)
    except:
        site_errors.append(url_master)
        logr('Site %s not working properly' % url_master)
    return (url_array)
Esempio n. 14
0
def heelsland(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    # typical tumblr post - easy to grab pictures
    soupfilter = SoupStrainer("div", {"class": "post load"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    posts = [
        post['href'] for post in soup.find_all('a')
        if re.search('/image/', post['href'])
    ]
    if len(posts) > 0:
        for post_url in posts:
            req = requests.get(post_url,
                               headers=my_headers,
                               timeout=my_timeout,
                               proxies=my_proxies)
            soup = BeautifulSoup(req.text, "lxml")
            img_tags = soup.find_all("div", {"id": "content-wrapper"})
            img_array = [img_url.img['data-src'] for img_url in img_tags]
            if debug_app:
                for item in img_array:
                    logr('img link ' + item)
            return (img_array)

    soup = BeautifulSoup(req.text, "lxml")
    # check if it's a photoset post
    if len(soup.find_all("div", {"class": "html_photoset"})):
        # this returns a javascript snippet which passes the img as parameters. this part parses the js code and extracts the urls.
        img_tags = soup.find_all('script')
        jscript_tag = str(
            img_tags[7]).split('\n')[-1].split('\t')[-1].strip('</script>')
        json_data = json.loads(jscript_tag)
        if debug_app:
            for item in json_data:
                logr('img link ' + item)
        return (json_data['image']['@list'])
        # img_array = [img_url for img_url in json_data['image']['@list'] if re.match('^http://.+',img_url)]
        # return(img_array)

    #
    # if other algorithms failed it defaults to download the first instance image
    img_tag = soup.find("div", {"class": "photo"})
    if debug_app: logr('img link ' + img_tag.img['src'])
    return (img_tag.img['src'])
Esempio n. 15
0
def evil(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soup = BeautifulSoup(req.text, "lxml")
    tags = soup.find_all('meta')
    soup.decompose()
    if len(tags) == 63:
        res_url = tags[39]['content']
        if re.match(img_pattern, res_url):
            if debug_app: logr('img link ' + res_url)
        else:
            res_url = 'error'
            if debug_app: logr('error')
    else:
        res_url = 'error'
        if debug_app: logr('error')
    return (res_url)
Esempio n. 16
0
def heelsfromhell(url):
    req = requests.get(url,
                       headers=my_headers,
                       timeout=my_timeout,
                       proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"id": "content"})
    soup = BeautifulSoup(req.text, "lxml", parse_only=soupfilter)
    tags = soup.find_all('a')
    nxt_soup = BeautifulSoup(str(tags[0]))
    try:
        res_url = nxt_soup.img['src']
        if debug_app: logr('img link ' + res_url)
    except:
        tags = soup.find_all('p')
        nxt_soup = BeautifulSoup(str(tags[0]))
        try:
            res_url = nxt_soup.img['src']
            if debug_app: logr('img link ' + res_url)
        except:
            if debug_app: logr('error')
            res_url = 'error'
    soup.decompose()
    return (res_url)
Esempio n. 17
0
    parser = argparse.ArgumentParser(prog='tumblrbot.py',
                                     description='Tumblr Image Downloader')
    parser.add_argument('-x',
                        '--proxy',
                        nargs='?',
                        const='dynamic',
                        default='none')
    parser.add_argument('-d',
                        '--debug_app',
                        action="store_true",
                        help=("prints debug messages"))
    parser.add_argument('-u',
                        '--url',
                        nargs='?',
                        const='dynamic',
                        default='none')
    parser.add_argument('-f',
                        '--file',
                        nargs='?',
                        const='dynamic',
                        default='/root/xscripts/tumblrbot/urls')
    parser.add_argument("-r",
                        "--random",
                        action="store_true",
                        help=("randomizes  playlist"))
    args = parser.parse_args()
    if os.path.isfile(args.file) or re.match(r'^http://.+\.', args.url):
        bot(args.file, args.proxy, args.random, args.url, args.debug_app)
    else:
        logr('not a valid download_list file or url')
Esempio n. 18
0
def bot(url_list, opt_proxy, random_mode, target_url, debug):
    script_start = timer()
    global complete_path
    global tumblrblog
    # global image_link
    global dbconn_tumblr
    global dbconn_smp
    global debug_app
    global my_proxies
    global my_headers
    global my_timeout
    global site_errors
    global image_counter
    global tot_images
    global page_errors
    global tot_errors
    global imgskiped
    global parse_only
    debug_app = debug
    if opt_proxy == 'none':
        my_proxies = {}
    else:
        my_proxies = {"http": opt_proxy}
    start_path = '/mnt/vol1/crawler/crawler.4/'
    credentials_path = os.path.join(work_folder, 'api_settings', 'config')
    user_agent_string = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:43.0) Gecko/20100101 Firefox/43.0'
    my_headers = {
        'User-Agent': user_agent_string,
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Referer': "http://www.tumblr.com"
    }

    my_timeout = 10
    image_counter = 0
    tot_images = 0
    tot_errors = 0
    imgskiped = 0
    max_pages_per_blog = 99
    max_images_per_page = 999999
    sleep_time = 5
    site_errors = []
    # Database Connection Setup
    db_server = 'prometheus'
    (db_user, db_pass) = load_config(credentials_path)['databases'][db_server]
    dbconn_tumblr = MySQLdb.connect(host=db_server,
                                    user=db_user,
                                    passwd=db_pass,
                                    db='tumblr')
    dbconn_smp = MySQLdb.connect(host=db_server,
                                 user=db_user,
                                 passwd=db_pass,
                                 db='samplesdb')
    dbconn_tumblr.autocommit(True)
    dbconn_smp.autocommit(True)
    ###########################
    img_pattern = re.compile('.+\.(jpg|png|gif)$')
    today = datetime.datetime.now()
    if target_url != 'none':
        sites_list = [target_url]
    else:
        sites_list = open(url_list).read()
        sites_list = [
            line for line in sites_list.split('\n')
            if re.match('^http://.+', line)
        ]
    for tumblrblog in sites_list:
        page_counter = 1
        image_counter = 1
        complete_path = start_path + str(urlparse(tumblrblog).hostname)
        ensure_dir(complete_path)
        lista_paginas = get_url_archives(tumblrblog)
        if len(lista_paginas) == 0:
            continue
        for pagina in lista_paginas:  #url archive pages by month
            parsed_obj = urlparse(pagina)
            url_path = parsed_obj.path
            url_path_split = url_path.split('/')
            url_year = url_path_split[2]
            url_month = url_path_split[3]
            if (str(today.month) == url_month) and (str(today.year)
                                                    == url_year):
                pass
            else:
                rows_pages = check_db_page(pagina)
                if len(rows_pages) > 0:
                    logr('page match')
                    continue
            logr('Looking at Page %s ...' % pagina)
            page_errors = 0
            logr('Getting list of urls')
            list_image_links = find_image_urls(pagina)
            for this_img_link in list_image_links:
                if isinstance(this_img_link, list):
                    for this_instance in this_img_link:
                        download_images(this_instance, tumblrblog)
                    page_counter += len(this_img_link)
                else:
                    download_images(this_img_link, tumblrblog)
                    page_counter += 1
                if image_counter > max_images_per_page:
                    break
            if page_errors == 0:
                rec_page(pagina)
            time.sleep(sleep_time)
            if page_counter > max_pages_per_blog:
                page_counter -= 1
                logr('Reached max number of %s blog pages for %s' %
                     (page_counter, tumblrblog))
                break
    dbconn_tumblr.close()
    dbconn_smp.close()
    elapsed_time = timer() - script_start
    # logr("I've done my job!")
    bot_summary('tumblrbot', tot_images, tot_errors, imgskiped, elapsed_time,
                url_list)
Esempio n. 19
0
def download_images(url, tumblrblog):
    global image_counter
    global tot_images
    global page_errors
    global tot_errors
    global imgskiped
    filename = os.path.basename(url)
    sys.stdout.write('Checking ' + url)
    rows_image_urls = check_db_imageurl(url)
    if len(rows_image_urls) > 0:
        logr('url match')
        return
    try:
        raw_content = requests.get(url,
                                   headers=my_headers,
                                   timeout=my_timeout,
                                   proxies=my_proxies)
        # raw_content = urllib.urlopen(url).read()
    except:
        sys.stdout.write(' error while downloading \n')  #
        page_errors += 1
        tot_errors += 1
        return
    # sys.stdout.write('ok')
    try:
        raw_conversion = StringIO(raw_content.content)
        resource_image = Image.open(raw_conversion)
        imghash = hashmem(resource_image)
        rows_images = check_imgdup(imghash)
    except:
        sys.stdout.write(' hash error, image skipped\n')
        page_errors += 1
        return

    if len(rows_images) > 0:
        imgskiped += 1
        logr('image hash match')
        # sys.stdout.write( 'image hash match\n' )
        return

    filename = os.path.basename(url)
    save_as = os.path.join(complete_path, filename)

    try:
        image_handle = open(save_as, "w")
        image_handle.write(raw_content.content)
        image_handle.close()
        sys.stdout.write(' saved')
    except:
        sys.stdout.write('\nerror while saving %s ' + filename + '\n')
        page_errors += 1
        tot_errors += 1
        return
    dbfd_uri = save_as.replace('/mnt/vol1/', '')
    dbfd_pstar = dbfd_uri.split('/')[2]
    img_height, img_width = get_img_size(save_as)
    is_portrait = is_img_portrait(img_height, img_width)
    sample_timestamp = str(os.path.getmtime(save_as)).split('.')[-0]
    rec_img_indb(dbconn_smp, dbfd_uri, dbfd_pstar, img_height, img_width,
                 is_portrait, sample_timestamp)
    rec_link(url, urlparse(tumblrblog).netloc, imghash)
    sys.stdout.write(' inserted into db\n')
    image_counter += 1
    tot_images += 1
    return
Esempio n. 20
0
def find_image_urls(page_url):
    array_urls = []
    htmlsource = requests.get(page_url,
                              headers=my_headers,
                              timeout=my_timeout,
                              proxies=my_proxies)
    soupfilter = SoupStrainer("div", {"class": "l-content"})
    soup = BeautifulSoup(htmlsource.text, "lxml", parse_only=soupfilter)
    tags = soup.find_all("a")
    all_links_for_page = [link['href'] for link in tags]
    for every_link in all_links_for_page:
        if debug_app: logr('post link ' + every_link)
        url_parsed = urlparse(every_link)
        if url_parsed.netloc == 'sweet57334.tumblr.com':
            img_link = get_sweet(every_link)
        elif url_parsed.netloc == 'highheelsandshizzle.tumblr.com':
            img_link = hizzle(every_link)
        elif url_parsed.netloc == 'heelsfromhell.tumblr.com':
            img_link = heelsfromhell(every_link)
        elif url_parsed.netloc == 'therubik.tumblr.com':
            img_link = therubik(every_link)
        elif url_parsed.netloc == 'shoelvr67.tumblr.com':
            img_link = shoelvr67(every_link)
        elif url_parsed.netloc == 'hot-on-heels.com':
            img_link = hotonheels(every_link)
        elif url_parsed.netloc == 'legsandheels.tumblr.com':
            img_link = lgshls(every_link)
        elif url_parsed.netloc == 'jjperfectlegs.tumblr.com' or url_parsed.netloc == 'classysexypixs.tumblr.com':
            img_link = jjperfectlegs(every_link)
        elif url_parsed.netloc == 'naughtylegs.tumblr.com' or url_parsed.netloc == 'peeptoeheels.tumblr.com':
            img_link = naughtylegs(every_link)
        elif url_parsed.netloc == 'closetheels.tumblr.com':
            img_link = closetheels(every_link)
        elif url_parsed.netloc == 'high-heels-forever.tumblr.com':
            img_link = highheelsforever(every_link)
        elif url_parsed.netloc == 'tuneman86.tumblr.com' or url_parsed.netloc == 'sluttybimbogirl.tumblr.com' \
        or url_parsed.netloc == 'artandsexy.tumblr.com' :
            img_link = tuneman86(every_link)
        elif url_parsed.netloc == 'bestcelebritylegs.tumblr.com':
            img_link = bestcelebritylegs(every_link)
        elif url_parsed.netloc == 'e-v-i-l-f-u-c-k-e-r.tumblr.com':
            img_link = evil(every_link)
        elif url_parsed.netloc == 'sexy-on-heels.tumblr.com':
            img_link = sexyonheels(every_link)
        elif url_parsed.netloc == 'addicttosex.tumblr.com':
            img_link = addicttosex(every_link)
        elif url_parsed.netloc == 'icelegsandperspectives.tumblr.com':
            img_link = nicelegsandperspectives(every_link)
        elif url_parsed.netloc == 'haawheels.tumblr.com':
            img_link = haawheels(every_link)
        elif url_parsed.netloc == 'heelhunter.tumblr.com':
            img_link = heelhunter(every_link)
        elif url_parsed.netloc == 'mejeej.tumblr.com':
            img_link = mejeej(every_link)
        elif url_parsed.netloc == 'stilettocouture.tumblr.com':
            img_link = stilettocouture(every_link)
        elif url_parsed.netloc == 'goodmission.tumblr.com' or url_parsed.netloc == 'tejano78.tumblr.com':
            img_link = goodmission(every_link)
        elif url_parsed.netloc == 'www.heels-land.com':
            img_link = heelsland(every_link)
        else:
            logr(every_link)
            img_link = 'error'
        # first check if the crawler return an array of link urls
        if isinstance(img_link, list):
            print 'array : %s' % str(img_link)
            if img_link[0] != 'error': array_urls.append(img_link)
        else:
            if not re.match('^avatar_.+',
                            img_link.split('/')[-1]) and img_link != 'error':
                array_urls.append(img_link)
    return array_urls