Esempio n. 1
0
def test_greedy():
    img_dir = osp.join(test_dir, 'greedy')
    greedy_crawler = GreedyImageCrawler(
        parser_threads=2, storage={'root_dir': img_dir})
    greedy_crawler.crawl(
        'http://www.bbc.com/news', max_num=5, min_size=(100, 100))
    shutil.rmtree(img_dir)
Esempio n. 2
0
def test_greedy():
    print('start testing GreedyImageCrawler')
    greedy_crawler = GreedyImageCrawler(parser_threads=4,
                                        storage={'root_dir': 'images/greedy'})
    greedy_crawler.crawl('http://www.bbc.com/news',
                         max_num=10,
                         min_size=(100, 100))
Esempio n. 3
0
def test_greedy():
    img_dir = osp.join(test_dir, 'greedy')
    greedy_crawler = GreedyImageCrawler(parser_threads=2,
                                        storage={'root_dir': img_dir})
    greedy_crawler.crawl('http://www.bbc.com/news',
                         max_num=5,
                         min_size=(100, 100))
    shutil.rmtree(img_dir)
Esempio n. 4
0
                                      feeder_threads=1,
                                      parser_threads=1,
                                      downloader_threads=4,
                                      storage={'root_dir': 'matches'})
    #        log_level=logging.INFO,
    #        extra_downloader_args={'log_file': 'meta.txt'})
    baidu_crawler.crawl(keyword=(query),
                        offset=0,
                        max_num=(num),
                        min_size=(200, 200),
                        max_size=None)

elif engine in ('Greedy', 'greedy'):
    greedy_crawler = GreedyImageCrawler(downloader_cls=MyImageDownloader,
                                        feeder_threads=1,
                                        parser_threads=1,
                                        downloader_threads=4,
                                        storage={'root_dir': 'matches'})
    greedy_crawler.crawl(domains=(url),
                         max_num=(num),
                         min_size=None,
                         max_size=None)

#For Flickr
#from datetime import date
#from icrawler.builtin import FlickrImageCrawler

#flickr_crawler = FlickrImageCrawler('your_apikey',
#                                    storage={'root_dir': 'your_image_dir'})
#flickr_crawler.crawl(max_num=1000, tags='child,baby',
#                     group_id='68012010@N00', min_upload_date=date(2015, 5, 1))
Esempio n. 5
0
def test_greedy():
    print('start testing GreedyImageCrawler')
    greedy_crawler = GreedyImageCrawler(
        parser_threads=4, storage={'root_dir': 'images/greedy'})
    greedy_crawler.crawl(
        'http://www.bbc.com/news', max_num=10, min_size=(100, 100))
Esempio n. 6
0
"""
BING CRAWLER WITH ICRAWLER PACKAGE.
"""

from icrawler.builtin import BingImageCrawler, GreedyImageCrawler

search_term = 'tsutsugamushi'
"""
bing_crawler = BingImageCrawler(downloader_threads=4,
                                storage={'root_dir': search_term + ' crawled images'})
bing_crawler.crawl(keyword=search_term, filters=None, offset=0, max_num=1000)
"""
#
search_url = 'https://bbc.com'
greedy_crawler = GreedyImageCrawler(
    storage={
        'root_dir': 'greedy_bing_eng_url ' + search_term + ' crawled images'
    })
greedy_crawler.crawl(domains=search_url,
                     max_num=1000,
                     min_size=None,
                     max_size=None)
Esempio n. 7
0
def image_crawler():
    table = dynamodb.Table('Images')
    target = request.form.get('target')
    num = request.form.get('num')
    num = int(num)
    radio = request.form.get('gridRadios')

    if radio == 'Greedy':
        url = target
        url = str(url)
        greedy_crawler = GreedyImageCrawler(
            storage={'root_dir': 'downloaded_pictures'})
        greedy_crawler.crawl(domains=url,
                             max_num=num,
                             min_size=(200, 200),
                             max_size=None)
        print(file_names)
        for file_name in file_names:
            response = table.put_item(Item={
                'username': session['username'],
                'imagename': file_name,
            })

    if radio == 'Instagram':
        looter = InstaLooter(directory="/tmp/", profile=target)
        looter.download_pictures(media_count=num)
        counter = 0
        for media in looter.medias():
            print(media)
            if (counter < num):
                if media['is_video']:
                    continue
                    # url = looter.get_post_info(media['code'])['video_url']
                else:
                    counter = counter + 1
                    url = media['display_src']
                    s3 = boto3.client('s3')
                    fp = io.BytesIO(urlopen(url).read())
                    s3.upload_fileobj(fp, 'ece1779project',
                                      media['id'] + '.jpg')
                    response = table.put_item(
                        Item={
                            'username': session['username'],
                            'imagename': media['id'] + '.jpg',
                        })
            else:
                break

    if radio == 'Google':
        google_crawler = GoogleImageCrawler(
            parser_threads=2,
            downloader_threads=4,
            storage={'root_dir': 'downloaded_pictures'})
        google_crawler.crawl(keyword=target,
                             max_num=num,
                             date_min=None,
                             date_max=None,
                             min_size=(200, 200),
                             max_size=None)
        for file_name in file_names:
            response = table.put_item(Item={
                'username': session['username'],
                'imagename': file_name,
            })

    return render_template("/imagecrawler/form.html")
Esempio n. 8
0
 def getImagesFromDomain(self, query, domain_url, num_pics):
     greedy_crawler = GreedyImageCrawler()
     greedy_crawler.crawl(domains=domain_url,
                          max_num=self.num_of_images,
                          min_size=(self.min_width, self.min_height),
                          max_size=None)
Esempio n. 9
0
# coding: utf-8

# In[ ]:

from icrawler.builtin import GreedyImageCrawler
greedy_crawler = GreedyImageCrawler(parser_threads=2, downloader_threads=2,storage={'root_dir': 'data'})
greedy_crawler.crawl(domains='www.***.com', max_num=1000, min_size=None, max_size=None)


# In[ ]:

from icrawler.builtin import BaiduImageCrawler
baidu_crawler = BaiduImageCrawler(storage={'root_dir': 'data'})
baidu_crawler.crawl(keyword='猫',offset=0, max_num=1000,min_size=None, max_size=None)


# In[ ]:

from icrawler.builtin import GoogleImageCrawler
Google_Crawler = GoogleImageCrawler(parser_threads=2, downloader_threads=2, storage={'root_dir': 'data'})
Google_Crawler.crawl(keyword='flower', max_num=1000, date_min=None, date_max=None, min_size=(160,160), max_size=None)