def populate_pages(url_list, category, halved_screen_shot=False): """ :param url_list: a list of the urls for the pages that are going to be populated :param category: the category in which the pages fall into :return: """ #For each url in the url_list f = open('page_meta_data.txt','a') for url in url_list: p = Page.objects.filter(url=url) pf = None if p: pf =p[0] if not pf: # create PageCapture object - specify the browser to be 800 x 600. try: pc = PageCapture(url,800, 600) url_file_name = convert_url_to_filename(url)+'.png' # To change to accomodate for the new changes image_file_name = os.path.join(DATA_DIR, url_file_name) pc.load_url(url) # fetch the screen-shot if halved_screen_shot: pc.crop_screen_shot(image_file_name,0,0,1000,1000) #pc.halve_screen_shot(image_file_name) else: pc.take_screen_shot(image_file_name) # get the title title = pc.get_page_title() # create page in models/db with category # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url. #save to file instead of db here to decouple. f.write('%s,%s,%s,%s\n' % (category.name, url, title,image_file_name,)) print("written {0} to file.".format(title)) p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name)) p.save() print 'Page title= ' + p.title + ' has been saved!' except ValueError: print 'Page has ((NOT)) been saved!' print 'ERROR IS {0}'.format("ValueError") continue else: print "Already added: {0}".format(pf.title, pf.url) f.close()
def populate_pages(url_list, category, halved_screen_shot=False): """ :param url_list: a list of the urls for the pages that are going to be populated :param category: the category in which the pages fall into :return: """ #For each url in the url_list for url in url_list: # create PageCapture object - specify the browser to be 800 x 600. try: pc = PageCapture(url, 800, 600) url_file_name = convert_url_to_filename(url) + '.png' # To change to accomodate for the new changes image_file_name = os.path.join(DATA_DIR, url_file_name) pc.load_url(url) # fetch the screen-shot if halved_screen_shot: if random.random() > 0.5: pc.crop_screen_shot(image_file_name, 200, 400, 700, 900) else: pc.crop_screen_shot(image_file_name, 0, 0, 1000, 1000) #pc.halve_screen_shot(image_file_name) else: pc.take_screen_shot(image_file_name) # get the title title = pc.get_page_title() # create page in models/db with category # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url. p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name)) p.save() print 'Page title= ' + p.title + ' has been saved!' except ValueError: print 'Page has ((NOT)) been saved!' print 'ERROR IS' print ValueError continue
def populate_pages(url_list, category, halved_screen_shot=False): """ :param url_list: a list of the urls for the pages that are going to be populated :param category: the category in which the pages fall into :return: """ #For each url in the url_list for url in url_list: # create PageCapture object - specify the browser to be 800 x 600. try: pc = PageCapture(url,800, 600) url_file_name = convert_url_to_filename(url)+'.png' # To change to accomodate for the new changes image_file_name = os.path.join(DATA_DIR, url_file_name) pc.load_url(url) # fetch the screen-shot if halved_screen_shot: if random.random() > 0.5: pc.crop_screen_shot(image_file_name,200,400,700,900) else: pc.crop_screen_shot(image_file_name,0,0,1000,1000) #pc.halve_screen_shot(image_file_name) else: pc.take_screen_shot(image_file_name) # get the title title = pc.get_page_title() # create page in models/db with category # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url. p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name)) p.save() print 'Page title= ' + p.title + ' has been saved!' except ValueError: print 'Page has ((NOT)) been saved!' print 'ERROR IS' print ValueError continue
def populate_pages(url_list, category, halved_screen_shot=False): """ :param url_list: a list of the urls for the pages that are going to be populated :param category: the category in which the pages fall into :return: """ #For each url in the url_list f = open('page_meta_data.txt', 'a') for url in url_list: p = Page.objects.filter(url=url) pf = None if p: pf = p[0] if not pf: # create PageCapture object - specify the browser to be 800 x 600. try: pc = PageCapture(url, 800, 600) url_file_name = convert_url_to_filename(url) + '.png' # To change to accomodate for the new changes image_file_name = os.path.join(DATA_DIR, url_file_name) pc.load_url(url) # fetch the screen-shot if halved_screen_shot: pc.crop_screen_shot(image_file_name, 0, 0, 1000, 1000) #pc.halve_screen_shot(image_file_name) else: pc.take_screen_shot(image_file_name) # get the title title = pc.get_page_title() # create page in models/db with category # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url. #save to file instead of db here to decouple. f.write('%s,%s,%s,%s\n' % ( category.name, url, title, image_file_name, )) print("written {0} to file.".format(title)) p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name)) p.save() print 'Page title= ' + p.title + ' has been saved!' except ValueError: print 'Page has ((NOT)) been saved!' print 'ERROR IS {0}'.format("ValueError") continue else: print "Already added: {0}".format(pf.title, pf.url) f.close()