def run_me(offset=0): ''' This runs the scraping script on ZILESINOPTI website ''' cities = ['timisoara', 'targu-mures', 'satu-mare'] for city in cities: index = offset if offset < 30 else abs(30 - offset) current_page = 1 if offset >= 30: current_page = int(offset/30) + 1 while True: print('Going to {} page in {}'.format(current_page, city)) shop_pages = sp.collect_shop_pages(slug=city, current_page=current_page) if not shop_pages: break for shop_link in shop_pages[index:]: try: shop_data = sp.collect_shop_data(url=shop_link) except: continue print('Scraping {} on {} site'.format(shop_data['name'], shop_data['url'])) model = sp.get_emails_from_page(page=shop_data) print('{} email found'.format(len(model['emails']))) if len(model['emails']): save_data_to_db(model) current_page = current_page + 1 if current_page > 10: break
def test_data(self): sc = Scraper() sc.store_data(INCIDENTS[0]) self.assertEqual("F150020627", Incident.objects.all()[0].incident_id) self.assertEqual("2717 Dexter Av N", Incident.objects.all()[0].location_text) self.assertEqual("Medic Response", Incident.objects.all()[0].type.type_name) self.assertEqual("E9", Vehicle.objects.get(id=1).name)
def post(self): logging.info('Running ScrapeHandler') scraper = Scraper() scraper.scrape("http://www.gdcafe.com/website/index.php/Flavours") subscribers = Subscriber.all().run() for s in subscribers: danver_flavours = [] delila_flavours = [] davis_flavours = [] for f in s.flavours: if f in scraper.danver_flavours: danver_flavours.append(f) if f in scraper.davis_flavours: davis_flavours.append(f) if f in scraper.delila_flavours: delila_flavours.append(f) if len(danver_flavours) == 0 and len(delila_flavours) == 0 and len(davis_flavours) == 0: continue params={'email':s.email, 'danver':danver_flavours, 'davis':davis_flavours, 'delila':delila_flavours} taskqueue.add(url='/worker/email', params=params, queue_name='email', countdown=0) logging.info('Submitted task to email ' + s.email) taskqueue.add(url='/worker/scrape', countdown=86400) logging.info('Finished ScrapeHandler')
def fetch_room(): myScraper = Scraper() tags = myScraper.get_list_from_tag("tr", "td") software_array = request.json['software_array'] list_of_all_room = [] for i in tags: if i[0] in software_array: # separate room string into list of room list_of_room = [i.split(",") for i in i[1::]] # turn nested lists into 1 flat list list_of_room = [ item.strip() for sublist in list_of_room for item in sublist ] # add rooms to result list list_of_all_room.append(list_of_room) # turn nested lists into 1 flat list list_of_all_room = [ item for sublist in list_of_all_room for item in sublist ] # If there are more than 1 software required if len(software_array) > 1: # find rooms that appear more than 1 list_of_all_room = list( set([i for i in list_of_all_room if list_of_all_room.count(i) > 1])) # reformat rooms number list_of_all_room = reformat_room_number(list_of_all_room) return {"result": list_of_all_room}
def fetch_software_name(): myScraper = Scraper() tags = myScraper.get_list_from_tag("tr", "td") software = [] for i in tags: software.append(i[0]) return {"result": software}
def test_followup_dispatch(self): sc = Scraper() sc.store_data(INCIDENTS[5]) self.assertEqual(1, len(Incident.objects.all())) self.assertEqual(1, len(Dispatch.objects.all())) sc.store_data(INCIDENTS[0]) self.assertEqual(1, len(Incident.objects.all())) self.assertEqual(2, len(Dispatch.objects.all()))
def build_keywords_from_links(tree, keyword_file): scraper = Scraper() # change this to file output for entry in tree.map.keys(): # print(entry) if len(entry) == 0: continue curr = tree.map.get(entry) if curr._visited: #print("already visited") continue if curr.links == []: #print("no links") continue links = curr.links resp = {} counter = 0 links_keywords_cache = {} for link in links: if counter > 20: break #print('Trying: ', link) link = link.replace('\'', '') tdict = None try: tdict=links_keywords_cache[link] except KeyError: tdict = scraper.get_keywords(link) links_keywords_cache[link] = tdict if len(str(tdict)) > 0: for key in tdict.keys(): try: resp[key].append(tdict[key]) except KeyError: resp[key] = [tdict[key]] counter+= 1 #print("writing to file") writer = open(keyword_file,'a') writer.write(entry + '\n') writer.write(str(resp)+'\n')
def handle(self, *args, **options): try: monitor = Monitor(log, verbose_debug_mode=options[self.VERBOSE_MODE]) monitor.debug("%s - Started scraping inmates from Cook County Sheriff's site." % datetime.now()) scraper = Scraper(monitor) if options[self.START_DATE]: scraper.check_for_missing_inmates(datetime.strptime(options[self.START_DATE], "%Y-%m-%d").date()) else: scraper.run() monitor.debug("%s - Finished scraping inmates from Cook County Sheriff's site." % datetime.now()) except Exception, e: log.exception(e)
def setUp(self) -> None: self.scraper = Scraper('python', 'katowice', 15, 20, False) self.scraper_local = Scraper('python', 'katowice', 15, 20, True) self.scraper.get_content() self.scraper_local.get_content() """ Creating second scraper object which works without requests, but with static page in /data/ - i am checking with it adding to dicts, which seems hard to implement with requests. Also, it's helpful to test scraping page structure. Of course it won't help when page structure changes. """ self.job = self.scraper_local.find_jobs_div()[0] self.offer = JobOffer(self.job, self.scraper_local.skip)
def _scraper_init(): global _scraper if not _scraper: log.info('creating scraper') _scraper = Scraper(os.environ['CHROME_DRIVER_PATH'], os.environ['SCRAPER_DATA_DIR'], json.load(open(os.environ['WEBSITE_URLS'])))
def get_result(term): # get links for 5 fiver results from Google result = Scraper.scrape_google(search_term=term) pages = [] for l in result.split(" "): pages.append(l) required_tags = ["h1", "h2", "h3", "h4", "h5", "pre", "code", "p"] text_outputs = [] code_outputs = [] json_data = [] for page in pages: res = requests.get(page) html_text = BeautifulSoup(res.text, 'html.parser') text = html_text.find_all(required_tags) for t in text: if t.name == 'code' or t.name == 'pre': code_outputs.append(t.get_text()) else: text_outputs.append(t.get_text()) data = { page: { "code_snippets": code_outputs, "relevant_text": text_outputs } } json_data.append(data) print(json_data) return json.dumps(json_data)
def __init__ ( self, root ): self.sp = Scraper() self.sc = ScreenCapture() # Init root.wm_title("Poker AI") self.root = root self.icons = [] self.sp_cnf_region_is_selected = False # Menu self.menu_bar = self.gui_set_menu() self.root.config(menu=self.menu_bar) self.frame_root = Tk.Frame(root, relief=Tk.GROOVE, bd=Gui.RELIEF_S) # Toolbar self.toolbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S) self.toolbar.pack(fill=Tk.BOTH, side=Tk.TOP) self.toolbar_start = self.gui_pack_toolbar_start() self.toolbar_sp_cnf = None # Main Frame self.frame_main = Tk.Frame(self.frame_root) self.frame_main.pack(fill=Tk.BOTH,expand=1) self.frame_start = self.gui_pack_frame_start(self.frame_main) self.frame_sp_cnf = None # Status Bar self.statusbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S) self.statusbar.pack(fill=Tk.BOTH) self.status_var = Tk.StringVar() self.status_label = Tk.Label(self.statusbar, textvariable=self.status_var, anchor=Tk.W) self.status_label.pack(fill=Tk.BOTH) # Show self.frame_root.pack(fill=Tk.BOTH,expand=1) self.resize(300,200)
def main() -> None: # Setup GPIO MOTOR1_PINS = [11, 13] MOTOR2_PINS = [16, 15] GPIO.setmode(GPIO.BOARD) GPIO.setup(MOTOR1_PINS, GPIO.OUT) GPIO.setup(MOTOR2_PINS, GPIO.OUT) # ------- logger.setLevel('INFO') scraper: Scraper = Scraper(DBStorage()) controller: Controller = Controller() sensors: List[ISensor] = [] sentries: List[ISentry] = [] horizontal_sensor: HorizontalCapacitanceSensor = HorizontalCapacitanceSensor( ) vertical_sensor: VerticalCapacitanceSensor = VerticalCapacitanceSensor() # TODO: add LCD display? # register all sensors sensors.append(PIM486()) sensors.append(ArduinoSerialInterface()) # register all sentries sentries.append(HumiditySentry()) sentries.append(LightSentry()) sentries.append(WaterSentry()) sentries.append(TemperatureSentry()) # spawn two threads and run one for any external events (user input etc) # run main loop scraper_runner: threading.Thread = threading.Thread(target=scraper.run, name="Scraper") controller_runner: threading.Thread = threading.Thread( target=controller.run, name="Controller") try: scraper_runner.start() controller_runner.start() while True: for s in sensors: s.poll() # poll for data for most of the sensors time.sleep(0.5) # poll the sensors periodically, serve IRQ's from some most important - pindas etc except KeyboardInterrupt: scraper.is_done = True controller.is_done = True scraper_runner.join() controller_runner.join() # deinit all sensors for s in sensors: s.close() # GPIO CLEANUP GPIO.cleanup() logInfo("Exiting")
def run(self): # Abi of the messaging smart contract # abi = '''[{"constant":true,"inputs":[{"name":"","type":"address"}],"name":"last_msg_index","outputs":[{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":false,"inputs":[{"name":"_key","type":"string"},{"name":"_type","type":"string"}],"name":"setPublicKey","outputs":[],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"},{"name":"_index","type":"uint256"}],"name":"newMessage","outputs":[{"name":"","type":"bool"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"},{"name":"_index","type":"uint256"}],"name":"getMessageByIndex","outputs":[{"name":"","type":"address"},{"name":"","type":"string"},{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"","type":"address"}],"name":"keys","outputs":[{"name":"key","type":"string"},{"name":"key_type","type":"string"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"}],"name":"getPublicKey","outputs":[{"name":"_key","type":"string"},{"name":"_key_type","type":"string"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"","type":"address"},{"name":"","type":"uint256"}],"name":"messages","outputs":[{"name":"from","type":"address"},{"name":"text","type":"string"},{"name":"time","type":"uint256"}],"payable":false,"type":"function"},{"constant":false,"inputs":[{"name":"_to","type":"address"},{"name":"_text","type":"string"}],"name":"sendMessage","outputs":[],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"}],"name":"getLastMessage","outputs":[{"name":"","type":"address"},{"name":"","type":"string"},{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_owner","type":"address"}],"name":"lastIndex","outputs":[{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":true,"inputs":[],"name":"message_staling_period","outputs":[{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"anonymous":false,"inputs":[{"indexed":true,"name":"_sender","type":"address"},{"indexed":true,"name":"_receiver","type":"address"},{"indexed":false,"name":"_time","type":"uint256"},{"indexed":false,"name":"message","type":"string"}],"name":"Message","type":"event"},{"anonymous":false,"inputs":[{"indexed":true,"name":"_sender","type":"address"},{"indexed":false,"name":"_key","type":"string"},{"indexed":false,"name":"_keytype","type":"string"}],"name":"PublicKeyUpdated","type":"event"}]''' if self.testnet: web3 = Web3( Web3.HTTPProvider( 'https://ropsten.infura.io/v3/29e5c62848414895b549aa4befebe614' )) else: web3 = Web3( Web3.HTTPProvider( 'https://mainnet.infura.io/v3/29e5c62848414895b549aa4befebe614' )) acc = web3.eth.account.privateKeyToAccount(self.private_key) if not web3.isConnected(): Scraper.log("Messaging:\tNo connection established") # Messaging smart contract to use if not sending a direct transaction to the contract owner # messaging = web3.eth.contract(address="0xCdcDD44f7f617B965983a8C1bB0B845A5766FEbA", abi=abi) Scraper.log("Messaging:\tWaiting for messages") nonce = 1 while True: (address, message) = self.report_q.get() if message is None: break message = "Hello, We scanned a smart contract you deployed and found a vulnrability in it, here is the report:\n" + message transaction = { 'to': web3.toChecksumAddress(address), 'from': acc.address, 'value': 0, 'gasPrice': web3.eth.gasPrice, 'nonce': web3.eth.getTransactionCount(acc.address), 'data': message.encode('utf-8').hex() } transaction['gas'] = web3.eth.estimateGas(transaction) # transaction = messaging.functions.sendMessage(address, message).buildTransaction({'from': acc.address, 'nonce': '0x%02x' % web3.eth.getTransactionCount(address)} # Use this to send the message to a messaging smart contract) signed = acc.signTransaction(transaction) tx = web3.eth.sendRawTransaction(signed.rawTransaction) Scraper.log("Messaging:\tSent message") Scraper.log("Messaging:\tReceived terminator, shutting down...")
def start_scraping(): job_name = input('Enter job name: ') place = input('Enter place: ') radius = int(input('Enter radius: ')) scraper = Scraper(job_name, place, radius) print(f'URL: {scraper.page.url}, Place: {scraper.location}, Job name: \ {scraper.job_name}\n') template = Template(scraper.offers, scraper.number_of_offers)
def test_scrap(self): nb_doc = 4 # to keep test short curr_doc = 0 scraper = Scraper(disconnected=True) directory = os.path.dirname(os.path.abspath(__file__)) with vcr.use_cassette(directory + '/vcr_cassettes/test_run_scraper.yaml', record_mode='none', ignore_localhost=True): for doc in scraper.scrap(): self.assertIsInstance(doc.url, unicode) self.assertIsInstance(doc.title, unicode) self.assertIsInstance(doc.content, unicode) self.assertNotIn(u'.gif', doc.url) # check extension filter self.assertNotIn(u'youtu', doc.url) # check regex filter curr_doc += 1 if curr_doc == nb_doc: break else: self.fail('error: not enough docs extracted from cassette, should be ' + str(nb_doc) + ', was ' + str(curr_doc))
def post(self): logging.info("Running scrape handler") scraper = Scraper() scrape_result = scraper.scrape_all() seats = [] parties = [] prices = [] winners = {} for s in scrape_result: if s.name not in seats: seats.append(Seat(name=s.name, state=s.state, id=s.name)) seats[-1].put() lowest_price = 1000 winner = '' for c in s.candidates: if c.name not in parties: parties.append(Party(name=c.name, id=c.name)) party_key = ndb.Key(Party, c.name) seat_key = ndb.Key(Seat, s.name) price = Price(party=party_key, seat=seat_key, price=c.price) price.put() if c.price < lowest_price: lowest_price = c.price winner = c.name if winner in winners: winners[winner] += 1 else: winners[winner] = 1 for party in parties: if party.name in winners: party.num_seats = winners[party.name] else: party.num_seats = 0 party.put() self.response.out.write(winners)
def execute_alternate(self): input_combinations = Combination( self.list_of_input).get_result_reversed() master_data = [] master_id = normalize_string(util.get_uuid()) for com in input_combinations: description = "" scr = Scraper() browser = scr.dive_plus(self.url, com) wait = WebDriverWait(browser, GlobalPreferences.setting["timeout"]) try: page_loaded = wait.until_not( lambda browser: browser.current_url == self.url) except TimeoutException: print("Timeout") description = "Timeout\n" finally: result = { "url_after": browser.current_url, "text_found": scr.find_text_in_browser( GlobalPreferences.setting["expected"]["text_after"]), "element_found": scr.find_element_in_browser( GlobalPreferences.setting["expected"]["element_after"]) } data = { "result": result, "expected": GlobalPreferences.setting["expected"], "id": str(get_uuid()), "date": get_today(), "title": "Skreepy", "description": description, "tester": GlobalPreferences.setting["tester"], "inputs": com, "master_test_id": master_id } master_data.append(data) browser.close() MasterReportWindow(master_data, self).show()
def main(self): if len(sys.argv) > 2: Scraper.api = sys.argv[1] self.private_key = sys.argv[2] else: Scraper.log( "You did not define an Etherscan API key or Private key.") exit() # create different queues new_address_q = Queue() report_q = Queue() # create different threads for instance in range(mythril_instances): myth_x = MythX(new_address_q, report_q) myth_x.start() scraper = Scraper(new_address_q) scraper.start() messenger = Messenger(report_q, self.private_key, True) messenger.start() try: while True: time.sleep(.1) except KeyboardInterrupt: pass finally: new_address_q.put(None) report_q.put(None) report_q.put(None)
def fill_template(template, urls): table = template for url in urls: scraper = Scraper(url) times = 0 while (True): if times < 5: try: scraper.start() row = scraper.get_dictionary() table = table.append(row, ignore_index=True) break except: continue times += 1 else: print('5 unsuccessful attempts') break return table
def test_url_is_restaurant_page_succeeds(self): """ Asserts the endpoint succeeds when the url is a restaurant url """ Scraper.get_scraper = MagicMock(return_value=Scraper([])) Scraper.get_reviews = MagicMock(return_value=[]) with APP.test_client() as client: sent = { "url": "https://www.grubhub.com/restaurant/hashbrowns-on-wells-1155-n-wells-st-chicago/287727" } result = client.post('/scrape_reviews', data=sent) self.assertEqual(result.status_code, 200)
def spider(df, n_processes, stopwords): """Launch scraper and parser. Args: df: url stored in a data frame n_processes: process to run a launcher Returns: table: corpus of a web page """ log = logging.getLogger('spider') scraper = Scraper() parser = Parser() df = scraper.scrape_job(df, n_processes) log.debug("dataframe scraped: {}".format(df.head(1))) df_final = parser.extract_corpus(df, 2 * multiprocessing.cpu_count(), stopwords) table = pa.Table.from_pandas(df_final) log.debug("dataframe parsed: {}".format(table)) return table
def output(url, body, counter): if counter == 0: global main_url main_url = url global html html = body return counter + 1 elif counter == 1: global counts counts = body return counter + 1 elif counter == 2: global updates updates = body return counter + 1 else: comments = body scraper = Scraper(main_url, html, counts, updates, comments) scraper.start() row = scraper.get_dictionary() global table table = table.append(row, ignore_index=True) return 0
def run(): jsonpickle.set_encoder_options('simplejson', indent=4, ensure_ascii=False) scraper = Scraper() folder = '/media/nico/SAMSUNG/devs/gator/scraping reddit 10-01-2016' log_file = folder + 'run_scraper-' + str(datetime.datetime.utcnow()) + '.log' logging.basicConfig(format=u'%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename=log_file) while True: try: for scraper_document in scraper.scrap(): filename = folder + '/' + str(datetime.datetime.utcnow()) + '.json' json = jsonpickle.encode(scraper_document) with codecs.open(filename=filename, mode='w', encoding='utf-8') as file_desc: file_desc.write(json) except Exception as exception: # pylint: disable=broad-except logging.error("The orchestrator crashed! Starting it over ...") logging.exception(exception) sleep(30)
def __init__(self, root): self.sp = Scraper() self.sc = ScreenCapture() # Init root.wm_title("Poker AI") self.root = root self.icons = [] self.sp_cnf_region_is_selected = False # Menu self.menu_bar = self.gui_set_menu() self.root.config(menu=self.menu_bar) self.frame_root = Tk.Frame(root, relief=Tk.GROOVE, bd=Gui.RELIEF_S) # Toolbar self.toolbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S) self.toolbar.pack(fill=Tk.BOTH, side=Tk.TOP) self.toolbar_start = self.gui_pack_toolbar_start() self.toolbar_sp_cnf = None # Main Frame self.frame_main = Tk.Frame(self.frame_root) self.frame_main.pack(fill=Tk.BOTH, expand=1) self.frame_start = self.gui_pack_frame_start(self.frame_main) self.frame_sp_cnf = None # Status Bar self.statusbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S) self.statusbar.pack(fill=Tk.BOTH) self.status_var = Tk.StringVar() self.status_label = Tk.Label(self.statusbar, textvariable=self.status_var, anchor=Tk.W) self.status_label.pack(fill=Tk.BOTH) # Show self.frame_root.pack(fill=Tk.BOTH, expand=1) self.resize(300, 200)
def ng_scraper(): parser = argparse.ArgumentParser(description="Scrape inmate data from Cook County Sheriff's site.") parser.add_argument('-d', '--day', action='store', dest='start_date', default=None, help=('Specify day to search for missing inmates, format is YYYY-MM-DD. ' 'If not specified, searches all days.')) parser.add_argument('--verbose', action="store_true", dest='verbose', default=False, help='Turn on verbose mode.') args = parser.parse_args() try: monitor = Monitor(log, verbose_debug_mode=args.verbose) monitor.debug("%s - Started scraping inmates from Cook County Sheriff's site." % datetime.now()) scraper = Scraper(monitor) if args.start_date: scraper.check_for_missing_inmates(datetime.strptime(args.start_date, '%Y-%m-%d').date()) else: scraper.run() monitor.debug("%s - Finished scraping inmates from Cook County Sheriff's site." % datetime.now()) except Exception, e: log.exception(e)
def test_happy_path(self): """ Asserts the endpoint returns a dictionary as json """ reviewMockData = {"author": "test"} Scraper.get_scraper = MagicMock(return_value=Scraper([])) Scraper.get_reviews = MagicMock(return_value=[reviewMockData]) with APP.test_client() as client: sent = { "url": "https://www.grubhub.com/restaurant/hashbrowns-on-wells-1155-n-wells-st-chicago/287727" } result = client.post('/scrape_reviews', data=sent) print(result) self.assertEqual(result.status_code, 200)
def start_scraping(): config_reader = JsonConfigReader() engine = create_engine(Config.DB_URI) Base = declarative_base() Base.metadata.bind = engine DBSession = sessionmaker(bind=engine) session = DBSession() config = config_reader.read('config.json') scraper = Scraper(config) scraping_manager = ScrapingManager(scraper, session) scraping_manager.start_scraping()
def main(): logging.config.fileConfig("logging.conf") logger = logging.getLogger("sLogger") ua = UserAgent() params = { "ae": "0", "oa": "1", "pt": "2", "vl": "3", "wg": "4", "pf": "5", "hi": "6", "wi": "7", "bp": "8", "pac": "9", "pas": "******", "sho": "11", "phy": "12", "dri": "13", "def": "14" } query = "&".join([f"showCol%5B{y}%5D={x}" for x, y in params.items()]) url = f"https://sofifa.com/players?{query}&offset=" urls = [url + str(offset) for offset in range(0, 18060, 60)] # Parameters number_of_scraper = 31 pages = 10 scrapers = [ Scraper(urls[pages * i:min(pages * (i + 1), len(urls))], ua.random) for i in range(number_of_scraper) ] logger.info("Scraping started...") multi_threading = MultiThreading(scrapers) multi_threading.run() logger.info("Scraping finished.") logger.info("Generating CSV file...") save_data(Scraper.players_scraped) logger.info("CSV File is generated.")
def step_impl(context): os.environ['NOTIFIER_ENABLED'] = '1' os.environ[ 'NOTIFIER_TOKEN'] = '1958942562:AAGKDfy2S7vcXj3cFe0I1-0Hevq8ayM-9U0' os.environ['NOTIFIER_MESSAGE'] = 'This is a test notification' os.environ['NOTIFIER_CHAT_ID'] = '-545020496' os.environ['NOTIFIER_LAPSE'] = '10' os.environ['NOTIFIER_MAX_RETRY'] = '5' os.environ['DATABASE_STORE'] = 'localsqlite' os.environ['LOCAL_SQLITE_FILE'] = 'test.db' os.environ['ERROR_HANDLER'] = 'stdout' os.environ['PROVIDER1_NAME'] = 'argenprop' os.environ['PROVIDER1_ENABLED'] = '1' os.environ['PROVIDER1_BASE_URL'] = 'https://www.argenprop.com' os.environ[ 'PROVIDER1_S1'] = '/departamento-alquiler-barrio-palermo-2-dormitorios-5-o-más-ambientes' context.scraper = Scraper(Config("bogus.env"))
def scrape_other_sites(): shops = list() shops.append({ 'name' : 'mindblower', 'url' : 'https://mindblower.ro/'}) shops.append({ 'name' : 'MrGift', 'url' : 'http://www.mrgift.ro/'}) shops.append({ 'name' : 'Zaragoo', 'url' : 'http://www.zaragoo.ro/'}) shops.append({ 'name' : 'TheGift', 'url' : 'http://www.thegift.ro/'}) shops.append({ 'name' : 'GiftsBoutique', 'url' : 'https://www.giftsboutique.ro/'}) shops.append({ 'name' : 'Tu.Ro', 'url' : 'https://www.tu.ro/cadouri.html'}) shops.append({ 'name' : 'Smuff', 'url' : 'https://www.smuff.ro/cadouri/cadouri-traznite'}) shops.append({ 'name' : 'Cadouri de decoratiuni', 'url' : 'http://www.cadouridecoratiuni.ro/'}) shops.append({ 'name' : 'BlueGifts', 'url' : 'https://bluegifts.ro/'}) shops.append({ 'name' : 'Tu.Ro', 'url' : 'https://www.tu.ro/cadouri.html'}) for shop in shops: try: shop_data = sp.get_emails_from_page(page=shop) except: print('EXCEPT') continue save_data_to_db(shop_data)
def get_reviews(): """ Accepts a url as form data and attempts to return review data as json """ url = request.form['url'] if url is None: data = { "success": False, "message": "Must provide a url to scrape reviews." } return APP.response_class(status=400, mimetype='applicaton/json', response=json.dumps(data)) parsed_url = urlparse(url) main_page_expr = re.compile(r'\/(restaurant)\/(.*)\/([0-9]*)') if parsed_url.netloc != "www.grubhub.com" or main_page_expr.match( parsed_url.path) is None: data = { "success": False, "message": "Must be a grubhub restaurant page." } return APP.response_class(status=400, mimetype='applicaton/json', response=json.dumps(data)) try: scraper = Scraper.get_scraper(url) return jsonify(scraper.get_reviews()) except Exception as exception: data = { "success": False, "message": f"An unexpected error occured while scraping reviews: {exception}" } return APP.response_class(status=500, mimetype='applicaton/json', response=json.dumps(data))
def scrape_google_search_results(filename='google_results.json'): import json import pprint as pp with open(filename, 'r') as f: data = json.load(f) for query in data: for result in query['results']: try: user = User.query.filter_by(name=result['domain']).first() except: db.session.rollback() raise if not user: if not result['link_type'] == 'ads_main': try: page_info = dict(zip(['name', 'url'], [result['domain'], result['link']])) shop_data = sp.get_emails_from_page(page=page_info, fn=save_data_to_db) if shop_data['emails'] is not None: save_data_to_db(shop_data) except: print('Error') continue else: print('Skip this, already in the db')
def ng_scraper(): parser = argparse.ArgumentParser( description="Scrape inmate data from Cook County Sheriff's site.") parser.add_argument( '-d', '--day', action='store', dest='start_date', default=None, help=( 'Specify day to search for missing inmates, format is YYYY-MM-DD. ' 'If not specified, searches all days.')) parser.add_argument('--verbose', action="store_true", dest='verbose', default=False, help='Turn on verbose mode.') args = parser.parse_args() try: monitor = Monitor(log, verbose_debug_mode=args.verbose) monitor.debug( "%s - Started scraping inmates from Cook County Sheriff's site." % datetime.now()) scraper = Scraper(monitor) if args.start_date: scraper.check_for_missing_inmates( datetime.strptime(args.start_date, '%Y-%m-%d').date()) else: scraper.run(date.today() - timedelta(1), feature_controls()) monitor.debug( "%s - Finished scraping inmates from Cook County Sheriff's site." % datetime.now()) except Exception, e: log.exception(e)
class AutoPScraperTest(TestCase): def setUp(self): self.scraper = Scraper() self.scraper.set_autop(AdvertOptions.CARS) def test_car_scraper_selection(self): ''' Test if correct scraper selected by provided option ''' self.assertEquals(AutoPScraper, type(self.scraper.type())) def test_particular_car_advert_scrape(self): ''' Tests paricular car advetisement scrape ''' self.scraper.set_autop(AdvertOptions.CARS) # TODO FIX IT USE PYTHONPATH url = 'file:///home/apoluden/Programming/workspace/reseller/scraper/tests/bmw_advertisement.html' scraped_advert = self.scraper.scrape_particular_advert(None, path=url) vehicle = scraped_advert['vehicle'] advert = scraped_advert['advert'] seller = scraped_advert['seller'] self.assertEquals('+37069157207', seller['number']) self.assertEquals('5004458', advert['uid']) self.assertEquals('Panevėžys,Lietuva', advert['location']) self.assertEquals('10 900 €', advert['price']) self.assertEquals('BMW 520, 2.0 l., universalas', advert['name']) def test_bad_webpage_url_or_path(self): ''' Tests wrong URL or path ''' wrong_path = 'file://wrong/path' wrong_url = 'http://wrong.url' scraper = AutoPCarScraper() self.assertIsNone(scraper.page_content(wrong_url)) self.assertIsNone(scraper.page_content(None, wrong_path))
def handle(self, *args, **options): scraper = Scraper() scraper.fetch_data()
def test_no_dupes(self): sc = Scraper() sc.store_data(INCIDENTS[0]) self.assertEqual(1, len(Incident.objects.all())) sc.store_data(INCIDENTS[0]) self.assertEqual(1, len(Incident.objects.all()))
def test_open_close(self): sc = Scraper() sc.store_data(INCIDENTS[0]) self.assertIsNone(Incident.objects.all()[0].end) sc.store_data(INCIDENTS[1]) self.assertIsNotNone(Incident.objects.all()[0].end)
def test_dupe_vehic(self): sc = Scraper() sc.store_data(INCIDENTS[0]) self.assertEqual(1, len(Vehicle.objects.all())) sc.store_data(INCIDENTS[2]) self.assertEqual(1, len(Vehicle.objects.all()))
from scraper.scraper import Scraper from scraper import domains if __name__ == '__main__': d = domains.Domain('http://jkjas.com/Magazine') d.add_page('http://jkjas.com/Magazine') domains.add_domain(d) s = Scraper() s.run()
class Gui ( object ): path_screenshots = 'scraper/screenshots' path_config_files = 'scraper/config_files' CONFIG_FILE_EXT = 'scp' RELIEF = Tk.RAISED RELIEF_S = 2 GRID_V = Tk.N+Tk.S GRID_H = Tk.E+Tk.W GRID_BOTH = Tk.N+Tk.S+Tk.E+Tk.W def __init__ ( self, root ): self.sp = Scraper() self.sc = ScreenCapture() # Init root.wm_title("Poker AI") self.root = root self.icons = [] self.sp_cnf_region_is_selected = False # Menu self.menu_bar = self.gui_set_menu() self.root.config(menu=self.menu_bar) self.frame_root = Tk.Frame(root, relief=Tk.GROOVE, bd=Gui.RELIEF_S) # Toolbar self.toolbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S) self.toolbar.pack(fill=Tk.BOTH, side=Tk.TOP) self.toolbar_start = self.gui_pack_toolbar_start() self.toolbar_sp_cnf = None # Main Frame self.frame_main = Tk.Frame(self.frame_root) self.frame_main.pack(fill=Tk.BOTH,expand=1) self.frame_start = self.gui_pack_frame_start(self.frame_main) self.frame_sp_cnf = None # Status Bar self.statusbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S) self.statusbar.pack(fill=Tk.BOTH) self.status_var = Tk.StringVar() self.status_label = Tk.Label(self.statusbar, textvariable=self.status_var, anchor=Tk.W) self.status_label.pack(fill=Tk.BOTH) # Show self.frame_root.pack(fill=Tk.BOTH,expand=1) self.resize(300,200) # # ====== GENERAL METHODS ====== # def resize ( self, w, h ): win_w = self.root.winfo_screenwidth() win_h = self.root.winfo_screenheight() x = win_w/2 - w/2 y = win_h/2 - h/2 self.root.geometry("%dx%d+%d+%d" % (w, h, x, y)) def set_status ( self, text='' ): self.status_var.set(text) # ====== GUI COMPONENTS ====== # # --- menu --- # def gui_set_menu ( self ): menubar = Tk.Menu(self.root) # Screen Scraper menu menu_sp = Tk.Menu(menubar, tearoff=0) self.gui_add_menu_cmd(menu_sp, 'New', self.sp_cnf_new, 'Ctrl-n') self.gui_add_menu_cmd(menu_sp, 'Open', self.sp_cnf_open, 'Ctrl-o') self.gui_add_menu_cmd(menu_sp, 'Save', self.sp_cnf_save, 'Ctrl-s') self.gui_add_menu_cmd(menu_sp, 'Save as...', self.sp_cnf_saveas, 'Ctrl-Shift-s') menu_sp.add_separator() self.gui_add_menu_cmd(menu_sp, 'Exit', self.root.quit) menubar.add_cascade(label="Scraper", menu=menu_sp) # Edit menu editmenu = Tk.Menu(menubar, tearoff=0) editmenu.add_command(label="LOL") editmenu.add_separator() editmenu.add_command(label="WTF") menubar.add_cascade(label="Edit", menu=editmenu) # Key bindings self.root.bind_all("<Command-n>", self.sp_cnf_new) # return menubar def gui_add_menu_cmd ( self, parent, label, command, key=None ): if key: if platform.system() == "Darwin": key = key.replace('Ctrl', 'Cmd') # parent.add_command(label=label, command=command, accelerator=key) self.root.bind_all(key, command) # --- toolbar --- # def gui_pack_toolbar_start ( self ): screenshots = [ f for f in os.listdir(Gui.path_screenshots) if os.path.isfile(os.path.join(Gui.path_screenshots,f)) ] state_btn_new = Tk.NORMAL if screenshots else Tk.DISABLED # config_files = [ f for f in os.listdir(Gui.path_config_files) if os.path.isfile(os.path.join(Gui.path_config_files,f)) ] state_btn_open = Tk.NORMAL if config_files else Tk.DISABLED # toolbar = self.gui_pack_tools_btns([ {'icon':'screencap', 'command':self.sp_do_screencap, 'text':'Take a screenshot'}, {'icon':'new', 'command':self.sp_cnf_new, 'state':state_btn_new, 'text':'New config file (needs 1 screenshot)'}, {'icon':'open', 'command':self.sp_cnf_open, 'state':state_btn_open, 'text':'Open a config file...'}, ]) # return toolbar def gui_pack_toolbar_sp_cnf ( self ): toolbar = self.gui_pack_tools_btns([ {'icon':'save', 'command':self.sp_cnf_save, 'text':'Save configuration file'}, {'icon':'load_screenshot', 'command':self.sp_open_img, 'text':'Load a screenshot'}, {'icon':'lol', 'command':self.sp_cnf_return_start, 'side':Tk.RIGHT, 'text':'Return to start screen'}, ]) # return toolbar def gui_pack_tools_btns ( self, btns, parent=None ): if parent==None: parent = self.toolbar frame = Tk.Frame(parent) frame.pack(fill=Tk.BOTH) frame.btns = {} # for btn in btns: b = self.gui_pack_btn(frame, **btn) frame.btns[btn['icon']] = b # return frame def gui_pack_btn ( self, frame, icon, command, side=Tk.LEFT, state=Tk.NORMAL, space=2, relief=Tk.FLAT, text='' ): img = ImageTk.PhotoImage(Image.open('gui_icons/'+icon+'.png')) self.icons.append(img) try: img_active = ImageTk.PhotoImage(Image.open('gui_icons/'+icon+'_active.png')) self.icons.append(img_active) except: img_active = img try: img_disabled = ImageTk.PhotoImage(Image.open('gui_icons/'+icon+'_disabled.png')) self.icons.append(img_disabled) except: img_disabled = img # w = ImageTk.PhotoImage.width(img) h = ImageTk.PhotoImage.height(img) d = space # cnv = Tk.Canvas(frame, width=w, height=h, state=state, highlightthickness=0, relief=relief, bd=d) cnv.create_image(d, d, anchor='nw',image=img, activeimage=img_active, disabledimage=img_disabled) cnv.bind('<ButtonRelease-1>', command) cnv.bind('<Enter>', lambda ev,self=self: self.set_status(text)) cnv.bind('<Leave>', lambda ev,self=self: self.set_status('')) cnv.pack(side=side) return cnv # --- start screen --- # def gui_pack_frame_start ( self, frame ): st_frame = Tk.Frame(frame,relief=Gui.RELIEF,bd=Gui.RELIEF_S) cnv = Tk.Canvas(st_frame,width=160,height=128) cnv.pack() logo = 'gui_icons/logo.png' img = ImageTk.PhotoImage(Image.open(logo)) self.canvas_start_img = img cnv.create_image(0, 0, image=img) cnv.config(scrollregion=cnv.bbox(Tk.ALL)) # st_frame.pack(fill=Tk.BOTH,expand=1) # return st_frame # --- scraper configuration screen --- # def gui_pack_frame_sp_cnf ( self, parent ): # resize self.resize(900, 600) frame = Tk.Frame(parent,relief=Gui.RELIEF,bd=Gui.RELIEF_S) frame.pack(fill=Tk.BOTH,expand=1) frame.grid_rowconfigure(0, weight=1) # # screen shot frame (frame_sc, canvas_sc) = self.gui_canvas_and_scroll(frame) self.canvas_sc = canvas_sc canvas_sc.configure(cursor='cross') frame_sc.grid(row=0, column=0, sticky=Gui.GRID_BOTH) frame.grid_columnconfigure(0, weight=1) frame.configure(relief=Gui.RELIEF,bd=Gui.RELIEF_S) # # controls frame frame_ctr = Tk.Frame(frame) frame_ctr.grid(row=0, column=1, sticky=Gui.GRID_BOTH) # - preview and controls frame_preview = Tk.Frame(frame_ctr) frame_preview.grid(row=0) self.gui_pack_preview(frame_preview) # - tabs bar frame_tabs = Tk.Frame(frame_ctr,relief=Tk.GROOVE,bd=2) frame_tabs.grid(row=1, sticky=Gui.GRID_BOTH) self.sp_cnf_tabs = self.gui_pack_tools_btns([ {'icon':'target', 'command':self.sp_cnf_switch_tab_marker, 'relief':Tk.SUNKEN, 'text':'Markers'}, {'icon':'card', 'command':self.do_nothing, 'relief':Tk.RAISED, 'text':'Cards templates'}, {'icon':'text', 'command':self.sp_cnf_switch_tab_ocr, 'relief':Tk.RAISED, 'text':'OCR'}, ], frame_tabs) # - tab content frame_tab = Tk.Frame(frame_ctr) self.sp_cnf_frame_tab_content = frame_tab self.gui_pack_tab_marker(frame_tab) self.sp_cnf_tab_ocr = None self.sp_cnf_tab_current = self.sp_cnf_tab_marker self.sp_cnf_tab_icon_current = 'target' frame_tab.grid(row=2) # - place holder Tk.Label(frame_ctr, text=':)').grid(row=3) frame_ctr.grid_rowconfigure(3, weight=1) # return frame def gui_canvas_and_scroll ( self, parent ): frame = Tk.Frame(parent) xscroll = Tk.Scrollbar(frame, orient=Tk.HORIZONTAL) yscroll = Tk.Scrollbar(frame) canvas = Tk.Canvas(frame, xscrollcommand=xscroll.set, yscrollcommand=yscroll.set) # def mouseWheel ( ev ): if ev.state==0: # up/down canvas.yview("scroll", ev.delta,"units") else: # right/left canvas.xview("scroll", ev.delta,"units") # xscroll.config(command=canvas.xview) yscroll.config(command=canvas.yview) canvas.bind("<MouseWheel>", mouseWheel) # frame.grid_rowconfigure(0, weight=1) frame.grid_columnconfigure(0, weight=1) canvas.grid(row=0, column=0, sticky=Gui.GRID_BOTH) xscroll.grid(row=1, column=0, sticky=Gui.GRID_H) yscroll.grid(row=0, column=1, sticky=Gui.GRID_V) # return (frame, canvas) def gui_pack_preview ( self, frame_preview ): # tools frame_toolbar = Tk.Frame(frame_preview) self.sp_cnf_preview_toolbar = self.gui_pack_tools_btns([ {'icon':'zoom_plus', 'command':self.sp_prev_zoom_plus}, {'icon':'zoom_minus', 'command':self.sp_prev_zoom_minus}, ], frame_toolbar) frame_toolbar.grid(row=1, columnspan=2, sticky=Gui.GRID_BOTH) # move buttons self.canvas_sp_cnf_prev_img_zoom = 2 frame_prev_above = Tk.Frame(frame_preview) self.gui_pack_btn(frame_prev_above, 'down', lambda e, s=self:self.sp_prev_move(e,'N',-1), side=Tk.BOTTOM, space=0) self.gui_pack_btn(frame_prev_above, 'up', lambda e, s=self:self.sp_prev_move(e,'N',1), side=Tk.BOTTOM, space=0) frame_prev_above.grid(row=1, column=1) frame_prev_left = Tk.Frame(frame_preview) self.gui_pack_btn(frame_prev_left, 'right', lambda e, s=self:self.sp_prev_move(e,'W',-1), side=Tk.RIGHT, space=0) self.gui_pack_btn(frame_prev_left, 'left', lambda e, s=self:self.sp_prev_move(e,'W',1), side=Tk.RIGHT, space=0) frame_prev_left.grid(row=2, column=0) frame_prev_right = Tk.Frame(frame_preview) self.gui_pack_btn(frame_prev_right, 'left', lambda e, s=self:self.sp_prev_move(e,'E',-1), side=Tk.LEFT, space=0) self.gui_pack_btn(frame_prev_right, 'right', lambda e, s=self:self.sp_prev_move(e,'E',1), side=Tk.LEFT, space=0) frame_prev_right.grid(row=2, column=2) frame_prev_below = Tk.Frame(frame_preview) self.gui_pack_btn(frame_prev_below, 'up', lambda e, s=self:self.sp_prev_move(e,'S',-1), side=Tk.TOP, space=0) self.gui_pack_btn(frame_prev_below, 'down', lambda e, s=self:self.sp_prev_move(e,'S',1), side=Tk.TOP, space=0) frame_prev_below.grid(row=3, column=1) # preview canvas (frame_prev, canvas_prev) = self.gui_canvas_and_scroll(frame_preview) self.canvas_sp_cnf_prev = canvas_prev canvas_prev.configure(width=128, height=128, bg='gray') frame_prev.grid(row=2, column=1, sticky=Tk.N) def gui_pack_tab_marker ( self, frame_tab ): frame = Tk.Frame(frame_tab) frame.pack() self.sp_cnf_tab_marker = frame self.sp_cnf_marker_selected = None # combobox list_markers = self.spCnf.get_list_markers() self.sp_cnf_marker_name = Tk.StringVar() if list_markers: self.sp_cnf_marker_name.set(list_markers[0]) self.sp_cnf_next_marker_name = 'marker'+str(len(list_markers)+1) self.combobox_markers = ttk.Combobox(frame, values=list_markers, textvariable=self.sp_cnf_marker_name) self.combobox_markers.bind('<<ComboboxSelected>>', self.sp_cnf_switch_marker) self.combobox_markers.bind('<Key>', lambda ev, self=self: self.root.after(1,self.sp_cnf_editing_marker_name)) self.combobox_markers.grid(row=0,column=0) # commands frame_commands = Tk.Frame(frame) frame_addedit = Tk.Frame(frame_commands) self.sp_cnf_btn_add_marker = self.gui_pack_btn(frame_addedit, 'plus', self.sp_cnf_add_marker, state=Tk.DISABLED) self.sp_cnf_btn_edit_marker = self.gui_pack_btn(frame_addedit, 'ok', self.sp_cnf_rename_marker, state=Tk.DISABLED) self.sp_cnf_btn_edit_marker.pack_forget() frame_addedit.pack(side=Tk.LEFT)# frame_locate = Tk.Frame(frame_commands) self.sp_cnf_btn_locate = self.gui_pack_btn(frame_locate, 'target', self.sp_cnf_locate_marker, state=Tk.DISABLED) self.sp_cnf_btn_locate.pack_forget() frame_locate.pack(side=Tk.LEFT)# frame_commands.grid(row=1,column=0) # init if packed after loading a conf file if list_markers: self.root.after(1, self.sp_cnf_switch_marker) def gui_pack_tab_ocr ( self, frame_tab ): frame = Tk.Frame(frame_tab) frame.pack() self.sp_cnf_tab_ocr = frame test_btn = self.gui_pack_btn(frame, 'work', self.sp_cnf_do_ocr, side=Tk.TOP) # --- switch between screens or tabs --- # def gui_switch_to_sp_cnf ( self ): self.toolbar_start.pack_forget() self.frame_start.pack_forget() # if self.toolbar_sp_cnf == None: self.toolbar_sp_cnf = self.gui_pack_toolbar_sp_cnf() self.frame_sp_cnf = self.gui_pack_frame_sp_cnf(self.frame_main) else: self.resize(900, 600) self.toolbar_sp_cnf.pack(fill=Tk.BOTH) self.frame_sp_cnf.pack(fill=Tk.BOTH,expand=1) def sp_cnf_return_start ( self, ev=None ): self.toolbar_sp_cnf.pack_forget() self.frame_sp_cnf.pack_forget() self.resize(300, 200) self.toolbar_start.pack(fill=Tk.BOTH) self.frame_start.pack(fill=Tk.BOTH,expand=1) def sp_cnf_switch_tab_marker ( self, ev=None ): self.sp_cnf_tab_current.pack_forget() self.sp_cnf_tab_marker.pack() self.sp_cnf_tab_current = self.sp_cnf_tab_marker self.sp_cnf_tabs.btns[self.sp_cnf_tab_icon_current].configure(relief=Tk.RAISED) self.sp_cnf_tabs.btns['target'].configure(relief=Tk.SUNKEN) self.sp_cnf_tab_icon_current = 'target' def sp_cnf_switch_tab_ocr ( self, ev=None ): self.sp_cnf_tab_current.pack_forget() if self.sp_cnf_tab_ocr: self.sp_cnf_tab_ocr.pack() else: self.gui_pack_tab_ocr(self.sp_cnf_frame_tab_content) self.sp_cnf_tab_current = self.sp_cnf_tab_ocr self.sp_cnf_tabs.btns[self.sp_cnf_tab_icon_current].configure(relief=Tk.RAISED) self.sp_cnf_tabs.btns['text'].configure(relief=Tk.SUNKEN) self.sp_cnf_tab_icon_current = 'text' # ====== GUI ACTION ====== # # --- button commands --- # def sp_cnf_new ( self, ev=None ): self.spCnf = ScraperConfig() self.gui_switch_to_sp_cnf() screenshots = [ f for f in os.listdir(Gui.path_screenshots) if os.path.splitext(f)[1] == '.tif' ] f = os.path.join(Gui.path_screenshots,screenshots[0]) self.sp_show_image(f) def sp_cnf_open ( self, ev=None ): f = tkFileDialog.askopenfilename(parent=self.root, title='Open config file', initialdir=Gui.path_config_files, defaultextension=Gui.CONFIG_FILE_EXT ) if f: self.spCnf = ScraperConfig.load(f) self.gui_switch_to_sp_cnf() self.sp_show_image(self.spCnf.imagename) def sp_cnf_save ( self, ev=None ): if not self.spCnf.filename: self.sp_cnf_saveas(ev) else: self.spCnf.save() def sp_cnf_saveas ( self, ev=None ): f = tkFileDialog.asksaveasfilename(parent=self.root, title='Save config file as', initialdir=Gui.path_config_files, defaultextension=Gui.CONFIG_FILE_EXT ) if f: self.spCnf.save(f) def sp_do_screencap ( self, ev=None ): a = self.sc.capture() img_data = Image.fromstring('L', (a.shape[0], a.shape[1]), a.astype('b').tostring()) img = ImageTk.PhotoImage(image=img_data) # screenshots = [ f for f in os.listdir(Gui.path_screenshots) if os.path.isfile(os.path.join(Gui.path_screenshots,f)) ] cv2.imwrite('%s/screenshot_%d.tif' % (Gui.path_screenshots, (len(screenshots)+1)), a) # enable new button self.toolbar_start.btns['new'].configure(state=Tk.NORMAL) def sp_open_img ( self, ev=None ): f = tkFileDialog.askopenfilename(parent=self.root, title='Open a screenshot', initialdir=Gui.path_screenshots, filetypes=[("Screenshots", "*.tif")] ) try: self.sp_show_image(f) except: pass def sp_show_image ( self, f ): self.spCnf.set_image_file(f) img = ImageTk.PhotoImage(Image.open(f)) self.canvas_sc_img = img cnv = self.canvas_sc cnv.create_image(0, 0, image=img, anchor="nw") cnv.config(scrollregion=cnv.bbox(Tk.ALL)) # cnv.bind('<ButtonPress-1>', self.on_button_press) cnv.bind('<ButtonRelease-1>', self.on_button_release) cnv.bind('<B1-Motion>', self.on_button_motion) # --- mouse event --- # def on_button_press ( self, event ): cnv = self.canvas_sc # get coordinates x0 = max(0, cnv.canvasx(event.x)) y0 = max(0, cnv.canvasy(event.y)) # save start coordinates self.click_x0 = x0 self.click_y0 = y0 # delete previous rectangle if hasattr(self, 'selected_rect'): cnv.delete(self.selected_rect) # create rectangle self.selected_rect = cnv.create_rectangle(x0,y0,x0,y0,outline='red') # update gui self.sp_cnf_btn_add_marker.pack() self.sp_cnf_btn_edit_marker.pack_forget() self.sp_cnf_btn_locate.pack_forget() self.sp_cnf_marker_name.set(self.sp_cnf_next_marker_name) def on_button_motion ( self, event ): cnv = self.canvas_sc # get coordinates x1 = max(0, cnv.canvasx(event.x)) y1 = max(0, cnv.canvasy(event.y)) # get start coordinates x0,y0 = (self.click_x0, self.click_y0) # update rectangle cnv.coords(self.selected_rect, x0, y0, x1, y1) def on_button_release ( self, event ): cnv = self.canvas_sc # get coordinates x1 = max(0, cnv.canvasx(event.x)) y1 = max(0, cnv.canvasy(event.y)) # invert if necessary x0 = min(self.click_x0, x1) y0 = min(self.click_y0, y1) x1 = max(self.click_x0, x1) y1 = max(self.click_y0, y1) # update rectangle cnv.coords(self.selected_rect, x0, y0, x1, y1) # show preview self.sp_preview(x0, y0, x1, y1) # set selected region self.spCnf.new_pattern() self.sp_cnf_select_rect(x0, y0, x1, y1) # update gui self.sp_cnf_btn_add_marker.config(state=Tk.NORMAL) # --- preview --- # def sp_preview ( self, x0, y0, x1, y1 ): # round coordinates l = int(x0) t = int(y0) r = int(x1) b = int(y1) # get sub image for preview img = Tk.PhotoImage() img.tk.call(img, 'copy', self.canvas_sc_img, '-from', l, t, r, b, '-to', 0, 0) # save sub image data self.canvas_sp_cnf_prev_img_id = None self.canvas_sp_cnf_prev_img_source = (img, abs(r-l), abs(b-t)) # show preview self.sp_preview_draw() def sp_preview_draw ( self ): cnv = self.canvas_sp_cnf_prev # load sub image data (img, w, h) = self.canvas_sp_cnf_prev_img_source # delete old image canvas if self.canvas_sp_cnf_prev_img_id: cnv.delete(self.canvas_sp_cnf_prev_img_id) # compute zoomed canvas zoom = self.canvas_sp_cnf_prev_img_zoom w2 = w*zoom h2 = h*zoom img2 = img.zoom(zoom) self.canvas_sp_cnf_prev_img = img2 self.canvas_sp_cnf_prev_img_zoom = zoom w_cnv = int(cnv.cget('width')) h_cnv = int(cnv.cget('height')) x = w_cnv/2 - w2/2 y = h_cnv/2 - h2/2 # update canvas image self.canvas_sp_cnf_prev_img_id = cnv.create_image(x, y, image=img2, anchor="nw") cnv.config(scrollregion=(x-5, y-5, w_cnv/2+w2/2+5, h_cnv/2+h2/2+5)) def sp_prev_zoom_plus ( self, ev=None ): if not self.sp_cnf_region_is_selected: return self.canvas_sp_cnf_prev_img_zoom = self.canvas_sp_cnf_prev_img_zoom * 2 self.sp_preview_draw() self.sp_cnf_preview_toolbar.btns['zoom_minus'].configure(state=Tk.NORMAL) def sp_prev_zoom_minus ( self, ev=None ): if not self.sp_cnf_region_is_selected: return zoom = self.canvas_sp_cnf_prev_img_zoom if zoom >= 2: self.canvas_sp_cnf_prev_img_zoom = zoom / 2 self.sp_preview_draw() if zoom == 2: self.sp_cnf_preview_toolbar.btns['zoom_minus'].configure(state=Tk.DISABLED) def sp_prev_move ( self, ev, orientation, step ): if not self.sp_cnf_region_is_selected: return cnv = self.canvas_sc # get selected rectangle region = self.spCnf.get_pattern() (x0, y0, x1, y1) = region.rect # new coordinates if orientation=='N': y0 = y0-step elif orientation=='S': y1 = y1+step elif orientation=='W': x0 = x0-step elif orientation=='E': x1 = x1+step # update rectangle and preview cnv.coords(self.selected_rect, x0, y0, x1, y1) self.sp_preview(x0,y0,x1,y1) # self.sp_cnf_select_rect(x0, y0, x1, y1) def sp_cnf_select_rect ( self, *coords ): # set current rect self.spCnf.select_rect(coords) # update gui self.sp_cnf_region_is_selected = True # --- markers --- # def sp_cnf_add_marker ( self, ev=None ): if self.sp_cnf_btn_add_marker.cget('state') == Tk.DISABLED: return # change combo list name = self.sp_cnf_marker_name.get() new_list = tuple(self.combobox_markers['values']) + (name,) self.combobox_markers['values'] = new_list self.combobox_markers.set(name) self.sp_cnf_next_marker_name = 'marker'+str(len(new_list)+1) # add marker to spCnf self.spCnf.add_marker(name) # update gui self.canvas_sc.itemconfig(self.selected_rect, outline='green') self.sp_cnf_btn_add_marker.pack_forget() self.sp_cnf_btn_edit_marker.pack() self.sp_cnf_btn_locate.pack() self.set_status('Marker created') def sp_cnf_editing_marker_name ( self, ev=None ): current_marker = self.spCnf.get_pattern() new_name = self.sp_cnf_marker_name.get() if current_marker.name == new_name: self.sp_cnf_btn_edit_marker.config(state=Tk.DISABLED) else: self.sp_cnf_btn_edit_marker.config(state=Tk.NORMAL) def sp_cnf_rename_marker ( self, ev=None ): # get names new_name = self.sp_cnf_marker_name.get() old_name = self.spCnf.get_pattern().name # replace in combobox new_list = [ new_name if name==old_name else name for name in self.combobox_markers['values'] ] self.combobox_markers['values'] = new_list # replace in spCnf self.spCnf.rename_marker(old_name, new_name) # update gui self.sp_cnf_btn_edit_marker.config(state=Tk.DISABLED) self.set_status('Marker renamed') def sp_cnf_switch_marker ( self, ev=None ): # switch current pattern name = self.combobox_markers.get() marker = self.spCnf.switch_marker(name) coords = marker.rect # show preview self.sp_preview(*coords) # switch add/edit buttons self.sp_cnf_btn_add_marker.pack_forget() self.sp_cnf_btn_edit_marker.pack() self.sp_cnf_btn_locate.pack() # remove old rectangle if hasattr(self, 'selected_rect'): self.canvas_sc.delete(self.selected_rect) def sp_cnf_locate_marker ( self, ev=None ): # find coordinates pattern = self.spCnf.get_pattern().img_cv2 (x0, y0) = self.sp.locate(self.spCnf.img_cv2, pattern) (h, w, d) = pattern.shape x1 = x0 + w y1 = y0 + h # show rectangle if hasattr(self, 'selected_rect'): self.canvas_sc.delete(self.selected_rect) self.selected_rect = self.canvas_sc.create_rectangle(x0, y0, x1, y1, outline='blue') # --- OCR --- # def sp_cnf_do_ocr ( self, ev=None ): img = self.spCnf.create_pattern_image() txt = self.sp.do_ocr(img) # print txt def do_nothing ( self, ev=None ): pass
def test_multi_vehic(self): sc = Scraper() sc.store_data(INCIDENTS[6]) self.assertEqual(2, len(Vehicle.objects.all()))
def setUp(self): self.scraper = Scraper()
def test_closed(self): sc = Scraper() sc.store_data(INCIDENTS[1]) self.assertEqual(0, len(Incident.objects.all()))
def setUp(self): self.scraper = Scraper() self.scraper.set_autop(AdvertOptions.CARS)
# them if needed. import hashlib import time from models import * from scraper.scraper import Scraper encoding = 'UTF-8' # helper function for benchmarking current_milli_time = lambda: int(round(time.time() * 1000)) # First crawl through the whole index and get all articles we can find print('Start crawling...') start_time = current_milli_time() scraper = Scraper() articles = scraper.scrape() time_taken = current_milli_time() - start_time print('Found {} articles in {} ms'.format(len(articles), time_taken)) # Now fill the database print('Starting database work') for article in articles: # build a hash so we can more easily find out if we have an article already h = hashlib.sha256() h.update(str(article['date']).encode(encoding)) h.update(article['place'].encode(encoding)) h.update(article['description'].encode(encoding)) digest = h.digest()