Example #1
0
def run_me(offset=0):
	'''
	This runs the scraping
	script on ZILESINOPTI
	website
	'''
	cities = ['timisoara', 
			'targu-mures', 'satu-mare']
	for city in cities:
		index = offset if offset < 30 else abs(30 - offset)
		current_page = 1
		if offset >= 30:
			current_page = int(offset/30) + 1
		while True:
			print('Going to {} page in {}'.format(current_page, city))
			shop_pages = sp.collect_shop_pages(slug=city, current_page=current_page)
			if not shop_pages:
				break 
			for shop_link in shop_pages[index:]:
				try:
					shop_data = sp.collect_shop_data(url=shop_link)
				except:
					continue
				print('Scraping {} on {} site'.format(shop_data['name'], shop_data['url']))
				model = sp.get_emails_from_page(page=shop_data)
				print('{} email found'.format(len(model['emails'])))
				if len(model['emails']):
					save_data_to_db(model)
			current_page = current_page + 1
			if current_page > 10:
				break
Example #2
0
 def test_data(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[0])
     self.assertEqual("F150020627", Incident.objects.all()[0].incident_id)
     self.assertEqual("2717 Dexter Av N", Incident.objects.all()[0].location_text)
     self.assertEqual("Medic Response", Incident.objects.all()[0].type.type_name)
     self.assertEqual("E9", Vehicle.objects.get(id=1).name)
Example #3
0
  def post(self):
    logging.info('Running ScrapeHandler')
    scraper = Scraper()
    scraper.scrape("http://www.gdcafe.com/website/index.php/Flavours")
    subscribers = Subscriber.all().run()
    for s in subscribers:
      danver_flavours = []
      delila_flavours = []
      davis_flavours = []
      for f in s.flavours:
        if f in scraper.danver_flavours:
          danver_flavours.append(f)
        if f in scraper.davis_flavours:
          davis_flavours.append(f)
        if f in scraper.delila_flavours:
          delila_flavours.append(f)

      if len(danver_flavours) == 0 and len(delila_flavours) == 0 and len(davis_flavours) == 0:
        continue

      params={'email':s.email, 'danver':danver_flavours, 'davis':davis_flavours, 'delila':delila_flavours}
      taskqueue.add(url='/worker/email', params=params, queue_name='email', countdown=0)
      logging.info('Submitted task to email ' + s.email)

    taskqueue.add(url='/worker/scrape', countdown=86400)
    logging.info('Finished ScrapeHandler')
Example #4
0
def fetch_room():
    myScraper = Scraper()
    tags = myScraper.get_list_from_tag("tr", "td")
    software_array = request.json['software_array']
    list_of_all_room = []
    for i in tags:
        if i[0] in software_array:
            # separate room string into list of room
            list_of_room = [i.split(",") for i in i[1::]]
            # turn nested lists into 1 flat list
            list_of_room = [
                item.strip() for sublist in list_of_room for item in sublist
            ]
            # add rooms to result list
            list_of_all_room.append(list_of_room)
    # turn nested lists into 1 flat list
    list_of_all_room = [
        item for sublist in list_of_all_room for item in sublist
    ]

    # If there are more than 1 software required
    if len(software_array) > 1:
        # find rooms that appear more than 1
        list_of_all_room = list(
            set([i for i in list_of_all_room
                 if list_of_all_room.count(i) > 1]))

    # reformat rooms number
    list_of_all_room = reformat_room_number(list_of_all_room)
    return {"result": list_of_all_room}
Example #5
0
def fetch_software_name():
    myScraper = Scraper()
    tags = myScraper.get_list_from_tag("tr", "td")
    software = []
    for i in tags:
        software.append(i[0])
    return {"result": software}
Example #6
0
 def test_followup_dispatch(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[5])
     self.assertEqual(1, len(Incident.objects.all()))
     self.assertEqual(1, len(Dispatch.objects.all()))
     sc.store_data(INCIDENTS[0])
     self.assertEqual(1, len(Incident.objects.all()))
     self.assertEqual(2, len(Dispatch.objects.all()))
Example #7
0
    def build_keywords_from_links(tree, keyword_file):

        scraper = Scraper()

        # change this to file output
        for entry in  tree.map.keys():
            # print(entry)
            if len(entry) == 0:
                continue

            curr = tree.map.get(entry)

            if curr._visited:
                #print("already visited")
                continue

            if curr.links == []:
                #print("no links")
                continue

            links = curr.links


            resp = {}
            counter = 0
            links_keywords_cache = {}


            for link in links:
                if counter > 20:
                    break
                #print('Trying: ', link)
                link = link.replace('\'', '')
                tdict = None
                try:
                    tdict=links_keywords_cache[link]
                except KeyError:
                    tdict = scraper.get_keywords(link)
                    links_keywords_cache[link] = tdict

            if len(str(tdict)) > 0:
                for key in tdict.keys():
                    try:
                        resp[key].append(tdict[key])
                    except KeyError:
                        resp[key] = [tdict[key]]
                counter+= 1

            #print("writing to file")
            writer = open(keyword_file,'a')
            writer.write(entry + '\n')
            writer.write(str(resp)+'\n')
Example #8
0
    def handle(self, *args, **options):
        try:
            monitor = Monitor(log, verbose_debug_mode=options[self.VERBOSE_MODE])
            monitor.debug("%s - Started scraping inmates from Cook County Sheriff's site." % datetime.now())

            scraper = Scraper(monitor)
            if options[self.START_DATE]:
                scraper.check_for_missing_inmates(datetime.strptime(options[self.START_DATE], "%Y-%m-%d").date())
            else:
                scraper.run()

            monitor.debug("%s - Finished scraping inmates from Cook County Sheriff's site." % datetime.now())
        except Exception, e:
            log.exception(e)
Example #9
0
 def setUp(self) -> None:
     self.scraper = Scraper('python', 'katowice', 15, 20, False)
     self.scraper_local = Scraper('python', 'katowice', 15, 20, True)
     self.scraper.get_content()
     self.scraper_local.get_content()
     """
     Creating second scraper object which works without requests,
     but with static page in /data/ - i am checking with it adding to dicts,
     which seems hard to implement with requests. Also, it's helpful to test
     scraping page structure. Of course it won't help when page structure
     changes.
     """
     self.job = self.scraper_local.find_jobs_div()[0]
     self.offer = JobOffer(self.job, self.scraper_local.skip)
Example #10
0
def _scraper_init():
    global _scraper
    if not _scraper:
        log.info('creating scraper')
        _scraper = Scraper(os.environ['CHROME_DRIVER_PATH'],
                           os.environ['SCRAPER_DATA_DIR'],
                           json.load(open(os.environ['WEBSITE_URLS'])))
Example #11
0
def get_result(term):
    # get links for 5 fiver results from Google
    result = Scraper.scrape_google(search_term=term)
    pages = []
    for l in result.split(" "):
        pages.append(l)

    required_tags = ["h1", "h2", "h3", "h4", "h5", "pre", "code", "p"]
    text_outputs = []
    code_outputs = []
    json_data = []
    for page in pages:
        res = requests.get(page)
        html_text = BeautifulSoup(res.text, 'html.parser')
        text = html_text.find_all(required_tags)
        for t in text:
            if t.name == 'code' or t.name == 'pre':
                code_outputs.append(t.get_text())
            else:
                text_outputs.append(t.get_text())

        data = {
            page: {
                "code_snippets": code_outputs,
                "relevant_text": text_outputs
            }
        }
        json_data.append(data)

    print(json_data)

    return json.dumps(json_data)
Example #12
0
	def __init__ ( self, root ):
		self.sp = Scraper()
		self.sc = ScreenCapture()
		# Init
		root.wm_title("Poker AI")
		self.root = root
		self.icons = []
		self.sp_cnf_region_is_selected = False
		# Menu
		self.menu_bar = self.gui_set_menu()
		self.root.config(menu=self.menu_bar)
		self.frame_root = Tk.Frame(root, relief=Tk.GROOVE, bd=Gui.RELIEF_S)
		# Toolbar
		self.toolbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S)
		self.toolbar.pack(fill=Tk.BOTH, side=Tk.TOP)
		self.toolbar_start = self.gui_pack_toolbar_start()
		self.toolbar_sp_cnf = None
		# Main Frame
		self.frame_main = Tk.Frame(self.frame_root)
		self.frame_main.pack(fill=Tk.BOTH,expand=1)
		self.frame_start = self.gui_pack_frame_start(self.frame_main)
		self.frame_sp_cnf = None
		# Status Bar
		self.statusbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S)
		self.statusbar.pack(fill=Tk.BOTH)
		self.status_var = Tk.StringVar()
		self.status_label = Tk.Label(self.statusbar, textvariable=self.status_var, anchor=Tk.W)
		self.status_label.pack(fill=Tk.BOTH)
		# Show
		self.frame_root.pack(fill=Tk.BOTH,expand=1)
		self.resize(300,200)
Example #13
0
def main() -> None:

    # Setup GPIO
    MOTOR1_PINS = [11, 13]
    MOTOR2_PINS = [16, 15]
    GPIO.setmode(GPIO.BOARD)
    GPIO.setup(MOTOR1_PINS, GPIO.OUT)
    GPIO.setup(MOTOR2_PINS, GPIO.OUT)
    # -------
    logger.setLevel('INFO')
    scraper: Scraper = Scraper(DBStorage())
    controller: Controller = Controller()
    sensors: List[ISensor] = []
    sentries: List[ISentry] = []
    horizontal_sensor: HorizontalCapacitanceSensor = HorizontalCapacitanceSensor(
    )
    vertical_sensor: VerticalCapacitanceSensor = VerticalCapacitanceSensor()

    # TODO: add LCD display?

    # register all sensors
    sensors.append(PIM486())
    sensors.append(ArduinoSerialInterface())

    # register all sentries
    sentries.append(HumiditySentry())
    sentries.append(LightSentry())
    sentries.append(WaterSentry())
    sentries.append(TemperatureSentry())

    # spawn two threads and run one for any external events (user input etc)
    # run main loop
    scraper_runner: threading.Thread = threading.Thread(target=scraper.run,
                                                        name="Scraper")
    controller_runner: threading.Thread = threading.Thread(
        target=controller.run, name="Controller")
    try:
        scraper_runner.start()
        controller_runner.start()
        while True:
            for s in sensors:
                s.poll()  # poll for data for most of the sensors

            time.sleep(0.5)

    # poll the sensors periodically, serve IRQ's from some most important - pindas etc
    except KeyboardInterrupt:
        scraper.is_done = True
        controller.is_done = True
        scraper_runner.join()
        controller_runner.join()

        # deinit all sensors
        for s in sensors:
            s.close()
        # GPIO CLEANUP
        GPIO.cleanup()

        logInfo("Exiting")
Example #14
0
    def run(self):
        # Abi of the messaging smart contract
        # abi = '''[{"constant":true,"inputs":[{"name":"","type":"address"}],"name":"last_msg_index","outputs":[{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":false,"inputs":[{"name":"_key","type":"string"},{"name":"_type","type":"string"}],"name":"setPublicKey","outputs":[],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"},{"name":"_index","type":"uint256"}],"name":"newMessage","outputs":[{"name":"","type":"bool"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"},{"name":"_index","type":"uint256"}],"name":"getMessageByIndex","outputs":[{"name":"","type":"address"},{"name":"","type":"string"},{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"","type":"address"}],"name":"keys","outputs":[{"name":"key","type":"string"},{"name":"key_type","type":"string"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"}],"name":"getPublicKey","outputs":[{"name":"_key","type":"string"},{"name":"_key_type","type":"string"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"","type":"address"},{"name":"","type":"uint256"}],"name":"messages","outputs":[{"name":"from","type":"address"},{"name":"text","type":"string"},{"name":"time","type":"uint256"}],"payable":false,"type":"function"},{"constant":false,"inputs":[{"name":"_to","type":"address"},{"name":"_text","type":"string"}],"name":"sendMessage","outputs":[],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_who","type":"address"}],"name":"getLastMessage","outputs":[{"name":"","type":"address"},{"name":"","type":"string"},{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":true,"inputs":[{"name":"_owner","type":"address"}],"name":"lastIndex","outputs":[{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"constant":true,"inputs":[],"name":"message_staling_period","outputs":[{"name":"","type":"uint256"}],"payable":false,"type":"function"},{"anonymous":false,"inputs":[{"indexed":true,"name":"_sender","type":"address"},{"indexed":true,"name":"_receiver","type":"address"},{"indexed":false,"name":"_time","type":"uint256"},{"indexed":false,"name":"message","type":"string"}],"name":"Message","type":"event"},{"anonymous":false,"inputs":[{"indexed":true,"name":"_sender","type":"address"},{"indexed":false,"name":"_key","type":"string"},{"indexed":false,"name":"_keytype","type":"string"}],"name":"PublicKeyUpdated","type":"event"}]'''

        if self.testnet:
            web3 = Web3(
                Web3.HTTPProvider(
                    'https://ropsten.infura.io/v3/29e5c62848414895b549aa4befebe614'
                ))
        else:
            web3 = Web3(
                Web3.HTTPProvider(
                    'https://mainnet.infura.io/v3/29e5c62848414895b549aa4befebe614'
                ))

        acc = web3.eth.account.privateKeyToAccount(self.private_key)

        if not web3.isConnected():
            Scraper.log("Messaging:\tNo connection established")

        # Messaging smart contract to use if not sending a direct transaction to the contract owner
        # messaging = web3.eth.contract(address="0xCdcDD44f7f617B965983a8C1bB0B845A5766FEbA", abi=abi)

        Scraper.log("Messaging:\tWaiting for messages")

        nonce = 1

        while True:
            (address, message) = self.report_q.get()

            if message is None:
                break

            message = "Hello, We scanned a smart contract you deployed and found a vulnrability in it, here is the report:\n" + message

            transaction = {
                'to': web3.toChecksumAddress(address),
                'from': acc.address,
                'value': 0,
                'gasPrice': web3.eth.gasPrice,
                'nonce': web3.eth.getTransactionCount(acc.address),
                'data': message.encode('utf-8').hex()
            }

            transaction['gas'] = web3.eth.estimateGas(transaction)

            #            transaction = messaging.functions.sendMessage(address, message).buildTransaction({'from': acc.address, 'nonce': '0x%02x' % web3.eth.getTransactionCount(address)} # Use this to send the message to a messaging smart contract)

            signed = acc.signTransaction(transaction)

            tx = web3.eth.sendRawTransaction(signed.rawTransaction)

            Scraper.log("Messaging:\tSent message")
        Scraper.log("Messaging:\tReceived terminator, shutting down...")
Example #15
0
def start_scraping():
    job_name = input('Enter job name: ')
    place = input('Enter place: ')
    radius = int(input('Enter radius: '))

    scraper = Scraper(job_name, place, radius)
    print(f'URL: {scraper.page.url}, Place: {scraper.location}, Job name: \
{scraper.job_name}\n')

    template = Template(scraper.offers, scraper.number_of_offers)
    def test_scrap(self):
        nb_doc = 4  # to keep test short
        curr_doc = 0
        scraper = Scraper(disconnected=True)
        directory = os.path.dirname(os.path.abspath(__file__))
        with vcr.use_cassette(directory + '/vcr_cassettes/test_run_scraper.yaml', record_mode='none', ignore_localhost=True):
            for doc in scraper.scrap():
                self.assertIsInstance(doc.url, unicode)
                self.assertIsInstance(doc.title, unicode)
                self.assertIsInstance(doc.content, unicode)
                self.assertNotIn(u'.gif', doc.url)  # check extension filter
                self.assertNotIn(u'youtu', doc.url)  # check regex filter

                curr_doc += 1
                if curr_doc == nb_doc:
                    break
            else:
                self.fail('error: not enough docs extracted from cassette, should be '
                          + str(nb_doc) + ', was ' + str(curr_doc))
Example #17
0
    def post(self):
        logging.info("Running scrape handler")
        scraper = Scraper()
        scrape_result = scraper.scrape_all()

        seats = []
        parties = []
        prices = []
        winners = {}
        for s in scrape_result:
            if s.name not in seats:
                seats.append(Seat(name=s.name, state=s.state, id=s.name))
                seats[-1].put()

            lowest_price = 1000
            winner = ''
            for c in s.candidates:
                if c.name not in parties:
                    parties.append(Party(name=c.name, id=c.name))
                party_key = ndb.Key(Party, c.name)
                seat_key = ndb.Key(Seat, s.name)
                price = Price(party=party_key, seat=seat_key, price=c.price)
                price.put()

                if c.price < lowest_price:
                    lowest_price = c.price
                    winner = c.name

            if winner in winners:
                winners[winner] += 1
            else:
                winners[winner] = 1

        for party in parties:
            if party.name in winners:
                party.num_seats = winners[party.name]
            else:
                party.num_seats = 0
            party.put()

        self.response.out.write(winners)
Example #18
0
 def execute_alternate(self):
     input_combinations = Combination(
         self.list_of_input).get_result_reversed()
     master_data = []
     master_id = normalize_string(util.get_uuid())
     for com in input_combinations:
         description = ""
         scr = Scraper()
         browser = scr.dive_plus(self.url, com)
         wait = WebDriverWait(browser, GlobalPreferences.setting["timeout"])
         try:
             page_loaded = wait.until_not(
                 lambda browser: browser.current_url == self.url)
         except TimeoutException:
             print("Timeout")
             description = "Timeout\n"
         finally:
             result = {
                 "url_after":
                 browser.current_url,
                 "text_found":
                 scr.find_text_in_browser(
                     GlobalPreferences.setting["expected"]["text_after"]),
                 "element_found":
                 scr.find_element_in_browser(
                     GlobalPreferences.setting["expected"]["element_after"])
             }
             data = {
                 "result": result,
                 "expected": GlobalPreferences.setting["expected"],
                 "id": str(get_uuid()),
                 "date": get_today(),
                 "title": "Skreepy",
                 "description": description,
                 "tester": GlobalPreferences.setting["tester"],
                 "inputs": com,
                 "master_test_id": master_id
             }
             master_data.append(data)
             browser.close()
     MasterReportWindow(master_data, self).show()
Example #19
0
  def post(self):
    logging.info("Running scrape handler")
    scraper = Scraper()
    scrape_result = scraper.scrape_all()

    seats = []
    parties = []
    prices = []
    winners = {}
    for s in scrape_result:
      if s.name not in seats:
        seats.append(Seat(name=s.name, state=s.state, id=s.name))
        seats[-1].put()

      lowest_price = 1000
      winner = ''
      for c in s.candidates:
        if c.name not in parties:
          parties.append(Party(name=c.name, id=c.name))
        party_key = ndb.Key(Party, c.name)
        seat_key = ndb.Key(Seat, s.name)
        price = Price(party=party_key, seat=seat_key, price=c.price)
        price.put()

        if c.price < lowest_price:
          lowest_price = c.price
          winner = c.name

      if winner in winners:
        winners[winner] += 1
      else:
        winners[winner] = 1

    for party in parties:
      if party.name in winners:
        party.num_seats = winners[party.name]
      else:
        party.num_seats = 0
      party.put()

    self.response.out.write(winners)
Example #20
0
    def main(self):
        if len(sys.argv) > 2:
            Scraper.api = sys.argv[1]
            self.private_key = sys.argv[2]
        else:
            Scraper.log(
                "You did not define an Etherscan API key or Private key.")
            exit()

        # create different queues
        new_address_q = Queue()
        report_q = Queue()
        # create different threads
        for instance in range(mythril_instances):
            myth_x = MythX(new_address_q, report_q)
            myth_x.start()

        scraper = Scraper(new_address_q)
        scraper.start()

        messenger = Messenger(report_q, self.private_key, True)
        messenger.start()

        try:
            while True:
                time.sleep(.1)
        except KeyboardInterrupt:
            pass
        finally:
            new_address_q.put(None)
            report_q.put(None)
            report_q.put(None)
Example #21
0
def fill_template(template, urls):
    table = template
    for url in urls:

        scraper = Scraper(url)
        times = 0

        while (True):
            if times < 5:
                try:
                    scraper.start()
                    row = scraper.get_dictionary()
                    table = table.append(row, ignore_index=True)
                    break
                except:
                    continue
                times += 1
            else:
                print('5 unsuccessful attempts')
                break

    return table
Example #22
0
 def test_url_is_restaurant_page_succeeds(self):
     """
         Asserts the endpoint succeeds when the url is a restaurant url
     """
     Scraper.get_scraper = MagicMock(return_value=Scraper([]))
     Scraper.get_reviews = MagicMock(return_value=[])
     with APP.test_client() as client:
         sent = {
             "url":
             "https://www.grubhub.com/restaurant/hashbrowns-on-wells-1155-n-wells-st-chicago/287727"
         }
         result = client.post('/scrape_reviews', data=sent)
         self.assertEqual(result.status_code, 200)
Example #23
0
def spider(df, n_processes, stopwords):
    """Launch scraper and parser.

    Args:
        df: url stored in a data frame
        n_processes: process to run a launcher

    Returns:
        table: corpus of a web page
    """
    log = logging.getLogger('spider')

    scraper = Scraper()
    parser = Parser()
    df = scraper.scrape_job(df, n_processes)
    log.debug("dataframe scraped: {}".format(df.head(1)))

    df_final = parser.extract_corpus(df, 2 * multiprocessing.cpu_count(),
                                     stopwords)
    table = pa.Table.from_pandas(df_final)
    log.debug("dataframe parsed: {}".format(table))

    return table
Example #24
0
def output(url, body, counter):
    if counter == 0:
        global main_url
        main_url = url
        global html
        html = body
        return counter + 1
    elif counter == 1:
        global counts
        counts = body
        return counter + 1
    elif counter == 2:
        global updates
        updates = body
        return counter + 1
    else:
        comments = body
        scraper = Scraper(main_url, html, counts, updates, comments)
        scraper.start()
        row = scraper.get_dictionary()
        global table
        table = table.append(row, ignore_index=True)
        return 0
Example #25
0
def run():
    jsonpickle.set_encoder_options('simplejson', indent=4, ensure_ascii=False)

    scraper = Scraper()

    folder = '/media/nico/SAMSUNG/devs/gator/scraping reddit 10-01-2016'

    log_file = folder + 'run_scraper-' + str(datetime.datetime.utcnow()) + '.log'

    logging.basicConfig(format=u'%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO, filename=log_file)

    while True:
        try:
            for scraper_document in scraper.scrap():
                filename = folder + '/' + str(datetime.datetime.utcnow()) + '.json'
                json = jsonpickle.encode(scraper_document)
                with codecs.open(filename=filename, mode='w', encoding='utf-8') as file_desc:
                    file_desc.write(json)

        except Exception as exception:  # pylint: disable=broad-except
            logging.error("The orchestrator crashed! Starting it over ...")
            logging.exception(exception)
            sleep(30)
Example #26
0
 def __init__(self, root):
     self.sp = Scraper()
     self.sc = ScreenCapture()
     # Init
     root.wm_title("Poker AI")
     self.root = root
     self.icons = []
     self.sp_cnf_region_is_selected = False
     # Menu
     self.menu_bar = self.gui_set_menu()
     self.root.config(menu=self.menu_bar)
     self.frame_root = Tk.Frame(root, relief=Tk.GROOVE, bd=Gui.RELIEF_S)
     # Toolbar
     self.toolbar = Tk.Frame(self.frame_root,
                             relief=Gui.RELIEF,
                             bd=Gui.RELIEF_S)
     self.toolbar.pack(fill=Tk.BOTH, side=Tk.TOP)
     self.toolbar_start = self.gui_pack_toolbar_start()
     self.toolbar_sp_cnf = None
     # Main Frame
     self.frame_main = Tk.Frame(self.frame_root)
     self.frame_main.pack(fill=Tk.BOTH, expand=1)
     self.frame_start = self.gui_pack_frame_start(self.frame_main)
     self.frame_sp_cnf = None
     # Status Bar
     self.statusbar = Tk.Frame(self.frame_root,
                               relief=Gui.RELIEF,
                               bd=Gui.RELIEF_S)
     self.statusbar.pack(fill=Tk.BOTH)
     self.status_var = Tk.StringVar()
     self.status_label = Tk.Label(self.statusbar,
                                  textvariable=self.status_var,
                                  anchor=Tk.W)
     self.status_label.pack(fill=Tk.BOTH)
     # Show
     self.frame_root.pack(fill=Tk.BOTH, expand=1)
     self.resize(300, 200)
Example #27
0
def ng_scraper():

    parser = argparse.ArgumentParser(description="Scrape inmate data from Cook County Sheriff's site.")
    parser.add_argument('-d', '--day', action='store', dest='start_date', default=None,
                        help=('Specify day to search for missing inmates, format is YYYY-MM-DD. '
                                'If not specified, searches all days.'))
    parser.add_argument('--verbose', action="store_true", dest='verbose', default=False,
                        help='Turn on verbose mode.')

    args = parser.parse_args()

    try:
        monitor = Monitor(log, verbose_debug_mode=args.verbose)
        monitor.debug("%s - Started scraping inmates from Cook County Sheriff's site." % datetime.now())

        scraper = Scraper(monitor)
        if args.start_date:
            scraper.check_for_missing_inmates(datetime.strptime(args.start_date, '%Y-%m-%d').date())
        else:
            scraper.run()

        monitor.debug("%s - Finished scraping inmates from Cook County Sheriff's site." % datetime.now())
    except Exception, e:
        log.exception(e)
Example #28
0
 def test_happy_path(self):
     """
        Asserts the endpoint returns a dictionary as json
     """
     reviewMockData = {"author": "test"}
     Scraper.get_scraper = MagicMock(return_value=Scraper([]))
     Scraper.get_reviews = MagicMock(return_value=[reviewMockData])
     with APP.test_client() as client:
         sent = {
             "url":
             "https://www.grubhub.com/restaurant/hashbrowns-on-wells-1155-n-wells-st-chicago/287727"
         }
         result = client.post('/scrape_reviews', data=sent)
         print(result)
         self.assertEqual(result.status_code, 200)
Example #29
0
def start_scraping():
    config_reader = JsonConfigReader()
    engine = create_engine(Config.DB_URI)

    Base = declarative_base()
    Base.metadata.bind = engine

    DBSession = sessionmaker(bind=engine)
    session = DBSession()

    config = config_reader.read('config.json')
    scraper = Scraper(config)

    scraping_manager = ScrapingManager(scraper, session)
    scraping_manager.start_scraping()
Example #30
0
def main():

    logging.config.fileConfig("logging.conf")
    logger = logging.getLogger("sLogger")

    ua = UserAgent()

    params = {
        "ae": "0",
        "oa": "1",
        "pt": "2",
        "vl": "3",
        "wg": "4",
        "pf": "5",
        "hi": "6",
        "wi": "7",
        "bp": "8",
        "pac": "9",
        "pas": "******",
        "sho": "11",
        "phy": "12",
        "dri": "13",
        "def": "14"
    }

    query = "&".join([f"showCol%5B{y}%5D={x}" for x, y in params.items()])
    url = f"https://sofifa.com/players?{query}&offset="
    urls = [url + str(offset) for offset in range(0, 18060, 60)]

    # Parameters
    number_of_scraper = 31
    pages = 10

    scrapers = [
        Scraper(urls[pages * i:min(pages * (i + 1), len(urls))], ua.random)
        for i in range(number_of_scraper)
    ]

    logger.info("Scraping started...")
    multi_threading = MultiThreading(scrapers)
    multi_threading.run()
    logger.info("Scraping finished.")

    logger.info("Generating CSV file...")
    save_data(Scraper.players_scraped)
    logger.info("CSV File is generated.")
Example #31
0
def step_impl(context):
    os.environ['NOTIFIER_ENABLED'] = '1'
    os.environ[
        'NOTIFIER_TOKEN'] = '1958942562:AAGKDfy2S7vcXj3cFe0I1-0Hevq8ayM-9U0'
    os.environ['NOTIFIER_MESSAGE'] = 'This is a test notification'
    os.environ['NOTIFIER_CHAT_ID'] = '-545020496'
    os.environ['NOTIFIER_LAPSE'] = '10'
    os.environ['NOTIFIER_MAX_RETRY'] = '5'

    os.environ['DATABASE_STORE'] = 'localsqlite'
    os.environ['LOCAL_SQLITE_FILE'] = 'test.db'
    os.environ['ERROR_HANDLER'] = 'stdout'

    os.environ['PROVIDER1_NAME'] = 'argenprop'
    os.environ['PROVIDER1_ENABLED'] = '1'
    os.environ['PROVIDER1_BASE_URL'] = 'https://www.argenprop.com'
    os.environ[
        'PROVIDER1_S1'] = '/departamento-alquiler-barrio-palermo-2-dormitorios-5-o-más-ambientes'
    context.scraper = Scraper(Config("bogus.env"))
Example #32
0
def scrape_other_sites():
	shops = list()
	shops.append({ 'name' : 'mindblower', 'url' : 'https://mindblower.ro/'})
	shops.append({ 'name' : 'MrGift', 'url' : 'http://www.mrgift.ro/'})
	shops.append({ 'name' : 'Zaragoo', 'url' : 'http://www.zaragoo.ro/'})
	shops.append({ 'name' : 'TheGift', 'url' : 'http://www.thegift.ro/'})
	shops.append({ 'name' : 'GiftsBoutique', 'url' : 'https://www.giftsboutique.ro/'})
	shops.append({ 'name' : 'Tu.Ro', 'url' : 'https://www.tu.ro/cadouri.html'})
	shops.append({ 'name' : 'Smuff', 'url' : 'https://www.smuff.ro/cadouri/cadouri-traznite'})
	shops.append({ 'name' : 'Cadouri de decoratiuni', 'url' : 'http://www.cadouridecoratiuni.ro/'})
	shops.append({ 'name' : 'BlueGifts', 'url' : 'https://bluegifts.ro/'})
	shops.append({ 'name' : 'Tu.Ro', 'url' : 'https://www.tu.ro/cadouri.html'})

	for shop in shops:
		try: 
			shop_data = sp.get_emails_from_page(page=shop)
		except:
			print('EXCEPT')
			continue
		save_data_to_db(shop_data)
Example #33
0
def get_reviews():
    """
        Accepts a url as form data and attempts to return review data as json
    """
    url = request.form['url']
    if url is None:
        data = {
            "success": False,
            "message": "Must provide a url to scrape reviews."
        }
        return APP.response_class(status=400,
                                  mimetype='applicaton/json',
                                  response=json.dumps(data))
    parsed_url = urlparse(url)
    main_page_expr = re.compile(r'\/(restaurant)\/(.*)\/([0-9]*)')
    if parsed_url.netloc != "www.grubhub.com" or main_page_expr.match(
            parsed_url.path) is None:
        data = {
            "success": False,
            "message": "Must be a grubhub restaurant page."
        }
        return APP.response_class(status=400,
                                  mimetype='applicaton/json',
                                  response=json.dumps(data))
    try:
        scraper = Scraper.get_scraper(url)
        return jsonify(scraper.get_reviews())
    except Exception as exception:
        data = {
            "success":
            False,
            "message":
            f"An unexpected error occured while scraping reviews: {exception}"
        }
        return APP.response_class(status=500,
                                  mimetype='applicaton/json',
                                  response=json.dumps(data))
Example #34
0
def scrape_google_search_results(filename='google_results.json'):
	import json
	import pprint as pp
	with open(filename, 'r') as f:
		data = json.load(f)
		for query in data:
			for result in query['results']:
				try:
					user = User.query.filter_by(name=result['domain']).first()
				except:
					db.session.rollback()
					raise
				if not user:
					if not result['link_type'] == 'ads_main':
						try:
							page_info = dict(zip(['name', 'url'], [result['domain'], result['link']]))
							shop_data = sp.get_emails_from_page(page=page_info, fn=save_data_to_db)
							if shop_data['emails'] is not None:
								save_data_to_db(shop_data)
						except:
							print('Error')
							continue
				else: 
					print('Skip this, already in the db')
Example #35
0
def ng_scraper():

    parser = argparse.ArgumentParser(
        description="Scrape inmate data from Cook County Sheriff's site.")
    parser.add_argument(
        '-d',
        '--day',
        action='store',
        dest='start_date',
        default=None,
        help=(
            'Specify day to search for missing inmates, format is YYYY-MM-DD. '
            'If not specified, searches all days.'))
    parser.add_argument('--verbose',
                        action="store_true",
                        dest='verbose',
                        default=False,
                        help='Turn on verbose mode.')

    args = parser.parse_args()

    try:
        monitor = Monitor(log, verbose_debug_mode=args.verbose)
        monitor.debug(
            "%s - Started scraping inmates from Cook County Sheriff's site." %
            datetime.now())

        scraper = Scraper(monitor)
        if args.start_date:
            scraper.check_for_missing_inmates(
                datetime.strptime(args.start_date, '%Y-%m-%d').date())
        else:
            scraper.run(date.today() - timedelta(1), feature_controls())

        monitor.debug(
            "%s - Finished scraping inmates from Cook County Sheriff's site." %
            datetime.now())
    except Exception, e:
        log.exception(e)
Example #36
0
class AutoPScraperTest(TestCase):
    def setUp(self):
        self.scraper = Scraper()
        self.scraper.set_autop(AdvertOptions.CARS)

    def test_car_scraper_selection(self):
        '''
        Test if correct scraper selected by provided option
        '''
        self.assertEquals(AutoPScraper, type(self.scraper.type()))

    def test_particular_car_advert_scrape(self):
        '''
        Tests paricular car advetisement scrape
        '''
        self.scraper.set_autop(AdvertOptions.CARS)
        # TODO FIX IT USE PYTHONPATH
        url = 'file:///home/apoluden/Programming/workspace/reseller/scraper/tests/bmw_advertisement.html'
        scraped_advert = self.scraper.scrape_particular_advert(None, path=url)
        vehicle = scraped_advert['vehicle']
        advert = scraped_advert['advert']
        seller = scraped_advert['seller']
        self.assertEquals('+37069157207', seller['number'])
        self.assertEquals('5004458', advert['uid'])
        self.assertEquals('Panevėžys,Lietuva', advert['location'])
        self.assertEquals('10 900 €', advert['price'])
        self.assertEquals('BMW 520, 2.0 l., universalas', advert['name'])

    def test_bad_webpage_url_or_path(self):
        '''
        Tests wrong URL or path
        '''
        wrong_path = 'file://wrong/path'
        wrong_url = 'http://wrong.url'
        scraper = AutoPCarScraper()
        self.assertIsNone(scraper.page_content(wrong_url))
        self.assertIsNone(scraper.page_content(None, wrong_path))
Example #37
0
 def handle(self, *args, **options):
     scraper = Scraper()
     scraper.fetch_data()
Example #38
0
 def test_no_dupes(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[0])
     self.assertEqual(1, len(Incident.objects.all()))
     sc.store_data(INCIDENTS[0])
     self.assertEqual(1, len(Incident.objects.all()))
Example #39
0
 def test_open_close(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[0])
     self.assertIsNone(Incident.objects.all()[0].end)
     sc.store_data(INCIDENTS[1])
     self.assertIsNotNone(Incident.objects.all()[0].end)
Example #40
0
 def test_dupe_vehic(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[0])
     self.assertEqual(1, len(Vehicle.objects.all()))
     sc.store_data(INCIDENTS[2])
     self.assertEqual(1, len(Vehicle.objects.all()))
Example #41
0
File: bot.py Project: SliinQ/404Bot
from scraper.scraper import Scraper
from scraper import domains

if __name__ == '__main__':
    d = domains.Domain('http://jkjas.com/Magazine')
    d.add_page('http://jkjas.com/Magazine')
    domains.add_domain(d)

    s = Scraper()
    s.run()
Example #42
0
class Gui ( object ):
	path_screenshots = 'scraper/screenshots'
	path_config_files = 'scraper/config_files'
	CONFIG_FILE_EXT = 'scp'
	RELIEF = Tk.RAISED
	RELIEF_S = 2
	GRID_V = Tk.N+Tk.S
	GRID_H = Tk.E+Tk.W
	GRID_BOTH = Tk.N+Tk.S+Tk.E+Tk.W
	
	def __init__ ( self, root ):
		self.sp = Scraper()
		self.sc = ScreenCapture()
		# Init
		root.wm_title("Poker AI")
		self.root = root
		self.icons = []
		self.sp_cnf_region_is_selected = False
		# Menu
		self.menu_bar = self.gui_set_menu()
		self.root.config(menu=self.menu_bar)
		self.frame_root = Tk.Frame(root, relief=Tk.GROOVE, bd=Gui.RELIEF_S)
		# Toolbar
		self.toolbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S)
		self.toolbar.pack(fill=Tk.BOTH, side=Tk.TOP)
		self.toolbar_start = self.gui_pack_toolbar_start()
		self.toolbar_sp_cnf = None
		# Main Frame
		self.frame_main = Tk.Frame(self.frame_root)
		self.frame_main.pack(fill=Tk.BOTH,expand=1)
		self.frame_start = self.gui_pack_frame_start(self.frame_main)
		self.frame_sp_cnf = None
		# Status Bar
		self.statusbar = Tk.Frame(self.frame_root, relief=Gui.RELIEF, bd=Gui.RELIEF_S)
		self.statusbar.pack(fill=Tk.BOTH)
		self.status_var = Tk.StringVar()
		self.status_label = Tk.Label(self.statusbar, textvariable=self.status_var, anchor=Tk.W)
		self.status_label.pack(fill=Tk.BOTH)
		# Show
		self.frame_root.pack(fill=Tk.BOTH,expand=1)
		self.resize(300,200)
		#

	# ====== GENERAL METHODS ====== #
	def resize ( self, w, h ):
		win_w = self.root.winfo_screenwidth()
		win_h = self.root.winfo_screenheight()
		x = win_w/2 - w/2
		y = win_h/2 - h/2
		self.root.geometry("%dx%d+%d+%d" % (w, h, x, y))

	def set_status ( self, text='' ):
		self.status_var.set(text)
		
	# ====== GUI COMPONENTS ====== #
	# --- menu --- #
	def gui_set_menu ( self ):
		menubar = Tk.Menu(self.root)
		# Screen Scraper menu
		menu_sp = Tk.Menu(menubar, tearoff=0)
		self.gui_add_menu_cmd(menu_sp, 'New', self.sp_cnf_new, 'Ctrl-n')
		self.gui_add_menu_cmd(menu_sp, 'Open', self.sp_cnf_open, 'Ctrl-o')
		self.gui_add_menu_cmd(menu_sp, 'Save', self.sp_cnf_save, 'Ctrl-s')
		self.gui_add_menu_cmd(menu_sp, 'Save as...', self.sp_cnf_saveas, 'Ctrl-Shift-s')
		menu_sp.add_separator()
		self.gui_add_menu_cmd(menu_sp, 'Exit', self.root.quit)
		menubar.add_cascade(label="Scraper", menu=menu_sp)
		# Edit menu
		editmenu = Tk.Menu(menubar, tearoff=0)
		editmenu.add_command(label="LOL")
		editmenu.add_separator()
		editmenu.add_command(label="WTF")
		menubar.add_cascade(label="Edit", menu=editmenu)
		# Key bindings
		self.root.bind_all("<Command-n>", self.sp_cnf_new)
		#
		return menubar

	def gui_add_menu_cmd ( self, parent, label, command, key=None ):
		if key:
			if platform.system() == "Darwin":
				key = key.replace('Ctrl', 'Cmd')
		#
		parent.add_command(label=label, command=command, accelerator=key)
		self.root.bind_all(key, command)
		
		
	# --- toolbar --- #
	def gui_pack_toolbar_start ( self ):
		screenshots = [ f for f in os.listdir(Gui.path_screenshots)
						if os.path.isfile(os.path.join(Gui.path_screenshots,f)) ]
		state_btn_new = Tk.NORMAL if screenshots else Tk.DISABLED
		#
		config_files = [ f for f in os.listdir(Gui.path_config_files)
						if os.path.isfile(os.path.join(Gui.path_config_files,f)) ]
		state_btn_open = Tk.NORMAL if config_files else Tk.DISABLED
		#		 
		toolbar = self.gui_pack_tools_btns([
			{'icon':'screencap', 'command':self.sp_do_screencap,
			 'text':'Take a screenshot'},
			{'icon':'new', 'command':self.sp_cnf_new, 'state':state_btn_new,
			 'text':'New config file (needs 1 screenshot)'},
			{'icon':'open', 'command':self.sp_cnf_open, 'state':state_btn_open,
			 'text':'Open a config file...'},
			])
		#
		return toolbar
		
	def gui_pack_toolbar_sp_cnf ( self ):
		toolbar = self.gui_pack_tools_btns([
			{'icon':'save', 'command':self.sp_cnf_save,
			 'text':'Save configuration file'},
			{'icon':'load_screenshot', 'command':self.sp_open_img,
			 'text':'Load a screenshot'},
			{'icon':'lol', 'command':self.sp_cnf_return_start, 'side':Tk.RIGHT,
			 'text':'Return to start screen'},
			])
		#
		return toolbar

	def gui_pack_tools_btns ( self, btns, parent=None ):
		if parent==None:
			parent = self.toolbar
		frame = Tk.Frame(parent)
		frame.pack(fill=Tk.BOTH)
		frame.btns = {}
		#
		for btn in btns:
			b = self.gui_pack_btn(frame, **btn)
			frame.btns[btn['icon']] = b
		#
		return frame

	def gui_pack_btn ( self, frame, icon, command,
					   side=Tk.LEFT, state=Tk.NORMAL, space=2, relief=Tk.FLAT, text='' ):
		img = ImageTk.PhotoImage(Image.open('gui_icons/'+icon+'.png'))
		self.icons.append(img)
		try:
			img_active = ImageTk.PhotoImage(Image.open('gui_icons/'+icon+'_active.png'))
			self.icons.append(img_active)
		except:
			img_active = img
		try:
			img_disabled = ImageTk.PhotoImage(Image.open('gui_icons/'+icon+'_disabled.png'))
			self.icons.append(img_disabled)
		except:
			img_disabled = img
		#
		w = ImageTk.PhotoImage.width(img)
		h = ImageTk.PhotoImage.height(img)
		d = space
		#
		cnv = Tk.Canvas(frame, width=w, height=h, state=state, highlightthickness=0, relief=relief, bd=d)
		cnv.create_image(d, d, anchor='nw',image=img, activeimage=img_active, disabledimage=img_disabled)
		cnv.bind('<ButtonRelease-1>', command)
		cnv.bind('<Enter>', lambda ev,self=self: self.set_status(text))
		cnv.bind('<Leave>', lambda ev,self=self: self.set_status(''))
		cnv.pack(side=side)
		return cnv


	# --- start screen --- #
	def gui_pack_frame_start ( self, frame ):
		st_frame = Tk.Frame(frame,relief=Gui.RELIEF,bd=Gui.RELIEF_S)
		cnv = Tk.Canvas(st_frame,width=160,height=128)
		cnv.pack()
		logo = 'gui_icons/logo.png'
		img = ImageTk.PhotoImage(Image.open(logo))
		self.canvas_start_img = img
		cnv.create_image(0, 0, image=img)
		cnv.config(scrollregion=cnv.bbox(Tk.ALL))
		#
		st_frame.pack(fill=Tk.BOTH,expand=1)
		#
		return st_frame

	# --- scraper configuration screen --- #
	def gui_pack_frame_sp_cnf ( self, parent ):
		# resize
		self.resize(900, 600)
		frame = Tk.Frame(parent,relief=Gui.RELIEF,bd=Gui.RELIEF_S)
		frame.pack(fill=Tk.BOTH,expand=1)
		frame.grid_rowconfigure(0, weight=1)
		#
		# screen shot frame
		(frame_sc, canvas_sc) = self.gui_canvas_and_scroll(frame)
		self.canvas_sc = canvas_sc
		canvas_sc.configure(cursor='cross')
		frame_sc.grid(row=0, column=0, sticky=Gui.GRID_BOTH)
		frame.grid_columnconfigure(0, weight=1)
		frame.configure(relief=Gui.RELIEF,bd=Gui.RELIEF_S)
		#
		# controls frame
		frame_ctr = Tk.Frame(frame)
		frame_ctr.grid(row=0, column=1, sticky=Gui.GRID_BOTH)
		# - preview and controls
		frame_preview = Tk.Frame(frame_ctr)
		frame_preview.grid(row=0)
		self.gui_pack_preview(frame_preview)
		# - tabs bar
		frame_tabs = Tk.Frame(frame_ctr,relief=Tk.GROOVE,bd=2)
		frame_tabs.grid(row=1, sticky=Gui.GRID_BOTH)
		self.sp_cnf_tabs = self.gui_pack_tools_btns([
			{'icon':'target', 'command':self.sp_cnf_switch_tab_marker, 'relief':Tk.SUNKEN,
			 'text':'Markers'},
			{'icon':'card', 'command':self.do_nothing, 'relief':Tk.RAISED,
			 'text':'Cards templates'},
			{'icon':'text', 'command':self.sp_cnf_switch_tab_ocr, 'relief':Tk.RAISED,
			 'text':'OCR'},
			], frame_tabs)
		# - tab content
		frame_tab = Tk.Frame(frame_ctr)
		self.sp_cnf_frame_tab_content = frame_tab
		self.gui_pack_tab_marker(frame_tab)
		self.sp_cnf_tab_ocr = None
		self.sp_cnf_tab_current = self.sp_cnf_tab_marker
		self.sp_cnf_tab_icon_current = 'target'
		frame_tab.grid(row=2)
		# - place holder
		Tk.Label(frame_ctr, text=':)').grid(row=3)
		frame_ctr.grid_rowconfigure(3, weight=1)
		#
		return frame

	def gui_canvas_and_scroll ( self, parent ):
		frame = Tk.Frame(parent)
		xscroll = Tk.Scrollbar(frame, orient=Tk.HORIZONTAL)
		yscroll = Tk.Scrollbar(frame)
		canvas = Tk.Canvas(frame, xscrollcommand=xscroll.set, yscrollcommand=yscroll.set)
		#
		def mouseWheel ( ev ):
			if ev.state==0:
				# up/down
				canvas.yview("scroll", ev.delta,"units")
			else:
				# right/left
				canvas.xview("scroll", ev.delta,"units")
		#		 
		xscroll.config(command=canvas.xview)
		yscroll.config(command=canvas.yview)
		canvas.bind("<MouseWheel>", mouseWheel)
		#
		frame.grid_rowconfigure(0, weight=1)
		frame.grid_columnconfigure(0, weight=1)
		canvas.grid(row=0, column=0, sticky=Gui.GRID_BOTH)
		xscroll.grid(row=1, column=0, sticky=Gui.GRID_H)
		yscroll.grid(row=0, column=1, sticky=Gui.GRID_V)
		#
		return (frame, canvas)

	def gui_pack_preview ( self, frame_preview ):
		# tools
		frame_toolbar = Tk.Frame(frame_preview)
		self.sp_cnf_preview_toolbar = self.gui_pack_tools_btns([
			{'icon':'zoom_plus', 'command':self.sp_prev_zoom_plus},
			{'icon':'zoom_minus', 'command':self.sp_prev_zoom_minus},
			], frame_toolbar)
		frame_toolbar.grid(row=1, columnspan=2, sticky=Gui.GRID_BOTH)
		# move buttons
		self.canvas_sp_cnf_prev_img_zoom = 2
		frame_prev_above = Tk.Frame(frame_preview)
		self.gui_pack_btn(frame_prev_above, 'down',
						  lambda e, s=self:self.sp_prev_move(e,'N',-1), side=Tk.BOTTOM, space=0)
		self.gui_pack_btn(frame_prev_above, 'up',
						  lambda e, s=self:self.sp_prev_move(e,'N',1), side=Tk.BOTTOM, space=0)
		frame_prev_above.grid(row=1, column=1)
		frame_prev_left = Tk.Frame(frame_preview)
		self.gui_pack_btn(frame_prev_left, 'right', 
						  lambda e, s=self:self.sp_prev_move(e,'W',-1), side=Tk.RIGHT, space=0)
		self.gui_pack_btn(frame_prev_left, 'left',
						  lambda e, s=self:self.sp_prev_move(e,'W',1), side=Tk.RIGHT, space=0)
		frame_prev_left.grid(row=2, column=0)
		frame_prev_right = Tk.Frame(frame_preview)
		self.gui_pack_btn(frame_prev_right, 'left',
						  lambda e, s=self:self.sp_prev_move(e,'E',-1), side=Tk.LEFT, space=0)
		self.gui_pack_btn(frame_prev_right, 'right',
						  lambda e, s=self:self.sp_prev_move(e,'E',1), side=Tk.LEFT, space=0)
		frame_prev_right.grid(row=2, column=2)
		frame_prev_below = Tk.Frame(frame_preview)
		self.gui_pack_btn(frame_prev_below, 'up',
						  lambda e, s=self:self.sp_prev_move(e,'S',-1), side=Tk.TOP, space=0)
		self.gui_pack_btn(frame_prev_below, 'down',
						  lambda e, s=self:self.sp_prev_move(e,'S',1), side=Tk.TOP, space=0)
		frame_prev_below.grid(row=3, column=1)
		# preview canvas
		(frame_prev, canvas_prev) = self.gui_canvas_and_scroll(frame_preview)
		self.canvas_sp_cnf_prev = canvas_prev
		canvas_prev.configure(width=128, height=128, bg='gray')
		frame_prev.grid(row=2, column=1, sticky=Tk.N)

	def gui_pack_tab_marker ( self, frame_tab ):
		frame = Tk.Frame(frame_tab)
		frame.pack()
		self.sp_cnf_tab_marker = frame
		self.sp_cnf_marker_selected = None
		# combobox
		list_markers = self.spCnf.get_list_markers()
		self.sp_cnf_marker_name = Tk.StringVar()
		if list_markers:
			self.sp_cnf_marker_name.set(list_markers[0])
		self.sp_cnf_next_marker_name = 'marker'+str(len(list_markers)+1)
		self.combobox_markers = ttk.Combobox(frame, values=list_markers,
											 textvariable=self.sp_cnf_marker_name)
		self.combobox_markers.bind('<<ComboboxSelected>>', self.sp_cnf_switch_marker)
		self.combobox_markers.bind('<Key>', lambda ev, self=self: self.root.after(1,self.sp_cnf_editing_marker_name))
		self.combobox_markers.grid(row=0,column=0)
		# commands
		frame_commands = Tk.Frame(frame)
		frame_addedit = Tk.Frame(frame_commands)
		self.sp_cnf_btn_add_marker = self.gui_pack_btn(frame_addedit, 'plus', self.sp_cnf_add_marker,
													   state=Tk.DISABLED)
		self.sp_cnf_btn_edit_marker = self.gui_pack_btn(frame_addedit, 'ok', self.sp_cnf_rename_marker,
													   state=Tk.DISABLED)
		self.sp_cnf_btn_edit_marker.pack_forget()
		frame_addedit.pack(side=Tk.LEFT)#
		frame_locate = Tk.Frame(frame_commands)
		self.sp_cnf_btn_locate = self.gui_pack_btn(frame_locate, 'target', self.sp_cnf_locate_marker,
												   state=Tk.DISABLED)
		self.sp_cnf_btn_locate.pack_forget()
		frame_locate.pack(side=Tk.LEFT)#
		frame_commands.grid(row=1,column=0)
		# init if packed after loading a conf file
		if list_markers:
			self.root.after(1, self.sp_cnf_switch_marker)
			
	def gui_pack_tab_ocr ( self, frame_tab ):
		frame = Tk.Frame(frame_tab)
		frame.pack()
		self.sp_cnf_tab_ocr = frame
		test_btn = self.gui_pack_btn(frame, 'work', self.sp_cnf_do_ocr, side=Tk.TOP)
		
	# --- switch between screens or tabs --- #
	def gui_switch_to_sp_cnf ( self ):
		self.toolbar_start.pack_forget()
		self.frame_start.pack_forget()
		#
		if self.toolbar_sp_cnf == None:
			self.toolbar_sp_cnf = self.gui_pack_toolbar_sp_cnf()
			self.frame_sp_cnf = self.gui_pack_frame_sp_cnf(self.frame_main)
		else:
			self.resize(900, 600)
			self.toolbar_sp_cnf.pack(fill=Tk.BOTH)
			self.frame_sp_cnf.pack(fill=Tk.BOTH,expand=1)

	def sp_cnf_return_start ( self, ev=None ):
		self.toolbar_sp_cnf.pack_forget()
		self.frame_sp_cnf.pack_forget()
		self.resize(300, 200)
		self.toolbar_start.pack(fill=Tk.BOTH)
		self.frame_start.pack(fill=Tk.BOTH,expand=1)

	def sp_cnf_switch_tab_marker ( self, ev=None ):
		self.sp_cnf_tab_current.pack_forget()
		self.sp_cnf_tab_marker.pack()
		self.sp_cnf_tab_current = self.sp_cnf_tab_marker
		self.sp_cnf_tabs.btns[self.sp_cnf_tab_icon_current].configure(relief=Tk.RAISED)
		self.sp_cnf_tabs.btns['target'].configure(relief=Tk.SUNKEN)
		self.sp_cnf_tab_icon_current = 'target'

	def sp_cnf_switch_tab_ocr ( self, ev=None ):
		self.sp_cnf_tab_current.pack_forget()
		if self.sp_cnf_tab_ocr:
			self.sp_cnf_tab_ocr.pack()
		else:
			self.gui_pack_tab_ocr(self.sp_cnf_frame_tab_content)
		self.sp_cnf_tab_current = self.sp_cnf_tab_ocr
		self.sp_cnf_tabs.btns[self.sp_cnf_tab_icon_current].configure(relief=Tk.RAISED)
		self.sp_cnf_tabs.btns['text'].configure(relief=Tk.SUNKEN)
		self.sp_cnf_tab_icon_current = 'text'
			
	# ====== GUI ACTION ====== #
	# --- button commands --- #
	def sp_cnf_new ( self, ev=None ):
		self.spCnf = ScraperConfig()
		self.gui_switch_to_sp_cnf()
		screenshots = [ f for f in os.listdir(Gui.path_screenshots)
						if os.path.splitext(f)[1] == '.tif' ]
		f = os.path.join(Gui.path_screenshots,screenshots[0])
		self.sp_show_image(f)
		
	def sp_cnf_open ( self, ev=None ):
		f = tkFileDialog.askopenfilename(parent=self.root,
										 title='Open config file',
										 initialdir=Gui.path_config_files,
										 defaultextension=Gui.CONFIG_FILE_EXT
										 )
		if f:
			self.spCnf = ScraperConfig.load(f)
			self.gui_switch_to_sp_cnf()
			self.sp_show_image(self.spCnf.imagename)

	def sp_cnf_save ( self, ev=None ):
		if not self.spCnf.filename:
			self.sp_cnf_saveas(ev)
		else:
			self.spCnf.save()

	def sp_cnf_saveas ( self, ev=None ):
		f = tkFileDialog.asksaveasfilename(parent=self.root,
										   title='Save config file as',
										   initialdir=Gui.path_config_files,
										   defaultextension=Gui.CONFIG_FILE_EXT
										   )
		if f:
			self.spCnf.save(f)
	
	def sp_do_screencap ( self, ev=None ):
		a = self.sc.capture()
		img_data = Image.fromstring('L', (a.shape[0], a.shape[1]), a.astype('b').tostring())
		img = ImageTk.PhotoImage(image=img_data)
		#
		screenshots = [ f for f in os.listdir(Gui.path_screenshots)
						if os.path.isfile(os.path.join(Gui.path_screenshots,f)) ]
		cv2.imwrite('%s/screenshot_%d.tif' % (Gui.path_screenshots, (len(screenshots)+1)), a)
		# enable new button
		self.toolbar_start.btns['new'].configure(state=Tk.NORMAL)

	def sp_open_img ( self, ev=None ):
		f = tkFileDialog.askopenfilename(parent=self.root,
										title='Open a screenshot',
										initialdir=Gui.path_screenshots,
										filetypes=[("Screenshots", "*.tif")]
										)
		try:
			self.sp_show_image(f)
		except:
			pass
		
	def sp_show_image ( self, f ):
		self.spCnf.set_image_file(f)
		img = ImageTk.PhotoImage(Image.open(f))
		self.canvas_sc_img = img
		cnv = self.canvas_sc
		cnv.create_image(0, 0, image=img, anchor="nw")
		cnv.config(scrollregion=cnv.bbox(Tk.ALL))
		#
		cnv.bind('<ButtonPress-1>', self.on_button_press)
		cnv.bind('<ButtonRelease-1>', self.on_button_release)
		cnv.bind('<B1-Motion>', self.on_button_motion)

	# --- mouse event --- #
	def on_button_press ( self, event ):
		cnv = self.canvas_sc
		# get coordinates
		x0 = max(0, cnv.canvasx(event.x))
		y0 = max(0, cnv.canvasy(event.y))
		# save start coordinates
		self.click_x0 = x0
		self.click_y0 = y0
		# delete previous rectangle
		if hasattr(self, 'selected_rect'):
			cnv.delete(self.selected_rect)
		# create rectangle
		self.selected_rect = cnv.create_rectangle(x0,y0,x0,y0,outline='red')
		# update gui
		self.sp_cnf_btn_add_marker.pack()
		self.sp_cnf_btn_edit_marker.pack_forget()
		self.sp_cnf_btn_locate.pack_forget()
		self.sp_cnf_marker_name.set(self.sp_cnf_next_marker_name)
		
	def on_button_motion ( self, event ):
		cnv = self.canvas_sc
		# get coordinates
		x1 = max(0, cnv.canvasx(event.x))
		y1 = max(0, cnv.canvasy(event.y))
		# get start coordinates
		x0,y0 = (self.click_x0, self.click_y0)
		# update rectangle
		cnv.coords(self.selected_rect, x0, y0, x1, y1)

	def on_button_release ( self, event ):
		cnv = self.canvas_sc
		# get coordinates
		x1 = max(0, cnv.canvasx(event.x))
		y1 = max(0, cnv.canvasy(event.y))
		# invert if necessary
		x0 = min(self.click_x0, x1)
		y0 = min(self.click_y0, y1)
		x1 = max(self.click_x0, x1)
		y1 = max(self.click_y0, y1)
		# update rectangle
		cnv.coords(self.selected_rect, x0, y0, x1, y1)
		# show preview
		self.sp_preview(x0, y0, x1, y1)
		# set selected region
		self.spCnf.new_pattern()
		self.sp_cnf_select_rect(x0, y0, x1, y1)
		# update gui
		self.sp_cnf_btn_add_marker.config(state=Tk.NORMAL)
		

	# --- preview --- #
	def sp_preview ( self, x0, y0, x1, y1 ):
		# round coordinates
		l = int(x0)
		t = int(y0)
		r = int(x1)
		b = int(y1)
		# get sub image for preview
		img = Tk.PhotoImage()
		img.tk.call(img, 'copy', self.canvas_sc_img, '-from', l, t, r, b, '-to', 0, 0)
		# save sub image data
		self.canvas_sp_cnf_prev_img_id = None
		self.canvas_sp_cnf_prev_img_source = (img, abs(r-l), abs(b-t))
		# show preview
		self.sp_preview_draw()
		
	def sp_preview_draw ( self ):
		cnv = self.canvas_sp_cnf_prev
		# load sub image data
		(img, w, h) = self.canvas_sp_cnf_prev_img_source
		# delete old image canvas
		if self.canvas_sp_cnf_prev_img_id:
			cnv.delete(self.canvas_sp_cnf_prev_img_id)
		# compute zoomed canvas
		zoom = self.canvas_sp_cnf_prev_img_zoom
		w2 = w*zoom
		h2 = h*zoom
		img2 = img.zoom(zoom)
		self.canvas_sp_cnf_prev_img = img2
		self.canvas_sp_cnf_prev_img_zoom = zoom
		w_cnv = int(cnv.cget('width'))
		h_cnv = int(cnv.cget('height'))
		x = w_cnv/2 - w2/2
		y = h_cnv/2 - h2/2
		# update canvas image
		self.canvas_sp_cnf_prev_img_id = cnv.create_image(x, y, image=img2, anchor="nw")
		cnv.config(scrollregion=(x-5, y-5, w_cnv/2+w2/2+5, h_cnv/2+h2/2+5))

	def sp_prev_zoom_plus ( self, ev=None ):
		if not self.sp_cnf_region_is_selected:
			return
		self.canvas_sp_cnf_prev_img_zoom = self.canvas_sp_cnf_prev_img_zoom * 2
		self.sp_preview_draw()
		self.sp_cnf_preview_toolbar.btns['zoom_minus'].configure(state=Tk.NORMAL)
		
	def sp_prev_zoom_minus ( self, ev=None ):
		if not self.sp_cnf_region_is_selected:
			return
		zoom = self.canvas_sp_cnf_prev_img_zoom
		if zoom >= 2:
			self.canvas_sp_cnf_prev_img_zoom = zoom / 2
			self.sp_preview_draw()
			if zoom == 2:
				self.sp_cnf_preview_toolbar.btns['zoom_minus'].configure(state=Tk.DISABLED)

	def sp_prev_move ( self, ev, orientation, step ):
		if not self.sp_cnf_region_is_selected:
			return
		cnv = self.canvas_sc
		# get selected rectangle
		region = self.spCnf.get_pattern()
		(x0, y0, x1, y1) = region.rect
		# new coordinates
		if orientation=='N':
			y0 = y0-step
		elif orientation=='S':
			y1 = y1+step
		elif orientation=='W':
			x0 = x0-step
		elif orientation=='E':
			x1 = x1+step
		# update rectangle and preview
		cnv.coords(self.selected_rect, x0, y0, x1, y1)
		self.sp_preview(x0,y0,x1,y1)
		# 
		self.sp_cnf_select_rect(x0, y0, x1, y1)

	def sp_cnf_select_rect ( self, *coords ):
		# set current rect
		self.spCnf.select_rect(coords)
		# update gui
		self.sp_cnf_region_is_selected = True

	
	# --- markers --- #
	def sp_cnf_add_marker ( self, ev=None ):
		if self.sp_cnf_btn_add_marker.cget('state') == Tk.DISABLED:
			return
		# change combo list
		name = self.sp_cnf_marker_name.get()
		new_list = tuple(self.combobox_markers['values']) + (name,)
		self.combobox_markers['values'] = new_list
		self.combobox_markers.set(name)
		self.sp_cnf_next_marker_name = 'marker'+str(len(new_list)+1)
		# add marker to spCnf
		self.spCnf.add_marker(name)
		# update gui
		self.canvas_sc.itemconfig(self.selected_rect, outline='green')
		self.sp_cnf_btn_add_marker.pack_forget()
		self.sp_cnf_btn_edit_marker.pack()
		self.sp_cnf_btn_locate.pack()
		self.set_status('Marker created')
		
	def sp_cnf_editing_marker_name ( self, ev=None ):
		current_marker = self.spCnf.get_pattern()
		new_name = self.sp_cnf_marker_name.get()
		if current_marker.name == new_name:
			self.sp_cnf_btn_edit_marker.config(state=Tk.DISABLED)
		else:
			self.sp_cnf_btn_edit_marker.config(state=Tk.NORMAL)
		
	def sp_cnf_rename_marker ( self, ev=None ):
		# get names
		new_name = self.sp_cnf_marker_name.get()
		old_name = self.spCnf.get_pattern().name
		# replace in combobox
		new_list = [ new_name if name==old_name else name for name in self.combobox_markers['values'] ]
		self.combobox_markers['values'] = new_list
		# replace in spCnf
		self.spCnf.rename_marker(old_name, new_name)
		# update gui
		self.sp_cnf_btn_edit_marker.config(state=Tk.DISABLED)
		self.set_status('Marker renamed')
		
	def sp_cnf_switch_marker ( self, ev=None ):
		# switch current pattern
		name = self.combobox_markers.get()
		marker = self.spCnf.switch_marker(name)
		coords = marker.rect
		# show preview
		self.sp_preview(*coords)
		# switch add/edit buttons
		self.sp_cnf_btn_add_marker.pack_forget()
		self.sp_cnf_btn_edit_marker.pack()
		self.sp_cnf_btn_locate.pack()
		# remove old rectangle
		if hasattr(self, 'selected_rect'):
			self.canvas_sc.delete(self.selected_rect)

	def sp_cnf_locate_marker ( self, ev=None ):
		# find coordinates
		pattern = self.spCnf.get_pattern().img_cv2
		(x0, y0) = self.sp.locate(self.spCnf.img_cv2, pattern)
		(h, w, d) = pattern.shape
		x1 = x0 + w
		y1 = y0 + h
		# show rectangle
		if hasattr(self, 'selected_rect'):
			self.canvas_sc.delete(self.selected_rect)
		self.selected_rect = self.canvas_sc.create_rectangle(x0, y0, x1, y1, outline='blue')

		
	# --- OCR --- #
	def sp_cnf_do_ocr ( self, ev=None ):
		img = self.spCnf.create_pattern_image()
		txt = self.sp.do_ocr(img)
		#
		print txt

	def do_nothing ( self, ev=None ):
		pass
Example #43
0
 def test_multi_vehic(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[6])
     self.assertEqual(2, len(Vehicle.objects.all()))
Example #44
0
 def setUp(self):
     self.scraper = Scraper()
Example #45
0
 def test_closed(self):
     sc = Scraper()
     sc.store_data(INCIDENTS[1])
     self.assertEqual(0, len(Incident.objects.all()))
Example #46
0
 def setUp(self):
     self.scraper = Scraper()
     self.scraper.set_autop(AdvertOptions.CARS)
Example #47
0
# them if needed.

import hashlib
import time
from models import *
from scraper.scraper import Scraper

encoding = 'UTF-8'

# helper function for benchmarking
current_milli_time = lambda: int(round(time.time() * 1000))

# First crawl through the whole index and get all articles we can find
print('Start crawling...')
start_time = current_milli_time()
scraper = Scraper()
articles = scraper.scrape()
time_taken = current_milli_time() - start_time
print('Found {} articles in {} ms'.format(len(articles), time_taken))

# Now fill the database
print('Starting database work')

for article in articles:
    # build a hash so we can more easily find out if we have an article already
    h = hashlib.sha256()
    h.update(str(article['date']).encode(encoding))
    h.update(article['place'].encode(encoding))
    h.update(article['description'].encode(encoding))
    digest = h.digest()