コード例 #1
0
 def waitAnswer(self):
     time = '[' + str(now()) + ']'
     print(c.green(time), self.info(), '| waiting answer')
     response = self.queue[TypeQ.MSG_RECEIVER].get(block=True)
     print(c.green(time), self.info(), '| response: ',
           c.orange(msg.readText(response)))
     return response
コード例 #2
0
ファイル: scraper.py プロジェクト: MarkNo1/Utils
    def __init__(self, delay=10, log=False, headless=False, tor=False):
        # Binary path
        path = pkg_resources.resource_filename(pkg_name, geckodriver_file)
        profile = webdriver.FirefoxProfile()
        if tor:
            print(c.red('Warning '), c.blue(
                'Be sure you have Tor browser opened in backgroud!'))
            print('Tor Proxy', c.green('Enabled'))
            profile.set_preference('network.proxy.type', 1)
            profile.set_preference('network.proxy.socks', '127.0.0.1')
            profile.set_preference('network.proxy.socks_port', 9150)
        profile.set_preference("browser.cache.disk.enable", False)
        profile.set_preference("browser.cache.memory.enable", False)
        profile.set_preference("browser.cache.offline.enable", False)
        profile.set_preference("network.http.use-cache", False)
        options = Options()
        if headless:
            options.add_argument('-headless')
        self.webBrowser = webdriver.Firefox(
            firefox_profile=profile, executable_path=path, firefox_options=options)

        self.delay = delay
        self.log = log
        self.log_file = 'geckodriver.log'
        self.wait = WebDriverWait(self.webBrowser, timeout=delay)
コード例 #3
0
 def what(self, text=None):
     D = c.green('@what')
     if text is None:
         text = input('What you want search ->  ')
     search = Page.scraper.get_element_BY(Target.Search.name)
     search.send_keys(text)
     print(Page.I + self.I + D, c.underline(text))
コード例 #4
0
    def find_elements_BY(self, element, target):
        nested_element = None
        assert isinstance(
            target,
            dict), 'Target must be a dictionary with key:value [type:target]'
        try:
            c_target = target.copy()
            _type, _target = c_target.popitem()

            if _type is CLASS:
                nested_element = self.__find_elements_by_class(
                    element, _target)
            if _type is ID:
                nested_element = self.__find_elements_by_id(element, _target)
            if _type is XPATH:
                nested_element = self.__find_elements_by_xpath(
                    element, _target)
            if _type is TAG:
                nested_element = self.__find_elements_by_tag(element, _target)
            if _type is LINK:
                nested_element = self.__find_elements_by_partial_link_text(
                    element, _target)

        except Exception as e:
            print(
                'Exception: {}: {} -> {} - is not present in page: {}'.format(
                    e, c.blue(_type), c.red(_target),
                    c.orange(self.webBrowser.current_url)))

        finally:
            if self.log and nested_element is not None:
                print('{} ---> {}'.format(c.blue(_type), c.green(_target)))
            return nested_element
コード例 #5
0
    def get_single_element_BY(self, target):
        element = None
        assert isinstance(
            target,
            dict), 'Target must be a dictionary with key:value [type:target]'
        try:
            c_target = target.copy()
            _type, _target = c_target.popitem()

            if _type is CLASS:
                element = self.__get_element_by_class(_target)
            elif _type is ID:
                element = self.__get_element_by_id(_target)
            elif _type is XPATH:
                element = self.__get_element_by_xpath(_target)
            elif _type is TAG:
                element = self.__get_element_by_tag(_target)
            else:
                print('Error - Type ( {} ).'.format(_type))

        except Exception as e:
            print(
                'Exception: {}: {} -> {} - is not present in page: {}'.format(
                    e, c.blue(_type), c.red(_target),
                    c.orange(self.webBrowser.current_url)))

        finally:
            if self.log and element is not None:
                print('{} ---> {}'.format(c.blue(_type), c.green(_target)))
            return element
コード例 #6
0
 def song(song):
     if song:
         artist = c.purple(str(song['Artist']))
         title = c.purple(str(song['Title']))
         album = c.purple(str(song['Album']))
         year = c.purple(str(song['Year']))
         sys.stdout.write(
             '\n|{}: - Artist: {}, Title: {}, Album: {}, Year: {}.'.format(
                 c.pink(time.strftime("%X")), artist, title, album, year))
コード例 #7
0
        def next(self, check=False):
            D = c.green('@next')
            next_b = Page.scraper.get_element_BY(Target.ListAd.button_next)
            link = Page.scraper.find_elements_BY(next_b, Target.ListAd.link)
            if link:
                if check:
                    return True

                Page.scraper.openUrl(link)
                print(Page.I + self.I + D, c.underline('Next'))
                return True
            print(Page.I + self.I + D, c.red('Finish'))
            return False
コード例 #8
0
 def catagory(self, idx=None):
     D = c.green('@catagory')
     category = Page.scraper.get_element_BY(Target.Search.category)
     view = Select(category)
     options = view.options
     print(c.underline('Category:'))
     for i, op in enumerate(options):
         print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
     if idx is None:
         idx = int(input('\nInsert number [i] -> '))
     chs = options[idx].text
     options[idx].click()
     print(Page.I + self.I + D, c.underline(chs))
コード例 #9
0
 def area(self, idx=None):
     D = c.green('@area')
     area = Page.scraper.get_element_BY(Target.Search.area)
     view = Select(area)
     options = view.options
     print(c.underline('Geographic Area:'))
     for i, op in enumerate(options):
         print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
     if idx is None:
         idx = int(input('insert the number [i] -> '))
     chs = options[idx].text
     options[idx].click()
     print(Page.I + self.I + D, c.underline(chs))
コード例 #10
0
 def links(self):
     D = c.green('@links')
     links = []
     raw_ads = Page.scraper.get_nested_elements_from_root(
         Target.ListAd.list_link_element)
     for raw_ad in raw_ads:
         raw_link = Page.scraper.find_elements_BY(
             raw_ad, Target.ListAd.raw_link)
         link = Page.scraper.find_elements_BY(raw_link[0],
                                              Target.ListAd.link)
         links.append(link)
     print(Page.I + self.I + D, 'Founded links: ',
           c.underline(str(len(links))))
     return links
コード例 #11
0
    class ListAd:
        I = c.orange('ListAd/')

        def links(self):
            D = c.green('@links')
            links = []
            raw_ads = Page.scraper.get_nested_elements_from_root(
                Target.ListAd.list_link_element)
            for raw_ad in raw_ads:
                raw_link = Page.scraper.find_elements_BY(
                    raw_ad, Target.ListAd.raw_link)
                link = Page.scraper.find_elements_BY(raw_link[0],
                                                     Target.ListAd.link)
                links.append(link)
            print(Page.I + self.I + D, 'Founded links: ',
                  c.underline(str(len(links))))
            return links

        def next(self, check=False):
            D = c.green('@next')
            next_b = Page.scraper.get_element_BY(Target.ListAd.button_next)
            link = Page.scraper.find_elements_BY(next_b, Target.ListAd.link)
            if link:
                if check:
                    return True

                Page.scraper.openUrl(link)
                print(Page.I + self.I + D, c.underline('Next'))
                return True
            print(Page.I + self.I + D, c.red('Finish'))
            return False

        def index_page(self):
            idx = Page.scraper.get_element_BY(
                Target.ListAd.number_page).get_attribute('innerHTML')
            idx = between('<strong>', '</strong>', idx)
            return idx

        def pages_links(self, chs=None):

            if chs is None:
                chs = input(
                    'How many page you want scrab? type: int or "all" \n')

            if chs == 'all':
                chs = 10000
            else:
                try:
                    chs = int(chs)
                except ValueError:
                    print('type: int or "all"')

            links = dict()
            for i in tqdm(range(chs), desc='Pages: '):
                idx = self.index_page()
                links[idx] = self.links()
                if self.next() is False:
                    break
            return links
コード例 #12
0
    class Search:
        I = c.orange('Search/')

        def goto_url(self):
            D = c.green('@goto_url')
            url = 'https://www.subito.it'
            Page.scraper.openUrl(url)
            print(Page.I + self.I + D, c.underline(url))

        def what(self, text=None):
            D = c.green('@what')
            if text is None:
                text = input('What you want search ->  ')
            search = Page.scraper.get_element_BY(Target.Search.name)
            search.send_keys(text)
            print(Page.I + self.I + D, c.underline(text))

        def catagory(self, idx=None):
            D = c.green('@catagory')
            category = Page.scraper.get_element_BY(Target.Search.category)
            view = Select(category)
            options = view.options
            print(c.underline('Category:'))
            for i, op in enumerate(options):
                print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
            if idx is None:
                idx = int(input('\nInsert number [i] -> '))
            chs = options[idx].text
            options[idx].click()
            print(Page.I + self.I + D, c.underline(chs))

        def area(self, idx=None):
            D = c.green('@area')
            area = Page.scraper.get_element_BY(Target.Search.area)
            view = Select(area)
            options = view.options
            print(c.underline('Geographic Area:'))
            for i, op in enumerate(options):
                print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
            if idx is None:
                idx = int(input('insert the number [i] -> '))
            chs = options[idx].text
            options[idx].click()
            print(Page.I + self.I + D, c.underline(chs))

        def continue_button(self):
            D = c.green('@continue_button')
            b_continue = Page.scraper.get_element_BY(
                Target.Search.button_continue)
            b_continue.click()
            print(Page.I + self.I + D, c.underline('Continue'))
コード例 #13
0
 def __init__(self, user, queue):
     Thread.__init__(self)
     self.user = user
     self.entityName = self.__class__.__name__
     self.daemon = True
     self.timeStart = now()
     time = '[' + str(self.timeStart) + ']'
     print(c.green(time), ' Start ', self.info())
     if queue:
         self.queue = queue
     else:
         self.queue = {
             TypeQ.MSG_SENDER: Queue(),
             TypeQ.MSG_RECEIVER: Queue(),
             TypeQ.SWITCH_CONVERSATION: Queue()
         }
コード例 #14
0
    class Ad:
        I = c.orange('Ad/')

        def record(self, url, page_idx):
            record = dict()
            record['page_index'] = page_idx
            record['url'] = url
            Page.scraper.openUrl(url)
            # General Info
            record['info'] = Page.scraper.get_element_BY(
                Target.Ad.summary).text.split('\n')
            # Description
            record['description'] = Page.scraper.get_element_BY(
                Target.Ad.description).text
            # Date - name
            record['date'] = Page.scraper.get_element_BY(
                Target.Ad.date_name).text.split('\n')[0]
            record['name'] = Page.scraper.get_element_BY(
                Target.Ad.date_name).text.split('\n')[1]
            # Phone
            phone_button = Page.scraper.get_element_BY(Target.Ad.phone_button)
            if phone_button:
                Page.scraper.webBrowser.execute_script("arguments[0].click();",
                                                       phone_button)
                sleep(1)
                number_id = dict(ID='adv_phone_big')
                record['phone'] = Page.scraper.get_element_BY(
                    Target.Ad.phone_number).text
            else:
                record['phone'] = 'None'

            return record

        def records_from_ad_links(self, ad_links):
            temp_db = []
            for page, list_link in tqdm(ad_links.items(), desc='Pages: '):
                for url in tqdm(list_link, desc='Links: '):
                    try:
                        temp_db.append(self.record(url, page))
                    except Exception as e:
                        pass

            return temp_db
コード例 #15
0
class Page:
    scraper = Scraper(log=LOG, headless=HEADLESS, tor=TOR)
    I = c.blue('Page/')

    class Search:
        I = c.orange('Search/')

        def goto_url(self):
            D = c.green('@goto_url')
            url = 'https://www.subito.it'
            Page.scraper.openUrl(url)
            print(Page.I + self.I + D, c.underline(url))

        def what(self, text=None):
            D = c.green('@what')
            if text is None:
                text = input('What you want search ->  ')
            search = Page.scraper.get_element_BY(Target.Search.name)
            search.send_keys(text)
            print(Page.I + self.I + D, c.underline(text))

        def catagory(self, idx=None):
            D = c.green('@catagory')
            category = Page.scraper.get_element_BY(Target.Search.category)
            view = Select(category)
            options = view.options
            print(c.underline('Category:'))
            for i, op in enumerate(options):
                print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
            if idx is None:
                idx = int(input('\nInsert number [i] -> '))
            chs = options[idx].text
            options[idx].click()
            print(Page.I + self.I + D, c.underline(chs))

        def area(self, idx=None):
            D = c.green('@area')
            area = Page.scraper.get_element_BY(Target.Search.area)
            view = Select(area)
            options = view.options
            print(c.underline('Geographic Area:'))
            for i, op in enumerate(options):
                print('[{}] - {}'.format(c.orange(str(i)), c.blue(op.text)))
            if idx is None:
                idx = int(input('insert the number [i] -> '))
            chs = options[idx].text
            options[idx].click()
            print(Page.I + self.I + D, c.underline(chs))

        def continue_button(self):
            D = c.green('@continue_button')
            b_continue = Page.scraper.get_element_BY(
                Target.Search.button_continue)
            b_continue.click()
            print(Page.I + self.I + D, c.underline('Continue'))

    class ListAd:
        I = c.orange('ListAd/')

        def links(self):
            D = c.green('@links')
            links = []
            raw_ads = Page.scraper.get_nested_elements_from_root(
                Target.ListAd.list_link_element)
            for raw_ad in raw_ads:
                raw_link = Page.scraper.find_elements_BY(
                    raw_ad, Target.ListAd.raw_link)
                link = Page.scraper.find_elements_BY(raw_link[0],
                                                     Target.ListAd.link)
                links.append(link)
            print(Page.I + self.I + D, 'Founded links: ',
                  c.underline(str(len(links))))
            return links

        def next(self, check=False):
            D = c.green('@next')
            next_b = Page.scraper.get_element_BY(Target.ListAd.button_next)
            link = Page.scraper.find_elements_BY(next_b, Target.ListAd.link)
            if link:
                if check:
                    return True

                Page.scraper.openUrl(link)
                print(Page.I + self.I + D, c.underline('Next'))
                return True
            print(Page.I + self.I + D, c.red('Finish'))
            return False

        def index_page(self):
            idx = Page.scraper.get_element_BY(
                Target.ListAd.number_page).get_attribute('innerHTML')
            idx = between('<strong>', '</strong>', idx)
            return idx

        def pages_links(self, chs=None):

            if chs is None:
                chs = input(
                    'How many page you want scrab? type: int or "all" \n')

            if chs == 'all':
                chs = 10000
            else:
                try:
                    chs = int(chs)
                except ValueError:
                    print('type: int or "all"')

            links = dict()
            for i in tqdm(range(chs), desc='Pages: '):
                idx = self.index_page()
                links[idx] = self.links()
                if self.next() is False:
                    break
            return links

    class Ad:
        I = c.orange('Ad/')

        def record(self, url, page_idx):
            record = dict()
            record['page_index'] = page_idx
            record['url'] = url
            Page.scraper.openUrl(url)
            # General Info
            record['info'] = Page.scraper.get_element_BY(
                Target.Ad.summary).text.split('\n')
            # Description
            record['description'] = Page.scraper.get_element_BY(
                Target.Ad.description).text
            # Date - name
            record['date'] = Page.scraper.get_element_BY(
                Target.Ad.date_name).text.split('\n')[0]
            record['name'] = Page.scraper.get_element_BY(
                Target.Ad.date_name).text.split('\n')[1]
            # Phone
            phone_button = Page.scraper.get_element_BY(Target.Ad.phone_button)
            if phone_button:
                Page.scraper.webBrowser.execute_script("arguments[0].click();",
                                                       phone_button)
                sleep(1)
                number_id = dict(ID='adv_phone_big')
                record['phone'] = Page.scraper.get_element_BY(
                    Target.Ad.phone_number).text
            else:
                record['phone'] = 'None'

            return record

        def records_from_ad_links(self, ad_links):
            temp_db = []
            for page, list_link in tqdm(ad_links.items(), desc='Pages: '):
                for url in tqdm(list_link, desc='Links: '):
                    try:
                        temp_db.append(self.record(url, page))
                    except Exception as e:
                        pass

            return temp_db
コード例 #16
0
ファイル: subito.py プロジェクト: MarkNo1/Web-Scraper
 def listAdPage(self, n_page):
     try:
         self.ad_links = self.listad.pages_links(n_page)
     except Exception as e:
         print(c.red(e))
コード例 #17
0
 def sendTextWithKeyboard(self, text, buttonList):
     self.addMessageToSenderQ(
         msg.createKeyboardMessage(self.user['id'], text, buttonList))
     time = '[' + str(now()) + ']'
     print(c.green(time), self.info(), '| sent: ', c.light_blue(text))
     return self.waitAnswer()
コード例 #18
0
 def sendText(self, text):
     self.addMessageToSenderQ(msg.createTextMessage(self.user['id'], text))
     time = '[' + str(now()) + ']'
     print(c.green(time), self.info(), '| sent: ', c.light_blue(text))
コード例 #19
0
 def goto_url(self):
     D = c.green('@goto_url')
     url = 'https://www.subito.it'
     Page.scraper.openUrl(url)
     print(Page.I + self.I + D, c.underline(url))
コード例 #20
0
 def __del__(self):
     self.timeEnd = now()
     time = '[' + str(self.timeEnd) + ']'
     print(c.orange(time), ' End ', self.info(), '| DT: ',
           delta(self.timeStart, self.timeEnd))
コード例 #21
0
 def station(station):
     sys.stdout.write('\n|Station: {} '.format(c.orange(station)))
コード例 #22
0
 def radio(radio):
     sys.stdout.write(('\n\n|Radio: {} '.format(c.blue(radio.name))))
コード例 #23
0
 def openUrl(self, url):
     self.webBrowser.get(url)
     if self.log:
         print('Page: {}'.format(c.orange(url)))
コード例 #24
0
 def run(self):
     try:
         self.task()
     except Exception as e:
         print(c.red(str(e)))
コード例 #25
0
 def info(self):
     return '| ' + str(self.entityName) + '| ' + c.blue(
         str(self.user['username']))
コード例 #26
0
 def continue_button(self):
     D = c.green('@continue_button')
     b_continue = Page.scraper.get_element_BY(
         Target.Search.button_continue)
     b_continue.click()
     print(Page.I + self.I + D, c.underline('Continue'))