def __init__(self, use_proxy=False):
     """
     Parser initialisation by launching the browser.
     """
     self.browser = self.browser_setup(use_proxy=use_proxy)
     self.tg_sources_path = 'telegram_parsing/channels.txt'
     self.keywords = Keywords()
Ejemplo n.º 2
0
    def add_keyword(self, new_word):
        """Add keyword end user"""

        self.keywords.append(new_word)
        words = Keywords()
        words.add(new_word)
        mongo.db.users.update({"name": self.username},
                              {"$set": {"keywords": self.keywords}})
Ejemplo n.º 3
0
    def check_user_weight(self):
        """
        Sort weight of the user links end get most popular
        :param link: str
        :return:
        """
        for source in ['telegram', 'twitter']:
            new_links = []
            words = Keywords()
            for now in range(config.NUMBER_WORDS):

                dct = {}
                dct1 = {}
                for link in self.keywords:
                    link = words[link]
                    try:
                        dct[link.links[source][now][1]] = dct.get(link.links[source][now][0], 0) + 1
                        dct1[link.links[source][now][1]] = link.links[source][now]
                    except IndexError:
                        pass
                maxi = 0
                max_link = ''
                for i in dct:
                    try:
                        if dct[i] > maxi and i not in [x[1] for x in new_links]:
                            maxi = dct[i]
                            max_link = dct1[i]
                    except IndexError:
                        pass
                new_links.append(max_link)
            self.weights[source] = new_links
Ejemplo n.º 4
0
 def get_full_data(self):
     """
     Get data that we give end WebPage
     :return: dict
     """
     to_return = {}
     keywords = Keywords()
     for word in self.keywords:
         to_return[word] = keywords[word].get_info()
     return to_return
Ejemplo n.º 5
0
 def get_user_weight(self, link):
     """
     Get weight of the link for special user
     :param link: str
     :return: int
     """
     user_weight = 0
     words = Keywords()
     for keyword in self.keywords:
         keyword = words.keywords[keyword]
         if link in keyword.links_dict:
             user_weight += keyword.links_dict[link]
     return user_weight
class Parser:
    """
    Main Parser class.
    """
    def __init__(self, use_proxy=False):
        """
        Parser initialisation by launching the browser.
        """
        self.browser = self.browser_setup(use_proxy=use_proxy)
        self.tg_sources_path = 'telegram_parsing/channels.txt'
        self.keywords = Keywords()

    def parse_telegram(self):
        """
        Launches telegram channels parsing.
        :return: None
        """
        parse_telegram(self)

    def parse_twitter(self):
        """
        Launches telegram channels parsing.
        :param keywords: list
        :return: None
        """
        parse_twitter(self)

    @staticmethod
    def by_class(search_in, class_name, get_text=False, parse_all=False):
        """
        Search for an element by its class_name.
        :param search_in: Selenium WebElement
        :param class_name: str
        :param get_text: bool
        :param parse_all: bool
        :return: Selenium WebElement or None if not found
        """
        result = None
        try:
            if parse_all:
                result = search_in.find_elements_by_class_name(class_name)
            else:
                result = search_in.find_element_by_class_name(class_name)
                if result and get_text:
                    result = result.text
        except NoSuchElementException:
            pass
        return result

    def quit(self):
        """
        Quit parsing by safely closing and quitting the browser.
        :return:
        """
        self.browser.close()
        self.browser.quit()

    def new_link(self, text, link, source, info):
        """
        This function get information about post, and
        update weights of keywords and user end get most popular one
        :param text: str
        :param link: str
        :return: None
        """
        self.keywords.add_new_link(text, link, source, info)

    def browser_setup(self, iterator=0, update_proxies=False, use_proxy=True):
        """
        Initial browser setup
        :param update_proxies: bool
        :param iterator: int
        :return: Selenium WebDriver
        """
        global PROXIES, REQ_PROXY

        if update_proxies:
            REQ_PROXY.__init__()
            PROXIES = REQ_PROXY.get_proxy_list()

        if use_proxy:
            # socket.setdefaulttimeout(120)

            for current_proxy in PROXIES:
                proxy = PROXIES[iterator].get_address()
                if not is_bad_proxy(proxy):
                    logger.info("Chosen Proxy: %str_check", proxy)
                    webdriver.DesiredCapabilities.CHROME['proxy'] = {
                        "httpProxy": proxy,
                        "ftpProxy": proxy,
                        "sslProxy": proxy,
                        "proxyType": "MANUAL",
                    }
                    break
                print("%s is a BAD PROXY" % (current_proxy))
        else:
            proxy = None

        options = webdriver.ChromeOptions()
        prefs = {"profile.default_content_setting_values.notifications": 2}
        options.add_experimental_option("prefs", prefs)
        # options.add_argument('--headless')
        options.add_argument('--start-maximized')
        options.add_argument('disable-infobars')
        options.add_argument('--disable-extensions')
        options.add_argument("user-agent=Chrome/80.0.3800.23")
        options.add_argument('--no-sandbox')
        options.binary_location = GOOGLE_CHROME_BIN
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-dev-shm-usage')

        if hasattr(self, 'browser'):
            executor_url = self.browser.command_executor._url
            session_id = self.browser.session_id
            browser = attach_to_session(executor_url, session_id, options,
                                        proxy)
            from twitter_parsing.twitter_parse import send
            send("browser succesfully attached end session!")
        else:
            browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH,
                                       chrome_options=options)
        # logger.info("Reloading capabilities")
        # browser.desired_capabilities.update(options.to_capabilities())

        # browser.set_window_position(0, 0)
        browser.set_window_size(320, 9999)
        # browser.header_overrides = {
        #     'user-agent': 'Mozilla/5.0',
        # }
        # browser._client.set_header_overrides(headers=dict_headers)
        return browser
 def setUp(self):
     """
     Initialize several class instances.
     """
     keywords = Keywords()