def __init__(self, use_proxy=False): """ Parser initialisation by launching the browser. """ self.browser = self.browser_setup(use_proxy=use_proxy) self.tg_sources_path = 'telegram_parsing/channels.txt' self.keywords = Keywords()
def add_keyword(self, new_word): """Add keyword end user""" self.keywords.append(new_word) words = Keywords() words.add(new_word) mongo.db.users.update({"name": self.username}, {"$set": {"keywords": self.keywords}})
def check_user_weight(self): """ Sort weight of the user links end get most popular :param link: str :return: """ for source in ['telegram', 'twitter']: new_links = [] words = Keywords() for now in range(config.NUMBER_WORDS): dct = {} dct1 = {} for link in self.keywords: link = words[link] try: dct[link.links[source][now][1]] = dct.get(link.links[source][now][0], 0) + 1 dct1[link.links[source][now][1]] = link.links[source][now] except IndexError: pass maxi = 0 max_link = '' for i in dct: try: if dct[i] > maxi and i not in [x[1] for x in new_links]: maxi = dct[i] max_link = dct1[i] except IndexError: pass new_links.append(max_link) self.weights[source] = new_links
def get_full_data(self): """ Get data that we give end WebPage :return: dict """ to_return = {} keywords = Keywords() for word in self.keywords: to_return[word] = keywords[word].get_info() return to_return
def get_user_weight(self, link): """ Get weight of the link for special user :param link: str :return: int """ user_weight = 0 words = Keywords() for keyword in self.keywords: keyword = words.keywords[keyword] if link in keyword.links_dict: user_weight += keyword.links_dict[link] return user_weight
class Parser: """ Main Parser class. """ def __init__(self, use_proxy=False): """ Parser initialisation by launching the browser. """ self.browser = self.browser_setup(use_proxy=use_proxy) self.tg_sources_path = 'telegram_parsing/channels.txt' self.keywords = Keywords() def parse_telegram(self): """ Launches telegram channels parsing. :return: None """ parse_telegram(self) def parse_twitter(self): """ Launches telegram channels parsing. :param keywords: list :return: None """ parse_twitter(self) @staticmethod def by_class(search_in, class_name, get_text=False, parse_all=False): """ Search for an element by its class_name. :param search_in: Selenium WebElement :param class_name: str :param get_text: bool :param parse_all: bool :return: Selenium WebElement or None if not found """ result = None try: if parse_all: result = search_in.find_elements_by_class_name(class_name) else: result = search_in.find_element_by_class_name(class_name) if result and get_text: result = result.text except NoSuchElementException: pass return result def quit(self): """ Quit parsing by safely closing and quitting the browser. :return: """ self.browser.close() self.browser.quit() def new_link(self, text, link, source, info): """ This function get information about post, and update weights of keywords and user end get most popular one :param text: str :param link: str :return: None """ self.keywords.add_new_link(text, link, source, info) def browser_setup(self, iterator=0, update_proxies=False, use_proxy=True): """ Initial browser setup :param update_proxies: bool :param iterator: int :return: Selenium WebDriver """ global PROXIES, REQ_PROXY if update_proxies: REQ_PROXY.__init__() PROXIES = REQ_PROXY.get_proxy_list() if use_proxy: # socket.setdefaulttimeout(120) for current_proxy in PROXIES: proxy = PROXIES[iterator].get_address() if not is_bad_proxy(proxy): logger.info("Chosen Proxy: %str_check", proxy) webdriver.DesiredCapabilities.CHROME['proxy'] = { "httpProxy": proxy, "ftpProxy": proxy, "sslProxy": proxy, "proxyType": "MANUAL", } break print("%s is a BAD PROXY" % (current_proxy)) else: proxy = None options = webdriver.ChromeOptions() prefs = {"profile.default_content_setting_values.notifications": 2} options.add_experimental_option("prefs", prefs) # options.add_argument('--headless') options.add_argument('--start-maximized') options.add_argument('disable-infobars') options.add_argument('--disable-extensions') options.add_argument("user-agent=Chrome/80.0.3800.23") options.add_argument('--no-sandbox') options.binary_location = GOOGLE_CHROME_BIN options.add_argument('--disable-gpu') options.add_argument('--disable-dev-shm-usage') if hasattr(self, 'browser'): executor_url = self.browser.command_executor._url session_id = self.browser.session_id browser = attach_to_session(executor_url, session_id, options, proxy) from twitter_parsing.twitter_parse import send send("browser succesfully attached end session!") else: browser = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, chrome_options=options) # logger.info("Reloading capabilities") # browser.desired_capabilities.update(options.to_capabilities()) # browser.set_window_position(0, 0) browser.set_window_size(320, 9999) # browser.header_overrides = { # 'user-agent': 'Mozilla/5.0', # } # browser._client.set_header_overrides(headers=dict_headers) return browser
def setUp(self): """ Initialize several class instances. """ keywords = Keywords()