def request(self, entry_url): try: #some santity checks if self.timeout < 0: raise ValueError("Timeout param can take only positive value") if type(self.strict) != type(True): raise ValueError("Strict param can take only boolean value") entry_url = clean_url(entry_url) #Wait for some time before raising Timeout exception page = requests.get(entry_url, timeout=self.timeout) mime_type = page.headers['content-type'] page.raise_for_status() #Stop if the no of pages visited is exceeded if self.count_exceeded(): return if (page != None) and remove_protocol(entry_url) not in self.visits: #Add the page to the visits list self.visits.add(clean_url(remove_protocol(entry_url))) soup = BeautifulSoup(page.text, 'lxml') if 'text/html' in mime_type: #If the page is HTML delegate it to HTMLPageRetriever pr = HTMLPageRetriever(self.strict) pr.add_links(self.crawler_queue, self.directory, entry_url, soup) elif 'text/xml' in mime_type: #If the page is XML delegate it to SiteMapRetriever sr = SiteMapRetriever(self.strict) sr.add_links(self.crawler_queue, self.directory, entry_url, soup) else: return print "--> " + entry_url return except requests.exceptions.ConnectionError as e: self.visits.add(clean_url(remove_protocol(entry_url))) print "Ignoring " + entry_url + ", URL might be incorrect" return None except requests.exceptions.Timeout as e: self.visits.add(clean_url(remove_protocol(entry_url))) print "Ignoring " + entry_url + ", timeout error" return None except requests.exceptions.RequestException as e: self.visits.add(clean_url(remove_protocol(entry_url))) print "Ignoring: " + entry_url + ", " + e.message return None except RuntimeError as e: print e return None
def search(self, query): ret = [] if query == None or len(query) == 0: return ret url = root + "/search-all?query=" + query u = urllib2.urlopen(url) page = u.read() result = page.find(product[0]) while result != -1: link_start = page.find(link[0], result + 1) + len(link[0]) link_end = page.find(link[1], link_start + 1) img_start = page.find(img[0], link_end + 1) + len(img[0]) img_end = page.find(img[1], img_start + 1) name_start = page.find(name[0], img_end + 1) + len(name[0]) name_end = page.find(name[1], name_start + 1) price_start = page.find(price[0], name_end + 1) + len(price[0]) price_end = page.find(price[1], price_start + 1) result = page.find(product[0], result + 1) url = root + page[link_start:link_end] cleaned_url = clean_url(url) d = { "link": cleaned_url, "img": page[img_start:img_end], "name": page[name_start:name_end], "price": int(page[price_start:price_end]), } ret.append(d) u.close() return ret
def get_uri(self, query, params=None, **kwargs): """Get the the request url""" if isinstance(query, basestring): query = YQLQuery(query) query_params = self.get_query_params(query, params, **kwargs) token = kwargs.get("token") if hasattr(token, "yahoo_guid"): query_params["oauth_yahoo_guid"] = getattr(token, "yahoo_guid") if not token: raise ValueError("Without a token three-legged-auth cannot be" " carried out") yql_logger.debug("query_params: %s", query_params) http_method = query.get_http_method() url = self.endpoint oauth_request = oauth.Request.from_consumer_and_token( self.consumer, http_url=url, token=token, parameters=query_params, http_method=http_method) yql_logger.debug("oauth_request: %s", oauth_request) # Sign request sig = self.get_signature(url) oauth_request.sign_request(sig, self.consumer, token) yql_logger.debug("oauth_signed_request: %s", oauth_request) url = oauth_request.to_url() url = clean_url(url) return url.replace('+', '%20').replace('%7E', '~')
def _insert_missing_har_urls(self, urls): result = dict() if len(urls) > 0: custom_condiction = ' WHERE url=?' for i in range(1, len(urls)): urls[i] = utils.clean_url(urls[i], False) custom_condiction += ' or url=? ' tmp = self.custom_select_from_table('har_urls', ['url', 'id'], custom_condiction, tuple(urls)) for row in tmp: result[row[0]] = row[1] urls.remove(row[0]) if len(urls) > 0: urls_to_insert = [(url, 0) for url in urls] self.insert_data('har_urls', ['url', 'is_advertising'], urls_to_insert) custom_condiction = ' WHERE url=?' for i in range(1, len(urls)): custom_condiction += ' or url=? ' tmp = self.custom_select_from_table('har_urls', ['url', 'id'], custom_condiction, tuple(urls)) for row in tmp: result[row[0]] = row[1] return result
def compose_tweets(self, message): words = message.split(" ") tweets = [] current = "" running_total = 0 for index in range(len(words)): word = words[index].strip() next_word = None current += clean_url(word) running_total += self.get_length_for_word(word) # print '[{}] "{}"'.format(running_total, current.encode('utf-8')) if index + 1 < len(words): next_word = words[index + 1].strip() current += " " running_total += 1 if next_word: if running_total + self.get_length_for_word(next_word) > 130: tweets.append(current) current = "" running_total = 0 else: tweets.append(current) break if len(tweets) > 1: for index in range(len(tweets)): tweets[index] += "({}/{})".format(index + 1, len(tweets)) return tweets
def generate_chain(self, message): """Generates a Markov chain from a message""" words = message.split() words.append(self.STOPWORD) words.insert(0, self.STOPWORD) # find URLs, neaten them up for i in range(0, len(words)): words[i] = clean_url(words[i]) if '<{}>'.format(self.users[self.BOT_ID]) in words[1]: del words[1] if len(words) < 2: return '' # remove stuff we don't know wordpair = '' index = 0 seedcandidates = [] while index < len(words) - 1: wordpair = words[index] + ' ' + words[index + 1] if self.dictionary.has_key(wordpair): seedcandidates.append(wordpair) index = index + 1 if len(seedcandidates) == 0: return '' chain = '' seed = random.choice(seedcandidates) # forwards wordpair = seed if self.dictionary.has_key(wordpair): chain = wordpair #print wordpair while (wordpair.split()[1] != self.STOPWORD) and (self.dictionary.has_key(wordpair)): wordpair = wordpair.split()[1] + ' ' + \ choose_word_from_list(self.dictionary.get(wordpair)[1]) #print wordpair chain = chain + ' ' + wordpair.split()[1] # backwards wordpair = seed if self.dictionary.has_key(wordpair) and wordpair.split()[0] != self.STOPWORD: wordpair = choose_word_from_list( self.dictionary.get(wordpair)[0]) + \ ' ' + wordpair.split()[0] # so we don't have the seed twice while (wordpair.split()[0] != self.STOPWORD) and (self.dictionary.has_key(wordpair)): #print wordpair chain = wordpair.split()[0] + ' ' + chain wordpair = choose_word_from_list( self.dictionary.get(wordpair)[0]) + \ ' ' + wordpair.split()[0] return chain.replace(self.STOPWORD, '')
def get_uri(self, query, params=None, **kwargs): """Get the the request url""" if isinstance(query, basestring): query = YQLQuery(query) params = self.get_query_params(query, params, **kwargs) query_string = urlencode(params) uri = '%s?%s' % (self.endpoint, query_string) uri = clean_url(uri) return uri
def get_username_board(url): url = clean_url(url) m = re.search('pinterest.[a-zA-Z.]+?/([^/]+)/([^#\\?]+)', url) username, board = m.groups() board = urllib.parse.unquote(board).strip() while board.endswith('/'): board = board[:-1].strip() return (username, board)
def get_uri(self, query, params=None, **kwargs): """Get the the request url""" if isinstance(query, basestring): query = YQLQuery(query) query_params = self.get_query_params(query, params, **kwargs) http_method = query.get_http_method() request = self.__two_legged_request(parameters=query_params, method=http_method) url = request.to_url() return clean_url(url)
def same_domain_cleanup(self, entry_url, url): #Do not parse the URL query string url = (url.split('?')[0]) if (url != None) else url #Do not use tokens following #, since it redirects to the same page url = (url.split('#')[0]) if (url != None) else url if url.startswith('javascript'): url = None elif url == '/' or url == '': url = None elif url.startswith('//'): url = 'http:' + url elif url.startswith("mailto:"): #If its email links, ignore url = None #Same domain or relative URL cleanup and convertion into proper URL string elif url.startswith('/'): url = clean_url(get_base_url(entry_url)) + url elif url.startswith('./'): url = clean_url(entry_url) + clean_url(url.replace(".", "")) return url
def make_request(url): out['text'] = 'Aguarde...Carregando...' response = None try: response = requests.get(clean_url(url)) except requests.exceptions.RequestException as e: # catastrophic error. bail. out['text'] = 'Erro na Requisicao' requestLinkArray = RequestLinkArray(url, response) out['text'] = requestLinkArray
def _retrieve_outbound_links(self): result = dict() principal_domain = utils.get_principal_domain(self._url) regex = "//*[@href and not(@href [contains(., '%s')])]" % principal_domain elements_with_urls = self._tree_explorer.xpath(self.body_node, regex) for element in elements_with_urls: href = element.attrib['href'] if utils.is_valid_url(href): href = utils.clean_url(href) if href not in result: result[href] = '' return list(result.keys())
def _prepare_tuple_failed_work(self, work_data_container): url = utils.clean_url(work_data_container.url, False) scheme, url = utils.split_url_and_scheme(url) scraped_flag = work_data_container.scraped attempts_count = work_data_container.attempts_count mime_type = work_data_container.mime_type response_code = work_data_container.http_response_code url_to_refer = work_data_container.url_to_refer error_text = work_data_container.error_text return scraped_flag, attempts_count, mime_type, response_code, None, url_to_refer, \ None, False, None, None, None, \ None, None, None, None, None, error_text, url, 0
def _get_canonical_url(self): result = None try: tmp_res = self.driver.find_element_by_xpath( '//link[@rel="canonical" and @href]') if tmp_res: href = tmp_res.get_attribute("href") if href: # domain = utils.get_principal_domain(self.current_url) result = href except NoSuchElementException: pass except TimeoutException: pass except Exception: pass if result is None: try: tmp_res = self.driver.find_element_by_xpath( '//meta[@property="og:url"]|//meta[@name="twitter:url"]') result = tmp_res.get_attribute('content') except NoSuchElementException: pass except TimeoutException: pass except Exception: pass if result: result = utils.clean_url(result, False) tmp = utils.clean_url(self.current_url, False) scheme, u = utils.split_url_and_scheme(tmp) if result.startswith(r'//'): result = '{}:{}'.format(scheme, result) elif result.startswith(r'/'): domain = '{}://{}'.format(scheme, utils.get_principal_domain_www(tmp)) result = '{}{}'.format(domain, result) if not utils.is_valid_url_to_navigate(result): result = None return result
def _prepare_tuple_without_article(self, work_data_container): har = None url = utils.clean_url(work_data_container.url, False) scheme, url = utils.split_url_and_scheme(url) scraped_flag = work_data_container.scraped attempts_count = work_data_container.attempts_count mime_type = work_data_container.mime_type response_code = work_data_container.http_response_code url_to_refer = work_data_container.url_to_refer pagecontent = work_data_container.page_content_container return scraped_flag, attempts_count, mime_type, \ response_code, pagecontent.language, url_to_refer, pagecontent.text,\ False, None, None, None, None, None, None, None, har, None, url, 0
def _extract_links_from_a_tags_in_text(self, text): """ Extract supplement links from the html text that contains <a> tags with href attribute. @param text: HTML text. @type text: str @return: Dictionary with supplement links grouped by extension. @rtype: { '<extension1>': [ ('<link1>', '<title1>'), ('<link2>', '<title2') ], 'extension2': [ ('<link3>', '<title3>'), ('<link4>', '<title4>') ] } """ soup = BeautifulSoup(text) links = [item['href'].strip() for item in soup.find_all('a') if 'href' in item.attrs] links = sorted(list(set(links))) supplement_links = {} for link in links: filename, extension = os.path.splitext(clean_url(link)) # Some courses put links to sites in supplement section, e.g.: # http://pandas.pydata.org/ if extension is '': continue # Make lowercase and cut the leading/trailing dot extension = clean_filename( extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename( os.path.basename(filename), self._unrestricted_filenames) if extension not in supplement_links: supplement_links[extension] = [] # Putting basename into the second slot of the tuple is important # because that will allow to download many supplements within a # single lecture, e.g.: # 01_slides-presented-in-this-module.pdf # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf # 01_slides-presented-in-this-module_LM-3dtexton.pdf supplement_links[extension].append((link, basename)) return supplement_links
def _extract_links_from_a_tags_in_text(self, text): """ Extract supplement links from the html text that contains <a> tags with href attribute. @param text: HTML text. @type text: str @return: Dictionary with supplement links grouped by extension. @rtype: { '<extension1>': [ ('<link1>', '<title1>'), ('<link2>', '<title2') ], 'extension2': [ ('<link3>', '<title3>'), ('<link4>', '<title4>') ] } """ soup = BeautifulSoup(text) links = [ item['href'].strip() for item in soup.find_all('a') if 'href' in item.attrs ] links = sorted(list(set(links))) supplement_links = {} for link in links: filename, extension = os.path.splitext(clean_url(link)) # Some courses put links to sites in supplement section, e.g.: # http://pandas.pydata.org/ if extension is '': continue # Make lowercase and cut the leading/trailing dot extension = clean_filename(extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename(os.path.basename(filename), self._unrestricted_filenames) if extension not in supplement_links: supplement_links[extension] = [] # Putting basename into the second slot of the tuple is important # because that will allow to download many supplements within a # single lecture, e.g.: # 01_slides-presented-in-this-module.pdf # 01_slides-presented-in-this-module_Dalal-cvpr05.pdf # 01_slides-presented-in-this-module_LM-3dtexton.pdf supplement_links[extension].append((link, basename)) return supplement_links
def _add_asset(name, url, destination): filename, extension = os.path.splitext(clean_url(name)) if extension is '': return extension = clean_filename(extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename(os.path.basename(filename), self._unrestricted_filenames) url = url.strip() if extension not in destination: destination[extension] = [] destination[extension].append((url, basename))
def _add_asset(name, url, destination): filename, extension = os.path.splitext(clean_url(name)) if extension is '': return extension = clean_filename( extension.lower().strip('.').strip(), self._unrestricted_filenames) basename = clean_filename( os.path.basename(filename), self._unrestricted_filenames) url = url.strip() if extension not in destination: destination[extension] = [] destination[extension].append((url, basename))
def add_links(self, process_queue, directory, entry_url, soup): """ Extract the valid links from the provided URL Parameters ---------- process_queue: Queue directory: dictionary Dictionary to store the parent and child url mappings entry_url: string URL to extract the hyperlinks soup : BeautifulSoup extract """ link_set = soup.find_all("url") #If there is no url tag found in XML file, it is not a sitemap / malformed if link_set == None: raise RuntimeWarning("malformed sitemap") directory[entry_url] = set() domain = remove_protocol(get_base_url(entry_url)) for link in link_set: #Find the url tags from XML and clean them for relative paths link = link.findNext("loc") if link != None: url = self.same_domain_cleanup(entry_url, link.text) if (url != None): #If strict flag is set, ignore the url from other domains if (self.strict_domain == True): if (remove_protocol(get_base_url(url)) != domain): continue if (url != entry_url): #Load all the child URLs for further processing process_queue.put(clean_url(url)) #Register a page, along with it's child URL's, to be shown / saved as file directory[entry_url].add(url) if link_set == None: raise RuntimeError("Ignoring " + entry_url + ", malformed xml/sitemap") #Converting set into list for serialising directory[entry_url] = list(directory[entry_url])
def _parse_rss_entry(self, entry, language, feed_sections): title = get_attr_dinamically(entry, 'title') link = get_attr_dinamically(entry, 'link') link = utils.clean_url(link, remove_arguments=False) article_date = self._get_parsed_dates_from_object( entry, 'published_parsed') article_container = ArticleContainer(url=link, title=title, publish_date=article_date, top_img=None, sections=[feed_sections]) extracted = PageContentContainer(None, url=link, article_c=article_container, language=language) self.data_collector.add_extracted_data( link, 0, 0, 'text/html', 0, page_content_container=extracted)
def process(self): try: if self.url_count == None: self.set_maximum() #Process until interrupted / count is matched while True: #Limit the number of threads to the user specified value #enumerate always return the child threads along with the main thread, ignore the later if (len(threading.enumerate()) - 1) < self.multi: #If count is exceeding, break the process if self.count_exceeded(): break #Wait until a mintue, to retrieve from queue page = clean_url( self.crawler_queue.get(block=True, timeout=self.timeout)) # If node hasn't been visited yet, proceed if remove_protocol(page) not in self.visits: try: #Collect URL's from the page threading.Thread(target=self.request, args=[page]).start() except Exception as e: print e.message break else: #Give some time for the threads to finish, since we should restrict the threads being spawned to the user-specified value time.sleep(2) except KeyboardInterrupt as e: print '\n\n---------------------------' print "\nFinishing the running jobs..." print '\n-----------------------------' except Queue.Empty as e: print "\nDone.." except ValueError as e: print e.message finally: #Join all existing threads to main thread. for thread in threading.enumerate(): if thread is not threading.currentThread(): thread.join(self.timeout) return self.directory
def retrieve_domain_links(self): result = dict() # principal_domain = utils.get_principal_domain(self._url) # regex = "//*[regexp:test(@href, '^(https?://)?(www\.)?.*%s', 'i')]" % principal_domain # elements_with_urls = self.root.xpath(regex) expression = "//a[contains(@href, '%s')]" % self.domain elements_with_urls = self._tree_explorer.xpath(self.body_node, expression) for element in elements_with_urls: href = element.attrib['href'] href = utils.clean_url(href, remove_arguments=False, domain=self.domain, scheme=self.scheme) if utils.is_valid_url_to_navigate(href): if utils.is_domain_link(href, self.domain): if href not in result: result[href] = '' return list(result.keys())
def _prepare_tuple_with_article(self, work_data_container): har = None url = utils.clean_url(work_data_container.url, False) scheme, url = utils.split_url_and_scheme(url) scraped_flag = work_data_container.scraped attempts_count = work_data_container.attempts_count mime_type = work_data_container.mime_type response_code = work_data_container.http_response_code url_to_refer = work_data_container.url_to_refer pagecontent = work_data_container.page_content_container art_container = pagecontent.article_c videos = ','.join(art_container.videos) authors = ','.join(art_container.authors) sections = ','.join(art_container.sections) publish_date = art_container.publish_date if publish_date and isinstance(publish_date, datetime.datetime): publish_date = utils.convert_datetime_to_format_str(publish_date) return scraped_flag, attempts_count, mime_type, response_code, pagecontent.language, url_to_refer, \ pagecontent.text, True, art_container.title,\ art_container.text, publish_date, \ art_container.top_img, videos, authors, sections, har, None, url, 0
def analyze_entry(site, e): if hasattr(e, 'published_parsed') and e.published_parsed: timestamp = time.mktime(e.published_parsed) elif hasattr(e, 'updated_parsed') and e.updated_parsed: timestamp = time.mktime(e.updated_parsed) else: return False if timestamp > time.time(): timestamp = time.time() if hasattr(e, "content"): content = e.summary else: content = e.description try: g = re.search(r'Noticia en Menéame: (http://menea.me/(\w+)) ', content) except: return False if g: id = int(g.group(2), 36) original_url = g.group(1) entry = dict() entry["site"] = site.encode('ascii', 'xmlcharrefreplace') entry["title"] = e.title.encode('ascii', 'xmlcharrefreplace') entry['url'] = clean_url(e.link) entry['ts'] = int(timestamp) entry['id'] = id res = store(site, entry) if res: post = "%s: «%s» %s (%s)" % ( entry["site"], entry["title"], entry['url'], original_url) print "Posting", post if not post_note(post): print "Error posting" return res return False
def add_links(self, process_queue, directory, entry_url, soup): """ Extract the valid links from the provided URL Parameters ---------- process_queue: Queue directory: dictionary Dictionary to store the parent and child url mappings entry_url: string URL to extract the hyperlinks soup : BeautifulSoup extract """ link_set = soup.find_all('a', href=True) directory[entry_url] = set() domain = remove_protocol(get_base_url(entry_url)) for link in link_set: #Find the urls from the soup extract and clean them for relative paths url = self.same_domain_cleanup(entry_url, link.get('href')) if (url != None): #If strict flag is set, ignore the url from other domains if (self.strict_domain == True): if (remove_protocol(get_base_url(url)) != domain): continue if (url != entry_url): #Load all the child URLs for further processing process_queue.put(clean_url(url)) #Register a page, along with it's child URL's, to be shown / saved as file directory[entry_url].add(url) #Converting set into list for serialising directory[entry_url] = list(directory[entry_url])
def analyze_entry(site, e): if hasattr(e, 'published_parsed') and e.published_parsed: timestamp = time.mktime(e.published_parsed) elif hasattr(e, 'updated_parsed') and e.updated_parsed: timestamp = time.mktime(e.updated_parsed) else: return False if timestamp > time.time(): timestamp = time.time() if hasattr(e, "content"): content = e.summary else: content = e.description try: g = re.search(r'Noticia en Menéame: (http://menea.me/(\w+)) ', content) except: return False if g: id = int(g.group(2), 36) original_url = g.group(1) entry = dict() entry["site"] = site.encode('ascii', 'xmlcharrefreplace') entry["title"] = e.title.encode('ascii', 'xmlcharrefreplace') entry['url'] = clean_url(e.link) entry['ts'] = int(timestamp) entry['id'] = id res = store(site, entry) if res: post = "%s: «%s» %s (%s)" % (entry["site"], entry["title"], entry['url'], original_url) print "Posting", post if not post_note(post): print "Error posting" return res return False
input = open('Markov_Dict.pkl', 'r') self.dictionary = pickle.load(input) input.close() self.dictLock.release() def toggle_learn(self): """Toggles the learning state""" self.isLearning = not self.isLearning def clean_urls_in_dictionary(self): self.dictLock.acquire() newdict = copy.deepcopy(self.DEFAULT_DICTIONARY) for key in self.dictionary: firsts = self.dictionary.get(key)[0] for i in range(0, len(firsts)): firsts[i] = (clean_url(firsts[i][0]), firsts[i][1]) seconds = self.dictionary.get(key)[1] for i in range(0, len(seconds)): seconds[i] = (clean_url(seconds[i][0]), seconds[i][1]) newkey = clean_url(key.split()[0]) if len(key.split()) > 1: newkey = newkey + ' ' + clean_url(key.split()[1]) newdict[newkey] = (firsts, seconds) self.dictionary = newdict self.dictLock.release() def word_index_in_list(findword, word_list): """Get the index of a word in a list""" for index in range(len(word_list)): if word_list[index][0] == findword: return index
def page_ad_matching(self, url): """ Function that matches URL to existing advertisers by relevance scores. :param url: Input URL :return: Advertisers, sorted by relevance scores. """ self.logger.debug('Keyword_2_company: ' + str(self.keyword_to_company)) self.logger.debug('ADVECS: ' + str(self.ad_keywords)) if url in self.articles: cleaned = self.articles[url] else: cleaned = clean_url(url) text = cleaned['title'] + '. ' + cleaned['text'] # if self.params['use_sentiment']: # candidates = self.all_companies.copy() # else: # candidates = self.all_companies_scalar.copy() # triggers = {} doc_vec = {} bid_companies = set() blacklisted_companies = set() scores = self.all_companies_scalar.copy() # Fetching target companies. targets = self.get_target_company(cleaned['title']) self.logger.debug('-----------' + str(targets)) self.logger.debug('-----------' + cleaned['title']) # Reverting back to regular analysis if no targets are found. if len(targets) == 0: self.params['targeted_sent'] = False first_chunk = True sentiment = np.array([0, 0]) for chunk in self.tokenizer.get_chunk(text): # Sentiment Extraction from Chunk if self.params['use_sentiment']: sentiment, tokens = self.analyzer_obj.get_sentiment_from_text( chunk) if first_chunk: # Fixed attention weight for the first chunk. sentiment = 2 * sentiment self.logger.debug('Sentiment extracted: ' + str(sentiment)) self.logger.debug(chunk) else: tokens = self.analyzer_obj.preprocess(chunk, join_negatives=False) for idx, token in enumerate(tokens): # Bidding Companies try: companies = self.keyword_to_company[token] self.logger.debug('Found trigger: ' + token) if self.params['use_sentiment']: if not np.any(sentiment): # Neutral Sentiment sentiment += np.array([0.5, 0]) APNEA.add_to_dict(doc_vec, token, sentiment) else: APNEA.add_to_dict(doc_vec, token, 1) bid_companies |= companies except KeyError: pass try: # Blacklist companies. companies = self.blacklist_to_company[token] blacklisted_companies |= companies except KeyError: try: # Handling 2 worded blacklist phrases. if idx + 1 < len(tokens): companies = self.blacklist_to_company[token + ' ' + tokens[idx + 1]] blacklisted_companies |= companies except KeyError: pass if first_chunk: first_chunk = False # Calculating the Relevance Scores, based on the system configuration. for c in bid_companies: ad_vec = self.ad_keywords[c] doc_vec_copy = doc_vec.copy() if self.params['use_sentiment']: for k in doc_vec_copy: if self.params['targeted_sent']: if c in targets: if self.params[ 'neg_sent'] and k in ad_vec and self.ad_negatives[ c][k]: # Targeted but Sentiment Insensitive. doc_vec_copy[k] = abs( self.score_sentiment( doc_vec_copy[k], method=self.params['scorer'])) else: # Targeted and Sentiment Sensitve doc_vec_copy[k] = self.score_sentiment( doc_vec_copy[k], method=self.params['scorer']) else: # Non-targets are taken as absolute values doc_vec_copy[k] = abs( self.score_sentiment( doc_vec_copy[k], method=self.params['scorer'])) else: if self.params[ 'neg_sent'] and k in ad_vec and self.ad_negatives[ c][k]: # Sentiment Insensitive doc_vec_copy[k] = abs( self.score_sentiment( doc_vec_copy[k], method=self.params['scorer'])) else: # Sentiment Sensitve doc_vec_copy[k] = self.score_sentiment( doc_vec_copy[k], method=self.params['scorer']) self.logger.debug('Company========>: ' + str(c)) self.logger.debug('ad_vec: ' + str(ad_vec)) self.logger.debug('doc_vec: ' + str(doc_vec)) self.logger.debug('doc_vec_copy: ' + str(doc_vec_copy)) scores[c] = self.cosine(ad_vec, doc_vec_copy) # Handling blacklist companies. if self.params['blacklist']: for blacklisted_company in blacklisted_companies: scores[blacklisted_company] = -(abs( scores[blacklisted_company])) self.logger.debug('scores: ' + str(scores)) # Scoring and Sorting. scored = sorted(scores.iteritems(), key=lambda x: x[1], reverse=True) return scored
def start_requests(self): for url in self.urls: yield scrapy.Request(url=utils.clean_url(url), callback=self.parse_url, meta={'key': url})
def get(self, url, https=False): url = clean_url(url, self.base_url, https=https) self.driver.get(url)
try: source = newspaper.build(url, config=config) except Exception as e: if PY_ENV == 'development': print('(SOURCE ERROR) Source Skipped\n') insert_log(source_id, 'sourceCrawl', 'error', float(time.clock() - src_start_time), { 'errorMessage': 'SOURCE ERROR', 'crawlerName': 'credible crawler' }) continue error_articles = [] prev_uuid = '' for article in source.articles: url_uuid = get_uuid(clean_url(article.url)) article.id = url_uuid if prev_uuid == url_uuid: continue if get_one(url_uuid, 'errorArticles') or get_one( url_uuid, 'articles'): print('Skipped: ' + article.url) error_articles.append(article.id) prev_uuid = url_uuid source.articles = [ a for a in source.articles if a.id not in error_articles ]
def engage(self): print "\nCrawler engaged, to interrupt press Ctrl+C or Command + dot/period" print "-" * 20 #Initiate the crawler, by placing the first URL in the processing queue self.crawler_queue.put(clean_url(self.root)) return self.process()
def do_commands(self, target, sender, message, sentByAdmin): if sentByAdmin and ('!saveDict' in message): try: self.save_dictionary() self.send_message(target, 'DICTIONARY SAVED SUCCESSFULLY') except IOError: self.send_message(target, 'DICTIONARY COULD NOT BE SAVED') return True elif sentByAdmin and ('!loadDict' in message): try: self.load_dictionary() self.send_message(target, 'DICTIONARY LOADED SUCCESSFULLY') except IOError: self.send_message(target, 'DICTIONARY COULD NOT BE LOADED') return True elif sentByAdmin and ('!eraseDict' in message): self.dictionary = { self.STOPWORD : ([self.STOPWORD], [self.STOPWORD]) } self.send_message(target, 'DICTIONARY ERASED (NOT SAVED YET)') return True elif sentByAdmin and ('!learn' in message): self.toggle_learn() print_message = 'I AM {} LEARNING' self.send_message(target, print_message.format('NOW' if self.isLearning else 'NO LONGER')) return True elif sentByAdmin and ('!cleanURL' in message): self.clean_urls_in_dictionary() self.send_message(target, 'LINKS IN DICTIONARY HAVE BEEN CLEANED') return True elif '!search' in message: try: message = message.lower() searchterms = message.split()[1:] for i in range(0, len(searchterms)): searchterms[i] = clean_url(searchterms[i]) if len(searchterms) == 1: phrases = [] for key in self.dictionary: if searchterms[0] == key.split()[0] or \ (len(key.split()) > 1 and \ searchterms[0] == key.split()[1]): phrases.append(key) self.send_message(target, '"%s" in pairs: %s' % (searchterms[0], str(phrases))) else: key = searchterms[0] + ' ' + searchterms[1] if self.dictionary.has_key(key): self.send_message(target, '"%s": %s' % (key, str(self.dictionary.get(key)))) else: self.send_message(target, '"%s" not found in dictionary' % key) except IndexError: self.send_message(target, 'MALFORMED COMMAND') return True elif '!talkback' in message: try: self.talkBackFreq = float(message.split()[1]) self.send_message(target, ('RESPONDING PROBABILITY SET TO %3f' % self.talkBackFreq)) except (IndexError, TypeError): self.send_message(target, 'MALFORMED COMMAND') return True elif sentByAdmin and ('!quit' in message): self.quit() return True elif '!avatar' in message: self.send_message(target, 'SOURCE OF MY CURRENT AVATAR: %s' % self.AVATARSOURCE) return True elif ('!nowplaying' in message): songname, songartist = self.generate_song() self.send_message(target, 'Now Playing: "%s", by %s' % (string.capwords(songname), string.capwords(songartist))) return True return False # did not find a command
def init(self): self.url = clean_url(self.url)
def init(self): self.url = clean_url(self.url) url = self.url # Determine the type if 'bookmark.php?type=user' in url or url.startswith(headers['following']): type = 'following' elif 'bookmark.php' in url or url.startswith(headers['bookmark']) or '/bookmarks/' in url: type = 'bookmark' elif 'illust_id=' in url or url.startswith(headers['illust']) or '/artworks/' in url: type = 'illust' elif 'search.php' in url or url.startswith(headers['search']): type = 'search' order = query_url(url).get('order', ['date_d'])[0] # data_d, date, popular_d, popular_male_d, popular_female_d scd = query_url(url).get('scd', [None])[0] # 2019-09-27 ecd = query_url(url).get('ecd', [None])[0] # 2019-09-28 blt = query_url(url).get('blt', [None])[0] # 5000 bgt = query_url(url).get('bgt', [None])[0] # 9999 type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira self.info = {'order': order, 'scd': scd, 'ecd': ecd, 'blt': blt, 'bgt': bgt, 'type': type_} elif '/tags/' in url: type = 'search' order = query_url(url).get('order', ['date_d'])[0] scd = query_url(url).get('scd', [None])[0] ecd = query_url(url).get('ecd', [None])[0] blt = query_url(url).get('blt', [None])[0] bgt = query_url(url).get('bgt', [None])[0] type_ = query_url(url).get('type', [None])[0] # None (all), illust, manga, ugoira if type_ is None: try: type_ = url.split('/tags/')[1].split('/')[1] except IndexError: type_ = None type_ = {'illustrations': 'illust'}.get(type_, type_) self.info = {'order': order, 'scd': scd, 'ecd': ecd, 'blt': blt, 'bgt': bgt, 'type': type_} elif 'id=' in url and 'mode=' not in url or url.startswith(headers['user']) or 'pixiv.me' in url or '/users/' in url: type = 'user' else: self.Invalid((u'[pixiv] Can not determine type: {}').format(url)) return 'stop' header = headers[type] if 'pixiv.net' in url or 'pixiv.me' in url: if not url.startswith('http://') and not url.startswith('https://'): url = u'https://' + url self.url = url else: url = url.replace('bmk_', '').replace('illust_', '').replace('pixiv_', '').replace('search_', '') if type == 'user': url = 'https://www.pixiv.net/member_illust.php?id={}'.format(url) elif type == 'bookmark': url = 'https://www.pixiv.net/bookmark.php?id={}'.format(url) elif type == 'illust': url = 'https://www.pixiv.net/member_illust.php?mode=medium&illust_id={}'.format(url) elif type == 'search': url = 'https://www.pixiv.net/search.php?s_mode=s_tag&word={}'.format(url) url = clean_url(url) else: self.Invalid('{}{}: ???'.format(header, url)) return 'stop' self.url = url self.print_('PIXIV_TYPE: {}'.format(type)) self.pixiv_type = type try: self.api = pixiv_auth.get_api() if 'error' in self.api.user_detail(11): self.api = pixiv_auth.get_api(force=True) except Exception as e: self.print_(print_error(e)[0]) self.Invalid(tr_('로그인 실패: {}{}\n[옵션 - 설정 - 픽시브 설정 - 로그인] 에서 설정해주세요.').format(header, url)) return 'stop'
try: meta = requests.get( 'http://localhost:5000/api/exposed/submit/meta?url=http://' + url).json() except: meta = { 'aboutUsUrl': '', 'contactUsUrl': '', } info = { 'isReliable': False, 'id': get_uuid(clean_url(domain)), 'brand': s['name'], 'url': clean_url(domain), 'socialScore': get_popularity('http://' + url)['totalScore'], 'worldRank': world_rank, 'countryRank': country_rank, 'aboutUsUrl': '' if meta['aboutUsUrl'] in ['http://#', 'https://#' ] else meta['aboutUsUrl'], 'contactUsUrl': '' if meta['contactUsUrl'] in ['http://#', 'https://#'] else
def fix_url(cls, url): url = clean_url(url) return url.split('?')[0]