def get_ratio_of_discr(df: pd.DataFrame, admissibles, outcome, sensitive): """ Compute adjusted Ratio of Observational Discrimination """ dom_sensitive = sorted(get_domain(df, [sensitive])) dom_outcome = sorted(get_domain(df, [outcome])) if len(dom_sensitive) != 2 or len(dom_outcome) != 2: return np.array([1.0]) s0 = dom_sensitive[0] s1 = dom_sensitive[1] o0 = dom_outcome[0] o1 = dom_outcome[1] cont_matrix_iter = contingency_matrices_iterator(df, outcome, sensitive, admissibles) rods = [] for cm in cont_matrix_iter: if cm.shape == (2, 2): cb = cm[s0][o1] * cm[s1][o0] ad = cm[s0][o0] * cm[s1][o1] if ad != 0: rods.append(cb / ad) else: rods.append(1.0) rods = np.array(rods) return rods
def get_source_node(self): if 'source_selector' in self.options: if self.options['source_selector']: nodes = self.doc.cssselect(self.options['source_selector']) if len(nodes) == 1: return nodes[0] for node in nodes: res = self.has_source(node) if res is not None: return res body = self.doc.find('body') if body is None: return None for node in body.iter(): res = self.has_source(node) if res is not None: return res domain = get_domain(self.url) for a in self.doc.iter('a'): link = a.get('href') if link and link.startswith('http') \ and get_domain(link) != domain: text = self.get_block_text(a) if len(text) > 2 \ and text.endswith(u'报') \ and not text.endswith(u'举报'): return a
def get_source_node(self): if 'source_selector' in self.options: if self.options['source_selector']: nodes = self.doc.cssselect(self.options['source_selector']) if len(nodes) == 1: return nodes[0] for node in nodes: res = self.has_source(node) if res is not None: return res for node in self.doc.find('body').iter(): res = self.has_source(node) if res is not None: return res domain = get_domain(self.url) for a in self.doc.iter('a'): link = a.get('href') if link and link.startswith('http') \ and get_domain(link) != domain: text = self.get_block_text(a) if len(text) > 2 \ and text.endswith(u'报') \ and not text.endswith(u'举报'): return a
def get_source_node(self): if self.options.get('source_selector', ''): nodes = self.article.select(self.options['source_selector']) if len(nodes) == 1: return nodes[0] for node in nodes: res = self.has_source(node) if res is not None: return res for node in self.article.doc.find('body').iter(): res = self.has_source(node) if res is not None: return res domain = get_domain(self.article.url) for a in self.article.doc.iter('a'): link = a.get('href') if link and link.startswith('http') \ and get_domain(link) != domain: text = self.article.get_block_text(a) if len(text) >= 2: if text.endswith(u'报') and not text.endswith(u'举报') \ or text[-2:] in CATES and len(text) == 4: return a
def learn_page(request, theme_name=None, template='theme_page.html'): theme = get_object_or_404(Theme, name=theme_name) theme.layers = theme.layer_set.all().order_by('name') context = { 'theme': theme, 'domain': get_domain(8000), 'domain8010': get_domain() } return render_to_response(template, RequestContext(request, context))
def data_needs(request, template='needs.html'): themes = Theme.objects.all().order_by('display_name') ordered_themes, theme_dict = add_ordered_needs_lists(themes) context = { 'themes': themes, 'theme_dict': theme_dict, 'ordered_themes': ordered_themes, 'domain': get_domain(8000), 'domain8010': get_domain() } return render_to_response(template, RequestContext(request, context))
def data_needs(request, template="needs.html"): themes = Theme.objects.all().order_by("display_name") ordered_themes, theme_dict = add_ordered_needs_lists(themes) context = { "themes": themes, "theme_dict": theme_dict, "ordered_themes": ordered_themes, "domain": get_domain(8000), "domain8010": get_domain(), } return render_to_response(template, RequestContext(request, context))
def csw_listing(request, template='pycsw_catalog_view.html'): if logger: logger.info("Start csw_listing") csw_recs = pycsw_records.objects.using('pycsw_test').all().order_by('organization') html_id = 0 for rec in csw_recs: rec.html_id = html_id html_id += 1 context = {'records': csw_recs, 'domain': get_domain(8000), 'domain8010': get_domain()} if logger: logger.info("End csw_listing") return render_to_response(template, RequestContext(request, context))
def topic_page(request, topic_name=None, template='topic_page.html'): topic = get_object_or_404(Topic, name=topic_name) views = MapView.objects.filter(topic=topic).order_by('ordering') viewsList = simplejson.dumps([view.name for view in views]) layers = topic.layers.all().order_by('name') context = { 'topic': topic, 'views': views, 'views_list': viewsList, 'initial_view': views[0].name, 'layers': layers, 'domain': get_domain(8000), 'domain8010': get_domain() } return render_to_response(template, RequestContext(request, context))
def get_har( self, remove_domain_request=True, domains_to_remove={ 'facebook.com', 'facebook.it', 'youtube.it', 'youtube.com', 'twitter.it', 'twitter.com' }, file_type_to_remove={'jpg', 'png', 'jpeg'}): result = list() if self.logging and self.logs: domain = None if remove_domain_request: domain = utils.get_domain(self.current_url) for log in self.logs: message = json.load(StringIO(log['message']))['message'] if 'method' in message: method = message['method'] if method and method == 'Network.responseReceived': url = message['params']['response']['url'] if utils.is_valid_url(url): to_insert = (domain and not utils.is_domain_link( url, domain)) or domain is None to_insert = to_insert and utils.get_filetype_from_url( url) not in file_type_to_remove if to_insert: for d in domains_to_remove: if utils.is_domain_link(url, d): to_insert = False break if to_insert: result.append(url) result = list(set(result)) #print('har len: ' + str(len(result))) return result
def __init__(self, link, base_url): self.text = self.get_text(link) self.class_ = self.get_class(link) self.href = self.get_href(link, base_url) self.domain = get_domain(self.href) self.parent = link.parent self.base_url = base_url
def add(self, cate): url = cate['url'] domain = get_domain(url) subdomains = get_subdomains(url) paths = get_path(url).split('/') query = urlparse.urlparse(url).query if domain not in self.root: self.root[domain] = {'sub':{}, 'path':{}} node = self.root[domain] if len(subdomains) > 1 or len(subdomains) == 1 and subdomains[0] != 'www': for sub in subdomains: if sub not in node['sub']: node['sub'][sub] = {'sub':{}, 'path':{}} node = node['sub'][sub] for path in paths: if path not in node['path']: node['path'][path] = {'path':{}} node = node['path'][path] if query: node['path']['query___' + query] = {'path':{}} node = node['path']['query___' + query] node['cate'] = cate
def bookmark_link(self): if not self.bookmark and self.is_sublayer and self.parent.bookmark: return self.parent.bookmark.replace('<layer_id>', str(self.id)) if not self.bookmark: domain = get_domain(8000) return '%s/planner/#%s' % (domain, self.slug) return self.bookmark
def bookmark_link(self): if not self.bookmark and self.is_sublayer and self.parent.bookmark: return self.parent.bookmark.replace('<layer_id>', str(self.id)) if not self.bookmark: domain = get_domain(8000) return '%s/planner/#%s' %(domain, self.slug) return self.bookmark
def deploy_prometheus_route(): '''Deploy Prometheus Route''' topic = 'Prometheus Operator Route' src_file = os.path.join(os.getcwd(),\ "deploy/monitoring/prometheus/assisted-installer-ocp-prometheus-route.yaml") dst_file = os.path.join(os.getcwd(),\ "build/assisted-installer-ocp-prometheus-route.yaml") try: # I have permissions ingress_domain = utils.get_domain() except: # I have not permissions, yes it's ugly... # This ingress should be there because of UI deployment json_path_ingress = '{.spec.rules[0].host}' cmd = "{} get ingress assisted-installer -o jsonpath='{}'".format( CMD_BIN, json_path_ingress) assisted_installer_ingress_domain = utils.check_output(cmd) if assisted_installer_ingress_domain.split( ".")[0] != 'assisted-installer': print("Error recovering the ingress route") sys.exit(1) ingress_domain = assisted_installer_ingress_domain.split(".", maxsplit=1)[1] with open(src_file, "r") as src: with open(dst_file, "w+") as dst: data = src.read() data = data.replace("INGRESS_DOMAIN", ingress_domain) print("Deploying {}: {}".format(topic, dst_file)) dst.write(data) utils.apply(dst_file)
def top_things(db_file): urls = {} people = {} graph = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database. (Top things)" cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) split = get_domain(loaded_rec) if urls.get(split, False) == False: urls[split] = 1 else: urls[split] = urls[split] + 1 person = loaded_rec['person'] if people.get(person, False) == False: people[person] = 1 else: people[person] = people[person] + 1 if split is not None and split is not "" and \ person is not None and person is not "": # Build a crazy relational graph out of my nosql data if graph.get(split, False) == False: graph[split] = { "is_person": False, "data": [person], "linked_to_count": 1 } elif person not in graph[split]: graph[split]["data"].append(person) graph[split][ "linked_to_count"] = graph[split]["linked_to_count"] + 1 if graph.get(person, False) == False: graph[person] = {"is_person": True, "data": [split]} elif split not in graph[person]: graph[person]["data"].append(split) cur.step_back() cur.disable() db.close() def get_one(x): return x[1] return (sorted(urls.items(), key=get_one, reverse=True), sorted(people.items(), key=get_one, reverse=True), graph)
def main(): deploy_options = handle_arguments() # TODO: delete once rename everything to assisted-installer if deploy_options.target == "oc-ingress": service_host = "assisted-installer.{}".format( utils.get_domain(deploy_options.domain)) service_port = "80" else: service_host = utils.get_service_host( SERVICE, deploy_options.target, namespace=deploy_options.namespace) service_port = utils.get_service_port( SERVICE, deploy_options.target, namespace=deploy_options.namespace) with open(SRC_FILE, "r") as src: with open(DST_FILE, "w+") as dst: data = src.read() data = data.replace("REPLACE_URL", '"{}"'.format(service_host)) data = data.replace("REPLACE_PORT", '"{}"'.format(service_port)) data = data.replace("REPLACE_DOMAINS", '"{}"'.format(deploy_options.base_dns_domains)) data = data.replace('REPLACE_NAMESPACE', deploy_options.namespace) print("Deploying {}".format(DST_FILE)) versions = { "IMAGE_BUILDER": "installer-image-build", "AGENT_DOCKER_IMAGE": "agent", "KUBECONFIG_GENERATE_IMAGE": "ignition-manifests-and-kubeconfig-generate", "INSTALLER_IMAGE": "assisted-installer", "CONTROLLER_IMAGE": "assisted-installer-controller", "CONNECTIVITY_CHECK_IMAGE": "connectivity_check", "INVENTORY_IMAGE": "inventory" } for env_var_name, image_short_name in versions.items(): image_fqdn = deployment_options.get_image_override( deploy_options, image_short_name, env_var_name) versions[env_var_name] = image_fqdn # Edge case for controller image override if os.environ.get("INSTALLER_IMAGE" ) and not os.environ.get("CONTROLLER_IMAGE"): versions[ "CONTROLLER_IMAGE"] = deployment_options.IMAGE_FQDN_TEMPLATE.format( "assisted-installer-controller", deployment_options.get_tag( versions["INSTALLER_IMAGE"])) versions["SELF_VERSION"] = deployment_options.get_image_override( deploy_options, "assisted-service", "SERVICE") deploy_tag = get_deployment_tag(deploy_options) if deploy_tag: versions["RELEASE_TAG"] = deploy_tag y = yaml.load(data) y['data'].update(versions) data = yaml.dump(y) dst.write(data) utils.apply(DST_FILE)
def get_allowed_from(self, child_urls): """ :param child_urls: List of child urls to check robots.txt on :return: A list of allowed child urls to crawl """ allowed = [] domains = list(set('{0}'.format(get_domain(url)) for url in child_urls)) domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains} for domain in domain_to_children: try: rules = self.robots.fetch(domain) for url in domain_to_children[domain]: if rules.allowed(url, self._agent): allowed.append(url) except: allowed.extend(domain_to_children[domain]) return allowed
def introspect(domain): filter_func = lambda x: get_domain(loads(x[1])).lower() in domain.lower() pages, requested_page = get_effective_page(request.args.get("page", 0), filter_func) items = get_items(filter_func, g.db_file, requested_page) return render_template("index.html", items=items, pages=pages, requested_page=requested_page, current_page=request.args.get('page', 0))
def is_image_link(url): if url.split('.')[-1] in img_extensions: return True domain = get_domain(url).split('.') for sharer in img_sharers: if sharer in domain: return True return False
def add_learn_links(themes): context = [] domain = get_domain() for theme in themes: link = '%s/portal/learn/%s' %(domain, linkify(theme.name)) #print link context.append({'theme': theme, 'learn_link': link}) return context
def get_error_rate(self, response): self.out_domains.add(get_domain(response.request.url)) self.crawler.stats.inc_value("no_requests") if not self.domain.check_request_url(response.request.url): self.crawler.stats.inc_value('no_new_posts') self.sum_download_time += response.meta['request_time'] urls = [response.urljoin(url.strip()) for url in response.xpath("//a/@href").getall() if fix_url(url)] for url in urls: yield Request(url=url, callback=self.get_error_rate, errback=self.check_error_back_rate)
def __init__(self, name, url): self.name = name self.url = url.replace(' ','') self.html = self.get_html() self.domain = get_domain(self.url) # e.g. stackoverflow.com self.base_url = self.get_base_url() # e.g. http://stackoverflow.com/ self.all_links = self.get_all_links() self.some_valid_links = self.get_some_valid_links() self.valid_links = self.get_all_valid_links()
def top_things(db_file): urls = {} people = {} graph = {} db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database. (Top things)" cur = db.cursor() cur.jump_back() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) split = get_domain(loaded_rec) if urls.get(split, False) == False: urls[split] = 1 else: urls[split] = urls[split] + 1 person = loaded_rec['person'] if people.get(person, False) == False: people[person] = 1 else: people[person] = people[person] + 1 if split is not None and split is not "" and \ person is not None and person is not "": # Build a crazy relational graph out of my nosql data if graph.get(split, False) == False: graph[split] = {"is_person": False, "data": [person], "linked_to_count": 1} elif person not in graph[split]: graph[split]["data"].append(person) graph[split]["linked_to_count"] = graph[split]["linked_to_count"] + 1 if graph.get(person, False) == False: graph[person] = {"is_person": True, "data": [split]} elif split not in graph[person]: graph[person]["data"].append(split) cur.step_back() cur.disable() db.close() def get_one(x): return x[1] return (sorted(urls.items(), key=get_one, reverse=True), sorted(people.items(), key=get_one, reverse=True), graph)
def __init__(self, url_list, save_base_dir, header, encoding=None, grab_out_site_link=False, to_single_page=False, full_site=False, ref_model=False, framework=None): """ :param url_list: :param save_base_dir: :param header: :param encoding: :param grab_out_site_link: :param to_single_page: :param full_site: :param ref_model: """ self.parent_save_dir = save_base_dir self.date_str = get_date() self.zip_save_base_abs_dir = f"{self.parent_save_dir}/{config.template_archive_dir}/{self.date_str}/" # zip /xx/xx/archive/2019-00-01/ self.download_temp_abs_dir = f"{self.parent_save_dir}/{config.template_temp_dir}/{self.date_str}/" self.domain = get_domain(url_list[0]) self.tpl_dl_dir, self.js_dir, self.img_dir, self.css_dir, self.other_dir = self.__prepare_dirs( ) self.dl_urls = {} # 去重使用,存储 url=>磁盘绝对路径 self.error_grab_resource = {} # 记录 http url => disk url ,最后生成一个报告打包 self.header = header self.charset = encoding self.is_grab_outer_link = grab_out_site_link self.is_ref_model = ref_model if self.is_ref_model: self.is_grab_outer_link = False # 盗链模式下,一定不抓外部的资源,内部资源也会被改写绝对路径 self.is_to_single_page = to_single_page # 是否把图片,css, js等压缩到一个页面里 self.single_page = [] #report生成时使用 self.is_full_site = full_site #是否是整站 self.html_link_queue = Queue() # html 页面的队列 for u in url_list: self.html_link_queue.put(u) self.downloaded_html_url = [ ] # 已经下载过的html,保存(disk_path, file_name, url, ), 最后用这个重新修正链接 self.download_queue = Queue( ) # 数据格式json {'cmd':quit/download, "url":'http://baidu.com', "save_path":'/full/path/file.ext', 'type':'bin/text'} self.download_finished = False # url消耗完毕不代表网络请求都返回了 self.task_finished = False # 全部网络都返回, eventloop结束 self.zip_result_file = None self.file_name_dup_checker = { } # file_name => url 。用于检查生成的文件名字是否有重复的,如果重复了就要重新生成了 self.framework_support = framework self.thread = threading.Thread(target=self.__download_thread) self.thread.start()
def __init__(self, input, **options): self.input = input self.url = options.get('url', '') self.debug = options.get('debug', False) self.title = options.get('title', '^^') self.pages = options.get('pages', None) self.texts = options.get('texts', None) self.domain = get_domain(self.url) self.options = options self.doc = clean_html(input, return_doc=True) self.text = self.doc.text_content() self.len = word_count(self.text) if self.text else 0
def get_cookie_syncs_for_multiple_sites(self, sites, cookie_length=8, filepath=''): """Get cookie syncing data for multiple sites, and write results to disk. Cookies must be at least cookie_length characters long to be considered. """ sites = self._filter_site_list(sites) cookie_sync_data = defaultdict(defaultdict) for site in sites: cookie_sync_data[site] = self.get_cookie_syncs_by_site( site, cookie_length=cookie_length) # Write complete output as csv with open(os.path.join(filepath, 'full_cookie_syncs.csv'), 'w') as f: writer = csv.writer(f) writer.writerow( ['site', 'sending_domain', 'receiving_url', 'cookie_value']) for site in cookie_sync_data: for receiving_url in cookie_sync_data[site]: for sending_url, cookie_value in cookie_sync_data[site][ receiving_url]: writer.writerow( [site, sending_url, receiving_url, cookie_value]) # Write partial output as CSV, only identifying sending domain and receiving domain # (rather than the full receiving URL) cooks_just_domains = defaultdict(defaultdict) for site in cookie_sync_data: cooks_just_domains[site] = defaultdict(set) for receiving_url in cookie_sync_data[site]: for sending_domain, value in cookie_sync_data[site][ receiving_url]: cooks_just_domains[site][utils.get_domain( receiving_url)].add(sending_domain) with open(os.path.join(filepath, 'condensed_cookie_syncs.csv'), 'w') as f: writer = csv.writer(f) writer.writerow(['site', 'sending_domain', 'receiving_domain']) for site in cooks_just_domains: for receiving_domain in cooks_just_domains[site]: if len(cooks_just_domains[site][receiving_domain] ) > 1 and 'NOT_FOUND' in cooks_just_domains[site][ receiving_domain]: cooks_just_domains[site][receiving_domain].discard( 'NOT_FOUND') for sending_domain in cooks_just_domains[site][ receiving_domain]: writer.writerow( [site, sending_domain, receiving_domain])
def __init__(self, function): super(LowAnalyzer, self).__init__() self.function = function # Dominio self.domain = get_domain(self.function) # Raíces self.roots = get_roots(self.function) # Signo self.negative, self.positive = get_sign(self.function)
def render_template(self, template_path, template_context, to_string=False): """ Render a Template to output """ from modules.events.internal import api as events_api # Debug - Show what non-js search engines see template_context['no_client'] = bool( self.request.get('no_client', False)) # TODO: This needs to abstract the jinja env out further... from main import JINJA_ENVIRONMENT template_context['settings_dict'] = {} template_context['settings_dict']['is_appspot'] = is_appspot() template_context['settings_dict']['domain'] = get_domain() # Tack on the google analytics profiles if is_appspot(): # TODO: Remove this before being prod ready template_context['settings_dict'][ 'ga_profile_id'] = 'UA-54271335-1' else: template_context['settings_dict'][ 'ga_profile_id'] = 'UA-54271335-2' # Temporary Serverside rendering handler - this should be done via jina extensions likely template_context[ 'upcoming_event_resources'] = events_api.get_upcoming_event_resources( ) template_context[ 'ongoing_event_resources'] = events_api.get_ongoing_event_resources( ) # TODO: This should come from some sort of middleware likely template_context['settings_dict']['is_authenticated'] = bool( users.get_current_user()) template_context['settings'] = json.dumps( template_context['settings_dict']) template = JINJA_ENVIRONMENT.get_template(template_path) rendered_content = template.render(template_context) if to_string: return rendered_content self.response.write(rendered_content)
def tiles_page(request, slug=None, template='tiles_page.html'): layer = get_object_or_404(Layer, slug_name=slug) orig_url = layer.url arctile_url = orig_url.replace('{z}', '{level}').replace('{x}', '{col}').replace( '{y}', '{row}') arcrest_url = orig_url.replace('/export', '') context = { 'layer': layer, 'arctile_url': arctile_url, 'arcrest_url': arcrest_url, 'domain': get_domain(8000) } return render_to_response(template, RequestContext(request, context))
def fetch_from(self, urls): """ :param urls: A list of urls to fetch sitemaps of :return: A list of urls that was found within each sitemap of given urls """ unique_domains = list(set(get_domain(u) for u in urls)) sitemaps = self._try_fetch_sitemaps(unique_domains) results = [] for url in sitemaps: sitemaps_content = self.requests_getter.get_content_from(sitemaps[url]) for content in sitemaps_content: locations = self.sitemap_url_extractor.extract_from(content) locations = filter(lambda u: not u.endswith('.xml'), locations) results.extend(locations) return results
def article(): url = request.args.get('url') article = mongo.article.find_one({'_id': url}) if not article: try: html = get_or_cache(url) article = html2article(html, url, selector=True, merge=True) if article and not article['src_name']: article['src_name'] = get_domain(url) tpl = url2tpl(url) urls = html2urls(html, url) texts = dict( map(lambda x: (x[0], max(x[1], key=lambda y: len(y))), urls.iteritems())) tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys())) urls = {} for u, t in tmp.iteritems(): if u != url and t == tpl: urls[u] = texts[u] if len(urls) >= 10: break if article: article['urls'] = urls article['_id'] = url article['view'] = 1 article['last'] = time.time() copy = article.copy() copy['urls'] = json.dumps(copy['urls']) mongo.article.save(copy) except: pass else: article['urls'] = json.loads(article['urls']) mongo.article.update({'_id': url}, {'$set': { 'view': article['view'] + 1 }}) if article: article['pubtime'] = article['pubtime'][:10] return render_template('extract/article.html', article=article, url=url)
def deploy_grafana_route(): # Deploy Grafana Route topic = 'Grafana Route' src_file = os.path.join( os.getcwd(), "deploy/monitoring/grafana/assisted-installer-ocp-grafana-route.yaml") dst_file = os.path.join(os.getcwd(), "build/assisted-installer-ocp-grafana-route.yaml") ingress_domain = utils.get_domain() with open(src_file, "r") as src: with open(dst_file, "w+") as dst: data = src.read() data = data.replace("INGRESS_DOMAIN", ingress_domain) print("Deploying {}: {}".format(topic, dst_file)) dst.write(data) utils.apply(dst_file)
def sortFrontier(frontier, domainurl_count): inlinks_count = [len(i.inlinks) for i in frontier] inlinks_score = normalize(inlinks_count) domain_count = [domainurl_count[get_domain(i.url)] for i in frontier] domain_score = normalize(domain_count) keyword_score = match_keywords(frontier) for idx,obj in enumerate(frontier): final_score = inlinks_score[idx] + domain_score[idx] + keyword_score[idx] obj.score = final_score sortedList = sorted(frontier, key=lambda obj: obj.score, reverse=True) logging.info("Returning sorted list of objects (top 30){}".format([(obj.url, obj.score) for obj in sortedList[:30]])) return sortedList
def domain_out_domains(): with open('domain_out_domains.jsonl', mode='w') as f: for file_name in os.listdir('url_outlinks'): with open(f"url_outlinks/{file_name}") as fp: domains = json.load(fp) for domain, url_outlinks in domains.items(): out_domains_set = set() for outlinks in url_outlinks.values(): for outlink in outlinks: out_domains_set.add(get_domain(outlink)) f.write( f"{json.dumps({'domain': domain, 'out_domains': list(out_domains_set)})}\n" ) fp.close() f.flush() f.close()
def fetch_stories(self, correlation_id=-1): """Fetches new stories from the datasource. Uses the last story external id to fetch only new stories.""" try: url = "http://%s/twitter_sensor/?user=%s&password=%s" % (get_domain(), self.user.user_name, self.user.user_password) tweets = urllib.urlopen(url).read() tweets = json.loads(tweets) print tweets for key in tweets: try : authors = [] authors.append(tweets[key]) self.add_read_story(key, authors) self.add_user(tweets[key]) except: log_event("fetch_stories_failed", "AgentCell", self.id, "Adding fetched story %s failed, for %s" % (key, self.user), correlation_id) except: log_event("fetch_stories_failed", "AgentCell", self.id, "Failed to fetch stories for %s" % self.user, correlation_id)
def article(): url = request.args.get('url') article = mongo.article.find_one({'_id':url}) if not article: try: html = get_or_cache(url) article = html2article(html, url, selector=True, merge=True) if article and not article['src_name']: article['src_name'] = get_domain(url) tpl = url2tpl(url) urls = html2urls(html, url) texts = dict(map(lambda x: (x[0], max(x[1], key=lambda y:len(y))), urls.iteritems())) tmp = dict(map(lambda x: (x, url2tpl(x)), texts.iterkeys())) urls = {} for u, t in tmp.iteritems(): if u != url and t == tpl: urls[u] = texts[u] if len(urls) >= 10: break if article: article['urls'] = urls article['_id'] = url article['view'] = 1 article['last'] = time.time() copy = article.copy() copy['urls'] = json.dumps(copy['urls']) mongo.article.save(copy) except: pass else: article['urls'] = json.loads(article['urls']) mongo.article.update({'_id':url}, {'$set':{'view':article['view'] + 1}}) if article: article['pubtime'] = article['pubtime'][:10] return render_template('extract/article.html', article=article, url=url)
def crawl(self, url, max_page_depth=5, max_external_sites_page_depth=4, request_rate_limit=4): """ Will crawl a given url up to max_page_depth and max_external_sites_page_depth on a max rate of request_rate_limit. :param url: The to-be crawled url :param max_page_depth: Max internal (same-domain) depth :param max_external_sites_page_depth: Max external (different-domain) depth :param request_rate_limit: Up to n requests at once :return: List of Url objects (See schemas/url.py) """ self._url_scanner.set_request_limit(request_rate_limit) self._max_page_depth = max_page_depth self._max_external_sites_page_depth = max_external_sites_page_depth self._domain = get_domain(url) self._internal_urls_to_scan.append(url) self._crawl_internal_urls() self._crawl_external_urls() return self._get_crawled_urls()
def main(): # TODO: delete once rename everything to assisted-installer if args.target == "oc-ingress": service_host = "assisted-installer.{}".format( utils.get_domain(args.domain)) service_port = "80" else: service_host = utils.get_service_host(SERVICE, args.target) service_port = utils.get_service_port(SERVICE, args.target) with open(SRC_FILE, "r") as src: with open(DST_FILE, "w+") as dst: data = src.read() data = data.replace("REPLACE_URL", '"{}"'.format(service_host)) data = data.replace("REPLACE_PORT", '"{}"'.format(service_port)) print("Deploying {}".format(DST_FILE)) if args.deploy_tag is not "": versions = { "IMAGE_BUILDER": "quay.io/ocpmetal/installer-image-build:", "AGENT_DOCKER_IMAGE": "quay.io/ocpmetal/agent:", "KUBECONFIG_GENERATE_IMAGE": "quay.io/ocpmetal/ignition-manifests-and-kubeconfig-generate:", "INSTALLER_IMAGE": "quay.io/ocpmetal/assisted-installer:", "CONNECTIVITY_CHECK_IMAGE": "quay.io/ocpmetal/connectivity_check:", "INVENTORY_IMAGE": "quay.io/ocpmetal/inventory:", "HARDWARE_INFO_IMAGE": "quay.io/ocpmetal/hardware_info:", "SELF_VERSION": "quay.io/ocpmetal/installer-image-build:" } versions = { k: v + args.deploy_tag for k, v in versions.items() } y = yaml.load(data) y['data'].update(versions) data = yaml.dump(y) else: y = yaml.load(data) y['data'].update({"SELF_VERSION": os.environ.get("SERVICE")}) data = yaml.dump(y) dst.write(data) utils.apply(DST_FILE)
def read_seeds(seedfile): logging.info("Reading the seeds") frontier_map = {} # url: url object frontier = {} # waveno : list of url objects domain_urlcount = {} wave_no = 1 with open(seedfile, "r") as f: for line in f: line = line.strip() url = clean_url(line) if validators.url(url) is True: obj = URL(url, wave_no) frontier_map[url] = obj if wave_no not in frontier: frontier[wave_no] = [] frontier[wave_no].append(obj) domain_urlcount = checkDomain(get_domain(obj.url), domain_urlcount) return frontier_map, frontier, domain_urlcount
def deploy_grafana_route(): '''Deploy Grafana Route''' topic = 'Grafana Route' src_file = os.path.join(os.getcwd(),\ 'deploy/monitoring/grafana/assisted-installer-ocp-grafana-route.yaml') dst_file = os.path.join(os.getcwd(),\ 'build', deploy_options.namespace, 'assisted-installer-ocp-grafana-route.yaml') try: # I have permissions ingress_domain = utils.get_domain(target=deploy_options.target, namespace=deploy_options.namespace, profile=deploy_options.profile) except: # I have not permissions, yes it's ugly... # This ingress should be there because of UI deployment json_path_ingress = '{.spec.rules[0].host}' cmd = "{} -n {} get ingress assisted-installer -o jsonpath='{}'".format( CMD_BIN, deploy_options.namespace, json_path_ingress) assisted_installer_ingress_domain = utils.check_output(cmd) if assisted_installer_ingress_domain.split( ".")[0] != 'assisted-installer': print("Error recovering the ingress route") sys.exit(1) ingress_domain = assisted_installer_ingress_domain.split(".", maxsplit=1)[1] with open(src_file, "r") as src: with open(dst_file, "w+") as dst: data = src.read() data = data.replace("INGRESS_DOMAIN", ingress_domain) data = data.replace('REPLACE_NAMESPACE', f'"{deploy_options.namespace}"') print("Deploying {}: {}".format(topic, dst_file)) dst.write(data) utils.apply(target=deploy_options.target, namespace=deploy_options.namespace, profile=deploy_options.profile, file=dst_file)
def extract_essence(self, correlation_id): """ Analyze the story text, to extract the essence from it. For the essence, look for a matching StoryEssence cell. If found, link the story cell to the StoryEssence cell. Else create a new StoryEssence cell & link the story to it. """ try: print "extract_essence called for story '%s'" % self.core client = Client() response = client.get('http://%s/text_analyzer/extract_essence/' % get_domain(), {'text': self.core}).content print "got extract essence response: ", response if response != "": try : self.add_essence(response) except: print sys.exc_info() print "essence=", response log_event("extract_essence_failed", STORY_CELL, self.id, "Adding essence '%s' extracted from story '%s' failed" % (response, self.core), correlation_id) # all went all, update the flag self.is_essence_extracted = True self.save() except: print "Failed to extract essence", sys.exc_info() log_event("extract_essence_failed", STORY_CELL, self.id, "Failed to extract essence from story '%s'" % self.core, correlation_id)
def fetch_stories(self, correlation_id=-1): """Fetches new stories from the datasource. Uses the last story external id to fetch only new stories.""" try: #url = "http://%s/twitter_sensor/?user=%s&password=%s" % (get_domain(), self.user.user_name, self.user.user_password) #tweets = urllib.urlopen(url).read() client = Client() tweets = client.get('http://%s/twitter_sensor/' % get_domain(), {'user': self.user.user_name, 'password': self.user.user_password}).content tweets = json.loads(tweets) print tweets for key in tweets: try : authors = [] for story in StoryCell.objects.all(): if story.core == key: return authors.append(tweets[key][0]) self.add_read_story(key, authors) self.add_user(tweets[key][0]) except: log_event("fetch_stories_failed", AGENT_CELL, self.id, "Adding fetched story %s failed, for %s" % (key, self.user), correlation_id) except: print "Failed to fetch stories", sys.exc_info() log_event("fetch_stories_failed", AGENT_CELL, self.id, "Failed to fetch stories for %s" % self.user, correlation_id)
def tiles_link(self): if self.is_shareable and self.layer_type in ['XYZ', 'ArcRest', 'WMS']: domain = get_domain(8000) return '%s/explore/%s' %(domain, self.slug) return None
def extract_concepts(self, correlation_id): """ Analyze the story text, to extract named entities. For each named entity, look for a matching Concept cell. If found, link the concept cell to the story. Else create a new concept cell & link the story to it. """ try: client = Client() response = client.get('http://%s/text_analyzer/extract_named_entities/' % get_domain(), {'text': self.core}).content if response != "[]": named_entities = json.loads(response) for ne in named_entities: try : self.add_concept(ne) except: print sys.exc_info() log_event("extract_concepts_failed", STORY_CELL, self.id, "Adding concept '%s' extracted from story '%s' failed" % (ne[0], self.core), correlation_id) # all went all, update the flag self.is_concepts_extracted = True self.save() except: print "Failed to extract concepts", sys.exc_info() log_event("extract_concepts_failed", STORY_CELL, self.id, "Failed to extract concepts from story '%s'" % self.core, correlation_id)
def filter_func(x): return get_domain(loads(x[1])).lower() in domain.lower()
# Create A set of results based upon this result set - iterator?? ctx['posts'] = entities rss_content = self.render_template('./templates/newsfeeds/rss.html', ctx, to_string=True) # Set Cache ubercache.cache_set(cache_key, rss_content, category='written') self.response.headers['Content-Type'] = 'application/xml' self.response.write(rss_content) return # Rest Controllers resource_url = 'http://' + get_domain() + '/api/posts/%s' category_resource_url = 'http://' + get_domain() + '/api/post_categories/%s' CATEGORY_REST_RULES = [ ResourceIdField(output_only=True), ResourceUrlField(category_resource_url, output_only=True), SlugField(BlogCategory.slug, required=True), RestField(BlogCategory.title, required=True), ] REST_RULES = [ ResourceIdField(output_only=True), ResourceUrlField(resource_url, output_only=True), SlugField(BlogPost.slug, required=True),
def data_catalog_bs3(request, template='bs3_catalog.html'): themes = Theme.objects.all().order_by('display_name') themes_with_links = add_learn_links(themes) add_ordered_layers_lists(themes_with_links) context = {'themes': themes_with_links, 'domain': get_domain(8000), 'domain8010': get_domain()} return render_to_response(template, RequestContext(request, context))
def data_needs(request, template='needs.html'): themes = Theme.objects.all().order_by('display_name') ordered_themes, theme_dict = add_ordered_needs_lists(themes) context = {'themes': themes, 'theme_dict': theme_dict, 'ordered_themes': ordered_themes, 'domain': get_domain(8000), 'domain8010': get_domain()} return render_to_response(template, RequestContext(request, context))
def get_user_stats(username, db_file): item = { "username": username, "aliases": [], "total_posts": 0, "domains": {}, "first_post_date": None, "first_post_date_unix": None, "most_recent_post": None, "most_recent_post_unix": 0, "average_posts_per_hour": 0.0, "average_posts_per_day": 0.0, "average_posts_per_week": 0.0 } db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) if loaded_rec['person'] != username: cur.step() continue # Looks like this is a post by the user we're looking for split = get_domain(loaded_rec) if item['domains'].get(split, False) == False: item['domains'][split] = 1 else: item['domains'][split] = item['domains'][split] + 1 if item['first_post_date_unix'] is None: item['first_post_date_unix'] = loaded_rec['created_at'] if item['most_recent_post_unix'] < loaded_rec['created_at']: item['most_recent_post_unix'] = loaded_rec['created_at'] item['total_posts'] = item['total_posts'] + 1 cur.step() cur.disable() db.close() # Clean up everything first_time = None if item['first_post_date_unix'] is not None: unix = float(item['first_post_date_unix']) first_time = datetime.fromtimestamp(unix) item['first_post_date'] = first_time.isoformat() recent_time = None if item['most_recent_post_unix'] is not None: unix = float(item['most_recent_post_unix']) recent_time = datetime.fromtimestamp(unix) item['most_recent_post'] = recent_time.isoformat() if first_time and recent_time: delta = recent_time - first_time item['user_age_days'] = delta.days item['user_age_seconds'] = delta.total_seconds() item['average_posts_per_hour'] = item['total_posts'] / ( delta.total_seconds() / 60.0) item['average_posts_per_day'] = item['total_posts'] / ( delta.total_seconds() / 60.0 / 24.0) item['average_posts_per_week'] = item['total_posts'] / ( delta.total_seconds() / 60.0 / 24.0 / 7.0) return item
def get_user_stats(username, db_file): item = { "username": username, "aliases": [], "total_posts": 0, "domains": {}, "first_post_date": None, "first_post_date_unix": None, "most_recent_post": None, "most_recent_post_unix": 0, "average_posts_per_hour": 0.0, "average_posts_per_day": 0.0, "average_posts_per_week": 0.0 } db = DB() if not db.open("{0}".format(db_file), DB.OREADER | DB.OCREATE): print "Could not open database." cur = db.cursor() cur.jump() while True: rec = cur.get(False) if not rec: break loaded_rec = loads(rec[1]) if loaded_rec['person'] != username: cur.step() continue # Looks like this is a post by the user we're looking for split = get_domain(loaded_rec) if item['domains'].get(split, False) == False: item['domains'][split] = 1 else: item['domains'][split] = item['domains'][split] + 1 if item['first_post_date_unix'] is None: item['first_post_date_unix'] = loaded_rec['created_at'] if item['most_recent_post_unix'] < loaded_rec['created_at']: item['most_recent_post_unix'] = loaded_rec['created_at'] item['total_posts'] = item['total_posts'] + 1 cur.step() cur.disable() db.close() # Clean up everything first_time = None if item['first_post_date_unix'] is not None: unix = float(item['first_post_date_unix']) first_time = datetime.fromtimestamp(unix) item['first_post_date'] = first_time.isoformat() recent_time = None if item['most_recent_post_unix'] is not None: unix = float(item['most_recent_post_unix']) recent_time = datetime.fromtimestamp(unix) item['most_recent_post'] = recent_time.isoformat() if first_time and recent_time: delta = recent_time - first_time item['user_age_days'] = delta.days item['user_age_seconds'] = delta.total_seconds() item['average_posts_per_hour'] = item['total_posts'] / (delta.total_seconds() / 60.0) item['average_posts_per_day'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0) item['average_posts_per_week'] = item['total_posts'] / (delta.total_seconds() / 60.0 / 24.0 / 7.0) return item
def description_link(self): theme_name = self.themes.all()[0].name domain = get_domain(8000) return '%s/learn/%s#%s' %(domain, theme_name, self.slug)
from rest.resource import RestField, SlugField, ResourceIdField, ResourceUrlField from rest.resource import BooleanField, ResourceField from rest.params import coerce_to_datetime from rest.utils import get_key_from_resource_id from files.rest_helpers import REST_RESOURCE_RULES as FILE_REST_RULES from modules.events.internal import api as events_api from modules.events.internal.models import Event from modules.events.constants import CATEGORY, PRIMARY_IMAGE_PROP from utils import ubercache from cal.rest_helpers import EventDateField from utils import get_domain resource_url = 'http://' + get_domain() + '/api/events/%s' # TODO: HRM? # verbosity vs. input vs. output REST_RULES = [ ResourceIdField(output_only=True), ResourceUrlField(resource_url, output_only=True), SlugField(Event.slug, required=True), RestField(Event.name, required=True), RestField(Event.url, required=False), EventDateField(Event.event_dates, required=True), RestField(Event.content), RestField(Event.summary), BooleanField(Event.featured), RestField(Event.primary_image_resource_id, required=False), ResourceField(PRIMARY_IMAGE_PROP,
def learn_page(request, theme_name=None, template='learn_page.html'): topics = Topic.objects.filter(active=True).order_by('ordering') context = {'topics': topics, 'domain': get_domain(8000), 'domain8010': get_domain()} return render_to_response(template, RequestContext(request, context))
def topic_page(request, topic_name=None, template='topic_page.html'): topic = get_object_or_404(Topic, name = topic_name) views = MapView.objects.filter(topic=topic).order_by('ordering') viewsList = simplejson.dumps([view.name for view in views]) layers = topic.layers.all().order_by('name') context = {'topic': topic, 'views': [views[0]], 'views_list': [viewsList[0]], 'initial_view': views[0].name, 'layers': layers, 'domain': get_domain(8000), 'domain8010': get_domain()} return render_to_response(template, RequestContext(request, context))
def is_filtered_jid(self, user_jid): if (self._whitelist and user_jid not in self._whitelist and get_domain(user_jid) not in self._whitelist): return True
def learn_link(self): domain = get_domain(8000) return '%s/learn/%s' %(domain, self.name)
def get_absolute_url(self): return "http://%s/cells/view/story/%d" % (get_domain(), self.id)