def build_web_domains_redirects(self): self.web_domains_redirects = defaultdict(set) for k, v in self.web_sites.items(): if v.redirect_to is not None: d1 = urlsplit_pro(k).hostname d2 = urlsplit_pro(v.redirect_to).hostname if d1 != d2: self.web_domains_redirects[d1].add(d2) self.web_domains_redirects[d2].add(d1)
def get_web_domain_by_url(self, document_url, site_url): # first take web domain from which the document was dowloaded web_domain = urlsplit_pro(document_url).hostname if self.web_sites.get_first_site_by_web_domain(web_domain) is not None: return web_domain # if this web domain is unknown, take web domain from site_url web_domain = urlsplit_pro(site_url).hostname if self.web_sites.get_first_site_by_web_domain(web_domain) is None: if not self.web_sites.is_a_special_domain(web_domain): self.logger.error( "web domain {} is missing in office.txt".format(site_url)) return web_domain
def check_href_elementary(href): if len(href) == 0: return False if href.startswith('mailto:'): return False if href.startswith('tel:'): return False if href.startswith('javascript:'): return False if href.startswith('about:'): return False if href.startswith('consultantplus:'): return False # spaces are not prohibited, but should be converted if href.find('\n') != -1 or href.find('\t') != -1: return False if href.startswith('#'): if not href.startswith('#!'): # it is a hashbang (a starter for AJAX url) http://minpromtorg.gov.ru/open_ministry/anti/ return False if href.find('?') != -1: o = urlsplit_pro(href) if o.query != '': query = urllib.parse.parse_qs(o.query) if 'print' in query: return False # khabkrai.ru if 'special' in query.get('version', list()): return False # admkrsk.ru if 'accessability' in query: return False return True
def get_region_by_url(self, name, url): if name not in self.name2wikidata: return None, None regions = set() (_, netloc1, _, _, _) = urlsplit_pro(url) for x in self.name2wikidata.get(name, []): (_, netloc2, _, _, _) = urlsplit_pro(x['website']) if netloc1 == netloc2: region_wikidata_id = os.path.basename(x['oblast']) region_id = self.regions.get_region_by_wikidata_id(region_wikidata_id) if region_id is None: continue entry_id = os.path.basename(x['item']) regions.add((entry_id, region_id)) if len(regions) == 1: return list(regions)[0] return None, None
def print_predicted_as_external(self): web_sites = TDeclarationWebSiteList(logger=self.logger, offices=RUSSIA.offices_in_memory) for key, src_doc in self.dlrobot_human.get_all_documents(): if src_doc.calculated_office_id is None: continue urls = set(r.get_site_url() for r in src_doc.web_references) if len(urls) != 1: continue src_doc_url = list(urls)[0] if src_doc_url == "service.nalog.ru": continue office = RUSSIA.offices_in_memory.get_office_by_id( src_doc.calculated_office_id) u: TDeclarationWebSite found = False origin_hostname = urlsplit_pro(src_doc_url).hostname if web_sites.is_a_special_domain(origin_hostname): continue for u in office.office_web_sites: if urlsplit_pro(u.url).hostname == origin_hostname: found = True break if found: continue ww = web_sites.search_url(src_doc_url) if ww is None: self.logger.error( "cannot find url {} by web domain in offices.txt".format( src_doc_url)) continue r = { "sha256": key, "predicted_office": { "id": office.office_id, "name": office.name }, "url_host_office": { "id": ww.parent_office.office_id, "name": ww.parent_office.name }, "url": src_doc_url, "title": src_doc.get_doc_title() } print(json.dumps(r, indent=4, ensure_ascii=False))
def _prepare_url_before_http_request(url, method): THttpRequester.consider_request_policy(url, method) url = TUrlUtf8Encode.convert_url_to_idna(url) o = urlsplit_pro(url) path = urllib.parse.unquote(o.path) path = urllib.parse.quote(path) url = urllib.parse.urlunsplit( (o.scheme, o.netloc, path, o.query, o.fragment)) return url
def set_target(self, target_url, target_title=None): if target_url is None or len(target_url) == 0: self.target_url = None self.target_title = None self.url_query = '' self.url_path = '' else: self.target_url = strip_viewer_prefix(target_url).strip(" \r\n\t") self.target_title = target_title o = urlsplit_pro(self.target_url) self.url_query = o.query self.url_path = o.path
def get_other_sites_regexp_on_the_same_web_domain(self, morda_url): web_domain = urlsplit_pro(morda_url).hostname other_sites = list() for k in self.get_sites_by_web_domain(web_domain): if morda_url.find(k) == -1: other_sites.append("((www.)?{}(/|$))".format(k)) if len(other_sites) == 0: return None s = "|".join(other_sites) self.logger.debug( "use regexp {} to prohibit crawling other projects".format(s)) return re.compile(s)
def urls_html(self): site_info: TDeclarationWebSite hrefs = list() for site_info in self.office_web_sites: p = urlsplit_pro(site_info.url) anchor = p.netloc + p.path if not site_info.can_communicate(): href = "{} (obsolete)".format(anchor) else: href = '<a href="{}">{}</a>'.format(site_info.url, anchor) hrefs.append(href) return "; ".join(hrefs)
def _get_base_url(self): base = self.url for l in self.soup.findAll('base'): href = l.attrs.get('href') if href is not None: base = href break if base.startswith('/') and not base.startswith('//'): o = urlsplit_pro(self.url) scheme_and_web_domain = urllib.parse.urlunsplit( (o.scheme, o.netloc, "", "", "")) base = THtmlParser.make_link(scheme_and_web_domain, base) return base
def export_files(self): human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json) if self.args.start_from_empty: human_files_db.create_db() else: human_files_db.open_write_mode() document_file_ids = set() for sha256, doc in human_files_db.get_all_documents(): for ref in doc.decl_references: if ref.document_file_id is not None: document_file_ids.add(ref.document_file_id) files_count = 0 for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records(): if document_file_id in document_file_ids: continue while self.pdf_conversion_client.server_is_too_busy(): self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format( self.pdf_conversion_client.last_pdf_conversion_queue_length )) time.sleep(5*60) web_site = urlsplit_pro(link).netloc if web_site.startswith('www.'): web_site = web_site[len('www.'):] if self.args.max_files_count is not None and files_count >= self.args.max_files_count: break self.logger.debug("export document_file_id={}".format(document_file_id)) for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path, document_file_id): sha256 = build_dislosures_sha256(local_file_path) self.logger.debug("add {}, sha256={}".format(local_file_path, sha256)) source_document = TSourceDocument(os.path.splitext(local_file_path)[1]) ref = TDeclaratorReference() ref.document_id = document_id ref.document_file_id = document_file_id ref._site_url = web_site ref.office_id = self.fix_list(sha256, office_id) ref.income_year = income_year ref.document_file_url = declarator_url source_document.add_decl_reference(ref) human_files_db.update_source_document(sha256, source_document) files_count += 1 self.logger.debug('added files count: {}'.format(files_count)) human_files_db.close_db() self.send_new_pdfs_to_smart_parser()
def get_url_modifications(url: str): o = urllib_parse_pro.urlsplit_pro(url) if len(o.scheme) > 0: protocols = [o.scheme] else: protocols = ["http", "https"] if o.netloc.startswith("www."): with_www = [True] else: with_www = [True, False] for only_with_www in with_www: for protocol in protocols: host = o.netloc if only_with_www: host = "www." + host modified_url = urllib.parse.urlunsplit( (protocol, host, o.path, o.query, o.fragment)) yield modified_url
def init_main_page_url_from_redirected_url(self, url, title, page_source): o = urllib_parse_pro.urlsplit_pro(url) netloc = o.netloc scheme = o.scheme if scheme == 'http' and netloc.endswith(':443'): self.logger.debug("coerce url {} to https".format(url)) netloc = netloc[0:-len(':443')] scheme = 'https' self.main_page_url = urllib.parse.urlunsplit([ scheme, netloc, o.path, # path '', # query '' ]) self.logger.debug("main_url_page={}".format(self.main_page_url)) self.reach_status = TWebSiteReachStatus.normal self.main_page_source = page_source self.url_nodes[self.main_page_url] = TUrlInfo(title=title)
def get_web_domain(url): web_domain = urlsplit_pro(url).hostname if web_domain.startswith("www."): web_domain = web_domain[4:] return web_domain
def test_url_split(self): self.assertEqual(urlsplit_pro('http://petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('https://petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('ftp://petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('mailto://petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('http://petushki.info:99').netloc, 'petushki.info:99') self.assertEqual(urlsplit_pro('https:////petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('//petushki.info').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('https:////petushki.info/test').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('petushki.info/test').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('//petushki.info/test').netloc, 'petushki.info') self.assertEqual(urlsplit_pro('дагогни.рф').netloc, 'дагогни.рф') self.assertEqual(urlsplit_pro('дагогни.рф/test').netloc, 'дагогни.рф') self.assertEqual(urlsplit_pro('http://дагогни.рф/test').netloc, 'дагогни.рф') self.assertEqual(urlsplit_pro('https://xn--80agabx3af.xn--p1ai').netloc, 'xn--80agabx3af.xn--p1ai') self.assertEqual(urlsplit_pro('xn--80agabx3af.xn--p1ai').netloc, 'xn--80agabx3af.xn--p1ai') self.assertEqual(urlsplit_pro('xn--80agabx3af.xn--p1ai/test').netloc, 'xn--80agabx3af.xn--p1ai')
def site_url_to_web_domain(site_url): return urlsplit_pro(site_url).hostname
def get_main_url_protocol(self): return str(urllib_parse_pro.urlsplit_pro(self.main_page_url).scheme)