def build_web_domains_redirects(self):
     self.web_domains_redirects = defaultdict(set)
     for k, v in self.web_sites.items():
         if v.redirect_to is not None:
             d1 = urlsplit_pro(k).hostname
             d2 = urlsplit_pro(v.redirect_to).hostname
             if d1 != d2:
                 self.web_domains_redirects[d1].add(d2)
                 self.web_domains_redirects[d2].add(d1)
Beispiel #2
0
 def get_web_domain_by_url(self, document_url, site_url):
     # first take web domain from which the document was dowloaded
     web_domain = urlsplit_pro(document_url).hostname
     if self.web_sites.get_first_site_by_web_domain(web_domain) is not None:
         return web_domain
     # if this web domain is unknown, take web domain from site_url
     web_domain = urlsplit_pro(site_url).hostname
     if self.web_sites.get_first_site_by_web_domain(web_domain) is None:
         if not self.web_sites.is_a_special_domain(web_domain):
             self.logger.error(
                 "web domain {} is missing in office.txt".format(site_url))
     return web_domain
Beispiel #3
0
def check_href_elementary(href):
    if len(href) == 0:
        return False
    if href.startswith('mailto:'):
        return False
    if href.startswith('tel:'):
        return False
    if href.startswith('javascript:'):
        return False
    if href.startswith('about:'):
        return False
    if href.startswith('consultantplus:'):
        return False
    # spaces are not prohibited, but should be converted
    if href.find('\n') != -1 or href.find('\t') != -1:
        return False
    if href.startswith('#'):
        if not href.startswith('#!'): # it is a hashbang (a starter for AJAX url) http://minpromtorg.gov.ru/open_ministry/anti/
            return False
    if href.find('?') != -1:
        o = urlsplit_pro(href)
        if o.query != '':
            query = urllib.parse.parse_qs(o.query)
            if 'print' in query:
                return False
            # khabkrai.ru
            if 'special' in query.get('version', list()):
                return False
            # admkrsk.ru
            if 'accessability' in query:
                return False

    return True
Beispiel #4
0
 def get_region_by_url(self, name, url):
     if name not in self.name2wikidata:
         return None, None
     regions = set()
     (_, netloc1, _, _, _) = urlsplit_pro(url)
     for x in self.name2wikidata.get(name, []):
         (_, netloc2, _, _, _) = urlsplit_pro(x['website'])
         if netloc1 == netloc2:
             region_wikidata_id = os.path.basename(x['oblast'])
             region_id = self.regions.get_region_by_wikidata_id(region_wikidata_id)
             if region_id is None:
                 continue
             entry_id = os.path.basename(x['item'])
             regions.add((entry_id, region_id))
     if len(regions) == 1:
         return list(regions)[0]
     return None, None
 def print_predicted_as_external(self):
     web_sites = TDeclarationWebSiteList(logger=self.logger,
                                         offices=RUSSIA.offices_in_memory)
     for key, src_doc in self.dlrobot_human.get_all_documents():
         if src_doc.calculated_office_id is None:
             continue
         urls = set(r.get_site_url() for r in src_doc.web_references)
         if len(urls) != 1:
             continue
         src_doc_url = list(urls)[0]
         if src_doc_url == "service.nalog.ru":
             continue
         office = RUSSIA.offices_in_memory.get_office_by_id(
             src_doc.calculated_office_id)
         u: TDeclarationWebSite
         found = False
         origin_hostname = urlsplit_pro(src_doc_url).hostname
         if web_sites.is_a_special_domain(origin_hostname):
             continue
         for u in office.office_web_sites:
             if urlsplit_pro(u.url).hostname == origin_hostname:
                 found = True
                 break
         if found:
             continue
         ww = web_sites.search_url(src_doc_url)
         if ww is None:
             self.logger.error(
                 "cannot find url {} by web domain in offices.txt".format(
                     src_doc_url))
             continue
         r = {
             "sha256": key,
             "predicted_office": {
                 "id": office.office_id,
                 "name": office.name
             },
             "url_host_office": {
                 "id": ww.parent_office.office_id,
                 "name": ww.parent_office.name
             },
             "url": src_doc_url,
             "title": src_doc.get_doc_title()
         }
         print(json.dumps(r, indent=4, ensure_ascii=False))
Beispiel #6
0
 def _prepare_url_before_http_request(url, method):
     THttpRequester.consider_request_policy(url, method)
     url = TUrlUtf8Encode.convert_url_to_idna(url)
     o = urlsplit_pro(url)
     path = urllib.parse.unquote(o.path)
     path = urllib.parse.quote(path)
     url = urllib.parse.urlunsplit(
         (o.scheme, o.netloc, path, o.query, o.fragment))
     return url
Beispiel #7
0
 def set_target(self, target_url, target_title=None):
     if target_url is None or len(target_url) == 0:
         self.target_url = None
         self.target_title = None
         self.url_query = ''
         self.url_path = ''
     else:
         self.target_url = strip_viewer_prefix(target_url).strip(" \r\n\t")
         self.target_title = target_title
         o = urlsplit_pro(self.target_url)
         self.url_query = o.query
         self.url_path = o.path
 def get_other_sites_regexp_on_the_same_web_domain(self, morda_url):
     web_domain = urlsplit_pro(morda_url).hostname
     other_sites = list()
     for k in self.get_sites_by_web_domain(web_domain):
         if morda_url.find(k) == -1:
             other_sites.append("((www.)?{}(/|$))".format(k))
     if len(other_sites) == 0:
         return None
     s = "|".join(other_sites)
     self.logger.debug(
         "use regexp {} to prohibit crawling other projects".format(s))
     return re.compile(s)
 def urls_html(self):
     site_info: TDeclarationWebSite
     hrefs = list()
     for site_info in self.office_web_sites:
         p = urlsplit_pro(site_info.url)
         anchor = p.netloc + p.path
         if not site_info.can_communicate():
             href = "{} (obsolete)".format(anchor)
         else:
             href = '<a href="{}">{}</a>'.format(site_info.url, anchor)
         hrefs.append(href)
     return ";&nbsp;&nbsp;&nbsp;".join(hrefs)
Beispiel #10
0
    def _get_base_url(self):
        base = self.url
        for l in self.soup.findAll('base'):
            href = l.attrs.get('href')
            if href is not None:
                base = href
                break
        if base.startswith('/') and not base.startswith('//'):
            o = urlsplit_pro(self.url)
            scheme_and_web_domain = urllib.parse.urlunsplit(
                (o.scheme, o.netloc, "", "", ""))
            base = THtmlParser.make_link(scheme_and_web_domain, base)

        return base
    def export_files(self):
        human_files_db = TDlrobotHumanFileDBM(self.args.dlrobot_human_json)
        if self.args.start_from_empty:
            human_files_db.create_db()
        else:
            human_files_db.open_write_mode()
        document_file_ids = set()
        for sha256, doc in human_files_db.get_all_documents():
            for ref in doc.decl_references:
                if ref.document_file_id is not None:
                    document_file_ids.add(ref.document_file_id)

        files_count = 0
        for document_file_id, document_id, file_path, link, office_id, income_year in self.get_all_file_sql_records():
            if document_file_id in document_file_ids:
                continue

            while self.pdf_conversion_client.server_is_too_busy():
                self.logger.error("wait pdf conversion_server for 5 minutes, last_pdf_conversion_queue_length={}".format(
                    self.pdf_conversion_client.last_pdf_conversion_queue_length
                ))
                time.sleep(5*60)

            web_site = urlsplit_pro(link).netloc
            if web_site.startswith('www.'):
                web_site = web_site[len('www.'):]

            if self.args.max_files_count is not None and files_count >= self.args.max_files_count:
                break
            self.logger.debug("export document_file_id={}".format(document_file_id))
            for local_file_path, declarator_url in self.download_unzip_and_send_file_source_doc_server(file_path,
                                                                                                    document_file_id):
                sha256 = build_dislosures_sha256(local_file_path)
                self.logger.debug("add {}, sha256={}".format(local_file_path, sha256))
                source_document = TSourceDocument(os.path.splitext(local_file_path)[1])
                ref = TDeclaratorReference()
                ref.document_id = document_id
                ref.document_file_id = document_file_id
                ref._site_url = web_site
                ref.office_id = self.fix_list(sha256, office_id)
                ref.income_year = income_year
                ref.document_file_url = declarator_url
                source_document.add_decl_reference(ref)
                human_files_db.update_source_document(sha256, source_document)
                files_count += 1
        self.logger.debug('added files count: {}'.format(files_count))
        human_files_db.close_db()
        self.send_new_pdfs_to_smart_parser()
 def get_url_modifications(url: str):
     o = urllib_parse_pro.urlsplit_pro(url)
     if len(o.scheme) > 0:
         protocols = [o.scheme]
     else:
         protocols = ["http", "https"]
     if o.netloc.startswith("www."):
         with_www = [True]
     else:
         with_www = [True, False]
     for only_with_www in with_www:
         for protocol in protocols:
             host = o.netloc
             if only_with_www:
                 host = "www." + host
             modified_url = urllib.parse.urlunsplit(
                 (protocol, host, o.path, o.query, o.fragment))
             yield modified_url
 def init_main_page_url_from_redirected_url(self, url, title, page_source):
     o = urllib_parse_pro.urlsplit_pro(url)
     netloc = o.netloc
     scheme = o.scheme
     if scheme == 'http' and netloc.endswith(':443'):
         self.logger.debug("coerce url {} to https".format(url))
         netloc = netloc[0:-len(':443')]
         scheme = 'https'
     self.main_page_url = urllib.parse.urlunsplit([
         scheme,
         netloc,
         o.path,  # path
         '',  # query
         ''
     ])
     self.logger.debug("main_url_page={}".format(self.main_page_url))
     self.reach_status = TWebSiteReachStatus.normal
     self.main_page_source = page_source
     self.url_nodes[self.main_page_url] = TUrlInfo(title=title)
Beispiel #14
0
def get_web_domain(url):
    web_domain = urlsplit_pro(url).hostname
    if web_domain.startswith("www."):
        web_domain = web_domain[4:]
    return web_domain
Beispiel #15
0
    def test_url_split(self):
        self.assertEqual(urlsplit_pro('http://petushki.info').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('https://petushki.info').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('ftp://petushki.info').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('mailto://petushki.info').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('http://petushki.info:99').netloc, 'petushki.info:99')

        self.assertEqual(urlsplit_pro('https:////petushki.info').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('petushki.info').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('//petushki.info').netloc, 'petushki.info')

        self.assertEqual(urlsplit_pro('https:////petushki.info/test').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('petushki.info/test').netloc, 'petushki.info')
        self.assertEqual(urlsplit_pro('//petushki.info/test').netloc, 'petushki.info')

        self.assertEqual(urlsplit_pro('дагогни.рф').netloc, 'дагогни.рф')
        self.assertEqual(urlsplit_pro('дагогни.рф/test').netloc, 'дагогни.рф')
        self.assertEqual(urlsplit_pro('http://дагогни.рф/test').netloc, 'дагогни.рф')

        self.assertEqual(urlsplit_pro('https://xn--80agabx3af.xn--p1ai').netloc, 'xn--80agabx3af.xn--p1ai')
        self.assertEqual(urlsplit_pro('xn--80agabx3af.xn--p1ai').netloc, 'xn--80agabx3af.xn--p1ai')
        self.assertEqual(urlsplit_pro('xn--80agabx3af.xn--p1ai/test').netloc, 'xn--80agabx3af.xn--p1ai')
Beispiel #16
0
 def site_url_to_web_domain(site_url):
     return urlsplit_pro(site_url).hostname
 def get_main_url_protocol(self):
     return str(urllib_parse_pro.urlsplit_pro(self.main_page_url).scheme)