Beispiel #1
0
    def get_url_list(self, start_selenium=False):
        web_domains = list()
        if self.args.filter_by_source is not None:
            web_domains = list()
            for k in self.web_sites.web_sites.values():
                if k.parent_office.source_id == self.args.filter_by_source:
                    web_domains.append(get_site_url(k.url))
        elif self.args.url_list is not None:
            web_domains = self.read_web_domains_from_file()
        else:
            #take all web domains
            web_domains = list(self.web_sites.web_sites.keys())

        domains_filtered = list(w for w in web_domains
                                if self.check_web_site_filters(w))

        self.logger.info("we are going to process {} web sites".format(
            len(domains_filtered)))

        if start_selenium:
            TDownloadEnv.FILE_CACHE_FOLDER = TDownloadEnv.FILE_CACHE_FOLDER + "_{}_{}".format(
                time.time(), os.getpid())
            self.logger.info("rm {}".format(TDownloadEnv.FILE_CACHE_FOLDER))
            TDownloadEnv.clear_cache_folder()
            project_path = "project.txt"
            TRobotProject.create_project("dummy.ru", project_path)
            with TRobotProject(
                    self.logger, project_path,
                    export_folder="result") as self.temp_dlrobot_project:
                for w in domains_filtered:
                    yield w
                os.unlink(project_path)
        else:
            for w in domains_filtered:
                yield w
Beispiel #2
0
 def create_departments(self):
     o: TOfficeInMemory
     TDownloadEnv.clear_cache_folder()
     project_path = "project.txt"
     TRobotProject.create_project("dummy.ru",
                                  project_path,
                                  web_sites_db=self.web_sites)
     with TRobotProject(
             self.logger, project_path,
             export_folder="result") as self.temp_dlrobot_project:
         for o in self.web_sites.offices.values():
             if o.parent_id == self.args.parent_office_id:
                 self.logger.info("ofiice id = {}, {}".format(
                     o.office_id, o.name))
                 query = self.args.query_template.format(o.name)
                 engine = random.choice(
                     [SearchEngineEnum.GOOGLE, SearchEngineEnum.YANDEX])
                 results = SearchEngine.send_request(
                     engine, query,
                     self.temp_dlrobot_project.selenium_driver)
                 if len(results) == 0:
                     msg = "cannot find results fo query {}".format(query)
                     self.logger.error(msg)
                 else:
                     new_web_site = TDeclarationWebSite(url=results[0])
                     found = False
                     for u in o.office_web_sites:
                         if u.url == new_web_site:
                             found = True
                             self.logger.error(
                                 "{} already exists".format(new_web_site))
                     if not found:
                         o.office_web_sites.append(new_web_site)
                         self.check_alive_one_url(new_web_site.url)
                 time.sleep(20)
Beispiel #3
0
 def setup_project(self, morda_url):
     logger = setup_logging('prohibited')
     self.project = TRobotProject(logger,
                                  '',
                                  config=TRobotConfig(),
                                  export_folder="result",
                                  enable_search_engine=False)
     web_site = self.project.add_web_site(morda_url)
     self.robot_step = TRobotStep(web_site)
     self.env = TestDlrobotEnv("data.prohibited")
     TDownloadEnv.FILE_CACHE_FOLDER = self.env.data_folder
 def test_unknown_site(self):
     self.project_path = os.path.join(self.env.data_folder, "project.txt")
     TRobotProject.create_project("http://unknown_site.org", self.project_path)
     dlrobot = TDlrobot(TDlrobot.parse_args(['--clear-cache-folder',  '--project', self.project_path]))
     try:
         project = dlrobot.open_project()
     except THttpRequester.RobotHttpException as exp:
         pass
     self.assertEqual(project.web_site_snapshots[0].reach_status, TWebSiteReachStatus.abandoned)
     TDownloadEnv.CONVERSION_CLIENT.stop_conversion_thread()
     TDownloadEnv.CONVERSION_CLIENT = None
def copy_files(args, toloka_results):
    assert args.positive_folder is not None
    assert args.negative_folder is not None
    logger = logging.getLogger("")
    with TRobotProject(args.project) as project:
        project.read_project()
        office_info: TWebSiteCrawlSnapshot
        office_info = project.web_site_snapshots[0]
        index = 0
        site_url = office_info.get_site_url()
        for export_record in office_info.exported_files:
            index += 1
            cached_file = export_record['cached_file']
            url = export_record['url']
            print()
            extension = TDownloadedFile(url).file_extension
            out_file = "{}_{}_{}{}".format(site_url.replace('/', '_'), index,
                                           int(time.time()), extension)
            tol_res = toloka_results.get(cached_file)
            if tol_res == "YES":
                folder = args.positive_folder
            elif tol_res == "NO":
                folder = args.negative_folder
            else:
                folder = None
            if folder is not None:
                out_file = os.path.join(folder, out_file)
                print("{} -> {}".format(url, out_file))
                shutil.copy(cached_file, out_file)
def create_toloka_pool(project_path, toloka_stream):
    logger = logging.getLogger("")
    with TRobotProject(logger, project_path) as project:
        project.read_project()
        office_info: TWebSiteCrawlSnapshot
        office_info = project.web_site_snapshots[0]
        toloka_stream.write(
            "INPUT:url\tINPUT:file_link\tINPUT:file_extension\tINPUT:html\n")
        ec = TExternalConverters()
        cnt = 0
        all_files = 0
        for export_record in office_info.exported_files:
            all_files += 1
            sys.stderr.write("{}/{}\n".format(all_files,
                                              len(office_info.exported_files)))
            sys.stderr.flush()
            url = export_record['url']
            cached_file = export_record['cached_file']
            extension = TDownloadedFile(url).file_extension
            temp_file = "dummy" + extension
            shutil.copy(cached_file, temp_file)
            html = ec.convert_to_html_with_soffice(temp_file)
            os.unlink(temp_file)
            if html is not None:
                html = html.replace("\t", " ").replace("\n",
                                                       " ").replace("\r", " ")
                toloka_stream.write("\t".join((url, cached_file, extension,
                                               html)) + "\n\n")
                cnt += 1
        sys.stderr.write("written {} lines of of {}".format(cnt, all_files))
    def __init__(self, port, website_folder, regional_main_pages=[]):
        self.dlrobot = None
        self.dlrobot_project = None
        self.web_site_folder = os.path.join(os.path.dirname(__file__), website_folder)
        name = os.path.basename(website_folder)
        self.data_folder = os.path.join(os.path.dirname(__file__), "data.{}".format(name))
        self.dlrobot_result_folder = os.path.join(self.data_folder, "result")
        if os.path.exists(self.data_folder):
            shutil.rmtree(self.data_folder, ignore_errors=True)
        handler = partial(http.server.SimpleHTTPRequestHandler,
                          directory=self.web_site_folder)
        if not is_local_http_port_free(port):
            for p in TTestEnv.additional_ports:
                if is_local_http_port_free(p):
                    port = p
                    break
        assert is_local_http_port_free(port)
        self.web_site = http.server.HTTPServer(server_address=("127.0.0.1", port), RequestHandlerClass=handler)
        os.mkdir(self.data_folder)
        os.chdir(self.data_folder)
        self.project_path = os.path.join(self.data_folder, "project.txt")
        regional = list("http://127.0.0.1:{}/{}".format(port, url) for url in regional_main_pages)

        project = TRobotProject.create_project_str("http://127.0.0.1:{}".format(port),
                                                   regional_main_pages=regional,
                                                   disable_search_engine=True)
        with open(self.project_path, "w") as outp:
            outp.write(project)
    def add_files_of_one_project(self, dlrobot_project):
        self.logger.debug("process {}".format(dlrobot_project))
        project_folder = os.path.join(self.args.input_dlrobot_folder,
                                      dlrobot_project)
        dlrobot_project_without_timestamp = re.sub('\.[0-9]+$', '',
                                                   dlrobot_project)
        project_path = os.path.join(project_folder,
                                    dlrobot_project_without_timestamp + ".txt")
        if not os.path.exists(project_path):
            self.logger.error(
                "no dlrobot project file found".format(project_folder))
            return
        try:
            project = TRobotProject(self.logger,
                                    project_path,
                                    config=self.dlrobot_config,
                                    web_sites_db=self.web_sites_db)
            project.read_project(check_step_names=False)
            office_info: TWebSiteCrawlSnapshot
            office_info = project.web_site_snapshots[0]
            site_url = office_info.get_site_url()
            exported_files = dict()
            for export_record in office_info.export_env.exported_files:
                exported_files[export_record.sha256] = export_record
        except Exception as exp:
            self.logger.error("cannot read project {}, exp={}".format(
                project_path, exp))
            return

        file_info: TExportFile
        for sha256, file_info in exported_files.items():
            web_ref = TWebReference(
                url=file_info.url,
                crawl_epoch=self.args.max_ctime,
                site_url=site_url,
                declaration_year=file_info.declaration_year)
            self.add_dlrobot_file(sha256, file_info.file_extension, [web_ref])
Beispiel #9
0
 def open_project(self):
     self.logger.debug("hostname={}".format(platform.node()))
     self.logger.debug("use {} as a cache folder".format(
         os.path.realpath(TDownloadEnv.FILE_CACHE_FOLDER)))
     with TRobotProject(self.logger, self.args.project, self.config,
                        self.args.result_folder) as project:
         self.logger.debug("total timeout = {}".format(
             self.config.get_dlrobot_total_timeout()))
         project.read_project()
         project.fetch_main_pages()
         if self.args.only_click_paths:
             project.write_export_stats()
         else:
             self.make_steps(project)
             project.write_export_stats()
             if self.args.click_features_file:
                 project.write_click_features(self.args.click_features_file)
         return project
    def collect_links_selenium(
            self,
            start_url,
            link_func=TRobotStep.looks_like_a_declaration_link,
            is_last_step=False):

        TDownloadEnv.clear_cache_folder()
        robot_steps = [{'step_name': "declarations"}]
        with TRobotProject(
                self.logger,
                "project.txt",
                TRobotConfig(passport_steps=robot_steps),
                "result",
                enable_search_engine=False,
        ) as project:
            project.read_project()
            office_info = project.web_site_snapshots[0]
            office_info.create_export_folder()

            step_info = TRobotStep(office_info,
                                   step_name="declarations",
                                   is_last_step=is_last_step)
            if isinstance(start_url, list):
                for x in start_url:
                    step_info.pages_to_process[x] = 0
                    office_info.url_nodes[x] = TUrlInfo(title="",
                                                        step_name=None)
            else:
                office_info.url_nodes[start_url] = TUrlInfo(title="",
                                                            step_name=None)
                step_info.pages_to_process[start_url] = 0

            step_info.processed_pages = set()
            step_info.apply_function_to_links(link_func)
            links = dict()
            for url, weight in step_info.url_to_weight.items():
                u = list(urllib.parse.urlparse(url))
                u[1] = "dummy"
                links[urllib.parse.urlunparse(u)] = weight

            for url_info in office_info.url_nodes.values():
                for d in url_info.downloaded_files:
                    links[d.downloaded_file] = 1
            return links
Beispiel #11
0
 def calc_project_stats(self, logger, web_sites_db, project_folder,
                        config: TRobotConfig):
     if not self.task_ended():
         return
     try:
         path = os.path.join(project_folder, self.project_file)
         with TRobotProject(logger,
                            path,
                            config=config,
                            start_selenium=False,
                            enable_search_engine=False,
                            web_sites_db=web_sites_db) as project:
             project.read_project(check_step_names=False)
             web_site_snapshot = project.web_site_snapshots[0]
             self.result_files_count = len(
                 web_site_snapshot.export_env.exported_files)
             self.reach_status = web_site_snapshot.reach_status
     except Exception as exp:
         logger.error("Cannot read file {}: exception={}".format(
             self.project_file, str(exp)))
         pass
Beispiel #12
0
class TestProhibitedLinksBase(TestCase):
    def setup_project(self, morda_url):
        logger = setup_logging('prohibited')
        self.project = TRobotProject(logger,
                                     '',
                                     config=TRobotConfig(),
                                     export_folder="result",
                                     enable_search_engine=False)
        web_site = self.project.add_web_site(morda_url)
        self.robot_step = TRobotStep(web_site)
        self.env = TestDlrobotEnv("data.prohibited")
        TDownloadEnv.FILE_CACHE_FOLDER = self.env.data_folder

    def tearDown(self):
        self.env.delete_temp_folder()

    def check_follow(self, src, trg, canon):
        if not src.startswith('http'):
            src = 'http://' + src
        if not trg.startswith('http'):
            trg = 'http://' + trg
        link_info = TLinkInfo(TClickEngine.selenium, src, trg)
        can_follow = self.robot_step.can_follow_this_link(link_info)
        self.assertEqual(canon, can_follow, msg="{} -> {}".format(src, trg))
Beispiel #13
0
 def get_new_project_to_process(self, worker_host_name, worker_ip):
     site_url = self.web_sites_to_process.pop(0)
     project_file = TRemoteDlrobotCall.web_site_to_project_file(site_url)
     self.logger.info(
         "start job: {} on {} (host name={}), left jobs: {}, running jobs: {}"
         .format(project_file, worker_ip, worker_host_name,
                 len(self.web_sites_to_process),
                 self.get_running_jobs_count()))
     remote_call = TRemoteDlrobotCall(worker_ip=worker_ip,
                                      project_file=project_file,
                                      web_site=site_url)
     remote_call.worker_host_name = worker_host_name
     web_site_passport = self.web_sites_db.get_web_site(site_url)
     regional_main_pages = list()
     if web_site_passport is None:
         self.logger.error(
             "{} is not registered in the web site db, no office information is available for the site"
         )
     project_content_str = TRobotProject.create_project_str(
         site_url,
         regional_main_pages,
         disable_search_engine=not self.args.enable_search_engines)
     self.worker_2_running_tasks[worker_ip].append(remote_call)
     return remote_call, project_content_str.encode("utf8")
Beispiel #14
0
    def send_files_to_central(self, files):
        web_domains = list()
        for file_name in files:
            web_domain = self.args.web_domain
            if file_name.endswith('.html'):
                web_domain = self.get_url_from_meta_tag(
                    file_name, self.args.web_domain)
            web_domains.append(web_domain)

        robot_project_path = TRobotProject.create_project_from_exported_files(
            self.logger, self.args.web_domain, files, web_domains)

        headers = {
            DLROBOT_HEADER_KEYS.EXIT_CODE: 0,
            DLROBOT_HEADER_KEYS.PROJECT_FILE:
            os.path.basename(robot_project_path),
            DLROBOT_HEADER_KEYS.WORKER_HOST_NAME: platform.node(),
            "Content-Type": "application/binary"
        }
        self.logger.debug(
            "send results back for {}".format(robot_project_path))
        dlrobot_results_file_name = os.path.basename(
            robot_project_path) + ".tar.gz"
        project_folder = self.args.web_domain
        with tarfile.open(dlrobot_results_file_name, "w:gz") as tar:
            for f in os.listdir(project_folder):
                tar.add(os.path.join(project_folder, f), arcname=f)

        self.logger.debug("created file {} size={}".format(
            dlrobot_results_file_name,
            os.stat(dlrobot_results_file_name).st_size))

        max_send_try_count = 3
        for try_id in range(max_send_try_count):
            conn = None
            try:
                conn = http.client.HTTPConnection(
                    self.args.server_address,
                    timeout=self.args.http_put_timeout)
                with open(dlrobot_results_file_name, "rb") as inp:
                    self.logger.debug("put file {} to {}".format(
                        dlrobot_results_file_name, self.args.server_address))
                    conn.request("PUT",
                                 dlrobot_results_file_name,
                                 inp.read(),
                                 headers=headers)
                    response = conn.getresponse()
                    conn.close()
                    conn = None
                    self.logger.debug(
                        "sent dlrobot result file {}, size={}, http_code={}".
                        format(dlrobot_results_file_name,
                               os.stat(dlrobot_results_file_name).st_size,
                               response.status))
                    break
            except Exception as exc:
                self.logger.error('worker got {}'.format(type(exc).__name__))
                self.logger.error('try_id = {} out of {}'.format(
                    try_id, max_send_try_count))
                if conn is not None:
                    conn.close()
                if try_id == max_send_try_count - 1:
                    self.logger.debug(
                        "give up, we cannot send the results back, so the results are useless"
                    )
                else:
                    sleep_seconds = (try_id + 1) * 180
                    self.logger.debug(
                        'sleep for {} seconds'.format(sleep_seconds))
                    time.sleep(sleep_seconds)

        self.logger.debug("delete file {}".format(dlrobot_results_file_name))
        os.unlink(dlrobot_results_file_name)
        shutil.rmtree(project_folder, ignore_errors=True)
        time.sleep(self.args.wait_after_each_doc * len(files))