def test_kato_file_extract(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_kato.json') json_raw = Utils.read_file(src_conf_path) download_handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, download_handler) file_path = service.download() extract_handler = HandlersFactory.get_handler( Extractor.handler_name(json_raw)) service = Extractor(json_raw, file_path, extract_handler) file_paths = service.extract() self.assertTrue(Utils.all_exists(file_paths))
def test_companies_files_extract(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_companies.json') json_raw = Utils.read_file(src_conf_path) download_handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, download_handler) downloaded_files_path = service.download() extract_handler = HandlersFactory.get_handler( Extractor.handler_name(json_raw)) service = Extractor(json_raw, downloaded_files_path, TEMP_PATH, extract_handler) all_files = service.extract() self.assertTrue(Utils.all_exists(all_files))
def run(self): src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, self.sourcefile) json_raw = Utils.read_file(src_file) handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw)) service = Extractor(json_raw, [lt.path for lt in self.input()], TEMP_PATH, handler) service.extract()
def output(self): src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, str(self.sourcefile)) json_raw = Utils.read_file(src_file) handler = HandlersFactory.get_handler(Extractor.handler_name(json_raw)) service = Extractor(json_raw, self.input().path, TEMP_PATH, handler) return [ luigi.LocalTarget(f) for f in service.path(json_raw, TEMP_PATH) ]
def test_companies_download_by_urllist(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_companies.json') json_raw = Utils.read_file(src_conf_path) handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, handler) file_paths = service.download() self.assertTrue(Utils.all_exists(file_paths))
def test_wrong_address_download_by_url(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_kgdgov_wrong_address.json') json_raw = Utils.read_file(src_conf_path) handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, handler) file_path = service.download() self.assertTrue(os.path.exists(file_path))
def test_kurk_parse_to_csv(self): srconf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_statgov_kurk.json') jobconf_path = os.path.join(JOBS_CONFIG_DIR, 'to_csv.json') src_json = Utils.read_file(srconf_path) job_json = Utils.read_file(jobconf_path) download_handler = HandlersFactory.get_handler( Downloader.handler_name(src_json)) service = Downloader(src_json, download_handler) downloaded_file = service.download() parse_handler = HandlersFactory.get_handler( XLSParser.handler_name(src_json, job_json)) service = XLSParser(src_json, job_json, downloaded_file, self.data_path, parse_handler) csvfile = service.path(src_json, job_json, self.data_path) rows_cnt = service.parse() self.assertTrue(os.path.exists(csvfile)) self.assertGreater(rows_cnt, 0)
def run(self): src_file = os.path.join(WEB_SOURCES_CONFIG_DIR, str(self.sourcefile)) src_conf = Utils.read_file(src_file) job_file = os.path.join(JOBS_CONFIG_DIR, str(self.jobfile)) job_conf = Utils.read_file(job_file) handler = HandlersFactory.get_handler( Parser.handler_name(src_conf, job_conf)) service = Parser(src_conf, job_conf, WEB_DATA_PATH, handler) service.parse()
def test_pseudo_company_download_by_url(self): src_conf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_kgdgov_pseudo_company.json') with open(src_conf_path, "r", encoding="utf8") as f: json_raw = f.read() handler = HandlersFactory.get_handler( Downloader.handler_name(json_raw)) service = Downloader(json_raw, handler) file_path = service.download() self.assertTrue(os.path.exists(file_path))
def test_address_parse_to_csv(self): srconf_path = os.path.join(WEB_SOURCES_CONFIG_DIR, 'web_datagov_addresses.json') jobconf_path = os.path.join(JOBS_CONFIG_DIR, 'to_csv.json') src_json = Utils.read_file(srconf_path) job_json = Utils.read_file(jobconf_path) parse_handler = HandlersFactory.get_handler( Parser.handler_name(src_json, job_json)) service = Parser(src_json, job_json, self.data_path, parse_handler) csvfile = service.path(src_json, job_json, self.data_path) service.parse() self.assertTrue(os.path.exists(csvfile))
i = -1 for arch in archives: arch_obj = Utils.get_archive_object(arch) data_format = Box(json.loads(instance.srconf)).storage.data_format file_path = path.abspath(path.dirname(arch)) for file in arch_obj.namelist(): if Utils.ext(file) == data_format: i += 1 arch_obj.extract(file, file_path) old_path = path.join(file_path, file).replace('/', os.sep) shutil.move(old_path, targets[i]) return targets @staticmethod def path(srconf, dpath): files_num = Box(json.loads(srconf)).storage.data_files_num archives_num = Box(json.loads(srconf)).storage.data_archives_num name = Box(json.loads(srconf)).name data_format = Box(json.loads(srconf)).storage.data_format files = list() for i in range(archives_num): for j in range(files_num): files.append( path.join(dpath, "{}_{}_{}.{}".format(name, i, j, data_format))) return files HandlersFactory.register("extract_file", ExtractorFile) HandlersFactory.register("extract_files", ExtractorFiles)
json_raw = res.group(2) break return json.loads(json_raw) @staticmethod def write_data(fpath, data, delimiter=";"): with open(fpath, "a", encoding="utf8") as f: csv_writer = csv.writer(f, delimiter=delimiter) for row in data: csv_writer.writerow(row.values()) @staticmethod def parse(instance, fpath): data = ParseJavaScriptJsonToCSV.get_data(instance) ParseJavaScriptJsonToCSV.write_data(fpath, data) @staticmethod def path(srconf, jobconf, dpath): name = Box(json.loads(srconf)).name data_format = Box(json.loads(jobconf)).data_format return os.path.join(dpath, "{}.{}".format(name, data_format)) HandlersFactory.register("xlsparse_to_csv", ParseFromExcelToCSV) HandlersFactory.register("xlsparse_oked_to_csv", ParseOkedToCsv) HandlersFactory.register("web_api_raw_json_parse_to_csv", ParseFromAPIToCSV) HandlersFactory.register("web_html_javascript_json_parse_to_csv", ParseJavaScriptJsonToCSV) HandlersFactory.register("web_html_table_text_parse_to_csv", ParseGosRegisterToCSV)
return fpath except Exception as e: raise e @staticmethod def urls(conf): url = Box(json.loads(conf)).url base_url = Box(json.loads(conf)).base_url js = requests.get(url, verify=False).text regex = Box(json.loads(conf)).storage.html.url_regexp res = re.findall(regex, js) urls = ["{}{}".format(base_url, res) for res in res] return urls @staticmethod def path(conf): name = Box(json.loads(conf)).name ext = Box(json.loads(conf)).storage.type urls = DownloaderByUrlStatGovCompanies().urls(conf) directory = TEMP_PATH os.makedirs(directory, exist_ok=True) return [ os.path.join(directory, "{}_{}.{}".format(name, i, ext)) for i, url in enumerate(urls) ] HandlersFactory.register("download_url", DownloaderByUrlToFile) HandlersFactory.register("download_urllist_ajax", DownloaderByUrlStatGovCompanies)